diff --git a/.github/labeler.yml b/.github/labeler.yml index 90cdda4d3ca..8506d38a048 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -12,7 +12,7 @@ cudf.polars: - 'python/cudf_polars/**' pylibcudf: - - 'python/cudf/pylibcudf/**' + - 'python/pylibcudf/**' libcudf: - 'cpp/**' diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 08d08c9c5a0..c034752d373 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -62,7 +62,7 @@ jobs: arch: "amd64" branch: ${{ inputs.branch }} build_type: ${{ inputs.build_type || 'branch' }} - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" date: ${{ inputs.date }} node_type: "gpu-v100-latest-1" run_script: "ci/build_docs.sh" diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index ade2f35397b..a65cae34653 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -30,6 +30,7 @@ jobs: - wheel-tests-cudf - wheel-build-cudf-polars - wheel-tests-cudf-polars + - cudf-polars-polars-tests - wheel-build-dask-cudf - wheel-tests-dask-cudf - devcontainer @@ -49,6 +50,7 @@ jobs: test_java: ${{ steps.changed-files.outputs.java_any_changed == 'true' }} test_notebooks: ${{ steps.changed-files.outputs.notebooks_any_changed == 'true' }} test_python: ${{ steps.changed-files.outputs.python_any_changed == 'true' }} + test_cudf_pandas: ${{ steps.changed-files.outputs.cudf_pandas_any_changed == 'true' }} steps: - name: Get PR info id: get-pr-info @@ -81,6 +83,7 @@ jobs: - '!java/**' - '!notebooks/**' - '!python/**' + - '!ci/cudf_pandas_scripts/**' java: - '**' - '!CONTRIBUTING.md' @@ -89,11 +92,13 @@ jobs: - '!img/**' - '!notebooks/**' - '!python/**' + - '!ci/cudf_pandas_scripts/**' notebooks: - '**' - '!CONTRIBUTING.md' - '!README.md' - '!java/**' + - '!ci/cudf_pandas_scripts/**' python: - '**' - '!CONTRIBUTING.md' @@ -102,6 +107,16 @@ jobs: - '!img/**' - '!java/**' - '!notebooks/**' + - '!ci/cudf_pandas_scripts/**' + cudf_pandas: + - '**' + - 'ci/cudf_pandas_scripts/**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!docs/**' + - '!img/**' + - '!java/**' + - '!notebooks/**' checks: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12 @@ -159,7 +174,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_java.sh" static-configure: needs: checks @@ -180,7 +195,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_notebooks.sh" docs-build: needs: conda-python-build @@ -190,7 +205,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/build_docs.sh" wheel-build-libcudf: needs: checks @@ -244,6 +259,17 @@ jobs: # This always runs, but only fails if this PR touches code in # pylibcudf or cudf_polars script: "ci/test_wheel_cudf_polars.sh" + cudf-polars-polars-tests: + needs: wheel-build-cudf-polars + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 + with: + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) + build_type: pull-request + # This always runs, but only fails if this PR touches code in + # pylibcudf or cudf_polars + script: "ci/test_cudf_polars_polars_tests.sh" wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit @@ -277,7 +303,7 @@ jobs: needs: [wheel-build-cudf, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 - if: needs.changed-files.outputs.test_python == 'true' + if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -288,7 +314,7 @@ jobs: needs: [wheel-build-cudf, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 - if: needs.changed-files.outputs.test_python == 'true' + if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index c06fe929988..a22d3c5b9cc 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -41,7 +41,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit @@ -81,7 +81,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit @@ -93,7 +93,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit diff --git a/build.sh b/build.sh index 211e1db9fbf..56359eae235 100755 --- a/build.sh +++ b/build.sh @@ -17,13 +17,14 @@ ARGS=$* # script, and that this script resides in the repo dir! REPODIR=$(cd $(dirname $0); pwd) -VALIDARGS="clean libcudf pylibcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings" -HELP="$0 [clean] [libcudf] [pylibcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"\\\"] +VALIDARGS="clean libcudf pylibcudf cudf cudf_polars cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings" +HELP="$0 [clean] [libcudf] [pylibcudf] [cudf] [cudf_polars] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"\\\"] clean - remove all existing build artifacts and configuration (start over) libcudf - build the cudf C++ code only pylibcudf - build the pylibcudf Python package cudf - build the cudf Python package + cudf_polars - build the cudf_polars Python package cudfjar - build cudf JAR with static libcudf using devtoolset toolchain dask_cudf - build the dask_cudf Python package benchmarks - build benchmarks @@ -239,11 +240,6 @@ if hasArg --pydevelop; then PYTHON_ARGS_FOR_INSTALL="${PYTHON_ARGS_FOR_INSTALL} -e" fi -# Append `-DFIND_CUDF_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option. -if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUDF_CPP"* ]]; then - EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_CUDF_CPP=ON" -fi - if hasArg --disable_large_strings; then BUILD_DISABLE_LARGE_STRINGS="ON" fi @@ -358,6 +354,12 @@ if buildAll || hasArg cudf; then python ${PYTHON_ARGS_FOR_INSTALL} . fi +# Build and install the cudf_polars Python package +if buildAll || hasArg cudf_polars; then + + cd ${REPODIR}/python/cudf_polars + python ${PYTHON_ARGS_FOR_INSTALL} . +fi # Build and install the dask_cudf Python package if buildAll || hasArg dask_cudf; then diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py index 7a12db927e5..485b2ac8a51 100644 --- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py +++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py @@ -67,20 +67,33 @@ def emoji_failed(x): # convert pr_results to a pandas DataFrame and then a markdown table pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index() main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index() -diff_df = pr_df - main_df -total_usage = pr_df['_slow_function_call'] + pr_df['_fast_function_call'] -pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/total_usage)*100.0).round(1) -pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/total_usage)*100.0).round(1) +total_usage = main_df["_slow_function_call"] + main_df["_fast_function_call"] +main_df["CPU Usage"] = ((main_df["_slow_function_call"] / total_usage) * 100.0).round(1) +main_df["GPU Usage"] = ((main_df["_fast_function_call"] / total_usage) * 100.0).round(1) + +total_usage = pr_df["_slow_function_call"] + pr_df["_fast_function_call"] +pr_df["CPU Usage"] = ((pr_df["_slow_function_call"] / total_usage) * 100.0).round(1) +pr_df["GPU Usage"] = ((pr_df["_fast_function_call"] / total_usage) * 100.0).round(1) + +cpu_usage_mean = pr_df["CPU Usage"].mean().round(2) +gpu_usage_mean = pr_df["GPU Usage"].mean().round(2) + +gpu_usage_rate_change = abs(pr_df["GPU Usage"].mean() - main_df["GPU Usage"].mean()) +pr_df["CPU Usage"] = pr_df["CPU Usage"].fillna(0) +pr_df["GPU Usage"] = pr_df["GPU Usage"].fillna(0) +main_df["CPU Usage"] = main_df["CPU Usage"].fillna(0) +main_df["GPU Usage"] = main_df["GPU Usage"].fillna(0) -cpu_usage_mean = pr_df['CPU Usage'].mean().round(2) -gpu_usage_mean = pr_df['GPU Usage'].mean().round(2) +diff_df = pr_df - main_df +diff_df["CPU Usage"] = diff_df["CPU Usage"].round(1).fillna(0) +diff_df["GPU Usage"] = diff_df["GPU Usage"].round(1).fillna(0) -# Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns -pr_df['CPU Usage'] = pr_df['CPU Usage'].fillna(0).astype(str) + '%' -pr_df['GPU Usage'] = pr_df['GPU Usage'].fillna(0).astype(str) + '%' +# Add '%' suffix to "CPU Usage" and "GPU Usage" columns +pr_df["CPU Usage"] = pr_df["CPU Usage"].astype(str) + "%" +pr_df["GPU Usage"] = pr_df["GPU Usage"].astype(str) + "%" -pr_df = pr_df[["total", "passed", "failed", "skipped", 'CPU Usage', 'GPU Usage']] -diff_df = diff_df[["total", "passed", "failed", "skipped"]] +pr_df = pr_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]] +diff_df = diff_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]] diff_df.columns = diff_df.columns + "_diff" diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed) diff_df["failed_diff"] = diff_df["failed_diff"].map(emoji_failed) @@ -99,13 +112,36 @@ def emoji_failed(x): "passed_diff": "Passed delta", "failed_diff": "Failed delta", "skipped_diff": "Skipped delta", + "CPU Usage_diff": "CPU Usage delta", + "GPU Usage_diff": "GPU Usage delta", } ) -df = df.sort_values(by=["Failed tests", "Skipped tests"], ascending=False) - +df = df.sort_values(by=["CPU Usage delta", "Total tests"], ascending=False) +df["CPU Usage delta"] = df["CPU Usage delta"].map(emoji_failed) +df["GPU Usage delta"] = df["GPU Usage delta"].map(emoji_passed) +df = df[ + [ + "Total tests", + "CPU Usage delta", + "GPU Usage delta", + "Passed tests", + "Failed tests", + "Skipped tests", + "CPU Usage", + "GPU Usage", + "Total delta", + "Passed delta", + "Failed delta", + "Skipped delta", + ] +] print(comment) print() -print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean}% and {gpu_usage_mean}%") +print( + f"Average GPU usage: {gpu_usage_mean}% {'an increase' if gpu_usage_rate_change > 0 else 'a decrease'} by {gpu_usage_rate_change}%" +) +print() +print(f"Average CPU usage: {cpu_usage_mean}%") print() print("Here are the results of running the Pandas tests against this PR:") print() diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index c6228a4ef33..f6bdc6f9484 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -56,10 +56,10 @@ else echo "" > ./constraints.txt if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then - # `test_python` constraints are for `[test]` not `[cudf-pandas-tests]` + # `test_python_cudf_pandas` constraints are for `[test]` not `[cudf-pandas-tests]` rapids-dependency-file-generator \ --output requirements \ - --file-key test_python \ + --file-key test_python_cudf_pandas \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ | tee ./constraints.txt fi diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index be55b49870f..870901d223b 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -25,9 +25,9 @@ NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}') NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR} # Need to distutils-normalize the versions for some use cases -CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))") -NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))") -PATCH_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_PATCH}'))") +CURRENT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${CURRENT_SHORT_TAG}'))") +NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))") +PATCH_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_PATCH}'))") echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG" @@ -45,6 +45,8 @@ sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_dask_cudf.sh DEPENDENCIES=( cudf cudf_kafka + cugraph + cuml custreamz dask-cuda dask-cudf @@ -57,7 +59,7 @@ DEPENDENCIES=( rmm ) for DEP in "${DEPENDENCIES[@]}"; do - for FILE in dependencies.yaml conda/environments/*.yaml; do + for FILE in dependencies.yaml conda/environments/*.yaml python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml; do sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}" done for FILE in python/*/pyproject.toml; do @@ -80,6 +82,7 @@ for FILE in .github/workflows/*.yaml .github/workflows/*.yml; do sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}" done sed_runner "s/branch-[0-9]\+\.[0-9]\+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh +sed_runner "s/branch-[0-9]\+\.[0-9]\+/branch-${NEXT_SHORT_TAG}/g" ci/test_cudf_polars_polars_tests.sh # Java files NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}-SNAPSHOT" diff --git a/ci/run_cudf_polars_polars_tests.sh b/ci/run_cudf_polars_polars_tests.sh new file mode 100755 index 00000000000..95f78f17f2f --- /dev/null +++ b/ci/run_cudf_polars_polars_tests.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +# Support invoking run_cudf_polars_pytests.sh outside the script directory +# Assumption, polars has been cloned in the root of the repo. +cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../polars/ + +DESELECTED_TESTS=( + "tests/unit/test_polars_import.py::test_polars_import" # relies on a polars built in place + "tests/unit/streaming/test_streaming_sort.py::test_streaming_sort[True]" # relies on polars built in debug mode + "tests/unit/test_cpu_check.py::test_check_cpu_flags_skipped_no_flags" # Mock library error + "tests/docs/test_user_guide.py" # No dot binary in CI image +) + +DESELECTED_TESTS=$(printf -- " --deselect %s" "${DESELECTED_TESTS[@]}") +python -m pytest \ + --import-mode=importlib \ + --cache-clear \ + -m "" \ + -p cudf_polars.testing.plugin \ + -v \ + --tb=native \ + ${DESELECTED_TESTS} \ + "$@" \ + py-polars/tests diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh new file mode 100755 index 00000000000..55399d0371a --- /dev/null +++ b/ci/test_cudf_polars_polars_tests.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -eou pipefail + +# We will only fail these tests if the PR touches code in pylibcudf +# or cudf_polars itself. +# Note, the three dots mean we are doing diff between the merge-base +# of upstream and HEAD. So this is asking, "does _this branch_ touch +# files in cudf_polars/pylibcudf", rather than "are there changes +# between upstream and this branch which touch cudf_polars/pylibcudf" +# TODO: is the target branch exposed anywhere in an environment variable? +if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ]; +then + HAS_CHANGES=1 + rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure" +else + HAS_CHANGES=0 + rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure" +fi + +rapids-logger "Download wheels" + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" +RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist + +# Download the pylibcudf built in the previous step +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep + +rapids-logger "Install pylibcudf" +python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl + +rapids-logger "Install cudf_polars" +python -m pip install $(echo ./dist/cudf_polars*.whl) + +TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")') +rapids-logger "Clone polars to ${TAG}" +git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1 + +# Install requirements for running polars tests +rapids-logger "Install polars test requirements" +python -m pip install -r polars/py-polars/requirements-dev.txt -r polars/py-polars/requirements-ci.txt + +function set_exitcode() +{ + EXITCODE=$? +} +EXITCODE=0 +trap set_exitcode ERR +set +e + +rapids-logger "Run polars tests" +./ci/run_cudf_polars_polars_tests.sh + +trap ERR +set -e + +if [ ${EXITCODE} != 0 ]; then + rapids-logger "Running polars test suite FAILED: exitcode ${EXITCODE}" +else + rapids-logger "Running polars test suite PASSED" +fi + +if [ ${HAS_CHANGES} == 1 ]; then + exit ${EXITCODE} +else + exit 0 +fi diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh index d0675b0431a..dc70661a17a 100755 --- a/ci/test_python_common.sh +++ b/ci/test_python_common.sh @@ -10,10 +10,10 @@ set -euo pipefail rapids-logger "Generate Python testing dependencies" ENV_YAML_DIR="$(mktemp -d)" - +FILE_KEY=$1 rapids-dependency-file-generator \ --output conda \ - --file-key test_python \ + --file-key ${FILE_KEY} \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ | tee "${ENV_YAML_DIR}/env.yaml" diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh index ae34047e87f..2386414b32e 100755 --- a/ci/test_python_cudf.sh +++ b/ci/test_python_cudf.sh @@ -5,7 +5,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../; # Common setup steps shared by Python test jobs -source ./ci/test_python_common.sh +source ./ci/test_python_common.sh test_python_cudf rapids-logger "Check GPU usage" nvidia-smi diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index 06a24773cae..67c97ad29a5 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -5,7 +5,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ # Common setup steps shared by Python test jobs -source ./ci/test_python_common.sh +source ./ci/test_python_common.sh test_python_other rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index 28ded2f8e0f..a701bfe15e0 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -39,6 +39,7 @@ rapids-logger "pytest pylibcudf" pushd python/pylibcudf/pylibcudf/tests python -m pytest \ --cache-clear \ + --numprocesses=8 \ --dist=worksteal \ . popd diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh index da9e50d0a2b..05f882a475b 100755 --- a/ci/test_wheel_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -13,10 +13,14 @@ set -eou pipefail if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/pylibcudf/)" ]; then HAS_CHANGES=1 + rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure" else HAS_CHANGES=0 + rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure" fi +rapids-logger "Download wheels" + RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 python ./dist @@ -35,7 +39,7 @@ if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then | tee ./constraints.txt fi -# echo to expand wildcard before adding `[extra]` requires for pip +# echo to expand wildcard before adding `[test]` requires for pip python -m pip install \ -v \ --constraint ./constraints.txt \ diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index 0d39807d56c..361a42ccda9 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -41,6 +41,7 @@ pushd python/dask_cudf/dask_cudf DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ + --dist=worksteal \ . popd @@ -50,5 +51,6 @@ pushd python/dask_cudf/dask_cudf DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ --numprocesses=8 \ + --dist=worksteal \ . popd diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 62d75965b9f..f91bf1e7046 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -31,7 +31,7 @@ dependencies: - doxygen=1.9.1 - fastavro>=0.22.9 - flatbuffers==24.3.25 -- fmt>=10.1.1,<11 +- fmt>=11.0.2,<12 - fsspec>=0.6.0 - gcc_linux-64=11.* - hypothesis @@ -84,7 +84,7 @@ dependencies: - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.12.0,<1.13 +- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index f16f2b377df..f4ec6bd5407 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -32,7 +32,7 @@ dependencies: - doxygen=1.9.1 - fastavro>=0.22.9 - flatbuffers==24.3.25 -- fmt>=10.1.1,<11 +- fmt>=11.0.2,<12 - fsspec>=0.6.0 - gcc_linux-64=11.* - hypothesis @@ -82,7 +82,7 @@ dependencies: - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.12.0,<1.13 +- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index 33fa4b4eccf..dc75eb4b252 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -26,13 +26,13 @@ librdkafka_version: - ">=2.5.0,<2.6.0a0" fmt_version: - - ">=10.1.1,<11" + - ">=11.0.2,<12" flatbuffers_version: - "=24.3.25" spdlog_version: - - ">=1.12.0,<1.13" + - ">=1.14.1,<1.15" nvcomp_version: - "=4.0.1" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7bc01e64441..136f43ee706 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -378,7 +378,9 @@ add_library( src/io/csv/reader_impl.cu src/io/csv/writer_impl.cu src/io/functions.cpp + src/io/json/host_tree_algorithms.cu src/io/json/json_column.cu + src/io/json/column_tree_construction.cu src/io/json/json_normalization.cu src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu @@ -797,7 +799,7 @@ add_dependencies(cudf jitify_preprocess_run) # Specify the target module library dependencies target_link_libraries( cudf - PUBLIC CCCL::CCCL rmm::rmm $ + PUBLIC CCCL::CCCL rmm::rmm $ spdlog::spdlog_header_only PRIVATE $ cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio $ nanoarrow ) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index abc6f74fccf..4113e38dcf4 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -230,6 +230,11 @@ ConfigureNVBench(STRUCT_CREATION_NVBENCH structs/create_structs.cpp) # -------------------------------------------------------------------------------- ConfigureBench(QUANTILES_BENCH quantiles/quantiles.cpp) +# ################################################################################################## +# * tdigest benchmark +# -------------------------------------------------------------------------------- +ConfigureNVBench(TDIGEST_NVBENCH quantiles/tdigest.cu) + # ################################################################################################## # * type_dispatcher benchmark --------------------------------------------------------------------- ConfigureBench(TYPE_DISPATCHER_BENCH type_dispatcher/type_dispatcher.cu) diff --git a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp index d4368906702..54d177df401 100644 --- a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp +++ b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp @@ -85,7 +85,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu auto const left_selected = left_input.select(left_on); auto const right_selected = right_input.select(right_on); auto const [left_join_indices, right_join_indices] = - cudf::left_join(left_selected, right_selected, cudf::null_equality::EQUAL, mr); + cudf::left_join(left_selected, right_selected, cudf::null_equality::EQUAL, stream, mr); auto const left_indices_span = cudf::device_span{*left_join_indices}; auto const right_indices_span = cudf::device_span{*right_join_indices}; diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp index 2d514764fc2..62116ddf661 100644 --- a/cpp/benchmarks/ndsh/utilities.cpp +++ b/cpp/benchmarks/ndsh/utilities.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -146,11 +147,15 @@ std::unique_ptr join_and_gather(cudf::table_view const& left_input, cudf::null_equality compare_nulls) { CUDF_FUNC_RANGE(); - constexpr auto oob_policy = cudf::out_of_bounds_policy::DONT_CHECK; - auto const left_selected = left_input.select(left_on); - auto const right_selected = right_input.select(right_on); - auto const [left_join_indices, right_join_indices] = cudf::inner_join( - left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref()); + constexpr auto oob_policy = cudf::out_of_bounds_policy::DONT_CHECK; + auto const left_selected = left_input.select(left_on); + auto const right_selected = right_input.select(right_on); + auto const [left_join_indices, right_join_indices] = + cudf::inner_join(left_selected, + right_selected, + compare_nulls, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); auto const left_indices_span = cudf::device_span{*left_join_indices}; auto const right_indices_span = cudf::device_span{*right_join_indices}; diff --git a/cpp/benchmarks/quantiles/tdigest.cu b/cpp/benchmarks/quantiles/tdigest.cu new file mode 100644 index 00000000000..9d37dbc9a26 --- /dev/null +++ b/cpp/benchmarks/quantiles/tdigest.cu @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include + +#include +#include +#include + +#include + +void bm_tdigest_merge(nvbench::state& state) +{ + auto const num_tdigests = static_cast(state.get_int64("num_tdigests")); + auto const tdigest_size = static_cast(state.get_int64("tdigest_size")); + auto const tdigests_per_group = + static_cast(state.get_int64("tdigests_per_group")); + auto const max_centroids = static_cast(state.get_int64("max_centroids")); + auto const num_groups = num_tdigests / tdigests_per_group; + auto const total_centroids = num_tdigests * tdigest_size; + + auto stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); + + constexpr int base_value = 5; + + // construct inner means/weights + auto val_iter = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type([tdigest_size](cudf::size_type i) { + return static_cast(base_value + (i % tdigest_size)); + })); + auto one_iter = thrust::make_constant_iterator(1); + cudf::test::fixed_width_column_wrapper means(val_iter, val_iter + total_centroids); + cudf::test::fixed_width_column_wrapper weights(one_iter, one_iter + total_centroids); + std::vector> inner_struct_children; + inner_struct_children.push_back(means.release()); + inner_struct_children.push_back(weights.release()); + cudf::test::structs_column_wrapper inner_struct(std::move(inner_struct_children)); + + // construct the tdigest lists themselves + auto offset_iter = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type([tdigest_size](cudf::size_type i) { + return i * tdigest_size; + })); + cudf::test::fixed_width_column_wrapper offsets(offset_iter, offset_iter + num_tdigests + 1); + auto list_col = cudf::make_lists_column( + num_tdigests, offsets.release(), inner_struct.release(), 0, {}, stream, mr); + + // min and max columns + auto min_iter = thrust::make_constant_iterator(base_value); + auto max_iter = thrust::make_constant_iterator(base_value + (tdigest_size - 1)); + cudf::test::fixed_width_column_wrapper mins(min_iter, min_iter + num_tdigests); + cudf::test::fixed_width_column_wrapper maxes(max_iter, max_iter + num_tdigests); + + // assemble the whole thing + std::vector> tdigest_children; + tdigest_children.push_back(std::move(list_col)); + tdigest_children.push_back(mins.release()); + tdigest_children.push_back(maxes.release()); + cudf::test::structs_column_wrapper tdigest(std::move(tdigest_children)); + + rmm::device_uvector group_offsets(num_groups + 1, stream, mr); + rmm::device_uvector group_labels(num_tdigests, stream, mr); + auto group_offset_iter = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [tdigests_per_group] __device__(cudf::size_type i) { return i * tdigests_per_group; })); + thrust::copy(rmm::exec_policy_nosync(stream, mr), + group_offset_iter, + group_offset_iter + num_groups + 1, + group_offsets.begin()); + auto group_label_iter = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [tdigests_per_group] __device__(cudf::size_type i) { return i / tdigests_per_group; })); + thrust::copy(rmm::exec_policy_nosync(stream, mr), + group_label_iter, + group_label_iter + num_tdigests, + group_labels.begin()); + + state.add_element_count(total_centroids); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, + [&](nvbench::launch& launch, auto& timer) { + timer.start(); + auto result = cudf::tdigest::detail::group_merge_tdigest( + tdigest, group_offsets, group_labels, num_groups, max_centroids, stream, mr); + timer.stop(); + }); +} + +NVBENCH_BENCH(bm_tdigest_merge) + .set_name("TDigest many tiny groups") + .add_int64_axis("num_tdigests", {500'000}) + .add_int64_axis("tdigest_size", {1, 1000}) + .add_int64_axis("tdigests_per_group", {1}) + .add_int64_axis("max_centroids", {10000, 1000}); + +NVBENCH_BENCH(bm_tdigest_merge) + .set_name("TDigest many small groups") + .add_int64_axis("num_tdigests", {500'000}) + .add_int64_axis("tdigest_size", {1, 1000}) + .add_int64_axis("tdigests_per_group", {3}) + .add_int64_axis("max_centroids", {10000, 1000}); diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake index c0e07d02d94..90b0f4d8a8e 100644 --- a/cpp/cmake/thirdparty/get_spdlog.cmake +++ b/cpp/cmake/thirdparty/get_spdlog.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -16,21 +16,12 @@ function(find_and_configure_spdlog) include(${rapids-cmake-dir}/cpm/spdlog.cmake) - rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET cudf-exports) - rapids_export_package(BUILD spdlog cudf-exports) + rapids_cpm_spdlog( + FMT_OPTION "EXTERNAL_FMT_HO" + INSTALL_EXPORT_SET cudf-exports + BUILD_EXPORT_SET cudf-exports + ) - if(spdlog_ADDED) - rapids_export( - BUILD spdlog - EXPORT_SET spdlog - GLOBAL_TARGETS spdlog spdlog_header_only - NAMESPACE spdlog:: - ) - include("${rapids-cmake-dir}/export/find_package_root.cmake") - rapids_export_find_package_root( - BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports - ) - endif() endfunction() find_and_configure_spdlog() diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index 442731694fa..9cda22d0695 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -18,6 +18,8 @@ #include "../utilities/timer.hpp" +#include + /** * @file parquet_io.cpp * @brief Demonstrates usage of the libcudf APIs to read and write @@ -159,8 +161,11 @@ int main(int argc, char const** argv) // Left anti-join the original and transcoded tables // identical tables should not throw an exception and // return an empty indices vector - auto const indices = cudf::left_anti_join( - input->view(), transcoded_input->view(), cudf::null_equality::EQUAL, resource.get()); + auto const indices = cudf::left_anti_join(input->view(), + transcoded_input->view(), + cudf::null_equality::EQUAL, + cudf::get_default_stream(), + resource.get()); // No exception thrown, check indices auto const valid = indices->size() == 0; diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index c7523c80b2b..7359a0d5fde 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -17,9 +17,12 @@ #pragma once #include +#include #include #include +#include + #include /** @@ -40,6 +43,7 @@ namespace datetime { * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t years @@ -47,6 +51,7 @@ namespace datetime { */ std::unique_ptr extract_year( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -54,6 +59,7 @@ std::unique_ptr extract_year( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t months @@ -61,6 +67,7 @@ std::unique_ptr extract_year( */ std::unique_ptr extract_month( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -68,6 +75,7 @@ std::unique_ptr extract_month( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t days @@ -75,6 +83,7 @@ std::unique_ptr extract_month( */ std::unique_ptr extract_day( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -82,6 +91,7 @@ std::unique_ptr extract_day( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t days @@ -89,6 +99,7 @@ std::unique_ptr extract_day( */ std::unique_ptr extract_weekday( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -96,6 +107,7 @@ std::unique_ptr extract_weekday( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t hours @@ -103,6 +115,7 @@ std::unique_ptr extract_weekday( */ std::unique_ptr extract_hour( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -110,6 +123,7 @@ std::unique_ptr extract_hour( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t minutes @@ -117,6 +131,7 @@ std::unique_ptr extract_hour( */ std::unique_ptr extract_minute( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -124,6 +139,7 @@ std::unique_ptr extract_minute( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t seconds @@ -131,6 +147,7 @@ std::unique_ptr extract_minute( */ std::unique_ptr extract_second( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -141,6 +158,7 @@ std::unique_ptr extract_second( * For example, the millisecond fraction of 1.234567890 seconds is 234. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t milliseconds @@ -148,6 +166,7 @@ std::unique_ptr extract_second( */ std::unique_ptr extract_millisecond_fraction( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -158,6 +177,7 @@ std::unique_ptr extract_millisecond_fraction( * For example, the microsecond fraction of 1.234567890 seconds is 567. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t microseconds @@ -165,6 +185,7 @@ std::unique_ptr extract_millisecond_fraction( */ std::unique_ptr extract_microsecond_fraction( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -175,6 +196,7 @@ std::unique_ptr extract_microsecond_fraction( * For example, the nanosecond fraction of 1.234567890 seconds is 890. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t nanoseconds @@ -182,6 +204,7 @@ std::unique_ptr extract_microsecond_fraction( */ std::unique_ptr extract_nanosecond_fraction( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group @@ -196,6 +219,7 @@ std::unique_ptr extract_nanosecond_fraction( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column containing last day of the month as TIMESTAMP_DAYS @@ -203,6 +227,7 @@ std::unique_ptr extract_nanosecond_fraction( */ std::unique_ptr last_day_of_month( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -210,6 +235,7 @@ std::unique_ptr last_day_of_month( * returns an int16_t cudf::column. The value is between [1, {365-366}] * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of datatype INT16 containing the day number since the start of the year @@ -217,6 +243,7 @@ std::unique_ptr last_day_of_month( */ std::unique_ptr day_of_year( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -245,6 +272,7 @@ std::unique_ptr day_of_year( * * @param timestamps cudf::column_view of timestamp type * @param months cudf::column_view of integer type containing the number of months to add + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of timestamp type containing the computed timestamps @@ -252,6 +280,7 @@ std::unique_ptr day_of_year( std::unique_ptr add_calendrical_months( cudf::column_view const& timestamps, cudf::column_view const& months, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -280,6 +309,7 @@ std::unique_ptr add_calendrical_months( * * @param timestamps cudf::column_view of timestamp type * @param months cudf::scalar of integer type containing the number of months to add + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @return cudf::column of timestamp type containing the computed timestamps @@ -287,6 +317,7 @@ std::unique_ptr add_calendrical_months( std::unique_ptr add_calendrical_months( cudf::column_view const& timestamps, cudf::scalar const& months, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -297,6 +328,7 @@ std::unique_ptr add_calendrical_months( * `output[i] is null` if `column[i]` is null * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of datatype BOOL8 truth value of the corresponding date @@ -304,6 +336,7 @@ std::unique_ptr add_calendrical_months( */ std::unique_ptr is_leap_year( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -315,11 +348,13 @@ std::unique_ptr is_leap_year( * @throw cudf::logic_error if input column datatype is not a TIMESTAMP * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * @return cudf::column of datatype INT16 of days in month of the corresponding date */ std::unique_ptr days_in_month( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -331,11 +366,13 @@ std::unique_ptr days_in_month( * @throw cudf::logic_error if input column datatype is not a TIMESTAMP * * @param column The input column containing datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * @return A column of INT16 type indicating which quarter the date is in */ std::unique_ptr extract_quarter( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -357,6 +394,7 @@ enum class rounding_frequency : int32_t { * * @param column cudf::column_view of the input datetime values * @param freq rounding_frequency indicating the frequency to round up to + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @throw cudf::logic_error if input column datatype is not TIMESTAMP. @@ -365,6 +403,7 @@ enum class rounding_frequency : int32_t { std::unique_ptr ceil_datetimes( cudf::column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -372,6 +411,7 @@ std::unique_ptr ceil_datetimes( * * @param column cudf::column_view of the input datetime values * @param freq rounding_frequency indicating the frequency to round down to + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @throw cudf::logic_error if input column datatype is not TIMESTAMP. @@ -380,6 +420,7 @@ std::unique_ptr ceil_datetimes( std::unique_ptr floor_datetimes( cudf::column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -387,6 +428,7 @@ std::unique_ptr floor_datetimes( * * @param column cudf::column_view of the input datetime values * @param freq rounding_frequency indicating the frequency to round to + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @throw cudf::logic_error if input column datatype is not TIMESTAMP. @@ -395,6 +437,7 @@ std::unique_ptr floor_datetimes( std::unique_ptr round_datetimes( cudf::column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp index 31782cbaf8a..9db7e48498f 100644 --- a/cpp/include/cudf/detail/datetime.hpp +++ b/cpp/include/cudf/detail/datetime.hpp @@ -26,111 +26,108 @@ namespace CUDF_EXPORT cudf { namespace datetime { namespace detail { /** - * @copydoc cudf::extract_year(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_year(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_year(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_month(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_month(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_month(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_day(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_day(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_day(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_weekday(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_hour(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_minute(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_second(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_second(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_second(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&, + * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&, rmm::cuda_stream_view, * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_millisecond_fraction(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&, + * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&, rmm::cuda_stream_view, * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_microsecond_fraction(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&, + * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&, rmm::cuda_stream_view, * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_nanosecond_fraction(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr last_day_of_month(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr day_of_year(cudf::column_view const& column, rmm::cuda_stream_view stream, @@ -138,9 +135,8 @@ std::unique_ptr day_of_year(cudf::column_view const& column, /** * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::column_view const&, - * rmm::device_async_resource_ref) + * rmm::cuda_stream_view, rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr add_calendrical_months(cudf::column_view const& timestamps, cudf::column_view const& months, @@ -149,9 +145,8 @@ std::unique_ptr add_calendrical_months(cudf::column_view const& ti /** * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::scalar const&, - * rmm::device_async_resource_ref) + * rmm::cuda_stream_view, rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr add_calendrical_months(cudf::column_view const& timestamps, cudf::scalar const& months, @@ -159,9 +154,9 @@ std::unique_ptr add_calendrical_months(cudf::column_view const& ti rmm::device_async_resource_ref mr); /** - * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr is_leap_year(cudf::column_view const& column, rmm::cuda_stream_view stream, diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp index 5738f9ec8e9..f51d1ba42b2 100644 --- a/cpp/include/cudf/detail/timezone.hpp +++ b/cpp/include/cudf/detail/timezone.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include @@ -26,14 +27,13 @@ namespace detail { /** * @copydoc cudf::make_timezone_transition_table(std::optional, std::string_view, - * rmm::device_async_resource_ref) + * rmm::cuda_stream_view, rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr make_timezone_transition_table( std::optional tzif_dir, std::string_view timezone_name, - rmm::cuda_stream_view stream, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); } // namespace detail diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index cc8912cb022..a590eb27511 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -97,6 +97,7 @@ class distinct_hash_join; * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -108,6 +109,7 @@ std::pair>, inner_join(cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -137,6 +139,7 @@ inner_join(cudf::table_view const& left_keys, * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -148,6 +151,7 @@ std::pair>, left_join(cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -176,6 +180,7 @@ left_join(cudf::table_view const& left_keys, * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -187,6 +192,7 @@ std::pair>, full_join(cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -205,6 +211,7 @@ full_join(cudf::table_view const& left_keys, * @param left_keys The left table * @param right_keys The right table * @param compare_nulls Controls whether null join-key values should match or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A vector `left_indices` that can be used to construct @@ -215,6 +222,7 @@ std::unique_ptr> left_semi_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -236,6 +244,7 @@ std::unique_ptr> left_semi_join( * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A column `left_indices` that can be used to construct @@ -246,6 +255,7 @@ std::unique_ptr> left_anti_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -266,6 +276,7 @@ std::unique_ptr> left_anti_join( * * @param left The left table * @param right The right table + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * * @return Result of cross joining `left` and `right` tables @@ -273,6 +284,7 @@ std::unique_ptr> left_anti_join( std::unique_ptr cross_join( cudf::table_view const& left, cudf::table_view const& right, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -567,6 +579,7 @@ class distinct_hash_join { * @param right The right table * @param binary_predicate The condition on which to join * @param output_size Optional value which allows users to specify the exact output size + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -578,6 +591,7 @@ conditional_inner_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -612,6 +626,7 @@ conditional_inner_join(table_view const& left, * @param right The right table * @param binary_predicate The condition on which to join * @param output_size Optional value which allows users to specify the exact output size + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -623,6 +638,7 @@ conditional_left_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -655,6 +671,7 @@ conditional_left_join(table_view const& left, * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -665,6 +682,7 @@ std::pair>, conditional_full_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -693,6 +711,7 @@ conditional_full_join(table_view const& left, * @param right The right table * @param binary_predicate The condition on which to join * @param output_size Optional value which allows users to specify the exact output size + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A vector `left_indices` that can be used to construct the result of @@ -704,6 +723,7 @@ std::unique_ptr> conditional_left_semi_join( table_view const& right, ast::expression const& binary_predicate, std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -732,6 +752,7 @@ std::unique_ptr> conditional_left_semi_join( * @param right The right table * @param binary_predicate The condition on which to join * @param output_size Optional value which allows users to specify the exact output size + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A vector `left_indices` that can be used to construct the result of @@ -743,6 +764,7 @@ std::unique_ptr> conditional_left_anti_join( table_view const& right, ast::expression const& binary_predicate, std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -786,6 +808,7 @@ std::unique_ptr> conditional_left_anti_join( * @param output_size_data An optional pair of values indicating the exact output size and the * number of matches for each row in the larger of the two input tables, left or right (may be * precomputed using the corresponding mixed_inner_join_size API). + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -801,6 +824,7 @@ mixed_inner_join( ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, std::optional>> output_size_data = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -846,6 +870,7 @@ mixed_inner_join( * @param output_size_data An optional pair of values indicating the exact output size and the * number of matches for each row in the larger of the two input tables, left or right (may be * precomputed using the corresponding mixed_left_join_size API). + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -861,6 +886,7 @@ mixed_left_join( ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, std::optional>> output_size_data = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -906,6 +932,7 @@ mixed_left_join( * @param output_size_data An optional pair of values indicating the exact output size and the * number of matches for each row in the larger of the two input tables, left or right (may be * precomputed using the corresponding mixed_full_join_size API). + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -921,6 +948,7 @@ mixed_full_join( ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, std::optional>> output_size_data = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -956,6 +984,7 @@ mixed_full_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -968,6 +997,7 @@ std::unique_ptr> mixed_left_semi_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1004,6 +1034,7 @@ std::unique_ptr> mixed_left_semi_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -1016,6 +1047,7 @@ std::unique_ptr> mixed_left_anti_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1041,6 +1073,7 @@ std::unique_ptr> mixed_left_anti_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair containing the size that would result from performing the @@ -1056,6 +1089,7 @@ std::pair>> mixed_in table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1081,6 +1115,7 @@ std::pair>> mixed_in * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair containing the size that would result from performing the @@ -1096,6 +1131,7 @@ std::pair>> mixed_le table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1111,6 +1147,7 @@ std::pair>> mixed_le * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return The size that would result from performing the requested join @@ -1119,6 +1156,7 @@ std::size_t conditional_inner_join_size( table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1134,6 +1172,7 @@ std::size_t conditional_inner_join_size( * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return The size that would result from performing the requested join @@ -1142,6 +1181,7 @@ std::size_t conditional_left_join_size( table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1157,6 +1197,7 @@ std::size_t conditional_left_join_size( * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return The size that would result from performing the requested join @@ -1165,6 +1206,7 @@ std::size_t conditional_left_semi_join_size( table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1180,6 +1222,7 @@ std::size_t conditional_left_semi_join_size( * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return The size that would result from performing the requested join @@ -1188,6 +1231,7 @@ std::size_t conditional_left_anti_join_size( table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp index aa903770e26..f6de1056c24 100644 --- a/cpp/include/cudf/timezone.hpp +++ b/cpp/include/cudf/timezone.hpp @@ -15,9 +15,12 @@ */ #pragma once +#include #include #include +#include + #include #include #include @@ -43,6 +46,7 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years; * * @param tzif_dir The directory where the TZif files are located * @param timezone_name standard timezone name (for example, "America/Los_Angeles") + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory. * * @return The transition table for the given timezone @@ -50,6 +54,7 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years; std::unique_ptr
make_timezone_transition_table( std::optional tzif_dir, std::string_view timezone_name, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); } // namespace CUDF_EXPORT cudf diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index fd9a6b8f5fe..ddb0dbcd96d 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -580,142 +580,167 @@ std::unique_ptr extract_quarter(column_view const& column, std::unique_ptr ceil_datetimes(column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::round_general( - detail::rounding_function::CEIL, freq, column, cudf::get_default_stream(), mr); + return detail::round_general(detail::rounding_function::CEIL, freq, column, stream, mr); } std::unique_ptr floor_datetimes(column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::round_general( - detail::rounding_function::FLOOR, freq, column, cudf::get_default_stream(), mr); + return detail::round_general(detail::rounding_function::FLOOR, freq, column, stream, mr); } std::unique_ptr round_datetimes(column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::round_general( - detail::rounding_function::ROUND, freq, column, cudf::get_default_stream(), mr); + return detail::round_general(detail::rounding_function::ROUND, freq, column, stream, mr); } -std::unique_ptr extract_year(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_year(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_year(column, cudf::get_default_stream(), mr); + return detail::extract_year(column, stream, mr); } -std::unique_ptr extract_month(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_month(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_month(column, cudf::get_default_stream(), mr); + return detail::extract_month(column, stream, mr); } -std::unique_ptr extract_day(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_day(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_day(column, cudf::get_default_stream(), mr); + return detail::extract_day(column, stream, mr); } std::unique_ptr extract_weekday(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_weekday(column, cudf::get_default_stream(), mr); + return detail::extract_weekday(column, stream, mr); } -std::unique_ptr extract_hour(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_hour(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_hour(column, cudf::get_default_stream(), mr); + return detail::extract_hour(column, stream, mr); } -std::unique_ptr extract_minute(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_minute(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_minute(column, cudf::get_default_stream(), mr); + return detail::extract_minute(column, stream, mr); } -std::unique_ptr extract_second(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_second(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_second(column, cudf::get_default_stream(), mr); + return detail::extract_second(column, stream, mr); } std::unique_ptr extract_millisecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_millisecond_fraction(column, cudf::get_default_stream(), mr); + return detail::extract_millisecond_fraction(column, stream, mr); } std::unique_ptr extract_microsecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_microsecond_fraction(column, cudf::get_default_stream(), mr); + return detail::extract_microsecond_fraction(column, stream, mr); } std::unique_ptr extract_nanosecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_nanosecond_fraction(column, cudf::get_default_stream(), mr); + return detail::extract_nanosecond_fraction(column, stream, mr); } std::unique_ptr last_day_of_month(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::last_day_of_month(column, cudf::get_default_stream(), mr); + return detail::last_day_of_month(column, stream, mr); } -std::unique_ptr day_of_year(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr day_of_year(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::day_of_year(column, cudf::get_default_stream(), mr); + return detail::day_of_year(column, stream, mr); } std::unique_ptr add_calendrical_months(cudf::column_view const& timestamp_column, cudf::column_view const& months_column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::add_calendrical_months( - timestamp_column, months_column, cudf::get_default_stream(), mr); + return detail::add_calendrical_months(timestamp_column, months_column, stream, mr); } std::unique_ptr add_calendrical_months(cudf::column_view const& timestamp_column, cudf::scalar const& months, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::add_calendrical_months(timestamp_column, months, cudf::get_default_stream(), mr); + return detail::add_calendrical_months(timestamp_column, months, stream, mr); } -std::unique_ptr is_leap_year(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr is_leap_year(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::is_leap_year(column, cudf::get_default_stream(), mr); + return detail::is_leap_year(column, stream, mr); } -std::unique_ptr days_in_month(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr days_in_month(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::days_in_month(column, cudf::get_default_stream(), mr); + return detail::days_in_month(column, stream, mr); } std::unique_ptr extract_quarter(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_quarter(column, cudf::get_default_stream(), mr); + return detail::extract_quarter(column, stream, mr); } } // namespace datetime diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index 6498a5e6c55..cf239297255 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -380,11 +380,11 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year) std::unique_ptr
make_timezone_transition_table(std::optional tzif_dir, std::string_view timezone_name, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::make_timezone_transition_table( - tzif_dir, timezone_name, cudf::get_default_stream(), mr); + return detail::make_timezone_transition_table(tzif_dir, timezone_name, stream, mr); } namespace detail { diff --git a/cpp/src/io/json/column_tree_construction.cu b/cpp/src/io/json/column_tree_construction.cu new file mode 100644 index 00000000000..c4fe7926706 --- /dev/null +++ b/cpp/src/io/json/column_tree_construction.cu @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nested_json.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cudf::io::json { + +using row_offset_t = size_type; + +#ifdef CSR_DEBUG_PRINT +template +void print(device_span d_vec, std::string name, rmm::cuda_stream_view stream) +{ + stream.synchronize(); + auto h_vec = cudf::detail::make_std_vector_sync(d_vec, stream); + std::cout << name << " = "; + for (auto e : h_vec) { + std::cout << e << " "; + } + std::cout << std::endl; +} +#endif + +namespace experimental::detail { + +struct level_ordering { + device_span node_levels; + device_span col_ids; + device_span parent_node_ids; + __device__ bool operator()(NodeIndexT lhs_node_id, NodeIndexT rhs_node_id) const + { + auto lhs_parent_col_id = parent_node_ids[lhs_node_id] == parent_node_sentinel + ? parent_node_sentinel + : col_ids[parent_node_ids[lhs_node_id]]; + auto rhs_parent_col_id = parent_node_ids[rhs_node_id] == parent_node_sentinel + ? parent_node_sentinel + : col_ids[parent_node_ids[rhs_node_id]]; + + return (node_levels[lhs_node_id] < node_levels[rhs_node_id]) || + (node_levels[lhs_node_id] == node_levels[rhs_node_id] && + lhs_parent_col_id < rhs_parent_col_id) || + (node_levels[lhs_node_id] == node_levels[rhs_node_id] && + lhs_parent_col_id == rhs_parent_col_id && col_ids[lhs_node_id] < col_ids[rhs_node_id]); + } +}; + +struct parent_nodeids_to_colids { + device_span rev_mapped_col_ids; + __device__ auto operator()(NodeIndexT parent_node_id) -> NodeIndexT + { + return parent_node_id == parent_node_sentinel ? parent_node_sentinel + : rev_mapped_col_ids[parent_node_id]; + } +}; + +/** + * @brief Reduces node tree representation to column tree CSR representation. + * + * @param node_tree Node tree representation of JSON string + * @param original_col_ids Column ids of nodes + * @param row_offsets Row offsets of nodes + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A tuple of column tree representation of JSON string, column ids of columns, and + * max row offsets of columns + */ +std::tuple reduce_to_column_tree( + tree_meta_t& node_tree, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, + device_span row_offsets, + bool is_array_of_arrays, + NodeIndexT row_array_parent_col_id, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + + if (original_col_ids.empty()) { + rmm::device_uvector empty_row_idx(0, stream); + rmm::device_uvector empty_col_idx(0, stream); + rmm::device_uvector empty_column_categories(0, stream); + rmm::device_uvector empty_max_row_offsets(0, stream); + rmm::device_uvector empty_mapped_col_ids(0, stream); + return std::tuple{compressed_sparse_row{std::move(empty_row_idx), std::move(empty_col_idx)}, + column_tree_properties{std::move(empty_column_categories), + std::move(empty_max_row_offsets), + std::move(empty_mapped_col_ids)}}; + } + + auto [unpermuted_tree, unpermuted_col_ids, unpermuted_max_row_offsets] = + cudf::io::json::detail::reduce_to_column_tree(node_tree, + original_col_ids, + sorted_col_ids, + ordered_node_ids, + row_offsets, + is_array_of_arrays, + row_array_parent_col_id, + stream); + + NodeIndexT num_columns = unpermuted_col_ids.size(); + + auto mapped_col_ids = cudf::detail::make_device_uvector_async( + unpermuted_col_ids, stream, cudf::get_current_device_resource_ref()); + rmm::device_uvector rev_mapped_col_ids(num_columns, stream); + rmm::device_uvector reordering_index(unpermuted_col_ids.size(), stream); + + thrust::sequence( + rmm::exec_policy_nosync(stream), reordering_index.begin(), reordering_index.end()); + // Reorder nodes and column ids in level-wise fashion + thrust::sort_by_key( + rmm::exec_policy_nosync(stream), + reordering_index.begin(), + reordering_index.end(), + mapped_col_ids.begin(), + level_ordering{ + unpermuted_tree.node_levels, unpermuted_col_ids, unpermuted_tree.parent_node_ids}); + + { + auto mapped_col_ids_copy = cudf::detail::make_device_uvector_async( + mapped_col_ids, stream, cudf::get_current_device_resource_ref()); + thrust::sequence( + rmm::exec_policy_nosync(stream), rev_mapped_col_ids.begin(), rev_mapped_col_ids.end()); + thrust::sort_by_key(rmm::exec_policy_nosync(stream), + mapped_col_ids_copy.begin(), + mapped_col_ids_copy.end(), + rev_mapped_col_ids.begin()); + } + + rmm::device_uvector parent_col_ids(num_columns, stream); + thrust::transform_output_iterator parent_col_ids_it(parent_col_ids.begin(), + parent_nodeids_to_colids{rev_mapped_col_ids}); + rmm::device_uvector max_row_offsets(num_columns, stream); + rmm::device_uvector column_categories(num_columns, stream); + thrust::copy_n( + rmm::exec_policy_nosync(stream), + thrust::make_zip_iterator(thrust::make_permutation_iterator( + unpermuted_tree.parent_node_ids.begin(), reordering_index.begin()), + thrust::make_permutation_iterator(unpermuted_max_row_offsets.begin(), + reordering_index.begin()), + thrust::make_permutation_iterator( + unpermuted_tree.node_categories.begin(), reordering_index.begin())), + num_columns, + thrust::make_zip_iterator( + parent_col_ids_it, max_row_offsets.begin(), column_categories.begin())); + +#ifdef CSR_DEBUG_PRINT + print(reordering_index, "h_reordering_index", stream); + print(mapped_col_ids, "h_mapped_col_ids", stream); + print(rev_mapped_col_ids, "h_rev_mapped_col_ids", stream); + print(parent_col_ids, "h_parent_col_ids", stream); + print(max_row_offsets, "h_max_row_offsets", stream); +#endif + + auto construct_row_idx = [&stream](NodeIndexT num_columns, + device_span parent_col_ids) { + auto row_idx = cudf::detail::make_zeroed_device_uvector_async( + static_cast(num_columns + 1), stream, cudf::get_current_device_resource_ref()); + // Note that the first element of csr_parent_col_ids is -1 (parent_node_sentinel) + // children adjacency + + auto num_non_leaf_columns = thrust::unique_count( + rmm::exec_policy_nosync(stream), parent_col_ids.begin() + 1, parent_col_ids.end()); + rmm::device_uvector non_leaf_nodes(num_non_leaf_columns, stream); + rmm::device_uvector non_leaf_nodes_children(num_non_leaf_columns, stream); + thrust::reduce_by_key(rmm::exec_policy_nosync(stream), + parent_col_ids.begin() + 1, + parent_col_ids.end(), + thrust::make_constant_iterator(1), + non_leaf_nodes.begin(), + non_leaf_nodes_children.begin(), + thrust::equal_to()); + + thrust::scatter(rmm::exec_policy_nosync(stream), + non_leaf_nodes_children.begin(), + non_leaf_nodes_children.end(), + non_leaf_nodes.begin(), + row_idx.begin() + 1); + + if (num_columns > 1) { + thrust::transform_inclusive_scan( + rmm::exec_policy_nosync(stream), + thrust::make_zip_iterator(thrust::make_counting_iterator(1), row_idx.begin() + 1), + thrust::make_zip_iterator(thrust::make_counting_iterator(1) + num_columns, row_idx.end()), + row_idx.begin() + 1, + cuda::proclaim_return_type([] __device__(auto a) { + auto n = thrust::get<0>(a); + auto idx = thrust::get<1>(a); + return n == 1 ? idx : idx + 1; + }), + thrust::plus{}); + } else { + auto single_node = 1; + row_idx.set_element_async(1, single_node, stream); + } + +#ifdef CSR_DEBUG_PRINT + print(row_idx, "h_row_idx", stream); +#endif + return row_idx; + }; + + auto construct_col_idx = [&stream](NodeIndexT num_columns, + device_span parent_col_ids, + device_span row_idx) { + rmm::device_uvector col_idx((num_columns - 1) * 2, stream); + thrust::fill(rmm::exec_policy_nosync(stream), col_idx.begin(), col_idx.end(), -1); + // excluding root node, construct scatter map + rmm::device_uvector map(num_columns - 1, stream); + thrust::inclusive_scan_by_key(rmm::exec_policy_nosync(stream), + parent_col_ids.begin() + 1, + parent_col_ids.end(), + thrust::make_constant_iterator(1), + map.begin()); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(1), + num_columns - 1, + [row_idx = row_idx.begin(), + map = map.begin(), + parent_col_ids = parent_col_ids.begin()] __device__(auto i) { + auto parent_col_id = parent_col_ids[i]; + if (parent_col_id == 0) + --map[i - 1]; + else + map[i - 1] += row_idx[parent_col_id]; + }); + thrust::scatter(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(1), + thrust::make_counting_iterator(1) + num_columns - 1, + map.begin(), + col_idx.begin()); + + // Skip the parent of root node + thrust::scatter(rmm::exec_policy_nosync(stream), + parent_col_ids.begin() + 1, + parent_col_ids.end(), + row_idx.begin() + 1, + col_idx.begin()); + +#ifdef CSR_DEBUG_PRINT + print(col_idx, "h_col_idx", stream); +#endif + + return col_idx; + }; + + /* + 5. CSR construction: + a. Sort column levels and get their ordering + b. For each column node coln iterated according to sorted_column_levels; do + i. Find nodes that have coln as the parent node -> set adj_coln + ii. row idx[coln] = size of adj_coln + 1 + iii. col idx[coln] = adj_coln U {parent_col_id[coln]} + */ + auto row_idx = construct_row_idx(num_columns, parent_col_ids); + auto col_idx = construct_col_idx(num_columns, parent_col_ids, row_idx); + + return std::tuple{ + compressed_sparse_row{std::move(row_idx), std::move(col_idx)}, + column_tree_properties{ + std::move(column_categories), std::move(max_row_offsets), std::move(mapped_col_ids)}}; +} + +} // namespace experimental::detail +} // namespace cudf::io::json diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu new file mode 100644 index 00000000000..70d61132b42 --- /dev/null +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -0,0 +1,808 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "io/utilities/parsing_utils.cuh" +#include "io/utilities/string_parsing.hpp" +#include "nested_json.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf::io::json::detail { + +/** + * @brief Get the column indices for the values column for array of arrays rows + * + * @param row_array_children_level The level of the row array's children + * @param d_tree The tree metadata + * @param col_ids The column ids + * @param num_columns The number of columns + * @param stream The stream to use + * @return The value columns' indices + */ +rmm::device_uvector get_values_column_indices(TreeDepthT const row_array_children_level, + tree_meta_t const& d_tree, + device_span col_ids, + size_type const num_columns, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + auto [level2_nodes, level2_indices] = get_array_children_indices( + row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream); + auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin()); + rmm::device_uvector values_column_indices(num_columns, stream); + thrust::scatter(rmm::exec_policy(stream), + level2_indices.begin(), + level2_indices.end(), + col_id_location, + values_column_indices.begin()); + return values_column_indices; +} + +/** + * @brief Copies strings specified by pair of begin, end offsets to host vector of strings. + * + * @param input String device buffer + * @param node_range_begin Begin offset of the strings + * @param node_range_end End offset of the strings + * @param stream CUDA stream + * @return Vector of strings + */ +std::vector copy_strings_to_host_sync( + device_span input, + device_span node_range_begin, + device_span node_range_end, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + auto const num_strings = node_range_begin.size(); + rmm::device_uvector string_offsets(num_strings, stream); + rmm::device_uvector string_lengths(num_strings, stream); + auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); + thrust::transform(rmm::exec_policy(stream), + d_offset_pairs, + d_offset_pairs + num_strings, + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()), + [] __device__(auto const& offsets) { + // Note: first character for non-field columns + return thrust::make_tuple( + static_cast(thrust::get<0>(offsets)), + static_cast(thrust::get<1>(offsets) - thrust::get<0>(offsets))); + }); + + cudf::io::parse_options_view options_view{}; + options_view.quotechar = '\0'; // no quotes + options_view.keepquotes = true; + auto d_offset_length_it = + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()); + auto d_column_names = parse_data(input.data(), + d_offset_length_it, + num_strings, + data_type{type_id::STRING}, + rmm::device_buffer{}, + 0, + options_view, + stream, + cudf::get_current_device_resource_ref()); + auto to_host = [stream](auto const& col) { + if (col.is_empty()) return std::vector{}; + auto const scv = cudf::strings_column_view(col); + auto const h_chars = cudf::detail::make_host_vector_async( + cudf::device_span(scv.chars_begin(stream), scv.chars_size(stream)), stream); + auto const h_offsets = cudf::detail::make_host_vector_async( + cudf::device_span(scv.offsets().data() + scv.offset(), + scv.size() + 1), + stream); + stream.synchronize(); + + // build std::string vector from chars and offsets + std::vector host_data; + host_data.reserve(col.size()); + std::transform( + std::begin(h_offsets), + std::end(h_offsets) - 1, + std::begin(h_offsets) + 1, + std::back_inserter(host_data), + [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); }); + return host_data; + }; + return to_host(d_column_names->view()); +} + +/** + * @brief Checks if all strings in each string column in the tree are nulls. + * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as + * false. + * + * @param input Input JSON string device data + * @param d_column_tree column tree representation of JSON string + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param options Parsing options specifying the parsing behaviour + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Array of bytes where each byte indicate if it is all nulls string column. + */ +rmm::device_uvector is_all_nulls_each_column(device_span input, + tree_meta_t const& d_column_tree, + tree_meta_t const& tree, + device_span col_ids, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream) +{ + auto const num_nodes = col_ids.size(); + auto const num_cols = d_column_tree.node_categories.size(); + rmm::device_uvector is_all_nulls(num_cols, stream); + thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true); + + auto parse_opt = parsing_options(options, stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + num_nodes, + [options = parse_opt.view(), + data = input.data(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { + auto const node_category = column_categories[col_ids[i]]; + if (node_category == NC_STR or node_category == NC_VAL) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, + {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); + if (!is_null_literal) is_all_nulls[col_ids[i]] = false; + } + }); + return is_all_nulls; +} + +NodeIndexT get_row_array_parent_col_id(device_span col_ids, + bool is_enabled_lines, + rmm::cuda_stream_view stream) +{ + NodeIndexT value = parent_node_sentinel; + if (!col_ids.empty()) { + auto const list_node_index = is_enabled_lines ? 0 : 1; + CUDF_CUDA_TRY(cudaMemcpyAsync(&value, + col_ids.data() + list_node_index, + sizeof(NodeIndexT), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + } + return value; +} +/** + * @brief Holds member data pointers of `d_json_column` + * + */ +struct json_column_data { + using row_offset_t = json_column::row_offset_t; + row_offset_t* string_offsets; + row_offset_t* string_lengths; + row_offset_t* child_offsets; + bitmask_type* validity; +}; + +std::pair, + std::unordered_map>> +build_tree(device_json_column& root, + std::vector const& is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +void scatter_offsets( + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t& d_column_tree, + host_span ignore_vals, + std::unordered_map>& columns, + rmm::cuda_stream_view stream); + +/** + * @brief Constructs `d_json_column` from node tree representation + * Newly constructed columns are insert into `root`'s children. + * `root` must be a list type. + * + * @param input Input JSON string device data + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param row_offsets Row offsets of the nodes in the tree + * @param root Root node of the `d_json_column` tree + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param options Parsing options specifying the parsing behaviour + * options affecting behaviour are + * is_enabled_lines: Whether the input is a line-delimited JSON + * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the device memory + * of child_offets and validity members of `d_json_column` + */ +void make_device_json_column(device_span input, + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + + bool const is_enabled_lines = options.is_enabled_lines(); + bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + auto const num_nodes = col_ids.size(); + rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy + thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); + + // sort by {col_id} on {node_ids} stable + rmm::device_uvector node_ids(col_ids.size(), stream); + thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); + thrust::stable_sort_by_key( + rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); + + NodeIndexT const row_array_parent_col_id = + get_row_array_parent_col_id(col_ids, is_enabled_lines, stream); + + // 1. gather column information. + auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = + reduce_to_column_tree(tree, + col_ids, + sorted_col_ids, + node_ids, + row_offsets, + is_array_of_arrays, + row_array_parent_col_id, + stream); + auto num_columns = d_unique_col_ids.size(); + std::vector column_names = copy_strings_to_host_sync( + input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); + // array of arrays column names + if (is_array_of_arrays) { + auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); + auto const column_parent_ids = + cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); + TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; + auto values_column_indices = + get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); + auto h_values_column_indices = + cudf::detail::make_host_vector_sync(values_column_indices, stream); + std::transform(unique_col_ids.begin(), + unique_col_ids.end(), + column_names.begin(), + column_names.begin(), + [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( + auto col_id, auto name) mutable { + return column_parent_ids[col_id] == row_array_parent_col_id + ? std::to_string(h_values_column_indices[col_id]) + : name; + }); + } + + auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() { + if (is_enabled_mixed_types_as_string) { + return cudf::detail::make_std_vector_sync( + is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream); + } + return std::vector(); + }(); + auto [ignore_vals, columns] = build_tree(root, + is_str_column_all_nulls, + d_column_tree, + d_unique_col_ids, + d_max_row_offsets, + column_names, + row_array_parent_col_id, + is_array_of_arrays, + options, + stream, + mr); + + scatter_offsets(tree, + col_ids, + row_offsets, + node_ids, + sorted_col_ids, + d_column_tree, + ignore_vals, + columns, + stream); +} + +std::pair, + std::unordered_map>> +build_tree(device_json_column& root, + std::vector const& is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); + auto column_categories = + cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream); + auto const column_parent_ids = + cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); + auto column_range_beg = + cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream); + auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream); + auto num_columns = d_unique_col_ids.size(); + + auto to_json_col_type = [](auto category) { + switch (category) { + case NC_STRUCT: return json_col_t::StructColumn; + case NC_LIST: return json_col_t::ListColumn; + case NC_STR: [[fallthrough]]; + case NC_VAL: return json_col_t::StringColumn; + default: return json_col_t::Unknown; + } + }; + auto init_to_zero = [stream](auto& v) { + thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0); + }; + + auto initialize_json_columns = [&](auto i, auto& col, auto column_category) { + if (column_category == NC_ERR || column_category == NC_FN) { + return; + } else if (column_category == NC_VAL || column_category == NC_STR) { + col.string_offsets.resize(max_row_offsets[i] + 1, stream); + col.string_lengths.resize(max_row_offsets[i] + 1, stream); + init_to_zero(col.string_offsets); + init_to_zero(col.string_lengths); + } else if (column_category == NC_LIST) { + col.child_offsets.resize(max_row_offsets[i] + 2, stream); + init_to_zero(col.child_offsets); + } + col.num_rows = max_row_offsets[i] + 1; + col.validity = + cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); + col.type = to_json_col_type(column_category); + }; + + auto reinitialize_as_string = [&](auto i, auto& col) { + col.string_offsets.resize(max_row_offsets[i] + 1, stream); + col.string_lengths.resize(max_row_offsets[i] + 1, stream); + init_to_zero(col.string_offsets); + init_to_zero(col.string_lengths); + col.num_rows = max_row_offsets[i] + 1; + col.validity = + cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); + col.type = json_col_t::StringColumn; + // destroy references of all child columns after this step, by calling remove_child_columns + }; + + path_from_tree tree_path{column_categories, + column_parent_ids, + column_names, + is_array_of_arrays, + row_array_parent_col_id}; + + // 2. generate nested columns tree and its device_memory + // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. + auto h_range_col_id_it = + thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin()); + std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { + return thrust::get<0>(a) < thrust::get<0>(b); + }); + + // use hash map because we may skip field name's col_ids + std::unordered_map> columns; + // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking + std::map, NodeIndexT> mapped_columns; + // find column_ids which are values, but should be ignored in validity + auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); + std::vector is_mixed_type_column(num_columns, 0); + std::vector is_pruned(num_columns, 0); + // for columns that are not mixed type but have been forced as string + std::vector forced_as_string_column(num_columns); + columns.try_emplace(parent_node_sentinel, std::ref(root)); + + std::function remove_child_columns = + [&](NodeIndexT this_col_id, device_json_column& col) { + for (auto col_name : col.column_order) { + auto child_id = mapped_columns[{this_col_id, col_name}]; + is_mixed_type_column[child_id] = 1; + remove_child_columns(child_id, col.child_columns.at(col_name)); + mapped_columns.erase({this_col_id, col_name}); + columns.erase(child_id); + } + col.child_columns.clear(); // their references are deleted above. + col.column_order.clear(); + }; + + auto name_and_parent_index = [&is_array_of_arrays, + &row_array_parent_col_id, + &column_parent_ids, + &column_categories, + &column_names](auto this_col_id) { + std::string name = ""; + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { + if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { + name = column_names[this_col_id]; + } else { + name = list_child_name; + } + } else if (column_categories[parent_col_id] == NC_FN) { + auto field_name_col_id = parent_col_id; + parent_col_id = column_parent_ids[parent_col_id]; + name = column_names[field_name_col_id]; + } else { + CUDF_FAIL("Unexpected parent column category"); + } + return std::pair{name, parent_col_id}; + }; + + // Prune columns that are not required to be parsed. + if (options.is_enabled_prune_columns()) { + for (auto const this_col_id : unique_col_ids) { + if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { + continue; + } + // Struct, List, String, Value + auto [name, parent_col_id] = name_and_parent_index(this_col_id); + // get path of this column, and get its dtype if present in options + auto const nt = tree_path.get_path(this_col_id); + std::optional const user_dtype = get_path_data_type(nt, options); + if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) { + is_pruned[this_col_id] = 1; + continue; + } else { + // make sure all its parents are not pruned. + while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) { + is_pruned[parent_col_id] = 0; + parent_col_id = column_parent_ids[parent_col_id]; + } + } + } + } + + // Build the column tree, also, handles mixed types. + for (auto const this_col_id : unique_col_ids) { + if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { + continue; + } + // Struct, List, String, Value + auto [name, parent_col_id] = name_and_parent_index(this_col_id); + + // if parent is mixed type column or this column is pruned or if parent + // has been forced as string, ignore this column. + if (parent_col_id != parent_node_sentinel && + (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) || + forced_as_string_column[parent_col_id]) { + ignore_vals[this_col_id] = 1; + if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } + if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; } + continue; + } + + // If the child is already found, + // replace if this column is a nested column and the existing was a value column + // ignore this column if this column is a value column and the existing was a nested column + auto it = columns.find(parent_col_id); + CUDF_EXPECTS(it != columns.end(), "Parent column not found"); + auto& parent_col = it->second.get(); + bool replaced = false; + if (mapped_columns.count({parent_col_id, name}) > 0) { + auto const old_col_id = mapped_columns[{parent_col_id, name}]; + // If mixed type as string is enabled, make both of them strings and merge them. + // All child columns will be ignored when parsing. + if (is_enabled_mixed_types_as_string) { + bool const is_mixed_type = [&]() { + // If new or old is STR and they are all not null, make it mixed type, else ignore. + if (column_categories[this_col_id] == NC_VAL || + column_categories[this_col_id] == NC_STR) { + if (is_str_column_all_nulls[this_col_id]) return false; + } + if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { + if (is_str_column_all_nulls[old_col_id]) return false; + } + return true; + }(); + if (is_mixed_type) { + is_mixed_type_column[this_col_id] = 1; + is_mixed_type_column[old_col_id] = 1; + // if old col type (not cat) is list or struct, replace with string. + auto& col = columns.at(old_col_id).get(); + if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { + reinitialize_as_string(old_col_id, col); + remove_child_columns(old_col_id, col); + // all its children (which are already inserted) are ignored later. + } + col.forced_as_string_column = true; + columns.try_emplace(this_col_id, columns.at(old_col_id)); + continue; + } + } + + if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { + ignore_vals[this_col_id] = 1; + continue; + } + if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { + // remap + ignore_vals[old_col_id] = 1; + mapped_columns.erase({parent_col_id, name}); + columns.erase(old_col_id); + parent_col.child_columns.erase(name); + replaced = true; // to skip duplicate name in column_order + } else { + // If this is a nested column but we're trying to insert either (a) a list node into a + // struct column or (b) a struct node into a list column, we fail + CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and + column_categories[this_col_id] == NC_STRUCT) or + (column_categories[old_col_id] == NC_STRUCT and + column_categories[this_col_id] == NC_LIST)), + "A mix of lists and structs within the same column is not supported"); + } + } + + auto this_column_category = column_categories[this_col_id]; + // get path of this column, check if it is a struct/list forced as string, and enforce it + auto const nt = tree_path.get_path(this_col_id); + std::optional const user_dtype = get_path_data_type(nt, options); + if ((column_categories[this_col_id] == NC_STRUCT or + column_categories[this_col_id] == NC_LIST) and + user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { + this_column_category = NC_STR; + } + + CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); + // move into parent + device_json_column col(stream, mr); + initialize_json_columns(this_col_id, col, this_column_category); + if ((column_categories[this_col_id] == NC_STRUCT or + column_categories[this_col_id] == NC_LIST) and + user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { + col.forced_as_string_column = true; + forced_as_string_column[this_col_id] = true; + } + + auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; + CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); + if (not replaced) parent_col.column_order.push_back(name); + columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); + mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); + } + + if (is_enabled_mixed_types_as_string) { + // ignore all children of mixed type columns + for (auto const this_col_id : unique_col_ids) { + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { + is_mixed_type_column[this_col_id] = 1; + ignore_vals[this_col_id] = 1; + columns.erase(this_col_id); + } + // Convert only mixed type columns as string (so to copy), but not its children + if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and + is_mixed_type_column[this_col_id] == 1) + column_categories[this_col_id] = NC_STR; + } + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), + column_categories.data(), + column_categories.size() * sizeof(column_categories[0]), + cudf::detail::host_memory_kind::PAGEABLE, + stream); + } + + // ignore all children of columns forced as string + for (auto const this_col_id : unique_col_ids) { + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) { + forced_as_string_column[this_col_id] = true; + ignore_vals[this_col_id] = 1; + } + // Convert only mixed type columns as string (so to copy), but not its children + if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and + forced_as_string_column[this_col_id]) + column_categories[this_col_id] = NC_STR; + } + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), + column_categories.data(), + column_categories.size() * sizeof(column_categories[0]), + cudf::detail::host_memory_kind::PAGEABLE, + stream); + + // restore unique_col_ids order + std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { + return thrust::get<1>(a) < thrust::get<1>(b); + }); + return {ignore_vals, columns}; +} + +void scatter_offsets( + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t& d_column_tree, + host_span ignore_vals, + std::unordered_map>& columns, + rmm::cuda_stream_view stream) +{ + auto const num_nodes = col_ids.size(); + auto const num_columns = d_column_tree.node_categories.size(); + // move columns data to device. + auto columns_data = cudf::detail::make_host_vector(num_columns, stream); + for (auto& [col_id, col_ref] : columns) { + if (col_id == parent_node_sentinel) continue; + auto& col = col_ref.get(); + columns_data[col_id] = json_column_data{col.string_offsets.data(), + col.string_lengths.data(), + col.child_offsets.data(), + static_cast(col.validity.data())}; + } + + auto d_ignore_vals = cudf::detail::make_device_uvector_async( + ignore_vals, stream, cudf::get_current_device_resource_ref()); + auto d_columns_data = cudf::detail::make_device_uvector_async( + columns_data, stream, cudf::get_current_device_resource_ref()); + + // 3. scatter string offsets to respective columns, set validity bits + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + num_nodes, + [column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + row_offsets = row_offsets.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + d_ignore_vals = d_ignore_vals.begin(), + d_columns_data = d_columns_data.begin()] __device__(size_type i) { + if (d_ignore_vals[col_ids[i]]) return; + auto const node_category = column_categories[col_ids[i]]; + switch (node_category) { + case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_STR: [[fallthrough]]; + case NC_VAL: + if (d_ignore_vals[col_ids[i]]) break; + set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); + d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; + d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; + break; + default: break; + } + }); + + // 4. scatter List offset + // copy_if only node's whose parent is list, (node_id, parent_col_id) + // stable_sort by parent_col_id of {node_id}. + // For all unique parent_node_id of (i==0, i-1!=i), write start offset. + // (i==last, i+1!=i), write end offset. + // unique_copy_by_key {parent_node_id} {row_offset} to + // col[parent_col_id].child_offsets[row_offset[parent_node_id]] + + auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids + auto parent_col_id = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [col_ids = col_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { + return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel + : col_ids[parent_node_ids[node_id]]; + })); + auto const list_children_end = thrust::copy_if( + rmm::exec_policy(stream), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + + num_nodes, + thrust::make_counting_iterator(0), + thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), + [d_ignore_vals = d_ignore_vals.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin()] __device__(size_type node_id) { + auto parent_node_id = parent_node_ids[node_id]; + return parent_node_id != parent_node_sentinel and + column_categories[col_ids[parent_node_id]] == NC_LIST and + (!d_ignore_vals[col_ids[parent_node_id]]); + }); + + auto const num_list_children = + list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); + thrust::stable_sort_by_key(rmm::exec_policy(stream), + parent_col_ids.begin(), + parent_col_ids.begin() + num_list_children, + node_ids.begin()); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + num_list_children, + [node_ids = node_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + parent_col_ids = parent_col_ids.begin(), + row_offsets = row_offsets.begin(), + d_columns_data = d_columns_data.begin(), + num_list_children] __device__(size_type i) { + auto const node_id = node_ids[i]; + auto const parent_node_id = parent_node_ids[node_id]; + // scatter to list_offset + if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = + row_offsets[node_id]; + } + // last value of list child_offset is its size. + if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = + row_offsets[node_id] + 1; + } + }); + + // 5. scan on offsets. + for (auto& [id, col_ref] : columns) { + auto& col = col_ref.get(); + if (col.type == json_col_t::StringColumn) { + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + col.string_offsets.begin(), + col.string_offsets.end(), + col.string_offsets.begin(), + thrust::maximum{}); + } else if (col.type == json_col_t::ListColumn) { + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + col.child_offsets.begin(), + col.child_offsets.end(), + col.child_offsets.begin(), + thrust::maximum{}); + } + } + stream.synchronize(); +} + +} // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 756047d383a..dfd9285f682 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -36,26 +35,18 @@ #include #include -#include #include #include #include -#include #include #include #include #include -#include -#include #include #include -#include -#include - namespace cudf::io::json::detail { -// DEBUG prints auto to_cat = [](auto v) -> std::string { switch (v) { case NC_STRUCT: return " S"; @@ -114,18 +105,19 @@ void print_tree(host_span input, */ std::tuple, rmm::device_uvector> reduce_to_column_tree(tree_meta_t& tree, - device_span original_col_ids, - device_span sorted_col_ids, - device_span ordered_node_ids, - device_span row_offsets, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, + device_span row_offsets, bool is_array_of_arrays, NodeIndexT const row_array_parent_col_id, rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); + // 1. column count for allocation - auto const num_columns = - thrust::unique_count(rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end()); + auto const num_columns = thrust::unique_count( + rmm::exec_policy_nosync(stream), sorted_col_ids.begin(), sorted_col_ids.end()); // 2. reduce_by_key {col_id}, {row_offset}, max. rmm::device_uvector unique_col_ids(num_columns, stream); @@ -170,30 +162,34 @@ reduce_to_column_tree(tree_meta_t& tree, }); // 4. unique_copy parent_node_ids, ranges - rmm::device_uvector column_levels(0, stream); // not required + rmm::device_uvector column_levels(num_columns, stream); // not required rmm::device_uvector parent_col_ids(num_columns, stream); rmm::device_uvector col_range_begin(num_columns, stream); // Field names rmm::device_uvector col_range_end(num_columns, stream); rmm::device_uvector unique_node_ids(num_columns, stream); - thrust::unique_by_key_copy(rmm::exec_policy(stream), + thrust::unique_by_key_copy(rmm::exec_policy_nosync(stream), sorted_col_ids.begin(), sorted_col_ids.end(), ordered_node_ids.begin(), thrust::make_discard_iterator(), unique_node_ids.begin()); + thrust::copy_n( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_zip_iterator( + thrust::make_permutation_iterator(tree.node_levels.begin(), unique_node_ids.begin()), thrust::make_permutation_iterator(tree.parent_node_ids.begin(), unique_node_ids.begin()), thrust::make_permutation_iterator(tree.node_range_begin.begin(), unique_node_ids.begin()), thrust::make_permutation_iterator(tree.node_range_end.begin(), unique_node_ids.begin())), unique_node_ids.size(), - thrust::make_zip_iterator( - parent_col_ids.begin(), col_range_begin.begin(), col_range_end.begin())); + thrust::make_zip_iterator(column_levels.begin(), + parent_col_ids.begin(), + col_range_begin.begin(), + col_range_end.begin())); // convert parent_node_ids to parent_col_ids thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), parent_col_ids.begin(), parent_col_ids.end(), parent_col_ids.begin(), @@ -211,18 +207,17 @@ reduce_to_column_tree(tree_meta_t& tree, column_categories[parent_col_id] == NC_LIST && (!is_array_of_arrays || parent_col_id != row_array_parent_col_id)); }; + // Mixed types in List children go to different columns, // so all immediate children of list column should have same max_row_offsets. // create list's children max_row_offsets array. (initialize to zero) // atomicMax on children max_row_offsets array. // gather the max_row_offsets from children row offset array. { - rmm::device_uvector list_parents_children_max_row_offsets(num_columns, stream); - thrust::fill(rmm::exec_policy(stream), - list_parents_children_max_row_offsets.begin(), - list_parents_children_max_row_offsets.end(), - 0); - thrust::for_each(rmm::exec_policy(stream), + auto list_parents_children_max_row_offsets = + cudf::detail::make_zeroed_device_uvector_async( + static_cast(num_columns), stream, cudf::get_current_device_resource_ref()); + thrust::for_each(rmm::exec_policy_nosync(stream), unique_col_ids.begin(), unique_col_ids.end(), [column_categories = column_categories.begin(), @@ -238,8 +233,9 @@ reduce_to_column_tree(tree_meta_t& tree, ref.fetch_max(max_row_offsets[col_id], cuda::std::memory_order_relaxed); } }); + thrust::gather_if( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), parent_col_ids.begin(), parent_col_ids.end(), parent_col_ids.begin(), @@ -254,7 +250,7 @@ reduce_to_column_tree(tree_meta_t& tree, // copy lists' max_row_offsets to children. // all structs should have same size. thrust::transform_if( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), unique_col_ids.begin(), unique_col_ids.end(), max_row_offsets.begin(), @@ -280,7 +276,7 @@ reduce_to_column_tree(tree_meta_t& tree, // For Struct and List (to avoid copying entire strings when mixed type as string is enabled) thrust::transform_if( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), col_range_begin.begin(), col_range_begin.end(), column_categories.begin(), @@ -297,678 +293,6 @@ reduce_to_column_tree(tree_meta_t& tree, std::move(max_row_offsets)}; } -/** - * @brief Get the column indices for the values column for array of arrays rows - * - * @param row_array_children_level The level of the row array's children - * @param d_tree The tree metadata - * @param col_ids The column ids - * @param num_columns The number of columns - * @param stream The stream to use - * @return The value columns' indices - */ -rmm::device_uvector get_values_column_indices(TreeDepthT const row_array_children_level, - tree_meta_t const& d_tree, - device_span col_ids, - size_type const num_columns, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - auto [level2_nodes, level2_indices] = get_array_children_indices( - row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream); - auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin()); - rmm::device_uvector values_column_indices(num_columns, stream); - thrust::scatter(rmm::exec_policy(stream), - level2_indices.begin(), - level2_indices.end(), - col_id_location, - values_column_indices.begin()); - return values_column_indices; -} - -/** - * @brief Copies strings specified by pair of begin, end offsets to host vector of strings. - * - * @param input String device buffer - * @param node_range_begin Begin offset of the strings - * @param node_range_end End offset of the strings - * @param stream CUDA stream - * @return Vector of strings - */ -std::vector copy_strings_to_host_sync( - device_span input, - device_span node_range_begin, - device_span node_range_end, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - auto const num_strings = node_range_begin.size(); - rmm::device_uvector string_offsets(num_strings, stream); - rmm::device_uvector string_lengths(num_strings, stream); - auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); - thrust::transform(rmm::exec_policy(stream), - d_offset_pairs, - d_offset_pairs + num_strings, - thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()), - [] __device__(auto const& offsets) { - // Note: first character for non-field columns - return thrust::make_tuple( - static_cast(thrust::get<0>(offsets)), - static_cast(thrust::get<1>(offsets) - thrust::get<0>(offsets))); - }); - - cudf::io::parse_options_view options_view{}; - options_view.quotechar = '\0'; // no quotes - options_view.keepquotes = true; - auto d_offset_length_it = - thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()); - auto d_column_names = parse_data(input.data(), - d_offset_length_it, - num_strings, - data_type{type_id::STRING}, - rmm::device_buffer{}, - 0, - options_view, - stream, - cudf::get_current_device_resource_ref()); - auto to_host = [stream](auto const& col) { - if (col.is_empty()) return std::vector{}; - auto const scv = cudf::strings_column_view(col); - auto const h_chars = cudf::detail::make_host_vector_async( - cudf::device_span(scv.chars_begin(stream), scv.chars_size(stream)), stream); - auto const h_offsets = cudf::detail::make_host_vector_async( - cudf::device_span(scv.offsets().data() + scv.offset(), - scv.size() + 1), - stream); - stream.synchronize(); - - // build std::string vector from chars and offsets - std::vector host_data; - host_data.reserve(col.size()); - std::transform( - std::begin(h_offsets), - std::end(h_offsets) - 1, - std::begin(h_offsets) + 1, - std::back_inserter(host_data), - [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); }); - return host_data; - }; - return to_host(d_column_names->view()); -} - -/** - * @brief Checks if all strings in each string column in the tree are nulls. - * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as - * false. - * - * @param input Input JSON string device data - * @param d_column_tree column tree representation of JSON string - * @param tree Node tree representation of the JSON string - * @param col_ids Column ids of the nodes in the tree - * @param options Parsing options specifying the parsing behaviour - * @param stream CUDA stream used for device memory operations and kernel launches - * @return Array of bytes where each byte indicate if it is all nulls string column. - */ -rmm::device_uvector is_all_nulls_each_column(device_span input, - tree_meta_t const& d_column_tree, - tree_meta_t const& tree, - device_span col_ids, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream) -{ - auto const num_nodes = col_ids.size(); - auto const num_cols = d_column_tree.node_categories.size(); - rmm::device_uvector is_all_nulls(num_cols, stream); - thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true); - - auto parse_opt = parsing_options(options, stream); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - num_nodes, - [options = parse_opt.view(), - data = input.data(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { - auto const node_category = column_categories[col_ids[i]]; - if (node_category == NC_STR or node_category == NC_VAL) { - auto const is_null_literal = serialized_trie_contains( - options.trie_na, - {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); - if (!is_null_literal) is_all_nulls[col_ids[i]] = false; - } - }); - return is_all_nulls; -} - -/** - * @brief Holds member data pointers of `d_json_column` - * - */ -struct json_column_data { - using row_offset_t = json_column::row_offset_t; - row_offset_t* string_offsets; - row_offset_t* string_lengths; - row_offset_t* child_offsets; - bitmask_type* validity; -}; - -/** - * @brief Constructs `d_json_column` from node tree representation - * Newly constructed columns are insert into `root`'s children. - * `root` must be a list type. - * - * @param input Input JSON string device data - * @param tree Node tree representation of the JSON string - * @param col_ids Column ids of the nodes in the tree - * @param row_offsets Row offsets of the nodes in the tree - * @param root Root node of the `d_json_column` tree - * @param is_array_of_arrays Whether the tree is an array of arrays - * @param options Parsing options specifying the parsing behaviour - * options affecting behaviour are - * is_enabled_lines: Whether the input is a line-delimited JSON - * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the device memory - * of child_offets and validity members of `d_json_column` - */ -void make_device_json_column(device_span input, - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, - device_json_column& root, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - - bool const is_enabled_lines = options.is_enabled_lines(); - bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); - auto const num_nodes = col_ids.size(); - rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy - thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); - - // sort by {col_id} on {node_ids} stable - rmm::device_uvector node_ids(col_ids.size(), stream); - thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); - thrust::stable_sort_by_key( - rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); - - NodeIndexT const row_array_parent_col_id = [&]() { - NodeIndexT value = parent_node_sentinel; - if (!col_ids.empty()) { - auto const list_node_index = is_enabled_lines ? 0 : 1; - CUDF_CUDA_TRY(cudaMemcpyAsync(&value, - col_ids.data() + list_node_index, - sizeof(NodeIndexT), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); - } - return value; - }(); - - // 1. gather column information. - auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = - reduce_to_column_tree(tree, - col_ids, - sorted_col_ids, - node_ids, - row_offsets, - is_array_of_arrays, - row_array_parent_col_id, - stream); - auto num_columns = d_unique_col_ids.size(); - auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); - auto column_categories = - cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream); - auto const column_parent_ids = - cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); - auto column_range_beg = - cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream); - auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream); - std::vector column_names = copy_strings_to_host_sync( - input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); - // array of arrays column names - if (is_array_of_arrays) { - TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; - auto values_column_indices = - get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); - auto h_values_column_indices = - cudf::detail::make_host_vector_sync(values_column_indices, stream); - std::transform(unique_col_ids.begin(), - unique_col_ids.end(), - column_names.begin(), - column_names.begin(), - [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( - auto col_id, auto name) mutable { - return column_parent_ids[col_id] == row_array_parent_col_id - ? std::to_string(h_values_column_indices[col_id]) - : name; - }); - } - - auto to_json_col_type = [](auto category) { - switch (category) { - case NC_STRUCT: return json_col_t::StructColumn; - case NC_LIST: return json_col_t::ListColumn; - case NC_STR: [[fallthrough]]; - case NC_VAL: return json_col_t::StringColumn; - default: return json_col_t::Unknown; - } - }; - auto init_to_zero = [stream](auto& v) { - thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0); - }; - - auto initialize_json_columns = [&](auto i, auto& col, auto column_category) { - if (column_category == NC_ERR || column_category == NC_FN) { - return; - } else if (column_category == NC_VAL || column_category == NC_STR) { - col.string_offsets.resize(max_row_offsets[i] + 1, stream); - col.string_lengths.resize(max_row_offsets[i] + 1, stream); - init_to_zero(col.string_offsets); - init_to_zero(col.string_lengths); - } else if (column_category == NC_LIST) { - col.child_offsets.resize(max_row_offsets[i] + 2, stream); - init_to_zero(col.child_offsets); - } - col.num_rows = max_row_offsets[i] + 1; - col.validity = - cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = to_json_col_type(column_category); - }; - - auto reinitialize_as_string = [&](auto i, auto& col) { - col.string_offsets.resize(max_row_offsets[i] + 1, stream); - col.string_lengths.resize(max_row_offsets[i] + 1, stream); - init_to_zero(col.string_offsets); - init_to_zero(col.string_lengths); - col.num_rows = max_row_offsets[i] + 1; - col.validity = - cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = json_col_t::StringColumn; - // destroy references of all child columns after this step, by calling remove_child_columns - }; - - path_from_tree tree_path{column_categories, - column_parent_ids, - column_names, - is_array_of_arrays, - row_array_parent_col_id}; - - // 2. generate nested columns tree and its device_memory - // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. - auto h_range_col_id_it = - thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin()); - std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { - return thrust::get<0>(a) < thrust::get<0>(b); - }); - - auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() { - if (is_enabled_mixed_types_as_string) { - return cudf::detail::make_host_vector_sync( - is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream); - } - return cudf::detail::make_empty_host_vector(0, stream); - }(); - - // use hash map because we may skip field name's col_ids - std::unordered_map> columns; - // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking - std::map, NodeIndexT> mapped_columns; - // find column_ids which are values, but should be ignored in validity - auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); - std::vector is_mixed_type_column(num_columns, 0); - std::vector is_pruned(num_columns, 0); - // for columns that are not mixed type but have been forced as string - std::vector forced_as_string_column(num_columns); - columns.try_emplace(parent_node_sentinel, std::ref(root)); - - std::function remove_child_columns = - [&](NodeIndexT this_col_id, device_json_column& col) { - for (auto col_name : col.column_order) { - auto child_id = mapped_columns[{this_col_id, col_name}]; - is_mixed_type_column[child_id] = 1; - remove_child_columns(child_id, col.child_columns.at(col_name)); - mapped_columns.erase({this_col_id, col_name}); - columns.erase(child_id); - } - col.child_columns.clear(); // their references are deleted above. - col.column_order.clear(); - }; - - auto name_and_parent_index = [&is_array_of_arrays, - &row_array_parent_col_id, - &column_parent_ids, - &column_categories, - &column_names](auto this_col_id) { - std::string name = ""; - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { - if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { - name = column_names[this_col_id]; - } else { - name = list_child_name; - } - } else if (column_categories[parent_col_id] == NC_FN) { - auto field_name_col_id = parent_col_id; - parent_col_id = column_parent_ids[parent_col_id]; - name = column_names[field_name_col_id]; - } else { - CUDF_FAIL("Unexpected parent column category"); - } - return std::pair{name, parent_col_id}; - }; - - // Prune columns that are not required to be parsed. - if (options.is_enabled_prune_columns()) { - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value - auto [name, parent_col_id] = name_and_parent_index(this_col_id); - // get path of this column, and get its dtype if present in options - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) { - is_pruned[this_col_id] = 1; - continue; - } else { - // make sure all its parents are not pruned. - while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) { - is_pruned[parent_col_id] = 0; - parent_col_id = column_parent_ids[parent_col_id]; - } - } - } - } - - // Build the column tree, also, handles mixed types. - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value - auto [name, parent_col_id] = name_and_parent_index(this_col_id); - - // if parent is mixed type column or this column is pruned or if parent - // has been forced as string, ignore this column. - if (parent_col_id != parent_node_sentinel && - (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) || - forced_as_string_column[parent_col_id]) { - ignore_vals[this_col_id] = 1; - if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } - if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; } - continue; - } - - // If the child is already found, - // replace if this column is a nested column and the existing was a value column - // ignore this column if this column is a value column and the existing was a nested column - auto it = columns.find(parent_col_id); - CUDF_EXPECTS(it != columns.end(), "Parent column not found"); - auto& parent_col = it->second.get(); - bool replaced = false; - if (mapped_columns.count({parent_col_id, name}) > 0) { - auto const old_col_id = mapped_columns[{parent_col_id, name}]; - // If mixed type as string is enabled, make both of them strings and merge them. - // All child columns will be ignored when parsing. - if (is_enabled_mixed_types_as_string) { - bool const is_mixed_type = [&]() { - // If new or old is STR and they are all not null, make it mixed type, else ignore. - if (column_categories[this_col_id] == NC_VAL || - column_categories[this_col_id] == NC_STR) { - if (is_str_column_all_nulls[this_col_id]) return false; - } - if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { - if (is_str_column_all_nulls[old_col_id]) return false; - } - return true; - }(); - if (is_mixed_type) { - is_mixed_type_column[this_col_id] = 1; - is_mixed_type_column[old_col_id] = 1; - // if old col type (not cat) is list or struct, replace with string. - auto& col = columns.at(old_col_id).get(); - if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { - reinitialize_as_string(old_col_id, col); - remove_child_columns(old_col_id, col); - // all its children (which are already inserted) are ignored later. - } - col.forced_as_string_column = true; - columns.try_emplace(this_col_id, columns.at(old_col_id)); - continue; - } - } - - if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { - ignore_vals[this_col_id] = 1; - continue; - } - if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { - // remap - ignore_vals[old_col_id] = 1; - mapped_columns.erase({parent_col_id, name}); - columns.erase(old_col_id); - parent_col.child_columns.erase(name); - replaced = true; // to skip duplicate name in column_order - } else { - // If this is a nested column but we're trying to insert either (a) a list node into a - // struct column or (b) a struct node into a list column, we fail - CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and - column_categories[this_col_id] == NC_STRUCT) or - (column_categories[old_col_id] == NC_STRUCT and - column_categories[this_col_id] == NC_LIST)), - "A mix of lists and structs within the same column is not supported"); - } - } - - auto this_column_category = column_categories[this_col_id]; - // get path of this column, check if it is a struct/list forced as string, and enforce it - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if ((column_categories[this_col_id] == NC_STRUCT or - column_categories[this_col_id] == NC_LIST) and - user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { - this_column_category = NC_STR; - } - - CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); - // move into parent - device_json_column col(stream, mr); - initialize_json_columns(this_col_id, col, this_column_category); - if ((column_categories[this_col_id] == NC_STRUCT or - column_categories[this_col_id] == NC_LIST) and - user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { - col.forced_as_string_column = true; - forced_as_string_column[this_col_id] = true; - } - - auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; - CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); - if (not replaced) parent_col.column_order.push_back(name); - columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); - mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); - } - - if (is_enabled_mixed_types_as_string) { - // ignore all children of mixed type columns - for (auto const this_col_id : unique_col_ids) { - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { - is_mixed_type_column[this_col_id] = 1; - ignore_vals[this_col_id] = 1; - columns.erase(this_col_id); - } - // Convert only mixed type columns as string (so to copy), but not its children - if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and - is_mixed_type_column[this_col_id] == 1) - column_categories[this_col_id] = NC_STR; - } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudf::detail::host_memory_kind::PAGEABLE, - stream); - } - - // ignore all children of columns forced as string - for (auto const this_col_id : unique_col_ids) { - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) { - forced_as_string_column[this_col_id] = true; - ignore_vals[this_col_id] = 1; - } - // Convert only mixed type columns as string (so to copy), but not its children - if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and - forced_as_string_column[this_col_id]) - column_categories[this_col_id] = NC_STR; - } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudf::detail::host_memory_kind::PAGEABLE, - stream); - - // restore unique_col_ids order - std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { - return thrust::get<1>(a) < thrust::get<1>(b); - }); - // move columns data to device. - auto columns_data = cudf::detail::make_host_vector(num_columns, stream); - for (auto& [col_id, col_ref] : columns) { - if (col_id == parent_node_sentinel) continue; - auto& col = col_ref.get(); - columns_data[col_id] = json_column_data{col.string_offsets.data(), - col.string_lengths.data(), - col.child_offsets.data(), - static_cast(col.validity.data())}; - } - - auto d_ignore_vals = cudf::detail::make_device_uvector_async( - ignore_vals, stream, cudf::get_current_device_resource_ref()); - auto d_columns_data = cudf::detail::make_device_uvector_async( - columns_data, stream, cudf::get_current_device_resource_ref()); - - // 3. scatter string offsets to respective columns, set validity bits - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - num_nodes, - [column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin(), - row_offsets = row_offsets.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - d_ignore_vals = d_ignore_vals.begin(), - d_columns_data = d_columns_data.begin()] __device__(size_type i) { - if (d_ignore_vals[col_ids[i]]) return; - auto const node_category = column_categories[col_ids[i]]; - switch (node_category) { - case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; - case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; - case NC_STR: [[fallthrough]]; - case NC_VAL: - if (d_ignore_vals[col_ids[i]]) break; - set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); - d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; - d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; - break; - default: break; - } - }); - - // 4. scatter List offset - // copy_if only node's whose parent is list, (node_id, parent_col_id) - // stable_sort by parent_col_id of {node_id}. - // For all unique parent_node_id of (i==0, i-1!=i), write start offset. - // (i==last, i+1!=i), write end offset. - // unique_copy_by_key {parent_node_id} {row_offset} to - // col[parent_col_id].child_offsets[row_offset[parent_node_id]] - - auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids - auto parent_col_id = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - cuda::proclaim_return_type( - [col_ids = col_ids.begin(), - parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { - return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel - : col_ids[parent_node_ids[node_id]]; - })); - auto const list_children_end = thrust::copy_if( - rmm::exec_policy(stream), - thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), - thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + - num_nodes, - thrust::make_counting_iterator(0), - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), - [d_ignore_vals = d_ignore_vals.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin()] __device__(size_type node_id) { - auto parent_node_id = parent_node_ids[node_id]; - return parent_node_id != parent_node_sentinel and - column_categories[col_ids[parent_node_id]] == NC_LIST and - (!d_ignore_vals[col_ids[parent_node_id]]); - }); - - auto const num_list_children = - list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); - thrust::stable_sort_by_key(rmm::exec_policy(stream), - parent_col_ids.begin(), - parent_col_ids.begin() + num_list_children, - node_ids.begin()); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - num_list_children, - [node_ids = node_ids.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - parent_col_ids = parent_col_ids.begin(), - row_offsets = row_offsets.begin(), - d_columns_data = d_columns_data.begin(), - num_list_children] __device__(size_type i) { - auto const node_id = node_ids[i]; - auto const parent_node_id = parent_node_ids[node_id]; - // scatter to list_offset - if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { - d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = - row_offsets[node_id]; - } - // last value of list child_offset is its size. - if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { - d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = - row_offsets[node_id] + 1; - } - }); - - // 5. scan on offsets. - for (auto& [id, col_ref] : columns) { - auto& col = col_ref.get(); - if (col.type == json_col_t::StringColumn) { - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - col.string_offsets.begin(), - col.string_offsets.end(), - col.string_offsets.begin(), - thrust::maximum{}); - } else if (col.type == json_col_t::ListColumn) { - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - col.child_offsets.begin(), - col.child_offsets.end(), - col.child_offsets.begin(), - thrust::maximum{}); - } - } - stream.synchronize(); -} - std::pair, std::vector> device_json_column_to_cudf_column( device_json_column& json_col, device_span d_input, diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 75639a0438f..93ef2b46be1 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -185,6 +185,55 @@ struct device_json_column { } }; +namespace experimental { +/* + * @brief Sparse graph adjacency matrix stored in Compressed Sparse Row (CSR) format. + */ +struct compressed_sparse_row { + rmm::device_uvector row_idx; + rmm::device_uvector col_idx; +}; + +/* + * @brief Auxiliary column tree properties that are required to construct the device json + * column subtree, but not required for the final cudf column construction. + */ +struct column_tree_properties { + rmm::device_uvector categories; + rmm::device_uvector max_row_offsets; + rmm::device_uvector mapped_ids; +}; + +namespace detail { +/** + * @brief Reduce node tree into column tree by aggregating each property of column. + * + * @param node_tree Node tree representation of JSON string + * @param original_col_ids Column ids of nodes + * @param sorted_col_ids Sorted column ids of nodes + * @param ordered_node_ids Node ids of nodes sorted by column ids + * @param row_offsets Row offsets of nodes + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Tuple of compressed_sparse_row struct storing adjacency information of the column tree, + * and column_tree_properties struct storing properties of each node i.e. column category, max + * number of rows in the column, and column id + */ +CUDF_EXPORT +std::tuple reduce_to_column_tree( + tree_meta_t& node_tree, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, + device_span row_offsets, + bool is_array_of_arrays, + NodeIndexT row_array_parent_col_id, + rmm::cuda_stream_view stream); + +} // namespace detail +} // namespace experimental + namespace detail { // TODO: return device_uvector instead of passing pre-allocated memory @@ -299,22 +348,59 @@ get_array_children_indices(TreeDepthT row_array_children_level, device_span node_levels, device_span parent_node_ids, rmm::cuda_stream_view stream); + /** - * @brief Reduce node tree into column tree by aggregating each property of column. + * @brief Reduces node tree representation to column tree representation. * - * @param tree json node tree to reduce (modified in-place, but restored to original state) - * @param col_ids column ids of each node (modified in-place, but restored to original state) - * @param row_offsets row offsets of each node (modified in-place, but restored to original state) - * @param stream The CUDA stream to which kernels are dispatched - * @return A tuple containing the column tree, identifier for each column and the maximum row index - * in each column + * @param node_tree Node tree representation of JSON string + * @param original_col_ids Column ids of nodes + * @param sorted_col_ids Sorted column ids of nodes + * @param ordered_node_ids Node ids of nodes sorted by column ids + * @param row_offsets Row offsets of nodes + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A tuple of column tree representation of JSON string, column ids of columns, and + * max row offsets of columns */ +CUDF_EXPORT std::tuple, rmm::device_uvector> -reduce_to_column_tree(tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, +reduce_to_column_tree(tree_meta_t& node_tree, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, + device_span row_offsets, + bool is_array_of_arrays, + NodeIndexT const row_array_parent_col_id, rmm::cuda_stream_view stream); - +/** + * @brief Constructs `d_json_column` from node tree representation + * Newly constructed columns are insert into `root`'s children. + * `root` must be a list type. + * + * @param input Input JSON string device data + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param row_offsets Row offsets of the nodes in the tree + * @param root Root node of the `d_json_column` tree + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param options Parsing options specifying the parsing behaviour + * options affecting behaviour are + * is_enabled_lines: Whether the input is a line-delimited JSON + * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the device memory + * of child_offets and validity members of `d_json_column` + */ +void make_device_json_column(device_span input, + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** * @brief Retrieves the parse_options to be used for type inference and type casting * diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index 748691fb7d1..2ec23e0dc6d 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -377,16 +376,12 @@ conditional_inner_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, std::optional output_size, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join(left, - right, - binary_predicate, - detail::join_kind::INNER_JOIN, - output_size, - cudf::get_default_stream(), - mr); + return detail::conditional_join( + left, right, binary_predicate, detail::join_kind::INNER_JOIN, output_size, stream, mr); } std::pair>, @@ -395,16 +390,12 @@ conditional_left_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, std::optional output_size, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join(left, - right, - binary_predicate, - detail::join_kind::LEFT_JOIN, - output_size, - cudf::get_default_stream(), - mr); + return detail::conditional_join( + left, right, binary_predicate, detail::join_kind::LEFT_JOIN, output_size, stream, mr); } std::pair>, @@ -412,16 +403,12 @@ std::pair>, conditional_full_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join(left, - right, - binary_predicate, - detail::join_kind::FULL_JOIN, - {}, - cudf::get_default_stream(), - mr); + return detail::conditional_join( + left, right, binary_predicate, detail::join_kind::FULL_JOIN, {}, stream, mr); } std::unique_ptr> conditional_left_semi_join( @@ -429,16 +416,12 @@ std::unique_ptr> conditional_left_semi_join( table_view const& right, ast::expression const& binary_predicate, std::optional output_size, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join_anti_semi(left, - right, - binary_predicate, - detail::join_kind::LEFT_SEMI_JOIN, - output_size, - cudf::get_default_stream(), - mr); + return detail::conditional_join_anti_semi( + left, right, binary_predicate, detail::join_kind::LEFT_SEMI_JOIN, output_size, stream, mr); } std::unique_ptr> conditional_left_anti_join( @@ -446,64 +429,56 @@ std::unique_ptr> conditional_left_anti_join( table_view const& right, ast::expression const& binary_predicate, std::optional output_size, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join_anti_semi(left, - right, - binary_predicate, - detail::join_kind::LEFT_ANTI_JOIN, - output_size, - cudf::get_default_stream(), - mr); + return detail::conditional_join_anti_semi( + left, right, binary_predicate, detail::join_kind::LEFT_ANTI_JOIN, output_size, stream, mr); } std::size_t conditional_inner_join_size(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::compute_conditional_join_output_size( - left, right, binary_predicate, detail::join_kind::INNER_JOIN, cudf::get_default_stream(), mr); + left, right, binary_predicate, detail::join_kind::INNER_JOIN, stream, mr); } std::size_t conditional_left_join_size(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::compute_conditional_join_output_size( - left, right, binary_predicate, detail::join_kind::LEFT_JOIN, cudf::get_default_stream(), mr); + left, right, binary_predicate, detail::join_kind::LEFT_JOIN, stream, mr); } std::size_t conditional_left_semi_join_size(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::compute_conditional_join_output_size(left, - right, - binary_predicate, - detail::join_kind::LEFT_SEMI_JOIN, - cudf::get_default_stream(), - mr); + return detail::compute_conditional_join_output_size( + left, right, binary_predicate, detail::join_kind::LEFT_SEMI_JOIN, stream, mr); } std::size_t conditional_left_anti_join_size(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::compute_conditional_join_output_size(left, - right, - binary_predicate, - detail::join_kind::LEFT_ANTI_JOIN, - cudf::get_default_stream(), - mr); + return detail::compute_conditional_join_output_size( + left, right, binary_predicate, detail::join_kind::LEFT_ANTI_JOIN, stream, mr); } } // namespace cudf diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp index 4f6a9484e8c..303442e79ef 100644 --- a/cpp/src/join/conditional_join.hpp +++ b/cpp/src/join/conditional_join.hpp @@ -19,7 +19,6 @@ #include #include -#include #include #include diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu index eeb49736bac..15594fb60e3 100644 --- a/cpp/src/join/cross_join.cu +++ b/cpp/src/join/cross_join.cu @@ -25,7 +25,6 @@ #include #include #include -#include #include #include @@ -75,10 +74,11 @@ std::unique_ptr cross_join(cudf::table_view const& left, std::unique_ptr cross_join(cudf::table_view const& left, cudf::table_view const& right, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::cross_join(left, right, cudf::get_default_stream(), mr); + return detail::cross_join(left, right, stream, mr); } } // namespace cudf diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 0abff27667b..7b13c260364 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -120,10 +119,11 @@ std::pair>, inner_join(table_view const& left, table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::inner_join(left, right, compare_nulls, cudf::get_default_stream(), mr); + return detail::inner_join(left, right, compare_nulls, stream, mr); } std::pair>, @@ -131,10 +131,11 @@ std::pair>, left_join(table_view const& left, table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::left_join(left, right, compare_nulls, cudf::get_default_stream(), mr); + return detail::left_join(left, right, compare_nulls, stream, mr); } std::pair>, @@ -142,10 +143,11 @@ std::pair>, full_join(table_view const& left, table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::full_join(left, right, compare_nulls, cudf::get_default_stream(), mr); + return detail::full_join(left, right, compare_nulls, stream, mr); } } // namespace cudf diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 573101cefd9..86402a0e7de 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -50,6 +51,11 @@ using mixed_multimap_type = cudf::detail::cuco_allocator, cuco::legacy::double_hashing<1, hash_type, hash_type>>; +using semi_map_type = cuco::legacy::static_map>; + using row_hash_legacy = cudf::row_hasher; diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index 8ff78dd47f4..820b81ee309 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -484,6 +483,7 @@ mixed_inner_join( ast::expression const& binary_predicate, null_equality compare_nulls, std::optional>> const output_size_data, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -495,7 +495,7 @@ mixed_inner_join( compare_nulls, detail::join_kind::INNER_JOIN, output_size_data, - cudf::get_default_stream(), + stream, mr); } @@ -506,6 +506,7 @@ std::pair>> mixed_in table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -516,7 +517,7 @@ std::pair>> mixed_in binary_predicate, compare_nulls, detail::join_kind::INNER_JOIN, - cudf::get_default_stream(), + stream, mr); } @@ -530,6 +531,7 @@ mixed_left_join( ast::expression const& binary_predicate, null_equality compare_nulls, std::optional>> const output_size_data, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -541,7 +543,7 @@ mixed_left_join( compare_nulls, detail::join_kind::LEFT_JOIN, output_size_data, - cudf::get_default_stream(), + stream, mr); } @@ -552,6 +554,7 @@ std::pair>> mixed_le table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -562,7 +565,7 @@ std::pair>> mixed_le binary_predicate, compare_nulls, detail::join_kind::LEFT_JOIN, - cudf::get_default_stream(), + stream, mr); } @@ -576,6 +579,7 @@ mixed_full_join( ast::expression const& binary_predicate, null_equality compare_nulls, std::optional>> const output_size_data, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -587,7 +591,7 @@ mixed_full_join( compare_nulls, detail::join_kind::FULL_JOIN, output_size_data, - cudf::get_default_stream(), + stream, mr); } diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh index 89c13285cfe..19701816867 100644 --- a/cpp/src/join/mixed_join_common_utils.cuh +++ b/cpp/src/join/mixed_join_common_utils.cuh @@ -25,7 +25,6 @@ #include #include -#include namespace cudf { namespace detail { @@ -161,38 +160,6 @@ struct pair_expression_equality : public expression_equality { } }; -/** - * @brief Equality comparator that composes two row_equality comparators. - */ -struct double_row_equality_comparator { - row_equality const equality_comparator; - row_equality const conditional_comparator; - - __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept - { - using experimental::row::lhs_index_type; - using experimental::row::rhs_index_type; - - return equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) && - conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}); - } -}; - -// A CUDA Cooperative Group of 4 threads for the hash set. -auto constexpr DEFAULT_MIXED_JOIN_CG_SIZE = 4; - -// The hash set type used by mixed_semi_join with the build_table. -using hash_set_type = cuco::static_set, - cuda::thread_scope_device, - double_row_equality_comparator, - cuco::linear_probing, - cudf::detail::cuco_allocator, - cuco::storage<1>>; - -// The hash_set_ref_type used by mixed_semi_join kerenels for probing. -using hash_set_ref_type = hash_set_type::ref_type; - } // namespace detail } // namespace cudf diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index f2c5ff13638..7459ac3e99c 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -38,16 +38,12 @@ CUDF_KERNEL void __launch_bounds__(block_size) table_device_view right_table, table_device_view probe, table_device_view build, + row_hash const hash_probe, row_equality const equality_probe, - hash_set_ref_type set_ref, + cudf::detail::semi_map_type::device_view hash_table_view, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data) { - auto constexpr cg_size = hash_set_ref_type::cg_size; - - auto const tile = - cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); - // Normally the casting of a shared memory array is used to create multiple // arrays of different types from the shared memory buffer, but here it is // used to circumvent conflicts between arrays of different types between @@ -56,24 +52,24 @@ CUDF_KERNEL void __launch_bounds__(block_size) cudf::ast::detail::IntermediateDataType* intermediate_storage = reinterpret_cast*>(raw_intermediate_storage); auto thread_intermediate_storage = - &intermediate_storage[tile.meta_group_rank() * device_expression_data.num_intermediates]; + &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; + + cudf::size_type const left_num_rows = left_table.num_rows(); + cudf::size_type const right_num_rows = right_table.num_rows(); + auto const outer_num_rows = left_num_rows; - cudf::size_type const outer_num_rows = left_table.num_rows(); - auto const outer_row_index = cudf::detail::grid_1d::global_thread_id() / cg_size; + cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size; auto evaluator = cudf::ast::detail::expression_evaluator( left_table, right_table, device_expression_data); if (outer_row_index < outer_num_rows) { - // Make sure to swap_tables here as hash_set will use probe table as the left one. - auto constexpr swap_tables = true; // Figure out the number of elements for this key. auto equality = single_expression_equality{ - evaluator, thread_intermediate_storage, swap_tables, equality_probe}; + evaluator, thread_intermediate_storage, false, equality_probe}; - auto const set_ref_equality = set_ref.with_key_eq(equality); - auto const result = set_ref_equality.contains(tile, outer_row_index); - if (tile.thread_rank() == 0) left_table_keep_mask[outer_row_index] = result; + left_table_keep_mask[outer_row_index] = + hash_table_view.contains(outer_row_index, hash_probe, equality); } } @@ -82,8 +78,9 @@ void launch_mixed_join_semi(bool has_nulls, table_device_view right_table, table_device_view probe, table_device_view build, + row_hash const hash_probe, row_equality const equality_probe, - hash_set_ref_type set_ref, + cudf::detail::semi_map_type::device_view hash_table_view, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data, detail::grid_1d const config, @@ -97,8 +94,9 @@ void launch_mixed_join_semi(bool has_nulls, right_table, probe, build, + hash_probe, equality_probe, - set_ref, + hash_table_view, left_table_keep_mask, device_expression_data); } else { @@ -108,8 +106,9 @@ void launch_mixed_join_semi(bool has_nulls, right_table, probe, build, + hash_probe, equality_probe, - set_ref, + hash_table_view, left_table_keep_mask, device_expression_data); } diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh index b08298e64e4..43714ffb36a 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cuh +++ b/cpp/src/join/mixed_join_kernels_semi.cuh @@ -45,8 +45,9 @@ namespace detail { * @param[in] right_table The right table * @param[in] probe The table with which to probe the hash table for matches. * @param[in] build The table with which the hash table was built. + * @param[in] hash_probe The hasher used for the probe table. * @param[in] equality_probe The equality comparator used when probing the hash table. - * @param[in] set_ref The hash table device view built from `build`. + * @param[in] hash_table_view The hash table built from `build`. * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating * the corresponding index from left table is present in output * @param[in] device_expression_data Container of device data required to evaluate the desired @@ -57,8 +58,9 @@ void launch_mixed_join_semi(bool has_nulls, table_device_view right_table, table_device_view probe, table_device_view build, + row_hash const hash_probe, row_equality const equality_probe, - hash_set_ref_type set_ref, + cudf::detail::semi_map_type::device_view hash_table_view, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data, detail::grid_1d const config, diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index 719b1d47105..aa4fa281159 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -29,7 +29,6 @@ #include #include #include -#include #include #include @@ -46,6 +45,45 @@ namespace cudf { namespace detail { +namespace { +/** + * @brief Device functor to create a pair of hash value and index for a given row. + */ +struct make_pair_function_semi { + __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept + { + // The value is irrelevant since we only ever use the hash map to check for + // membership of a particular row index. + return cuco::make_pair(static_cast(i), 0); + } +}; + +/** + * @brief Equality comparator that composes two row_equality comparators. + */ +class double_row_equality { + public: + double_row_equality(row_equality equality_comparator, row_equality conditional_comparator) + : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator} + { + } + + __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept + { + using experimental::row::lhs_index_type; + using experimental::row::rhs_index_type; + + return _equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) && + _conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}); + } + + private: + row_equality _equality_comparator; + row_equality _conditional_comparator; +}; + +} // namespace + std::unique_ptr> mixed_join_semi( table_view const& left_equality, table_view const& right_equality, @@ -57,7 +95,7 @@ std::unique_ptr> mixed_join_semi( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) and (join_type != join_kind::LEFT_JOIN) and + CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) && (join_type != join_kind::FULL_JOIN), "Inner, left, and full joins should use mixed_join."); @@ -98,7 +136,7 @@ std::unique_ptr> mixed_join_semi( // output column and follow the null-supporting expression evaluation code // path. auto const has_nulls = cudf::nullate::DYNAMIC{ - cudf::has_nulls(left_equality) or cudf::has_nulls(right_equality) or + cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) || binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)}; auto const parser = ast::detail::expression_parser{ @@ -117,20 +155,27 @@ std::unique_ptr> mixed_join_semi( auto right_conditional_view = table_device_view::create(right_conditional, stream); auto const preprocessed_build = - cudf::experimental::row::equality::preprocessed_table::create(build, stream); + experimental::row::equality::preprocessed_table::create(build, stream); auto const preprocessed_probe = - cudf::experimental::row::equality::preprocessed_table::create(probe, stream); + experimental::row::equality::preprocessed_table::create(probe, stream); auto const row_comparator = - cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe}; + cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build}; auto const equality_probe = row_comparator.equal_to(has_nulls, compare_nulls); + semi_map_type hash_table{ + compute_hash_table_size(build.num_rows()), + cuco::empty_key{std::numeric_limits::max()}, + cuco::empty_value{cudf::detail::JoinNoneValue}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; + // Create hash table containing all keys found in right table // TODO: To add support for nested columns we will need to flatten in many // places. However, this probably isn't worth adding any time soon since we // won't be able to support AST conditions for those types anyway. auto const build_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(build)}; auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build}; - + auto const hash_build = row_hash_build.device_hasher(build_nulls); // Since we may see multiple rows that are identical in the equality tables // but differ in the conditional tables, the equality comparator used for // insertion must account for both sets of tables. An alternative solution @@ -145,28 +190,20 @@ std::unique_ptr> mixed_join_semi( auto const equality_build_equality = row_comparator_build.equal_to(build_nulls, compare_nulls); auto const preprocessed_build_condtional = - cudf::experimental::row::equality::preprocessed_table::create(right_conditional, stream); + experimental::row::equality::preprocessed_table::create(right_conditional, stream); auto const row_comparator_conditional_build = cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional, preprocessed_build_condtional}; auto const equality_build_conditional = row_comparator_conditional_build.equal_to(build_nulls, compare_nulls); + double_row_equality equality_build{equality_build_equality, equality_build_conditional}; + make_pair_function_semi pair_func_build{}; - hash_set_type row_set{ - {compute_hash_table_size(build.num_rows())}, - cuco::empty_key{JoinNoneValue}, - {equality_build_equality, equality_build_conditional}, - {row_hash_build.device_hasher(build_nulls)}, - {}, - {}, - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - {stream.value()}}; - - auto iter = thrust::make_counting_iterator(0); + auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build); // skip rows that are null here. if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) { - row_set.insert(iter, iter + right_num_rows, stream.value()); + hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value()); } else { thrust::counting_iterator stencil(0); auto const [row_bitmask, _] = @@ -174,19 +211,18 @@ std::unique_ptr> mixed_join_semi( row_is_valid pred{static_cast(row_bitmask.data())}; // insert valid rows - row_set.insert_if(iter, iter + right_num_rows, stencil, pred, stream.value()); + hash_table.insert_if( + iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value()); } + auto hash_table_view = hash_table.get_device_view(); + detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE); - auto const shmem_size_per_block = - parser.shmem_per_thread * - cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size); + auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; auto const row_hash = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; auto const hash_probe = row_hash.device_hasher(has_nulls); - hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe); - // Vector used to indicate indices from left/probe table which are present in output auto left_table_keep_mask = rmm::device_uvector(probe.num_rows(), stream); @@ -195,8 +231,9 @@ std::unique_ptr> mixed_join_semi( *right_conditional_view, *probe_view, *build_view, + hash_probe, equality_probe, - row_set_ref, + hash_table_view, cudf::device_span(left_table_keep_mask), parser.device_expression_data, config, @@ -229,6 +266,7 @@ std::unique_ptr> mixed_left_semi_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -239,7 +277,7 @@ std::unique_ptr> mixed_left_semi_join( binary_predicate, compare_nulls, detail::join_kind::LEFT_SEMI_JOIN, - cudf::get_default_stream(), + stream, mr); } @@ -250,6 +288,7 @@ std::unique_ptr> mixed_left_anti_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -260,7 +299,7 @@ std::unique_ptr> mixed_left_anti_join( binary_predicate, compare_nulls, detail::join_kind::LEFT_ANTI_JOIN, - cudf::get_default_stream(), + stream, mr); } diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index f69ded73e8d..d2ab2122c75 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -23,7 +23,6 @@ #include #include #include -#include #include #include @@ -98,22 +97,24 @@ std::unique_ptr> left_semi_join( cudf::table_view const& left, cudf::table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr); + detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, stream, mr); } std::unique_ptr> left_anti_join( cudf::table_view const& left, cudf::table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr); + detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, stream, mr); } } // namespace cudf diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index 2dd25a7b890..e1c1d2e3002 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -1021,6 +1021,76 @@ struct group_key_func { } }; +// merges all the tdigests within each group. returns a table containing 2 columns: +// the sorted means and weights. +template +std::pair, rmm::device_uvector> generate_merged_centroids( + tdigest_column_view const& tdv, + GroupOffsetIter group_offsets, + size_type num_groups, + rmm::cuda_stream_view stream) +{ + auto temp_mr = cudf::get_current_device_resource_ref(); + + auto const total_merged_centroids = tdv.means().size(); + + // output is the merged centroids (means, weights) + rmm::device_uvector output_means(total_merged_centroids, stream, temp_mr); + rmm::device_uvector output_weights(total_merged_centroids, stream, temp_mr); + + // each group represents a collection of tdigest columns. each row is 1 tdigest. + // within each group, we want to sort all the centroids within all the tdigests + // in that group, using the means as the key. the "outer offsets" represent the indices of the + // tdigests, and the "inner offsets" represents the list of centroids for a particular tdigest. + // + // rows + // ---- centroid 0 --------- + // tdigest 0 centroid 1 + // ---- centroid 2 group 0 + // tdigest 1 centroid 3 + // ---- centroid 4 --------- + // tdigest 2 centroid 5 + // ---- centroid 6 group 1 + // tdigest 3 centroid 7 + // centroid 8 + // ---- centroid 9 -------- + auto inner_offsets = tdv.centroids().offsets(); + auto centroid_offsets = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [group_offsets, inner_offsets = tdv.centroids().offsets().begin()] __device__( + size_type i) { return inner_offsets[group_offsets[i]]; })); + + // perform the sort using the means as the key + size_t temp_size; + CUDF_CUDA_TRY(cub::DeviceSegmentedSort::SortPairs(nullptr, + temp_size, + tdv.means().begin(), + output_means.begin(), + tdv.weights().begin(), + output_weights.begin(), + total_merged_centroids, + num_groups, + centroid_offsets, + centroid_offsets + 1, + stream.value())); + + rmm::device_buffer temp_mem(temp_size, stream, temp_mr); + CUDF_CUDA_TRY(cub::DeviceSegmentedSort::SortPairs(temp_mem.data(), + temp_size, + tdv.means().begin(), + output_means.begin(), + tdv.weights().begin(), + output_weights.begin(), + total_merged_centroids, + num_groups, + centroid_offsets, + centroid_offsets + 1, + stream.value())); + + return {std::move(output_means), std::move(output_weights)}; +} + template std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, HGroupOffsetIter h_outer_offsets, @@ -1032,59 +1102,6 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - // thrust::merge and thrust::merge_by_key don't provide what we need. What we would need is an - // algorithm like a super-merge that takes two layers of keys: one which identifies the outer - // grouping of tdigests, and one which identifies the inner groupings of the tdigests within the - // outer groups. - // TODO: investigate replacing the iterative merge with a single stable_sort_by_key. - - // bring tdigest offsets back to the host - auto tdigest_offsets = tdv.centroids().offsets(); - std::vector h_inner_offsets(tdigest_offsets.size()); - cudaMemcpyAsync(h_inner_offsets.data(), - tdigest_offsets.begin(), - sizeof(size_type) * tdigest_offsets.size(), - cudaMemcpyDefault, - stream); - - stream.synchronize(); - - // extract all means and weights into a table - cudf::table_view tdigests_unsliced({tdv.means(), tdv.weights()}); - - // generate the merged (but not yet compressed) tdigests for each group. - std::vector> tdigests; - tdigests.reserve(num_groups); - std::transform(h_outer_offsets, - h_outer_offsets + num_groups, - std::next(h_outer_offsets), - std::back_inserter(tdigests), - [&](auto tdigest_start, auto tdigest_end) { - // the range of tdigests in this group - auto const num_tdigests = tdigest_end - tdigest_start; - - // slice each tdigest from the input - std::vector unmerged_tdigests; - unmerged_tdigests.reserve(num_tdigests); - auto offset_iter = std::next(h_inner_offsets.begin(), tdigest_start); - std::transform( - offset_iter, - offset_iter + num_tdigests, - std::next(offset_iter), - std::back_inserter(unmerged_tdigests), - [&](size_type start, size_type end) { - return cudf::detail::slice(tdigests_unsliced, {start, end}, stream); - }); - - // merge - return cudf::detail::merge(unmerged_tdigests, - {0}, - {order::ASCENDING}, - {}, - stream, - cudf::get_current_device_resource_ref()); - }); - // generate min and max values auto merged_min_col = cudf::make_numeric_column( data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr); @@ -1121,7 +1138,7 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, auto group_num_weights = cudf::detail::make_counting_transform_iterator( 0, group_num_weights_func{group_offsets, - tdigest_offsets.begin()}); + tdv.centroids().offsets().begin()}); thrust::replace_if(rmm::exec_policy(stream), merged_min_col->mutable_view().begin(), merged_min_col->mutable_view().end(), @@ -1135,29 +1152,33 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, group_is_empty{}, 0); - // concatenate all the merged tdigests back into one table. - std::vector tdigest_views; - tdigest_views.reserve(num_groups); - std::transform(tdigests.begin(), - tdigests.end(), - std::back_inserter(tdigest_views), - [](std::unique_ptr
const& t) { return t->view(); }); - auto merged = - cudf::detail::concatenate(tdigest_views, stream, cudf::get_current_device_resource_ref()); + auto temp_mr = cudf::get_current_device_resource_ref(); + + // merge the centroids + auto [merged_means, merged_weights] = + generate_merged_centroids(tdv, group_offsets, num_groups, stream); + size_t const num_centroids = tdv.means().size(); + CUDF_EXPECTS(merged_means.size() == num_centroids, + "Unexpected number of centroids in merged result"); // generate cumulative weights - auto merged_weights = merged->get_column(1).view(); - auto cumulative_weights = cudf::make_numeric_column( - data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED, stream); - auto keys = cudf::detail::make_counting_transform_iterator( - 0, - group_key_func{ - group_labels, tdigest_offsets.begin(), tdigest_offsets.size()}); + rmm::device_uvector cumulative_weights(merged_weights.size(), stream, temp_mr); + + // generate group keys for all centroids in the entire column + rmm::device_uvector group_keys(num_centroids, stream, temp_mr); + auto iter = thrust::make_counting_iterator(0); + auto inner_offsets = tdv.centroids().offsets(); + thrust::transform(rmm::exec_policy(stream), + iter, + iter + num_centroids, + group_keys.begin(), + group_key_func{ + group_labels, inner_offsets.begin(), inner_offsets.size()}); thrust::inclusive_scan_by_key(rmm::exec_policy(stream), - keys, - keys + cumulative_weights->size(), - merged_weights.begin(), - cumulative_weights->mutable_view().begin()); + group_keys.begin(), + group_keys.begin() + num_centroids, + merged_weights.begin(), + cumulative_weights.begin()); auto const delta = max_centroids; @@ -1166,37 +1187,32 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, delta, num_groups, nearest_value_centroid_weights{ - cumulative_weights->view().begin(), - group_offsets, - tdigest_offsets.begin()}, - centroid_group_info{cumulative_weights->view().begin(), - group_offsets, - tdigest_offsets.begin()}, + cumulative_weights.begin(), group_offsets, inner_offsets.begin()}, + centroid_group_info{ + cumulative_weights.begin(), group_offsets, inner_offsets.begin()}, cumulative_centroid_weight{ - cumulative_weights->view().begin(), + cumulative_weights.begin(), group_labels, group_offsets, - {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, + {inner_offsets.begin(), static_cast(inner_offsets.size())}}, false, stream, mr); // input centroid values auto centroids = cudf::detail::make_counting_transform_iterator( - 0, - make_weighted_centroid{merged->get_column(0).view().begin(), - merged_weights.begin()}); + 0, make_weighted_centroid{merged_means.begin(), merged_weights.begin()}); // compute the tdigest return compute_tdigests( delta, centroids, - centroids + merged->num_rows(), + centroids + merged_means.size(), cumulative_centroid_weight{ - cumulative_weights->view().begin(), + cumulative_weights.begin(), group_labels, group_offsets, - {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, + {inner_offsets.begin(), static_cast(inner_offsets.size())}}, std::move(merged_min_col), std::move(merged_max_col), group_cluster_wl, diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index 9d3a7ce5a4e..9824c472b20 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -132,6 +132,13 @@ struct cuda_event { cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); } virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); } + // Moveable but not copyable. + cuda_event(const cuda_event&) = delete; + cuda_event& operator=(const cuda_event&) = delete; + + cuda_event(cuda_event&&) = default; + cuda_event& operator=(cuda_event&&) = default; + operator cudaEvent_t() { return e_; } private: @@ -147,11 +154,12 @@ struct cuda_event { */ cudaEvent_t event_for_thread() { - thread_local std::vector> thread_events(get_num_cuda_devices()); + // The program may crash if this function is called from the main thread and user application + // subsequently calls cudaDeviceReset(). + // As a workaround, here we intentionally disable RAII and leak cudaEvent_t. + thread_local std::vector thread_events(get_num_cuda_devices()); auto const device_id = get_current_cuda_device(); - if (not thread_events[device_id.value()]) { - thread_events[device_id.value()] = std::make_unique(); - } + if (not thread_events[device_id.value()]) { thread_events[device_id.value()] = new cuda_event(); } return *thread_events[device_id.value()]; } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 1bedb344a01..b67d922d377 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -329,6 +329,7 @@ ConfigureTest(NESTED_JSON_TEST io/json/nested_json_test.cpp io/json/json_tree.cp ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) ConfigureTest(JSON_QUOTE_NORMALIZATION io/json/json_quote_normalization_test.cpp) ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json/json_whitespace_normalization_test.cu) +ConfigureTest(JSON_TREE_CSR io/json/json_tree_csr.cu) ConfigureTest( DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp GPUS 1 @@ -687,10 +688,12 @@ ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_CSVIO_TEST streams/io/csv_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_DATETIME_TEST streams/datetime_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_JOIN_TEST streams/join_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu new file mode 100644 index 00000000000..a336b327732 --- /dev/null +++ b/cpp/tests/io/json/json_tree_csr.cu @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "io/json/nested_json.hpp" + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include + +namespace cuio_json = cudf::io::json; + +struct h_tree_meta_t { + std::vector node_categories; + std::vector parent_node_ids; + std::vector node_range_begin; + std::vector node_range_end; +}; + +struct h_column_tree { + // position of nnzs + std::vector row_idx; + std::vector col_idx; + // node properties + std::vector categories; + std::vector column_ids; +}; + +// debug printing +template +void print(cudf::host_span vec, std::string name) +{ + std::cout << name << " = "; + for (auto e : vec) { + std::cout << e << " "; + } + std::cout << std::endl; +} + +bool check_equality(cuio_json::tree_meta_t& d_a, + cudf::device_span d_a_max_row_offsets, + cuio_json::experimental::compressed_sparse_row& d_b_csr, + cuio_json::experimental::column_tree_properties& d_b_ctp, + rmm::cuda_stream_view stream) +{ + // convert from tree_meta_t to column_tree_csr + stream.synchronize(); + + h_tree_meta_t a{cudf::detail::make_std_vector_async(d_a.node_categories, stream), + cudf::detail::make_std_vector_async(d_a.parent_node_ids, stream), + cudf::detail::make_std_vector_async(d_a.node_range_begin, stream), + cudf::detail::make_std_vector_async(d_a.node_range_end, stream)}; + + h_column_tree b{cudf::detail::make_std_vector_async(d_b_csr.row_idx, stream), + cudf::detail::make_std_vector_async(d_b_csr.col_idx, stream), + cudf::detail::make_std_vector_async(d_b_ctp.categories, stream), + cudf::detail::make_std_vector_async(d_b_ctp.mapped_ids, stream)}; + + auto a_max_row_offsets = cudf::detail::make_std_vector_async(d_a_max_row_offsets, stream); + auto b_max_row_offsets = cudf::detail::make_std_vector_async(d_b_ctp.max_row_offsets, stream); + + stream.synchronize(); + + auto num_nodes = a.parent_node_ids.size(); + if (num_nodes > 1) { + if (b.row_idx.size() != num_nodes + 1) { return false; } + + for (auto pos = b.row_idx[0]; pos < b.row_idx[1]; pos++) { + auto v = b.col_idx[pos]; + if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[0]) { return false; } + } + for (size_t u = 1; u < num_nodes; u++) { + auto v = b.col_idx[b.row_idx[u]]; + if (a.parent_node_ids[b.column_ids[u]] != b.column_ids[v]) { return false; } + + for (auto pos = b.row_idx[u] + 1; pos < b.row_idx[u + 1]; pos++) { + v = b.col_idx[pos]; + if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[u]) { return false; } + } + } + for (size_t u = 0; u < num_nodes; u++) { + if (a.node_categories[b.column_ids[u]] != b.categories[u]) { return false; } + } + for (size_t u = 0; u < num_nodes; u++) { + if (a_max_row_offsets[b.column_ids[u]] != b_max_row_offsets[u]) { return false; } + } + } else if (num_nodes == 1) { + if (b.row_idx.size() != num_nodes + 1) { return false; } + + if (b.row_idx[0] != 0 || b.row_idx[1] != 1) return false; + if (!b.col_idx.empty()) return false; + for (size_t u = 0; u < num_nodes; u++) { + if (a.node_categories[b.column_ids[u]] != b.categories[u]) { return false; } + } + + for (size_t u = 0; u < num_nodes; u++) { + if (a_max_row_offsets[b.column_ids[u]] != b_max_row_offsets[u]) { return false; } + } + } + return true; +} + +void run_test(std::string const& input, bool enable_lines = true) +{ + auto const stream = cudf::get_default_stream(); + cudf::string_scalar d_scalar(input, true, stream); + auto d_input = cudf::device_span{d_scalar.data(), + static_cast(d_scalar.size())}; + + cudf::io::json_reader_options options{}; + options.enable_lines(enable_lines); + options.enable_mixed_types_as_string(true); + + // Parse the JSON and get the token stream + auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream( + d_input, options, stream, cudf::get_current_device_resource_ref()); + + // Get the JSON's tree representation + auto gpu_tree = + cuio_json::detail::get_tree_representation(tokens_gpu, + token_indices_gpu, + options.is_enabled_mixed_types_as_string(), + stream, + cudf::get_current_device_resource_ref()); + + bool const is_array_of_arrays = [&]() { + std::array h_node_categories = {cuio_json::NC_ERR, cuio_json::NC_ERR}; + auto const size_to_copy = std::min(size_t{2}, gpu_tree.node_categories.size()); + CUDF_CUDA_TRY(cudaMemcpyAsync(h_node_categories.data(), + gpu_tree.node_categories.data(), + sizeof(cuio_json::node_t) * size_to_copy, + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + if (options.is_enabled_lines()) return h_node_categories[0] == cuio_json::NC_LIST; + return h_node_categories[0] == cuio_json::NC_LIST and + h_node_categories[1] == cuio_json::NC_LIST; + }(); + + auto tup = + cuio_json::detail::records_orient_tree_traversal(d_input, + gpu_tree, + is_array_of_arrays, + options.is_enabled_lines(), + stream, + rmm::mr::get_current_device_resource()); + auto& gpu_col_id = std::get<0>(tup); + auto& gpu_row_offsets = std::get<1>(tup); + + auto const num_nodes = gpu_col_id.size(); + rmm::device_uvector sorted_col_ids(gpu_col_id.size(), stream); // make a copy + thrust::copy( + rmm::exec_policy(stream), gpu_col_id.begin(), gpu_col_id.end(), sorted_col_ids.begin()); + + // sort by {col_id} on {node_ids} stable + rmm::device_uvector node_ids(gpu_col_id.size(), stream); + thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); + thrust::stable_sort_by_key( + rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); + + cudf::size_type const row_array_parent_col_id = [&]() { + cudf::size_type value = cuio_json::parent_node_sentinel; + auto const list_node_index = options.is_enabled_lines() ? 0 : 1; + CUDF_CUDA_TRY(cudaMemcpyAsync(&value, + gpu_col_id.data() + list_node_index, + sizeof(cudf::size_type), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + return value; + }(); + + auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = + cudf::io::json::detail::reduce_to_column_tree(gpu_tree, + gpu_col_id, + sorted_col_ids, + node_ids, + gpu_row_offsets, + is_array_of_arrays, + row_array_parent_col_id, + stream); + + auto [d_column_tree_csr, d_column_tree_properties] = + cudf::io::json::experimental::detail::reduce_to_column_tree(gpu_tree, + gpu_col_id, + sorted_col_ids, + node_ids, + gpu_row_offsets, + is_array_of_arrays, + row_array_parent_col_id, + stream); + + auto iseq = check_equality( + d_column_tree, d_max_row_offsets, d_column_tree_csr, d_column_tree_properties, stream); + // assert equality between csr and meta formats + ASSERT_TRUE(iseq); +} + +struct JsonColumnTreeTests : public cudf::test::BaseFixture {}; + +TEST_F(JsonColumnTreeTests, JSONL_Small) +{ + std::string const input = + R"( {} + { "a": { "y" : 6, "z": [] }} + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )"; // Prepare input & output buffers + run_test(input); +} + +TEST_F(JsonColumnTreeTests, JSONL_Large) +{ + std::string const input = + R"( {} + {} + { "a": { "y" : 6, "z": [] }} + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} + { "a": { "y" : 6, "z": [] }} + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} + { "a": { "y" : 6, "z": [] }} + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} + { "a": { "y" : 6, "z": [] }} + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )"; + run_test(input); +} + +TEST_F(JsonColumnTreeTests, JSONL_ListofStruct) +{ + std::string const input = R"( + { "Root": { "Key": [ { "EE": "A" } ] } } + { "Root": { "Key": { } } } + { "Root": { "Key": [{ "YY": 1}] } } + )"; + run_test(input); +} + +TEST_F(JsonColumnTreeTests, JSONL_MissingEntries) +{ + std::string json_stringl = R"( + {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true} + {"a": 1, "b": {"0": "abc" }, "c": false} + {"a": 1, "b": {}} + {"a": 1, "c": null} + )"; + run_test(json_stringl); +} + +TEST_F(JsonColumnTreeTests, JSONL_MoreMissingEntries) +{ + std::string json_stringl = R"( + { "foo1": [1,2,3], "bar": 123 } + { "foo2": { "a": 1 }, "bar": 456 } + { "foo1": [1,2,3], "bar": 123 } + { "foo2": { "a": 1 }, "bar": 456 } + { "foo1": [1,2,3], "bar": 123 } + { "foo2": { "a": 1 }, "bar": 456 } + )"; + run_test(json_stringl); +} + +TEST_F(JsonColumnTreeTests, JSONL_StillMoreMissingEntries) +{ + std::string json_stringl = R"( + { "foo1": [1,2,3], "bar": 123 } + { "foo2": { "a": 1 }, "bar": 456 } + { "foo1": ["123","456"], "bar": 123 } + { "foo2": { "b": 5 }, "car": 456 } + { "foo1": [1,2,3], "bar": 123 } + { "foo2": { "a": 1 }, "bar": 456 } + )"; + run_test(json_stringl); +} + +TEST_F(JsonColumnTreeTests, JSON_MissingEntries) +{ + std::string json_string = R"([ + {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true}, + {"a": 1, "b": {"0": "abc" }, "c": false}, + {"a": 1, "b": {}}, + {"a": 1, "c": null} + ])"; + run_test(json_string, false); +} + +TEST_F(JsonColumnTreeTests, JSON_StructOfStructs) +{ + std::string json_string = + R"([ + {}, + { "a": { "y" : 6, "z": [] }}, + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} + ])"; // Prepare input & output buffers + run_test(json_string, false); +} + +TEST_F(JsonColumnTreeTests, JSONL_ArrayOfArrays_NestedList) +{ + std::string json_string = + R"([123, [1,2,3]] + [456, null, { "a": 1 }])"; + run_test(json_string); +} + +TEST_F(JsonColumnTreeTests, JSON_ArrayofArrays_NestedList) +{ + std::string json_string = R"([[[1,2,3], null, 123], + [null, { "a": 1 }, 456 ]])"; + run_test(json_string, false); +} + +TEST_F(JsonColumnTreeTests, JSON_CornerCase_Empty) +{ + std::string json_string = R"([])"; + run_test(json_string, false); +} + +TEST_F(JsonColumnTreeTests, JSONL_CornerCase_List) +{ + std::string json_string = R"([123])"; + run_test(json_string, true); +} + +TEST_F(JsonColumnTreeTests, JSON_CornerCase_EmptyNestedList) +{ + std::string json_string = R"([[[]]])"; + run_test(json_string, false); +} + +TEST_F(JsonColumnTreeTests, JSON_CornerCase_EmptyNestedLists) +{ + std::string json_string = R"([[], [], []])"; + run_test(json_string, false); +} + +TEST_F(JsonColumnTreeTests, JSONL_CornerCase_ListofLists) +{ + std::string json_string = R"([[1, 2, 3], [4, 5, null], []])"; + run_test(json_string, true); +} + +TEST_F(JsonColumnTreeTests, JSONL_CornerCase_EmptyListOfLists) +{ + std::string json_string = R"([[]])"; + run_test(json_string, true); +} diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index ab387a5c7f5..3431e941359 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -39,6 +39,8 @@ #include #include +#include + #include template @@ -60,6 +62,7 @@ template >, cudf::table_view const& left_keys, cudf::table_view const& right_keys, cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr), cudf::out_of_bounds_policy oob_policy = cudf::out_of_bounds_policy::DONT_CHECK> std::unique_ptr join_and_gather( @@ -68,12 +71,13 @@ std::unique_ptr join_and_gather( std::vector const& left_on, std::vector const& right_on, cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { auto left_selected = left_input.select(left_on); auto right_selected = right_input.select(right_on); auto const [left_join_indices, right_join_indices] = - join_impl(left_selected, right_selected, compare_nulls, mr); + join_impl(left_selected, right_selected, compare_nulls, stream, mr); auto left_indices_span = cudf::device_span{*left_join_indices}; auto right_indices_span = cudf::device_span{*right_join_indices}; @@ -2027,7 +2031,11 @@ struct JoinTestLists : public cudf::test::BaseFixture { auto const probe_tv = cudf::table_view{{probe}}; auto const [left_result_map, right_result_map] = - join_func(build_tv, probe_tv, nulls_equal, cudf::get_current_device_resource_ref()); + join_func(build_tv, + probe_tv, + nulls_equal, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); auto const left_result_table = sort_and_gather(build_tv, column_view_from_device_uvector(*left_result_map), oob_policy); diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu index 08a0136700d..6c147c8a128 100644 --- a/cpp/tests/join/mixed_join_tests.cu +++ b/cpp/tests/join/mixed_join_tests.cu @@ -778,21 +778,6 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality) {1}); } -TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap) -{ - auto const col_ref_left_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); - auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); - auto left_one_greater_right_one = - cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); - - this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}}, - {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}}, - {0}, - {1}, - left_one_greater_right_one, - {2, 7, 8}); -} - TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates) { this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}}, @@ -915,18 +900,3 @@ TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality) left_zero_eq_right_zero, {0, 1, 3}); } - -TYPED_TEST(MixedLeftAntiJoinTest, MixedLeftAntiJoinGatherMap) -{ - auto const col_ref_left_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); - auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); - auto left_one_greater_right_one = - cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); - - this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}}, - {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}}, - {0}, - {1}, - left_one_greater_right_one, - {0, 1, 3, 4, 5, 6, 9}); -} diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp index 3e279260b99..554d5754e39 100644 --- a/cpp/tests/join/semi_anti_join_tests.cpp +++ b/cpp/tests/join/semi_anti_join_tests.cpp @@ -28,8 +28,11 @@ #include #include #include +#include #include +#include + #include template @@ -51,6 +54,7 @@ template > (*join_impl)( cudf::table_view const& left_keys, cudf::table_view const& right_keys, cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)> std::unique_ptr join_and_gather( cudf::table_view const& left_input, @@ -58,11 +62,12 @@ std::unique_ptr join_and_gather( std::vector const& left_on, std::vector const& right_on, cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { auto left_selected = left_input.select(left_on); auto right_selected = right_input.select(right_on); - auto const join_indices = join_impl(left_selected, right_selected, compare_nulls, mr); + auto const join_indices = join_impl(left_selected, right_selected, compare_nulls, stream, mr); auto left_indices_span = cudf::device_span{*join_indices}; auto left_indices_col = cudf::column_view{left_indices_span}; diff --git a/cpp/tests/streams/datetime_test.cpp b/cpp/tests/streams/datetime_test.cpp new file mode 100644 index 00000000000..82629156fa6 --- /dev/null +++ b/cpp/tests/streams/datetime_test.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +class DatetimeTest : public cudf::test::BaseFixture { + public: + cudf::test::fixed_width_column_wrapper timestamps{ + -23324234, // 1969-12-31 23:59:59.976675766 GMT + 23432424, // 1970-01-01 00:00:00.023432424 GMT + 987234623 // 1970-01-01 00:00:00.987234623 GMT + }; + cudf::test::fixed_width_column_wrapper months{{1, -1, 3}}; +}; + +TEST_F(DatetimeTest, ExtractYear) +{ + cudf::datetime::extract_year(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractMonth) +{ + cudf::datetime::extract_month(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractDay) +{ + cudf::datetime::extract_day(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractWeekday) +{ + cudf::datetime::extract_weekday(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractHour) +{ + cudf::datetime::extract_hour(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractMinute) +{ + cudf::datetime::extract_minute(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractSecond) +{ + cudf::datetime::extract_second(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractMillisecondFraction) +{ + cudf::datetime::extract_millisecond_fraction(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractMicrosecondFraction) +{ + cudf::datetime::extract_microsecond_fraction(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractNanosecondFraction) +{ + cudf::datetime::extract_nanosecond_fraction(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, LastDayOfMonth) +{ + cudf::datetime::last_day_of_month(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, DayOfYear) +{ + cudf::datetime::day_of_year(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, AddCalendricalMonths) +{ + cudf::datetime::add_calendrical_months(timestamps, months, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, AddCalendricalMonthsScalar) +{ + auto scalar = cudf::make_fixed_width_scalar(1, cudf::test::get_default_stream()); + + cudf::datetime::add_calendrical_months(timestamps, *scalar, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, IsLeapYear) +{ + cudf::datetime::is_leap_year(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, DaysInMonth) +{ + cudf::datetime::days_in_month(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractQuarter) +{ + cudf::datetime::extract_quarter(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, CeilDatetimes) +{ + cudf::datetime::ceil_datetimes( + timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, FloorDatetimes) +{ + cudf::datetime::floor_datetimes( + timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, RoundDatetimes) +{ + cudf::datetime::round_datetimes( + timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/join_test.cpp b/cpp/tests/streams/join_test.cpp new file mode 100644 index 00000000000..2811bb676fa --- /dev/null +++ b/cpp/tests/streams/join_test.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +class JoinTest : public cudf::test::BaseFixture { + static inline cudf::table make_table() + { + cudf::test::fixed_width_column_wrapper col0{{3, 1, 2, 0, 3}}; + cudf::test::strings_column_wrapper col1{{"s0", "s1", "s2", "s4", "s1"}}; + cudf::test::fixed_width_column_wrapper col2{{0, 1, 2, 4, 1}}; + + std::vector> columns; + columns.push_back(col0.release()); + columns.push_back(col1.release()); + columns.push_back(col2.release()); + + return cudf::table{std::move(columns)}; + } + + public: + cudf::table table0{make_table()}; + cudf::table table1{make_table()}; + cudf::table conditional0{make_table()}; + cudf::table conditional1{make_table()}; + cudf::ast::column_reference col_ref_left_0{0}; + cudf::ast::column_reference col_ref_right_0{0, cudf::ast::table_reference::RIGHT}; + cudf::ast::operation left_zero_eq_right_zero{ + cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0}; +}; + +TEST_F(JoinTest, InnerJoin) +{ + cudf::inner_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, LeftJoin) +{ + cudf::left_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, FullJoin) +{ + cudf::full_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, LeftSemiJoin) +{ + cudf::left_semi_join( + table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, LeftAntiJoin) +{ + cudf::left_anti_join( + table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, CrossJoin) { cudf::cross_join(table0, table1, cudf::test::get_default_stream()); } + +TEST_F(JoinTest, ConditionalInnerJoin) +{ + cudf::conditional_inner_join( + table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftJoin) +{ + cudf::conditional_left_join( + table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalFullJoin) +{ + cudf::conditional_full_join( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftSemiJoin) +{ + cudf::conditional_left_semi_join( + table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftAntiJoin) +{ + cudf::conditional_left_anti_join( + table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedInnerJoin) +{ + cudf::mixed_inner_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + std::nullopt, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedLeftJoin) +{ + cudf::mixed_left_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + std::nullopt, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedFullJoin) +{ + cudf::mixed_full_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + std::nullopt, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedLeftSemiJoin) +{ + cudf::mixed_left_semi_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedLeftAntiJoin) +{ + cudf::mixed_left_anti_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedInnerJoinSize) +{ + cudf::mixed_inner_join_size(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedLeftJoinSize) +{ + cudf::mixed_left_join_size(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalInnerJoinSize) +{ + cudf::conditional_inner_join_size( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftJoinSize) +{ + cudf::conditional_left_join_size( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftSemiJoinSize) +{ + cudf::conditional_left_semi_join_size( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftAntiJoinSize) +{ + cudf::conditional_left_anti_join_size( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} diff --git a/dependencies.yaml b/dependencies.yaml index 325f2dbcba7..911c443d294 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -43,15 +43,28 @@ files: includes: - cuda_version - test_cpp - test_python: + test_python_cudf_pandas: output: none includes: - cuda_version - py_version - test_python_common - test_python_cudf - - test_python_dask_cudf - test_python_cudf_pandas + test_python_cudf: + output: none + includes: + - cuda_version + - py_version + - test_python_common + - test_python_cudf + test_python_other: + output: none + includes: + - cuda_version + - py_version + - test_python_common + - test_python_dask_cudf test_java: output: none includes: @@ -350,12 +363,12 @@ dependencies: common: - output_types: conda packages: - - fmt>=10.1.1,<11 + - fmt>=11.0.2,<12 - flatbuffers==24.3.25 - librdkafka>=2.5.0,<2.6.0a0 # Align nvcomp version with rapids-cmake - nvcomp==4.0.1 - - spdlog>=1.12.0,<1.13 + - spdlog>=1.14.1,<1.15 rapids_build_skbuild: common: - output_types: [conda, requirements, pyproject] @@ -650,7 +663,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.0,<1.3 + - polars>=1.8,<1.9 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] @@ -707,9 +720,7 @@ dependencies: - matrix: {dependencies: "oldest"} packages: - numba==0.57.* - - numpy==1.23.* - pandas==2.0.* - - pyarrow==14.0.0 - matrix: packages: - output_types: conda @@ -764,6 +775,14 @@ dependencies: - &transformers transformers==4.39.3 - tzdata specific: + - output_types: [conda, requirements] + matrices: + - matrix: {dependencies: "oldest"} + packages: + - numpy==1.23.* + - pyarrow==14.0.0 + - matrix: + packages: - output_types: conda matrices: - matrix: @@ -783,6 +802,15 @@ dependencies: packages: - dask-cuda==24.12.*,>=0.0.0a0 - *numba + specific: + - output_types: [conda, requirements] + matrices: + - matrix: {dependencies: "oldest"} + packages: + - numpy==1.24.* + - pyarrow==14.0.1 + - matrix: + packages: depends_on_libcudf: common: - output_types: conda diff --git a/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png b/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png new file mode 100644 index 00000000000..e472cf66612 Binary files /dev/null and b/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png differ diff --git a/docs/cudf/source/_static/compute_heavy_queries_polars.png b/docs/cudf/source/_static/compute_heavy_queries_polars.png new file mode 100644 index 00000000000..6854ed5a436 Binary files /dev/null and b/docs/cudf/source/_static/compute_heavy_queries_polars.png differ diff --git a/docs/cudf/source/_static/pds_benchmark_polars.png b/docs/cudf/source/_static/pds_benchmark_polars.png new file mode 100644 index 00000000000..d0b48ab2901 Binary files /dev/null and b/docs/cudf/source/_static/pds_benchmark_polars.png differ diff --git a/docs/cudf/source/cudf_polars/index.rst b/docs/cudf/source/cudf_polars/index.rst new file mode 100644 index 00000000000..0a3a0d86b2c --- /dev/null +++ b/docs/cudf/source/cudf_polars/index.rst @@ -0,0 +1,41 @@ +Polars GPU engine +================= + +cuDF provides an in-memory, GPU-accelerated execution engine for Python users of the Polars Lazy API. +The engine supports most of the core expressions and data types as well as a growing set of more advanced dataframe manipulations +and data file formats. When using the GPU engine, Polars will convert expressions into an optimized query plan and determine +whether the plan is supported on the GPU. If it is not, the execution will transparently fall back to the standard Polars engine +and run on the CPU. + +Benchmark +--------- +We reproduced the `Polars Decision Support (PDS) `__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results: + +.. figure:: ../_static/pds_benchmark_polars.png + :width: 600px + + + +You can see up to 13x speedup using the GPU engine on the compute-heavy PDS queries involving complex aggregation and join operations. Below are the speedups for the top performing queries: + + +.. figure:: ../_static/compute_heavy_queries_polars.png + :width: 1000px + +:emphasis:`PDS-H benchmark | GPU: NVIDIA H100 PCIe | CPU: Intel Xeon W9-3495X (Sapphire Rapids) | Storage: Local NVMe` + +You can reproduce the results by visiting the `Polars Decision Support (PDS) GitHub repository `__. + +Learn More +---------- + +The GPU engine for Polars is now available in Open Beta and the engine is undergoing rapid development. To learn more, visit the `GPU Support page `__ on the Polars website. + +Launch on Google Colab +---------------------- + +.. figure:: ../_static/colab.png + :width: 200px + :target: https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb + + Try out the GPU engine for Polars in a free GPU notebook environment. Sign in with your Google account and `launch the demo on Colab `__. diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst index 3b8dfa5fe01..1b86cafeb48 100644 --- a/docs/cudf/source/index.rst +++ b/docs/cudf/source/index.rst @@ -29,5 +29,6 @@ other operations. user_guide/index cudf_pandas/index + cudf_polars/index libcudf_docs/index developer_guide/index diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst new file mode 100644 index 00000000000..06f74a38709 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst @@ -0,0 +1,6 @@ +======= +extract +======= + +.. automodule:: pylibcudf.strings.extract + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst new file mode 100644 index 00000000000..9850ee10098 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst @@ -0,0 +1,6 @@ +==== +find +==== + +.. automodule:: pylibcudf.strings.findall + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 1200ecba5d9..9b1a6b72a88 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -7,9 +7,12 @@ strings capitalize char_types contains + extract find + findall regex_flags regex_program repeat replace slice + strip diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst new file mode 100644 index 00000000000..a79774b8e67 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst @@ -0,0 +1,6 @@ +===== +strip +===== + +.. automodule:: pylibcudf.strings.strip + :members: diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst new file mode 100644 index 00000000000..6cd098da56d --- /dev/null +++ b/docs/dask_cudf/source/best_practices.rst @@ -0,0 +1,320 @@ +.. _best-practices: + +Dask cuDF Best Practices +======================== + +This page outlines several important guidelines for using `Dask cuDF +`__ effectively. + +.. note:: + Since Dask cuDF is a backend extension for + `Dask DataFrame `__, + the guidelines discussed in the `Dask DataFrames Best Practices + `__ + documentation also apply to Dask cuDF (excluding any pandas-specific + details). + + +Deployment and Configuration +---------------------------- + +Use Dask-CUDA +~~~~~~~~~~~~~ + +To execute a Dask workflow on multiple GPUs, a Dask cluster must +be deployed with `Dask-CUDA `__ +and `Dask.distributed `__. + +When running on a single machine, the `LocalCUDACluster `__ +convenience function is strongly recommended. No matter how many GPUs are +available on the machine (even one!), using `Dask-CUDA has many advantages +`__ +over default (threaded) execution. Just to list a few: + +* Dask-CUDA makes it easy to pin workers to specific devices. +* Dask-CUDA makes it easy to configure memory-spilling options. +* The distributed scheduler collects useful diagnostic information that can be viewed on a dashboard in real time. + +Please see `Dask-CUDA's API `__ +and `Best Practices `__ +documentation for detailed information. Typical ``LocalCUDACluster`` usage +is also illustrated within the multi-GPU section of `Dask cuDF's +`__ documentation. + +.. note:: + When running on cloud infrastructure or HPC systems, it is usually best to + leverage system-specific deployment libraries like `Dask Operator + `__ and `Dask-Jobqueue + `__. + + Please see `the RAPIDS deployment documentation `__ + for further details and examples. + + +Use diagnostic tools +~~~~~~~~~~~~~~~~~~~~ + +The Dask ecosystem includes several diagnostic tools that you should absolutely use. +These tools include an intuitive `browser dashboard +`__ as well as a dedicated +`API for collecting performance profiles +`__. + +No matter the workflow, using the dashboard is strongly recommended. +It provides a visual representation of the worker resources and compute +progress. It also shows basic GPU memory and utilization metrics (under +the ``GPU`` tab). To visualize more detailed GPU metrics in JupyterLab, +use `NVDashboard `__. + + +Enable cuDF spilling +~~~~~~~~~~~~~~~~~~~~ + +When using Dask cuDF for classic ETL workloads, it is usually best +to enable `native spilling support in cuDF +`__. +When using :func:`LocalCUDACluster`, this is easily accomplished by +setting ``enable_cudf_spill=True``. + +When a Dask cuDF workflow includes conversion between DataFrame and Array +representations, native cuDF spilling may be insufficient. For these cases, +`JIT-unspill `__ +is likely to produce better protection from out-of-memory (OOM) errors. +Please see `Dask-CUDA's spilling documentation +`__ for further details +and guidance. + +Use RMM +~~~~~~~ + +Memory allocations in cuDF are significantly faster and more efficient when +the `RAPIDS Memory Manager (RMM) `__ +library is configured appropriately on worker processes. In most cases, the best way to manage +memory is by initializing an RMM pool on each worker before executing a +workflow. When using :func:`LocalCUDACluster`, this is easily accomplished +by setting ``rmm_pool_size`` to a large fraction (e.g. ``0.9``). + +See the `Dask-CUDA memory-management documentation +`__ +for more details. + +Use the Dask DataFrame API +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Although Dask cuDF provides a public ``dask_cudf`` Python module, we +strongly recommended that you use the CPU/GPU portable ``dask.dataframe`` +API instead. Simply `use the Dask configuration system +`__ +to set the ``"dataframe.backend"`` option to ``"cudf"``, and the +``dask_cudf`` module will be imported and used implicitly. + +Be sure to use the :func:`to_backend` method if you need to convert +between the different DataFrame backends. For example:: + + df = df.to_backend("pandas") # This gives us a pandas-backed collection + +.. note:: + Although :func:`to_backend` makes it easy to move data between pandas + and cuDF, repetitive CPU-GPU data movement can degrade performance + significantly. For optimal results, keep your data on the GPU as much + as possible. + +Avoid eager execution +~~~~~~~~~~~~~~~~~~~~~ + +Although Dask DataFrame collections are lazy by default, there are several +notable methods that will result in the immediate execution of the +underlying task graph: + +:func:`compute`: Calling ``ddf.compute()`` will materialize the result of +``ddf`` and return a single cuDF object. This is done by executing the entire +task graph associated with ``ddf`` and concatenating its partitions in +local memory on the client process. + +.. note:: + Never call :func:`compute` on a large collection that cannot fit comfortably + in the memory of a single GPU! + +:func:`persist`: Like :func:`compute`, calling ``ddf.persist()`` will +execute the entire task graph associated with ``ddf``. The most important +difference is that the computed partitions will remain in distributed +worker memory instead of being concatenated together on the client process. +Another difference is that :func:`persist` will return immediately when +executing on a distributed cluster. If you need a blocking synchronization +point in your workflow, simply use the :func:`wait` function:: + + ddf = ddf.persist() + wait(ddf) + +.. note:: + Avoid calling :func:`persist` on a large collection that cannot fit comfortably + in global worker memory. If the total sum of the partition sizes is larger + than the sum of all GPU memory, calling persist will result in significant + spilling from device memory. If the individual partition sizes are large, this + is likely to produce an OOM error. + +:func:`len` / :func:`head` / :func:`tail`: Although these operations are used +often within pandas/cuDF code to quickly inspect data, it is best to avoid +them in Dask DataFrame. In most cases, these operations will execute some or all +of the underlying task graph to materialize the collection. + +:func:`sort_values` / :func:`set_index` : These operations both require Dask to +eagerly collect quantile information about the column(s) being targeted by the +global sort operation. See `Avoid Sorting`__ for notes on sorting considerations. + +.. note:: + When using :func:`set_index`, be sure to pass in ``sort=False`` whenever the + global collection does not **need** to be sorted by the new index. + +Avoid Sorting +~~~~~~~~~~~~~ + +`The design of Dask DataFrame `__ +makes it advantageous to work with data that is already sorted along its index at +creation time. For most other cases, it is best to avoid sorting unless the logic +of the workflow makes global ordering absolutely necessary. + +If the purpose of a :func:`sort_values` operation is to ensure that all unique +values in ``by`` will be moved to the same output partition, then `shuffle +`__ +is often the better option. + + +Reading Data +------------ + +Tune the partition size +~~~~~~~~~~~~~~~~~~~~~~~ + +The ideal partition size is usually between 1/32 and 1/8 the memory +capacity of a single GPU. Increasing the partition size will typically +reduce the number of tasks in your workflow and improve the GPU utilization +for each task. However, if the partitions are too large, the risk of OOM +errors can become significant. + +.. note:: + As a general rule of thumb, start with 1/32-1/16 for shuffle-intensive workflows + (e.g. large-scale sorting and joining), and 1/16-1/8 otherwise. For pathologically + skewed data distributions, it may be necessary to target 1/64 or smaller. + This rule of thumb comes from anecdotal optimization and OOM-debugging + experience. Since every workflow is different, choosing the best partition + size is both an art and a science. + +The easiest way to tune the partition size is when the DataFrame collection +is first created by a function like :func:`read_parquet`, :func:`read_csv`, +or :func:`from_map`. For example, both :func:`read_parquet` and :func:`read_csv` +expose a ``blocksize`` argument for adjusting the maximum partition size. + +If the partition size cannot be tuned effectively at creation time, the +`repartition `__ +method can be used as a last resort. + + +Use Parquet +~~~~~~~~~~~ + +`Parquet `__ is the recommended +file format for Dask cuDF. It provides efficient columnar storage and enables +Dask to perform valuable query optimizations like column projection and +predicate pushdown. + +The most important arguments to :func:`read_parquet` are ``blocksize`` and +``aggregate_files``: + +``blocksize``: Use this argument to specify the maximum partition size. +The default is `"256 MiB"`, but larger values are usually more performant +on GPUs with more than 8 GiB of memory. Dask will use the ``blocksize`` +value to map a discrete number of Parquet row-groups (or files) to each +output partition. This mapping will only account for the uncompressed +storage size of each row group, which is usually smaller than the +correspondng ``cudf.DataFrame``. + +``aggregate_files``: Use this argument to specify whether Dask should +map multiple files to the same DataFrame partition. The default is +``False``, but ``aggregate_files=True`` is usually more performant when +the dataset contains many files that are smaller than half of ``blocksize``. + +If you know that your files correspond to a reasonable partition size +before splitting or aggregation, set ``blocksize=None`` to disallow +file splitting. In the absence of column-projection pushdown, this will +result in a simple 1-to-1 mapping between files and output partitions. + +.. note:: + If your workflow requires a strict 1-to-1 mapping between files and + partitions, use :func:`from_map` to manually construct your partitions + with ``cudf.read_parquet``. When :func:`dd.read_parquet` is used, + query-planning optimizations may automatically aggregate distinct files + into the same partition (even when ``aggregate_files=False``). + +.. note:: + Metadata collection can be extremely slow when reading from remote + storage (e.g. S3 and GCS). When reading many remote files that all + correspond to a reasonable partition size, use ``blocksize=None`` + to avoid unnecessary metadata collection. + + +Use :func:`from_map` +~~~~~~~~~~~~~~~~~~~~ + +To implement custom DataFrame-creation logic that is not covered by +existing APIs (like :func:`read_parquet`), use :func:`dask.dataframe.from_map` +whenever possible. The :func:`from_map` API has several advantages +over :func:`from_delayed`: + +* It allows proper lazy execution of your custom logic +* It enables column projection (as long as the mapped function supports a ``columns`` key-word argument) + +See the `from_map API documentation `__ +for more details. + +.. note:: + Whenever possible, be sure to specify the ``meta`` argument to + :func:`from_map`. If this argument is excluded, Dask will need to + materialize the first partition eagerly. If a large RMM pool is in + use on the first visible device, this eager execution on the client + may lead to an OOM error. + + +Sorting, Joining, and Grouping +------------------------------ + +Sorting, joining, and grouping operations all have the potential to +require the global shuffling of data between distinct partitions. +When the initial data fits comfortably in global GPU memory, these +"all-to-all" operations are typically bound by worker-to-worker +communication. When the data is larger than global GPU memory, the +bottleneck is typically device-to-host memory spilling. + +Although every workflow is different, the following guidelines +are often recommended: + +* `Use a distributed cluster with Dask-CUDA workers `_ +* `Use native cuDF spilling whenever possible `_ +* Avoid shuffling whenever possible + * Use ``split_out=1`` for low-cardinality groupby aggregations + * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``) +* `Use UCX `__ if communication is a bottleneck. + +.. note:: + UCX enables Dask-CUDA workers to communicate using high-performance + tansport technologies like `NVLink `__ + and Infiniband. Without UCX, inter-process communication will rely + on TCP sockets. + + +User-defined functions +---------------------- + +Most real-world Dask DataFrame workflows use `map_partitions +`__ +to map user-defined functions across every partition of the underlying data. +This API is a fantastic way to apply custom operations in an intuitive and +scalable way. With that said, the :func:`map_partitions` method will produce +an opaque DataFrame expression that blocks the query-planning `optimizer +`__ from performing +useful optimizations (like projection and filter pushdown). + +Since column-projection pushdown is often the most effective optimization, +it is important to select the necessary columns both before and after calling +:func:`map_partitions`. You can also add explicit filter operations to further +mitigate the loss of filter pushdown. diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst index 7fe6cbd45fa..23ca7e49753 100644 --- a/docs/dask_cudf/source/index.rst +++ b/docs/dask_cudf/source/index.rst @@ -15,7 +15,7 @@ as the ``"cudf"`` dataframe backend for .. note:: Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU or multi-node execution on their own. You must also deploy a - `dask.distributed ` cluster + `dask.distributed `__ cluster to leverage multiple GPUs. We strongly recommend using `Dask-CUDA `__ to simplify the setup of the cluster, taking advantage of all features of the GPU @@ -29,6 +29,10 @@ minutes to Dask by `10 minutes to cuDF and Dask cuDF `__. +After reviewing the sections below, please see the +:ref:`Best Practices ` page for further guidance on +using Dask cuDF effectively. + Using Dask cuDF --------------- @@ -36,7 +40,7 @@ Using Dask cuDF The Dask DataFrame API (Recommended) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Simply use the `Dask configuration ` system to +Simply use the `Dask configuration `__ system to set the ``"dataframe.backend"`` option to ``"cudf"``. From Python, this can be achieved like so:: @@ -50,14 +54,14 @@ environment before running your code. Once this is done, the public Dask DataFrame API will leverage ``cudf`` automatically when a new DataFrame collection is created from an on-disk format using any of the following ``dask.dataframe`` -functions:: +functions: -* :func:`dask.dataframe.read_parquet` -* :func:`dask.dataframe.read_json` -* :func:`dask.dataframe.read_csv` -* :func:`dask.dataframe.read_orc` -* :func:`dask.dataframe.read_hdf` -* :func:`dask.dataframe.from_dict` +* :func:`read_parquet` +* :func:`read_json` +* :func:`read_csv` +* :func:`read_orc` +* :func:`read_hdf` +* :func:`from_dict` For example:: @@ -112,8 +116,8 @@ performance benefit over the CPU/GPU-portable ``dask.dataframe`` API. Also, using some parts of the explicit API are incompatible with automatic query planning (see the next section). -The explicit Dask cuDF API -~~~~~~~~~~~~~~~~~~~~~~~~~~ +Query Planning +~~~~~~~~~~~~~~ Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+). As long as the ``"dataframe.query-planning"`` configuration is set to diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index c8308ca17ec..2bb74c3e3b1 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -38,6 +38,8 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean allowLeadingZeros; private final boolean allowNonNumericNumbers; private final boolean allowUnquotedControlChars; + private final boolean cudfPruneSchema; + private final byte lineDelimiter; private JSONOptions(Builder builder) { super(builder); @@ -52,6 +54,16 @@ private JSONOptions(Builder builder) { allowLeadingZeros = builder.allowLeadingZeros; allowNonNumericNumbers = builder.allowNonNumericNumbers; allowUnquotedControlChars = builder.allowUnquotedControlChars; + cudfPruneSchema = builder.cudfPruneSchema; + lineDelimiter = builder.lineDelimiter; + } + + public boolean shouldCudfPruneSchema() { + return cudfPruneSchema; + } + + public byte getLineDelimiter() { + return lineDelimiter; } public boolean isDayFirst() { @@ -123,6 +135,22 @@ public static final class Builder extends ColumnFilterOptions.Builder Byte.MAX_VALUE) { + throw new IllegalArgumentException("Only basic ASCII values are supported as line delimiters " + delimiter); + } + lineDelimiter = (byte)delimiter; + return this; + } + /** * Should json validation be strict or not */ diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 09da43374ae..6d370ca27b2 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -258,7 +258,9 @@ private static native long readJSON(int[] numChildren, String[] columnNames, boolean strictValidation, boolean allowLeadingZeros, boolean allowNonNumericNumbers, - boolean allowUnquotedControl) throws CudfException; + boolean allowUnquotedControl, + boolean pruneColumns, + byte lineDelimiter) throws CudfException; private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames, int[] dTypeIds, int[] dTypeScales, @@ -272,6 +274,8 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, + boolean pruneColumns, + byte lineDelimiter, long dsHandle) throws CudfException; private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines, @@ -284,6 +288,7 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, + byte lineDelimiter, long dsHandle) throws CudfException; private static native long readAndInferJSON(long address, long length, @@ -297,7 +302,8 @@ private static native long readAndInferJSON(long address, long length, boolean strictValidation, boolean allowLeadingZeros, boolean allowNonNumericNumbers, - boolean allowUnquotedControl) throws CudfException; + boolean allowUnquotedControl, + byte lineDelimiter) throws CudfException; /** * Read in Parquet formatted data. @@ -1308,6 +1314,10 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp * @return the file parsed as a table on the GPU. */ public static Table readJSON(Schema schema, JSONOptions opts, File path) { + // only prune the schema if one is provided + boolean cudfPruneSchema = schema.getColumnNames() != null && + schema.getColumnNames().length != 0 && + opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta( readJSON(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), @@ -1321,7 +1331,9 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { opts.strictValidation(), opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), - opts.unquotedControlChars()))) { + opts.unquotedControlChars(), + cudfPruneSchema, + opts.getLineDelimiter()))) { return gatherJSONColumns(schema, twm, -1); } @@ -1404,7 +1416,8 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer, opts.strictValidation(), opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), - opts.unquotedControlChars())); + opts.unquotedControlChars(), + opts.getLineDelimiter())); } /** @@ -1426,6 +1439,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) { opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), + opts.getLineDelimiter(), dsHandle)); return twm; } finally { @@ -1465,6 +1479,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b assert len > 0; assert len <= buffer.length - offset; assert offset >= 0 && offset < buffer.length; + // only prune the schema if one is provided + boolean cudfPruneSchema = schema.getColumnNames() != null && + schema.getColumnNames().length != 0 && + opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta(readJSON( schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null, @@ -1479,7 +1497,9 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b opts.strictValidation(), opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), - opts.unquotedControlChars()))) { + opts.unquotedControlChars(), + cudfPruneSchema, + opts.getLineDelimiter()))) { return gatherJSONColumns(schema, twm, emptyRowCount); } } @@ -1505,6 +1525,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { */ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); + // only prune the schema if one is provided + boolean cudfPruneSchema = schema.getColumnNames() != null && + schema.getColumnNames().length != 0 && + opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(), @@ -1518,6 +1542,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), + cudfPruneSchema, + opts.getLineDelimiter(), dsHandle))) { return gatherJSONColumns(schema, twm, emptyRowCount); } finally { diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 92e213bcb60..0f77da54152 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1627,6 +1627,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, + jbyte line_delimiter, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1646,8 +1647,10 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) - .keep_quotes(keep_quotes); + .keep_quotes(keep_quotes) + .prune_columns(false); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) @@ -1676,7 +1679,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, jboolean strict_validation, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, - jboolean allow_unquoted_control) + jboolean allow_unquoted_control, + jbyte line_delimiter) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1700,6 +1704,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, .normalize_whitespace(static_cast(normalize_whitespace)) .strict_validation(strict_validation) .mixed_types_as_string(mixed_types_as_string) + .prune_columns(false) + .delimiter(static_cast(line_delimiter)) .keep_quotes(keep_quotes); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) @@ -1814,6 +1820,8 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, + jboolean prune_columns, + jbyte line_delimiter, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1848,8 +1856,10 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) - .keep_quotes(keep_quotes); + .keep_quotes(keep_quotes) + .prune_columns(prune_columns); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) @@ -1908,7 +1918,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean strict_validation, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, - jboolean allow_unquoted_control) + jboolean allow_unquoted_control, + jboolean prune_columns, + jbyte line_delimiter) { bool read_buffer = true; if (buffer == 0) { @@ -1957,8 +1969,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) - .keep_quotes(keep_quotes); + .keep_quotes(keep_quotes) + .prune_columns(prune_columns); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 830f2b33b32..c7fcb1756b6 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -40,7 +40,6 @@ import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.OriginalType; -import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import java.io.*; @@ -656,6 +655,24 @@ void testJSONValidationUnquotedControl() { } } + private static final byte[] CR_JSON_TEST_BUFFER = ("{\"a\":\"12\n3\"}\0" + + "{\"a\":\"AB\nC\"}\0").getBytes(StandardCharsets.UTF_8); + + @Test + void testReadJSONDelim() { + Schema schema = Schema.builder().addColumn(DType.STRING, "a").build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withLineDelimiter('\0') + .build(); + try (Table expected = new Table.TestBuilder() + .column("12\n3", "AB\nC") + .build(); + Table found = Table.readJSON(schema, opts, CR_JSON_TEST_BUFFER)) { + assertTablesAreEqual(expected, found); + } + } + private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" + "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" + "{\"d\":[1,2,3]}\n" + diff --git a/python/cudf/benchmarks/pytest.ini b/python/cudf/benchmarks/pytest.ini index db24415ef9e..187d91996b2 100644 --- a/python/cudf/benchmarks/pytest.ini +++ b/python/cudf/benchmarks/pytest.ini @@ -6,3 +6,4 @@ python_classes = Bench python_functions = bench_* markers = pandas_incompatible: mark a benchmark that cannot be run with pandas +addopts = --tb=native diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index e27c595edda..99e4c21df8a 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -599,7 +599,6 @@ cdef class Column: children=tuple(children) ) - # TODO: Actually support exposed data pointers. @staticmethod def from_pylibcudf( col, bint data_ptr_exposed=False @@ -616,7 +615,7 @@ cdef class Column: col : pylibcudf.Column The object to copy. data_ptr_exposed : bool - This parameter is not yet supported + Whether the data buffer is exposed. Returns ------- @@ -639,16 +638,18 @@ cdef class Column: dtype = dtype_from_pylibcudf_column(col) return cudf.core.column.build_column( - data=as_buffer(col.data().obj) if col.data() is not None else None, + data=as_buffer( + col.data().obj, exposed=data_ptr_exposed + ) if col.data() is not None else None, dtype=dtype, size=col.size(), mask=as_buffer( - col.null_mask().obj + col.null_mask().obj, exposed=data_ptr_exposed ) if col.null_mask() is not None else None, offset=col.offset(), null_count=col.null_count(), children=tuple([ - Column.from_pylibcudf(child) + Column.from_pylibcudf(child, data_ptr_exposed=data_ptr_exposed) for child in col.children() ]) ) diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx index e661059faa3..e6c2d136f0d 100644 --- a/python/cudf/cudf/_lib/concat.pyx +++ b/python/cudf/cudf/_lib/concat.pyx @@ -23,9 +23,9 @@ def concat_columns(object columns): def concat_tables(object tables, bool ignore_index=False): plc_tables = [] for table in tables: - cols = table._data.columns + cols = table._columns if not ignore_index: - cols = table._index._data.columns + cols + cols = table._index._columns + cols plc_tables.append(pylibcudf.Table([c.to_pylibcudf(mode="read") for c in cols])) return data_from_pylibcudf_table( diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 16182e31c08..49714091f46 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -384,7 +384,7 @@ cdef class _CPackedColumns: p.column_names = input_table._column_names p.column_dtypes = {} - for name, col in input_table._data.items(): + for name, col in input_table._column_labels_and_values: if isinstance(col.dtype, cudf.core.dtypes._BaseDtype): p.column_dtypes[name] = col.dtype diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 058e884e08b..9ad96f610b3 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -273,7 +273,7 @@ def read_csv( elif isinstance(dtype, abc.Collection): for index, col_dtype in enumerate(dtype): if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype): - col_name = df._data.names[index] + col_name = df._column_names[index] df._data[col_name] = df._data[col_name].astype(col_dtype) if names is not None and len(names) and isinstance(names[0], int): diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 483250dd36f..bc5e085ec39 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -17,6 +17,8 @@ from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar +import pylibcudf as plc + @acquire_spill_lock() def add_months(Column col, Column months): @@ -38,43 +40,9 @@ def add_months(Column col, Column months): @acquire_spill_lock() def extract_datetime_component(Column col, object field): - - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - if field == "year": - c_result = move(libcudf_datetime.extract_year(col_view)) - elif field == "month": - c_result = move(libcudf_datetime.extract_month(col_view)) - elif field == "day": - c_result = move(libcudf_datetime.extract_day(col_view)) - elif field == "weekday": - c_result = move(libcudf_datetime.extract_weekday(col_view)) - elif field == "hour": - c_result = move(libcudf_datetime.extract_hour(col_view)) - elif field == "minute": - c_result = move(libcudf_datetime.extract_minute(col_view)) - elif field == "second": - c_result = move(libcudf_datetime.extract_second(col_view)) - elif field == "millisecond": - c_result = move( - libcudf_datetime.extract_millisecond_fraction(col_view) - ) - elif field == "microsecond": - c_result = move( - libcudf_datetime.extract_microsecond_fraction(col_view) - ) - elif field == "nanosecond": - c_result = move( - libcudf_datetime.extract_nanosecond_fraction(col_view) - ) - elif field == "day_of_year": - c_result = move(libcudf_datetime.day_of_year(col_view)) - else: - raise ValueError(f"Invalid datetime field: '{field}'") - - result = Column.from_unique_ptr(move(c_result)) + result = Column.from_pylibcudf( + plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field) + ) if field == "weekday": # Pandas counts Monday-Sunday as 0-6 diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index b1900138d94..564daefbae2 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -179,7 +179,7 @@ cdef update_struct_field_names( ): # Deprecated, remove in favor of add_col_struct_names # when a reader is ported to pylibcudf - for i, (name, col) in enumerate(table._data.items()): + for i, (name, col) in enumerate(table._column_labels_and_values): table._data[name] = update_column_struct_field_names( col, schema_info[i] ) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index e6c9d60b05b..fa2690c7f21 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -235,16 +235,16 @@ cdef object _process_metadata(object df, df._index = idx elif set(index_col).issubset(names): index_data = df[index_col] - actual_index_names = list(index_col_names.values()) - if len(index_data._data) == 1: + actual_index_names = iter(index_col_names.values()) + if index_data._num_columns == 1: idx = cudf.Index._from_column( - index_data._data.columns[0], - name=actual_index_names[0] + index_data._columns[0], + name=next(actual_index_names) ) else: idx = cudf.MultiIndex.from_frame( index_data, - names=actual_index_names + names=list(actual_index_names) ) df.drop(columns=index_col, inplace=True) df._index = idx @@ -252,7 +252,7 @@ cdef object _process_metadata(object df, if use_pandas_metadata: df.index.names = index_col - if len(df._data.names) == 0 and column_index_type is not None: + if df._num_columns == 0 and column_index_type is not None: df._data.label_dtype = cudf.dtype(column_index_type) return df diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 8d463829a19..60a6795a402 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -20,13 +20,7 @@ from pylibcudf.libcudf.strings.convert.convert_booleans cimport ( to_booleans as cpp_to_booleans, ) from pylibcudf.libcudf.strings.convert.convert_datetime cimport ( - from_timestamps as cpp_from_timestamps, is_timestamp as cpp_is_timestamp, - to_timestamps as cpp_to_timestamps, -) -from pylibcudf.libcudf.strings.convert.convert_durations cimport ( - from_durations as cpp_from_durations, - to_durations as cpp_to_durations, ) from pylibcudf.libcudf.strings.convert.convert_floats cimport ( from_floats as cpp_from_floats, @@ -48,8 +42,12 @@ from pylibcudf.libcudf.types cimport data_type, type_id from cudf._lib.types cimport underlying_type_t_type_id +import pylibcudf as plc + import cudf +from cudf._lib.types cimport dtype_to_pylibcudf_type + def floating_to_string(Column input_col): cdef column_view input_column_view = input_col.view() @@ -522,19 +520,14 @@ def int2timestamp( A Column with date-time represented in string format """ - cdef column_view input_column_view = input_col.view() cdef string c_timestamp_format = format.encode("UTF-8") - cdef column_view input_strings_names = names.view() - - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_timestamps( - input_column_view, - c_timestamp_format, - input_strings_names)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.convert.convert_datetime.from_timestamps( + input_col.to_pylibcudf(mode="read"), + c_timestamp_format, + names.to_pylibcudf(mode="read") + ) + ) def timestamp2int(Column input_col, dtype, format): @@ -551,23 +544,15 @@ def timestamp2int(Column input_col, dtype, format): A Column with string represented in date-time format """ - cdef column_view input_column_view = input_col.view() - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype] + dtype = dtype_to_pylibcudf_type(dtype) + cdef string c_timestamp_format = format.encode('UTF-8') + return Column.from_pylibcudf( + plc.strings.convert.convert_datetime.to_timestamps( + input_col.to_pylibcudf(mode="read"), + dtype, + c_timestamp_format ) ) - cdef data_type out_type = data_type(tid) - cdef string c_timestamp_format = format.encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_to_timestamps( - input_column_view, - out_type, - c_timestamp_format)) - - return Column.from_unique_ptr(move(c_result)) def istimestamp(Column input_col, str format): @@ -613,23 +598,15 @@ def timedelta2int(Column input_col, dtype, format): A Column with string represented in TimeDelta format """ - cdef column_view input_column_view = input_col.view() - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype] + dtype = dtype_to_pylibcudf_type(dtype) + cdef string c_timestamp_format = format.encode('UTF-8') + return Column.from_pylibcudf( + plc.strings.convert.convert_durations.to_durations( + input_col.to_pylibcudf(mode="read"), + dtype, + c_timestamp_format ) ) - cdef data_type out_type = data_type(tid) - cdef string c_duration_format = format.encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_to_durations( - input_column_view, - out_type, - c_duration_format)) - - return Column.from_unique_ptr(move(c_result)) def int2timedelta(Column input_col, str format): @@ -647,16 +624,13 @@ def int2timedelta(Column input_col, str format): """ - cdef column_view input_column_view = input_col.view() cdef string c_duration_format = format.encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_durations( - input_column_view, - c_duration_format)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.convert.convert_durations.from_durations( + input_col.to_pylibcudf(mode="read"), + c_duration_format + ) + ) def int2ip(Column input_col): diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx index 63f4d57e562..5bf336f4f3c 100644 --- a/python/cudf/cudf/_lib/strings/extract.pyx +++ b/python/cudf/cudf/_lib/strings/extract.pyx @@ -1,21 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference from libc.stdint cimport uint32_t -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.extract cimport extract as cpp_extract -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program -from pylibcudf.libcudf.table.table cimport table - from cudf._lib.column cimport Column -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -26,21 +17,8 @@ def extract(Column source_strings, object pattern, uint32_t flags): The returning data contains one row for each subject string, and one column for each group. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_extract( - source_view, - dereference(c_prog) - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + prog = plc.strings.regex_program.RegexProgram.create(str(pattern), flags) + plc_result = plc.strings.extract.extract( + source_strings.to_pylibcudf(mode="read"), prog ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_result.columns())) diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx index 3cf2084e30a..0e758d5b322 100644 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ b/python/cudf/cudf/_lib/strings/findall.pyx @@ -1,21 +1,13 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference from libc.stdint cimport uint32_t -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.findall cimport findall as cpp_findall -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def findall(Column source_strings, object pattern, uint32_t flags): @@ -23,18 +15,11 @@ def findall(Column source_strings, object pattern, uint32_t flags): Returns data with all non-overlapping matches of `pattern` in each string of `source_strings` as a lists column. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_findall( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = plc.strings.regex_program.RegexProgram.create( + str(pattern), flags + ) + plc_result = plc.strings.findall.findall( + source_strings.to_pylibcudf(mode="read"), + prog, + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx index acf52cb7b9f..38ecb21a94c 100644 --- a/python/cudf/cudf/_lib/strings/strip.pyx +++ b/python/cudf/cudf/_lib/strings/strip.pyx @@ -13,6 +13,7 @@ from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar +import pylibcudf as plc @acquire_spill_lock() @@ -25,23 +26,14 @@ def strip(Column source_strings, """ cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() + return Column.from_pylibcudf( + plc.strings.strip.strip( + source_strings.to_pylibcudf(mode="read"), + plc.strings.SideType.BOTH, + repl.c_value + ) ) - with nogil: - c_result = move(cpp_strip( - source_view, - side_type.BOTH, - scalar_str[0] - )) - - return Column.from_unique_ptr(move(c_result)) - @acquire_spill_lock() def lstrip(Column source_strings, diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index baa08a545ec..40d0c9eac3a 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -3,41 +3,26 @@ from numba.np import numpy_support import cudf -from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES from cudf.core._internals.expressions import parse_expression from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.utils import cudautils from cython.operator cimport dereference -from libc.stdint cimport uintptr_t from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair -from libcpp.string cimport string from libcpp.utility cimport move cimport pylibcudf.libcudf.transform as libcudf_transform from pylibcudf cimport transform as plc_transform from pylibcudf.expressions cimport Expression from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.expressions cimport expression -from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport ( - bitmask_type, - data_type, - size_type, - type_id, -) -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from cudf._lib.types cimport underlying_type_t_type_id -from cudf._lib.utils cimport ( - columns_from_unique_ptr, - data_from_table_view, - table_view_from_columns, -) +from cudf._lib.utils cimport table_view_from_columns + +import pylibcudf as plc @acquire_spill_lock() @@ -46,17 +31,8 @@ def bools_to_mask(Column col): Given an int8 (boolean) column, compress the data from booleans to bits and return a Buffer """ - cdef column_view col_view = col.view() - cdef pair[unique_ptr[device_buffer], size_type] cpp_out - cdef unique_ptr[device_buffer] up_db - - with nogil: - cpp_out = move(libcudf_transform.bools_to_mask(col_view)) - up_db = move(cpp_out.first) - - rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db)) - buf = as_buffer(rmm_db) - return buf + mask, _ = plc_transform.bools_to_mask(col.to_pylibcudf(mode="read")) + return as_buffer(mask) @acquire_spill_lock() @@ -68,22 +44,15 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): if not isinstance(mask_buffer, cudf.core.buffer.Buffer): raise TypeError("mask_buffer is not an instance of " "cudf.core.buffer.Buffer") - cdef bitmask_type* bit_mask = ( - mask_buffer.get_ptr(mode="read") + plc_column = plc_transform.mask_to_bools( + mask_buffer.get_ptr(mode="read"), begin_bit, end_bit ) - - cdef unique_ptr[column] result - with nogil: - result = move( - libcudf_transform.mask_to_bools(bit_mask, begin_bit, end_bit) - ) - - return Column.from_unique_ptr(move(result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() def nans_to_nulls(Column input): - (mask, _) = plc_transform.nans_to_nulls( + mask, _ = plc_transform.nans_to_nulls( input.to_pylibcudf(mode="read") ) return as_buffer(mask) @@ -91,80 +60,45 @@ def nans_to_nulls(Column input): @acquire_spill_lock() def transform(Column input, op): - cdef column_view c_input = input.view() - cdef string c_str - cdef type_id c_tid - cdef data_type c_dtype - nb_type = numpy_support.from_dtype(input.dtype) nb_signature = (nb_type,) compiled_op = cudautils.compile_udf(op, nb_signature) - c_str = compiled_op[0].encode('UTF-8') np_dtype = cudf.dtype(compiled_op[1]) - try: - c_tid = ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[ - np_dtype - ] - ) - c_dtype = data_type(c_tid) - - except KeyError: - raise TypeError( - "Result of window function has unsupported dtype {}" - .format(np_dtype) - ) - - with nogil: - c_output = move(libcudf_transform.transform( - c_input, - c_str, - c_dtype, - True - )) - - return Column.from_unique_ptr(move(c_output)) + plc_column = plc_transform.transform( + input.to_pylibcudf(mode="read"), + compiled_op[0], + plc.column._datatype_from_dtype_desc(np_dtype.str[1:]), + True + ) + return Column.from_pylibcudf(plc_column) def table_encode(list source_columns): - cdef table_view c_input = table_view_from_columns(source_columns) - cdef pair[unique_ptr[table], unique_ptr[column]] c_result - - with nogil: - c_result = move(libcudf_transform.encode(c_input)) + plc_table, plc_column = plc_transform.encode( + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]) + ) return ( - columns_from_unique_ptr(move(c_result.first)), - Column.from_unique_ptr(move(c_result.second)) + [Column.from_pylibcudf(col) for col in plc_table.columns()], + Column.from_pylibcudf(plc_column) ) def one_hot_encode(Column input_column, Column categories): - cdef column_view c_view_input = input_column.view() - cdef column_view c_view_categories = categories.view() - cdef pair[unique_ptr[column], table_view] c_result - - with nogil: - c_result = move( - libcudf_transform.one_hot_encode(c_view_input, c_view_categories) - ) - - # Notice, the data pointer of `owner` has been exposed - # through `c_result.second` at this point. - owner = Column.from_unique_ptr( - move(c_result.first), data_ptr_exposed=True - ) - - pylist_categories = categories.to_arrow().to_pylist() - encodings, _ = data_from_table_view( - move(c_result.second), - owner=owner, - column_names=[ - x if x is not None else '' for x in pylist_categories - ] + plc_table = plc_transform.one_hot_encode( + input_column.to_pylibcudf(mode="read"), + categories.to_pylibcudf(mode="read"), ) - return encodings + result_columns = [ + Column.from_pylibcudf(col, data_ptr_exposed=True) + for col in plc_table.columns() + ] + result_labels = [ + x if x is not None else '' + for x in categories.to_arrow().to_pylist() + ] + return dict(zip(result_labels, result_columns)) @acquire_spill_lock() diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index cae28d02ef4..8660cca9322 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -49,9 +49,9 @@ cdef table_view table_view_from_table(tbl, ignore_index=False) except*: If True, don't include the index in the columns. """ return table_view_from_columns( - tbl._index._data.columns + tbl._data.columns + tbl._index._columns + tbl._columns if not ignore_index and tbl._index is not None - else tbl._data.columns + else tbl._columns ) @@ -62,7 +62,7 @@ cpdef generate_pandas_metadata(table, index): index_descriptors = [] columns_to_convert = list(table._columns) # Columns - for name, col in table._data.items(): + for name, col in table._column_labels_and_values: if cudf.get_option("mode.pandas_compatible"): # in pandas-compat mode, non-string column names are stringified. col_names.append(str(name)) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index ff114474aa4..a6abd63d042 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1951,7 +1951,7 @@ def drop_duplicates( return self._from_columns_like_self( drop_duplicates( list(self._columns), - keys=range(len(self._data)), + keys=range(len(self._columns)), keep=keep, nulls_are_equal=nulls_are_equal, ), diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index e059917b0b8..4463e3280df 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -623,11 +623,9 @@ def extract( "unsupported value for `flags` parameter" ) - data, _ = libstrings.extract(self._column, pat, flags) + data = libstrings.extract(self._column, pat, flags) if len(data) == 1 and expand is False: - data = next(iter(data.values())) - else: - data = data + _, data = data.popitem() return self._return_or_inplace(data, expand=expand) def contains( diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 09b0f453692..bc093fdaa9a 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -151,9 +151,9 @@ def __setitem__(self, key: abc.Hashable, value: ColumnBase) -> None: self.set_by_label(key, value) def __delitem__(self, key: abc.Hashable) -> None: - old_ncols = len(self._data) + old_ncols = len(self) del self._data[key] - new_ncols = len(self._data) + new_ncols = len(self) self._clear_cache(old_ncols, new_ncols) def __len__(self) -> int: @@ -213,7 +213,7 @@ def level_names(self) -> tuple[abc.Hashable, ...]: @property def nlevels(self) -> int: - if len(self._data) == 0: + if len(self) == 0: return 0 if not self.multiindex: return 1 @@ -226,7 +226,7 @@ def name(self) -> abc.Hashable: @cached_property def nrows(self) -> int: - if len(self._data) == 0: + if len(self) == 0: return 0 else: return len(next(iter(self.values()))) @@ -257,9 +257,9 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None: Parameters ---------- old_ncols: int - len(self._data) before self._data was modified + len(self) before self._data was modified new_ncols: int - len(self._data) after self._data was modified + len(self) after self._data was modified """ cached_properties = ("columns", "names", "_grouped_data") for attr in cached_properties: @@ -335,7 +335,7 @@ def insert( if name in self._data: raise ValueError(f"Cannot insert '{name}', already exists") - old_ncols = len(self._data) + old_ncols = len(self) if loc == -1: loc = old_ncols elif not (0 <= loc <= old_ncols): @@ -414,7 +414,7 @@ def get_labels_by_index(self, index: Any) -> tuple: tuple """ if isinstance(index, slice): - start, stop, step = index.indices(len(self._data)) + start, stop, step = index.indices(len(self)) return self.names[start:stop:step] elif pd.api.types.is_integer(index): return (self.names[index],) @@ -526,9 +526,9 @@ def set_by_label(self, key: abc.Hashable, value: ColumnBase) -> None: if len(self) > 0 and len(value) != self.nrows: raise ValueError("All columns must be of equal length") - old_ncols = len(self._data) + old_ncols = len(self) self._data[key] = value - new_ncols = len(self._data) + new_ncols = len(self) self._clear_cache(old_ncols, new_ncols) def _select_by_label_list_like(self, key: tuple) -> Self: @@ -718,12 +718,12 @@ def droplevel(self, level: int) -> None: if level < 0: level += self.nlevels - old_ncols = len(self._data) + old_ncols = len(self) self._data = { _remove_key_level(key, level): value # type: ignore[arg-type] for key, value in self._data.items() } - new_ncols = len(self._data) + new_ncols = len(self) self._level_names = ( self._level_names[:level] + self._level_names[level + 1 :] ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d73ad8225ca..16b0aa95c35 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -176,7 +176,7 @@ def _can_downcast_to_series(self, df, arg): return False @_performance_tracking - def _downcast_to_series(self, df, arg): + def _downcast_to_series(self, df: DataFrame, arg): """ "Downcast" from a DataFrame to a Series based on Pandas indexing rules @@ -203,16 +203,16 @@ def _downcast_to_series(self, df, arg): # take series along the axis: if axis == 1: - return df[df._data.names[0]] + return df[df._column_names[0]] else: if df._num_columns > 0: dtypes = df.dtypes.values.tolist() normalized_dtype = np.result_type(*dtypes) - for name, col in df._data.items(): + for name, col in df._column_labels_and_values: df[name] = col.astype(normalized_dtype) sr = df.T - return sr[sr._data.names[0]] + return sr[sr._column_names[0]] class _DataFrameLocIndexer(_DataFrameIndexer): @@ -258,7 +258,7 @@ def _getitem_tuple_arg(self, arg): and len(arg) > 1 and is_scalar(arg[1]) ): - return result._data.columns[0].element_indexing(0) + return result._columns[0].element_indexing(0) return result else: if isinstance(arg[0], slice): @@ -310,7 +310,7 @@ def _getitem_tuple_arg(self, arg): else: tmp_col_name = str(uuid4()) cantor_name = "_" + "_".join( - map(str, columns_df._data.names) + map(str, columns_df._column_names) ) if columns_df._data.multiindex: # column names must be appropriate length tuples @@ -1412,7 +1412,7 @@ def __setitem__(self, arg, value): else column.column_empty_like( col, masked=True, newsize=length ) - for key, col in self._data.items() + for key, col in self._column_labels_and_values ) self._data = self._data._from_columns_like_self( new_columns, verify=False @@ -1494,8 +1494,8 @@ def __delitem__(self, name): @_performance_tracking def memory_usage(self, index=True, deep=False) -> cudf.Series: - mem_usage = [col.memory_usage for col in self._data.columns] - names = [str(name) for name in self._data.names] + mem_usage = [col.memory_usage for col in self._columns] + names = [str(name) for name in self._column_names] if index: mem_usage.append(self.index.memory_usage()) names.append("Index") @@ -1725,7 +1725,7 @@ def _concat( [] if are_all_range_index or (ignore_index and not empty_has_index) - else list(f.index._data.columns) + else list(f.index._columns) ) + [f._data[name] if name in f._data else None for name in names] for f in objs @@ -1808,7 +1808,7 @@ def _concat( out.index.dtype, cudf.CategoricalDtype ): out = out.set_index(out.index) - for name, col in out._data.items(): + for name, col in out._column_labels_and_values: out._data[name] = col._with_type_metadata( tables[0]._data[name].dtype ) @@ -1831,13 +1831,13 @@ def astype( errors: Literal["raise", "ignore"] = "raise", ): if is_dict_like(dtype): - if len(set(dtype.keys()) - set(self._data.names)) > 0: + if len(set(dtype.keys()) - set(self._column_names)) > 0: raise KeyError( "Only a column name can be used for the " "key in a dtype mappings argument." ) else: - dtype = {cc: dtype for cc in self._data.names} + dtype = {cc: dtype for cc in self._column_names} return super().astype(dtype, copy, errors) def _clean_renderable_dataframe(self, output): @@ -2601,7 +2601,7 @@ def equals(self, other) -> bool: # If all other checks matched, validate names. if ret: for self_name, other_name in zip( - self._data.names, other._data.names + self._column_names, other._column_names ): if self_name != other_name: ret = False @@ -2676,7 +2676,7 @@ def columns(self, columns): ) self._data = ColumnAccessor( - data=dict(zip(pd_columns, self._data.columns)), + data=dict(zip(pd_columns, self._columns)), multiindex=multiindex, level_names=level_names, label_dtype=label_dtype, @@ -2698,7 +2698,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None: f"got {len(self)} elements" ) self._data = ColumnAccessor( - data=dict(zip(other.names, self._data.columns)), + data=dict(zip(other.names, self._columns)), multiindex=other.multiindex, rangeindex=other.rangeindex, level_names=other.level_names, @@ -2983,7 +2983,7 @@ def set_index( elif isinstance(col, (MultiIndex, pd.MultiIndex)): if isinstance(col, pd.MultiIndex): col = MultiIndex.from_pandas(col) - data_to_add.extend(col._data.columns) + data_to_add.extend(col._columns) names.extend(col.names) elif isinstance( col, (cudf.Series, cudf.Index, pd.Series, pd.Index) @@ -3110,7 +3110,9 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None): ) out = [] - for (name, col), other_col in zip(self._data.items(), other_cols): + for (name, col), other_col in zip( + self._column_labels_and_values, other_cols + ): source_col, other_col = _check_and_cast_columns_with_other( source_col=col, other=other_col, @@ -3314,7 +3316,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): column.column_empty_like( col_data, masked=True, newsize=length ) - for col_data in self._data.values() + for col_data in self._columns ), verify=False, ) @@ -3664,7 +3666,7 @@ def rename( name: col.find_and_replace( to_replace, vals, is_all_na ) - for name, col in self.index._data.items() + for name, col in self.index._column_labels_and_values } ) except OverflowError: @@ -3686,9 +3688,7 @@ def add_prefix(self, prefix, axis=None): raise NotImplementedError("axis is currently not implemented.") # TODO: Change to deep=False when copy-on-write is default out = self.copy(deep=True) - out.columns = [ - prefix + col_name for col_name in list(self._data.keys()) - ] + out.columns = [prefix + col_name for col_name in self._column_names] return out @_performance_tracking @@ -3697,9 +3697,7 @@ def add_suffix(self, suffix, axis=None): raise NotImplementedError("axis is currently not implemented.") # TODO: Change to deep=False when copy-on-write is default out = self.copy(deep=True) - out.columns = [ - col_name + suffix for col_name in list(self._data.keys()) - ] + out.columns = [col_name + suffix for col_name in self._column_names] return out @_performance_tracking @@ -4805,7 +4803,7 @@ def _func(x): # pragma: no cover # TODO: naive implementation # this could be written as a single kernel result = {} - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: apply_sr = Series._from_column(col) result[name] = apply_sr.apply(_func)._column @@ -5444,7 +5442,7 @@ def to_pandas( out_index = self.index.to_pandas() out_data = { i: col.to_pandas(nullable=nullable, arrow_type=arrow_type) - for i, col in enumerate(self._data.columns) + for i, col in enumerate(self._columns) } out_df = pd.DataFrame(out_data, index=out_index) @@ -5665,14 +5663,16 @@ def to_arrow(self, preserve_index=None) -> pa.Table: index = index._as_int_index() index.name = "__index_level_0__" if isinstance(index, MultiIndex): - index_descr = list(index._data.names) + index_descr = index._column_names index_levels = index.levels else: index_descr = ( index.names if index.name is not None else ("index",) ) data = data.copy(deep=False) - for gen_name, col_name in zip(index_descr, index._data.names): + for gen_name, col_name in zip( + index_descr, index._column_names + ): data._insert( data.shape[1], gen_name, @@ -5681,7 +5681,7 @@ def to_arrow(self, preserve_index=None) -> pa.Table: out = super(DataFrame, data).to_arrow() metadata = pa.pandas_compat.construct_metadata( - columns_to_convert=[self[col] for col in self._data.names], + columns_to_convert=[self[col] for col in self._column_names], df=self, column_names=out.schema.names, index_levels=index_levels, @@ -5724,12 +5724,12 @@ def to_records(self, index=True, column_dtypes=None, index_dtypes=None): "column_dtypes is currently not supported." ) members = [("index", self.index.dtype)] if index else [] - members += [(col, self[col].dtype) for col in self._data.names] + members += list(self._dtypes) dtype = np.dtype(members) ret = np.recarray(len(self), dtype=dtype) if index: ret["index"] = self.index.to_numpy() - for col in self._data.names: + for col in self._column_names: ret[col] = self[col].to_numpy() return ret @@ -6059,7 +6059,7 @@ def quantile( ) if columns is None: - columns = data_df._data.names + columns = set(data_df._column_names) if isinstance(q, numbers.Number): q_is_number = True @@ -6084,7 +6084,7 @@ def quantile( # Ensure that qs is non-scalar so that we always get a column back. interpolation = interpolation or "linear" result = {} - for k in data_df._data.names: + for k in data_df._column_names: if k in columns: ser = data_df[k] res = ser.quantile( @@ -6198,7 +6198,7 @@ def make_false_column_like_self(): if isinstance(values, DataFrame) else {name: values._column for name in self._data} ) - for col, self_col in self._data.items(): + for col, self_col in self._column_labels_and_values: if col in other_cols: other_col = other_cols[col] self_is_cat = isinstance(self_col, CategoricalColumn) @@ -6231,13 +6231,13 @@ def make_false_column_like_self(): else: result[col] = make_false_column_like_self() elif is_dict_like(values): - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: if name in values: result[name] = col.isin(values[name]) else: result[name] = make_false_column_like_self() elif is_list_like(values): - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: result[name] = col.isin(values) else: raise TypeError( @@ -6292,7 +6292,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only): name: filtered._data[name]._get_mask_as_column() if filtered._data[name].nullable else as_column(True, length=len(filtered._data[name])) - for name in filtered._data.names + for name in filtered._column_names } ) mask = mask.all(axis=1) @@ -6342,7 +6342,7 @@ def count(self, axis=0, numeric_only=False): length = len(self) return Series._from_column( as_column([length - col.null_count for col in self._columns]), - index=cudf.Index(self._data.names), + index=cudf.Index(self._column_names), ) _SUPPORT_AXIS_LOOKUP = { @@ -6409,7 +6409,7 @@ def _reduce( return source._apply_cupy_method_axis_1(op, **kwargs) else: axis_0_results = [] - for col_label, col in source._data.items(): + for col_label, col in source._column_labels_and_values: try: axis_0_results.append(getattr(col, op)(**kwargs)) except AttributeError as err: @@ -6634,7 +6634,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): prepared, mask, common_dtype = self._prepare_for_rowwise_op( method, skipna, numeric_only ) - for col in prepared._data.names: + for col in prepared._column_names: if prepared._data[col].nullable: prepared._data[col] = ( prepared._data[col] @@ -6820,7 +6820,7 @@ def select_dtypes(self, include=None, exclude=None): # remove all exclude types inclusion = inclusion - exclude_subtypes - for k, col in self._data.items(): + for k, col in self._column_labels_and_values: infered_type = cudf_dtype_from_pydata_dtype(col.dtype) if infered_type in inclusion: df._insert(len(df._data), k, col) @@ -7192,7 +7192,7 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): # Compute the column indices that serves as the input for # `interleave_columns` column_idx_df = pd.DataFrame( - data=range(len(self._data)), index=named_levels + data=range(self._num_columns), index=named_levels ) column_indices: list[list[int]] = [] @@ -7392,17 +7392,17 @@ def to_struct(self, name=None): ----- Note: a copy of the columns is made. """ - if not all(isinstance(name, str) for name in self._data.names): + if not all(isinstance(name, str) for name in self._column_names): warnings.warn( "DataFrame contains non-string column name(s). Struct column " "requires field name to be string. Non-string column names " "will be casted to string as the field name." ) - fields = {str(name): col.dtype for name, col in self._data.items()} + fields = {str(name): dtype for name, dtype in self._dtypes} col = StructColumn( data=None, dtype=cudf.StructDtype(fields=fields), - children=tuple(col.copy(deep=True) for col in self._data.columns), + children=tuple(col.copy(deep=True) for col in self._columns), size=len(self), offset=0, ) @@ -7984,7 +7984,7 @@ def value_counts( diff = set(subset) - set(self._data) if len(diff) != 0: raise KeyError(f"columns {diff} do not exist") - columns = list(self._data.names) if subset is None else subset + columns = list(self._column_names) if subset is None else subset result = ( self.groupby( by=columns, @@ -8105,7 +8105,7 @@ def func(left, right, output): right._column_names ) elif _is_scalar_or_zero_d_array(right): - for name, col in output._data.items(): + for name, col in output._column_labels_and_values: output._data[name] = col.fillna(value) return output else: @@ -8387,7 +8387,7 @@ def extract_col(df, col): and col not in df.index._data and not isinstance(df.index, MultiIndex) ): - return df.index._data.columns[0] + return df.index._column return df.index._data[col] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7b2bc85b13b..37ad6b8fabb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -75,8 +75,15 @@ def _columns(self) -> tuple[ColumnBase, ...]: return self._data.columns @property - def _dtypes(self) -> abc.Iterable: - return zip(self._data.names, (col.dtype for col in self._data.columns)) + def _column_labels_and_values( + self, + ) -> abc.Iterable[tuple[abc.Hashable, ColumnBase]]: + return zip(self._column_names, self._columns) + + @property + def _dtypes(self) -> abc.Generator[tuple[abc.Hashable, Dtype], None, None]: + for label, col in self._column_labels_and_values: + yield label, col.dtype @property def ndim(self) -> int: @@ -87,7 +94,7 @@ def serialize(self): # TODO: See if self._data can be serialized outright header = { "type-serialized": pickle.dumps(type(self)), - "column_names": pickle.dumps(tuple(self._data.names)), + "column_names": pickle.dumps(self._column_names), "column_rangeindex": pickle.dumps(self._data.rangeindex), "column_multiindex": pickle.dumps(self._data.multiindex), "column_label_dtype": pickle.dumps(self._data.label_dtype), @@ -156,7 +163,7 @@ def _mimic_inplace( self, result: Self, inplace: bool = False ) -> Self | None: if inplace: - for col in self._data: + for col in self._column_names: if col in result._data: self._data[col]._mimic_inplace( result._data[col], inplace=True @@ -267,7 +274,7 @@ def __len__(self) -> int: def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self: casted = ( col.astype(dtype.get(col_name, col.dtype), copy=copy) - for col_name, col in self._data.items() + for col_name, col in self._column_labels_and_values ) ca = self._data._from_columns_like_self(casted, verify=False) return self._from_data_like_self(ca) @@ -338,9 +345,7 @@ def equals(self, other) -> bool: return all( self_col.equals(other_col, check_dtypes=True) - for self_col, other_col in zip( - self._data.values(), other._data.values() - ) + for self_col, other_col in zip(self._columns, other._columns) ) @_performance_tracking @@ -434,11 +439,9 @@ def to_array( if dtype is None: if ncol == 1: - dtype = next(iter(self._data.values())).dtype + dtype = next(self._dtypes)[1] else: - dtype = find_common_type( - [col.dtype for col in self._data.values()] - ) + dtype = find_common_type([dtype for _, dtype in self._dtypes]) if not isinstance(dtype, numpy.dtype): raise NotImplementedError( @@ -446,12 +449,12 @@ def to_array( ) if self.ndim == 1: - return to_array(self._data.columns[0], dtype) + return to_array(self._columns[0], dtype) else: matrix = module.empty( shape=(len(self), ncol), dtype=dtype, order="F" ) - for i, col in enumerate(self._data.values()): + for i, col in enumerate(self._columns): # TODO: col.values may fail if there is nullable data or an # unsupported dtype. We may want to catch and provide a more # suitable error. @@ -751,7 +754,7 @@ def fillna( filled_columns = [ col.fillna(value[name], method) if name in value else col.copy() - for name, col in self._data.items() + for name, col in self._column_labels_and_values ] return self._mimic_inplace( @@ -764,11 +767,15 @@ def fillna( ) @_performance_tracking - def _drop_column(self, name): - """Drop a column by *name*""" - if name not in self._data: - raise KeyError(f"column '{name}' does not exist") - del self._data[name] + def _drop_column( + self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise" + ) -> None: + """Drop a column by *name* inplace.""" + try: + del self._data[name] + except KeyError as err: + if errors != "ignore": + raise KeyError(f"column '{name}' does not exist") from err @_performance_tracking def _quantile_table( @@ -988,7 +995,10 @@ def to_arrow(self): index: [[1,2,3]] """ return pa.Table.from_pydict( - {str(name): col.to_arrow() for name, col in self._data.items()} + { + str(name): col.to_arrow() + for name, col in self._column_labels_and_values + } ) @_performance_tracking @@ -1012,7 +1022,9 @@ def _copy_type_metadata(self: Self, other: Self) -> Self: See `ColumnBase._with_type_metadata` for more information. """ - for (name, col), (_, dtype) in zip(self._data.items(), other._dtypes): + for (name, col), (_, dtype) in zip( + self._column_labels_and_values, other._dtypes + ): self._data.set_by_label(name, col._with_type_metadata(dtype)) return self @@ -1422,7 +1434,7 @@ def _split(self, splits): """ return [ self._from_columns_like_self( - libcudf.copying.columns_split([*self._data.columns], splits)[ + libcudf.copying.columns_split(list(self._columns), splits)[ split_idx ], self._column_names, @@ -1432,7 +1444,7 @@ def _split(self, splits): @_performance_tracking def _encode(self): - columns, indices = libcudf.transform.table_encode([*self._columns]) + columns, indices = libcudf.transform.table_encode(list(self._columns)) keys = self._from_columns_like_self(columns) return keys, indices @@ -1578,7 +1590,7 @@ def __neg__(self): col.unary_operator("not") if col.dtype.kind == "b" else -1 * col - for col in self._data.columns + for col in self._columns ) ) ) @@ -1840,9 +1852,7 @@ def __copy__(self): def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" return self._from_data_like_self( - self._data._from_columns_like_self( - (~col for col in self._data.columns) - ) + self._data._from_columns_like_self((~col for col in self._columns)) ) @_performance_tracking diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6424c8af877..cb8cd0cd28b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -751,10 +751,8 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): ) and not libgroupby._is_all_scan_aggregate(normalized_aggs): # Even with `sort=False`, pandas guarantees that # groupby preserves the order of rows within each group. - left_cols = list( - self.grouping.keys.drop_duplicates()._data.columns - ) - right_cols = list(result_index._data.columns) + left_cols = list(self.grouping.keys.drop_duplicates()._columns) + right_cols = list(result_index._columns) join_keys = [ _match_join_keys(lcol, rcol, "left") for lcol, rcol in zip(left_cols, right_cols) @@ -1483,7 +1481,7 @@ def _post_process_chunk_results( # the column name should be, especially if we applied # a nameless UDF. result = result.to_frame( - name=grouped_values._data.names[0] + name=grouped_values._column_names[0] ) else: index_data = group_keys._data.copy(deep=True) @@ -1632,7 +1630,7 @@ def mult(df): if func in {"sum", "product"}: # For `sum` & `product`, boolean types # will need to result in `int64` type. - for name, col in res._data.items(): + for name, col in res._column_labels_and_values: if col.dtype.kind == "b": res._data[name] = col.astype("int") return res @@ -2715,11 +2713,8 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): def _reduce_numeric_only(self, op: str): columns = list( name - for name in self.obj._data.names - if ( - is_numeric_dtype(self.obj._data[name].dtype) - and name not in self.grouping.names - ) + for name, dtype in self.obj._dtypes + if (is_numeric_dtype(dtype) and name not in self.grouping.names) ) return self[columns].agg(op) @@ -3209,7 +3204,7 @@ def values(self) -> cudf.core.frame.Frame: """ # If the key columns are in `obj`, filter them out value_column_names = [ - x for x in self._obj._data.names if x not in self._named_columns + x for x in self._obj._column_names if x not in self._named_columns ] value_columns = self._obj._data.select_by_label(value_column_names) return self._obj.__class__._from_data(value_columns) @@ -3224,8 +3219,8 @@ def _handle_series(self, by): self.names.append(by.name) def _handle_index(self, by): - self._key_columns.extend(by._data.columns) - self.names.extend(by._data.names) + self._key_columns.extend(by._columns) + self.names.extend(by._column_names) def _handle_mapping(self, by): by = cudf.Series(by.values(), index=by.keys()) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b2bd20c4982..cd07c58c5d9 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -122,13 +122,13 @@ def _lexsorted_equal_range( sort_inds = None sort_vals = idx lower_bound = search_sorted( - [*sort_vals._data.columns], + list(sort_vals._columns), keys, side="left", ascending=sort_vals.is_monotonic_increasing, ).element_indexing(0) upper_bound = search_sorted( - [*sort_vals._data.columns], + list(sort_vals._columns), keys, side="right", ascending=sort_vals.is_monotonic_increasing, @@ -286,6 +286,20 @@ def name(self): def name(self, value): self._name = value + @property + @_performance_tracking + def _column_names(self) -> tuple[Any]: + return (self.name,) + + @property + @_performance_tracking + def _columns(self) -> tuple[ColumnBase]: + return (self._values,) + + @property + def _column_labels_and_values(self) -> Iterable: + return zip(self._column_names, self._columns) + @property # type: ignore @_performance_tracking def start(self) -> int: @@ -1068,7 +1082,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): else: inputs = { name: (col, None, False, None) - for name, col in self._data.items() + for name, col in self._column_labels_and_values } data = self._apply_cupy_ufunc_to_operands( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index fd6bf37f0e6..5952815deef 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3,7 +3,6 @@ from __future__ import annotations -import numbers import operator import textwrap import warnings @@ -150,24 +149,14 @@ ) -def _get_host_unique(array): +def _get_unique_drop_labels(array): + """Return labels to be dropped for IndexFrame.drop.""" if isinstance(array, (cudf.Series, cudf.Index, ColumnBase)): - return array.unique.to_pandas() - elif isinstance(array, (str, numbers.Number)): - return [array] + yield from np.unique(as_column(array).values_host) + elif is_scalar(array): + yield array else: - return set(array) - - -def _drop_columns(f: Frame, columns: abc.Iterable, errors: str): - for c in columns: - try: - f._drop_column(c) - except KeyError as e: - if errors == "ignore": - pass - else: - raise e + yield from set(array) def _indices_from_labels(obj, labels): @@ -294,7 +283,7 @@ def _num_rows(self) -> int: @property def _index_names(self) -> tuple[Any, ...]: # TODO: Tuple[str]? - return self.index._data.names + return self.index._column_names @classmethod def _from_data( @@ -307,6 +296,7 @@ def _from_data( raise ValueError( f"index must be None or a cudf.Index not {type(index).__name__}" ) + # out._num_rows requires .index to be defined out._index = RangeIndex(out._data.nrows) if index is None else index return out @@ -882,7 +872,7 @@ def replace( columns_dtype_map=dict(self._dtypes), ) copy_data = [] - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: try: replaced = col.find_and_replace( to_replace_per_column[name], @@ -2703,11 +2693,11 @@ def sort_index( by.extend( filter( lambda n: n not in handled, - self.index._data.names, + self.index._column_names, ) ) else: - by = list(idx._data.names) + by = list(idx._column_names) inds = idx._get_sorted_inds( by=by, ascending=ascending, na_position=na_position @@ -3013,7 +3003,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: columns_to_slice = [ *( - self.index._data.columns + self.index._columns if keep_index and not has_range_index else [] ), @@ -3210,7 +3200,7 @@ def _empty_like(self, keep_index=True) -> Self: result = self._from_columns_like_self( libcudf.copying.columns_empty_like( [ - *(self.index._data.columns if keep_index else ()), + *(self.index._columns if keep_index else ()), *self._columns, ] ), @@ -3227,7 +3217,7 @@ def _split(self, splits, keep_index=True): columns_split = libcudf.copying.columns_split( [ - *(self.index._data.columns if keep_index else []), + *(self.index._columns if keep_index else []), *self._columns, ], splits, @@ -3763,8 +3753,8 @@ def _reindex( idx_dtype_match = (df.index.nlevels == index.nlevels) and all( _is_same_dtype(left_dtype, right_dtype) for left_dtype, right_dtype in zip( - (col.dtype for col in df.index._data.columns), - (col.dtype for col in index._data.columns), + (dtype for _, dtype in df.index._dtypes), + (dtype for _, dtype in index._dtypes), ) ) @@ -3783,7 +3773,7 @@ def _reindex( (name or 0) if isinstance(self, cudf.Series) else name: col - for name, col in df._data.items() + for name, col in df._column_labels_and_values }, index=df.index, ) @@ -3794,7 +3784,7 @@ def _reindex( index = index if index is not None else df.index if column_names is None: - names = list(df._data.names) + names = list(df._column_names) level_names = self._data.level_names multiindex = self._data.multiindex rangeindex = self._data.rangeindex @@ -3948,7 +3938,7 @@ def round(self, decimals=0, how="half_even"): col.round(decimals[name], how=how) if name in decimals and col.dtype.kind in "fiu" else col.copy(deep=True) - for name, col in self._data.items() + for name, col in self._column_labels_and_values ) return self._from_data_like_self( self._data._from_columns_like_self(cols) @@ -4270,7 +4260,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): else: thresh = len(df) - for name, col in df._data.items(): + for name, col in df._column_labels_and_values: check_col = col.nans_to_nulls() no_threshold_valid_count = ( len(col) - check_col.null_count @@ -4305,7 +4295,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None): return self._from_columns_like_self( libcudf.stream_compaction.drop_nulls( - [*self.index._data.columns, *data_columns], + [*self.index._columns, *data_columns], how=how, keys=self._positions_from_column_names( subset, offset_by_index_columns=True @@ -4853,7 +4843,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # This works for Index too inputs = { name: (col, None, False, None) - for name, col in self._data.items() + for name, col in self._column_labels_and_values } index = self.index @@ -4933,7 +4923,7 @@ def repeat(self, repeats, axis=None): """ res = self._from_columns_like_self( Frame._repeat( - [*self.index._data.columns, *self._columns], repeats, axis + [*self.index._columns, *self._columns], repeats, axis ), self._column_names, self._index_names, @@ -5261,15 +5251,14 @@ def drop( out = self.copy() if axis in (1, "columns"): - target = _get_host_unique(target) - - _drop_columns(out, target, errors) + for label in _get_unique_drop_labels(target): + out._drop_column(label, errors=errors) elif axis in (0, "index"): dropped = _drop_rows_by_labels(out, target, level, errors) if columns is not None: - columns = _get_host_unique(columns) - _drop_columns(dropped, columns, errors) + for label in _get_unique_drop_labels(columns): + dropped._drop_column(label, errors=errors) out._mimic_inplace(dropped, inplace=True) @@ -6224,7 +6213,7 @@ def _preprocess_subset(self, subset): not np.iterable(subset) or isinstance(subset, str) or isinstance(subset, tuple) - and subset in self._data.names + and subset in self._column_names ): subset = (subset,) diff = set(subset) - set(self._data) @@ -6306,8 +6295,8 @@ def rank( ) numeric_cols = ( name - for name in self._data.names - if _is_non_decimal_numeric_dtype(self._data[name]) + for name, dtype in self._dtypes + if _is_non_decimal_numeric_dtype(dtype) ) source = self._get_columns_by_label(numeric_cols) if source.empty: diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index b65bc7af832..cfeaca00888 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -140,11 +140,15 @@ def __init__( # right_on. self._using_left_index = bool(left_index) left_on = ( - lhs.index._data.names if left_index else left_on if left_on else on + lhs.index._column_names + if left_index + else left_on + if left_on + else on ) self._using_right_index = bool(right_index) right_on = ( - rhs.index._data.names + rhs.index._column_names if right_index else right_on if right_on @@ -334,18 +338,18 @@ def _merge_results( # All columns from the left table make it into the output. Non-key # columns that share a name with a column in the right table are # suffixed with the provided suffix. - common_names = set(left_result._data.names) & set( - right_result._data.names + common_names = set(left_result._column_names) & set( + right_result._column_names ) cols_to_suffix = common_names - self._key_columns_with_same_name data = { (f"{name}{self.lsuffix}" if name in cols_to_suffix else name): col - for name, col in left_result._data.items() + for name, col in left_result._column_labels_and_values } # The right table follows the same rule as the left table except that # key columns from the right table are removed. - for name, col in right_result._data.items(): + for name, col in right_result._column_labels_and_values: if name in common_names: if name not in self._key_columns_with_same_name: data[f"{name}{self.rsuffix}"] = col @@ -399,7 +403,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: # producing the input result. by: list[Any] = [] if self._using_left_index and self._using_right_index: - by.extend(result.index._data.columns) + by.extend(result.index._columns) if not self._using_left_index: by.extend([result._data[col.name] for col in self._left_keys]) if not self._using_right_index: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index b86ad38c944..6de3981ba66 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -233,8 +233,8 @@ def names(self, value): # to unexpected behavior in some cases. This is # definitely buggy, but we can't disallow non-unique # names either... - self._data = self._data.__class__( - dict(zip(value, self._data.values())), + self._data = type(self._data)( + dict(zip(value, self._columns)), level_names=self._data.level_names, verify=False, ) @@ -693,19 +693,25 @@ def where(self, cond, other=None, inplace=False): @_performance_tracking def _compute_validity_mask(self, index, row_tuple, max_length): """Computes the valid set of indices of values in the lookup""" - lookup = cudf.DataFrame() + lookup_dict = {} for i, row in enumerate(row_tuple): if isinstance(row, slice) and row == slice(None): continue - lookup[i] = cudf.Series(row) - frame = cudf.DataFrame(dict(enumerate(index._data.columns))) + lookup_dict[i] = row + lookup = cudf.DataFrame(lookup_dict) + frame = cudf.DataFrame._from_data( + ColumnAccessor(dict(enumerate(index._columns)), verify=False) + ) with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) data_table = cudf.concat( [ frame, cudf.DataFrame._from_data( - {"idx": column.as_column(range(len(frame)))} + ColumnAccessor( + {"idx": column.as_column(range(len(frame)))}, + verify=False, + ) ), ], axis=1, @@ -716,7 +722,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length): # TODO: Remove this after merge/join # obtain deterministic ordering. if cudf.get_option("mode.pandas_compatible"): - lookup_order = "_" + "_".join(map(str, lookup._data.names)) + lookup_order = "_" + "_".join(map(str, lookup._column_names)) lookup[lookup_order] = column.as_column(range(len(lookup))) postprocess = operator.methodcaller( "sort_values", by=[lookup_order, "idx"] @@ -784,7 +790,7 @@ def _index_and_downcast(self, result, index, index_key): out_index.insert( out_index._num_columns, k, - cudf.Series._from_column(index._data.columns[k]), + cudf.Series._from_column(index._columns[k]), ) # determine if we should downcast from a DataFrame to a Series @@ -800,19 +806,19 @@ def _index_and_downcast(self, result, index, index_key): ) if need_downcast: result = result.T - return result[result._data.names[0]] + return result[result._column_names[0]] if len(result) == 0 and not slice_access: # Pandas returns an empty Series with a tuple as name # the one expected result column result = cudf.Series._from_data( - {}, name=tuple(col[0] for col in index._data.columns) + {}, name=tuple(col[0] for col in index._columns) ) elif out_index._num_columns == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to that column's name. - *_, last_column = index._data.columns + last_column = index._columns[-1] out_index = cudf.Index._from_column( last_column, name=index.names[-1] ) @@ -894,7 +900,7 @@ def __eq__(self, other): [ self_col.equals(other_col) for self_col, other_col in zip( - self._data.values(), other._data.values() + self._columns, other._columns ) ] ) @@ -1475,10 +1481,10 @@ def swaplevel(self, i=-2, j=-1) -> Self: ('aa', 'b')], ) """ - name_i = self._data.names[i] if isinstance(i, int) else i - name_j = self._data.names[j] if isinstance(j, int) else j + name_i = self._column_names[i] if isinstance(i, int) else i + name_j = self._column_names[j] if isinstance(j, int) else j new_data = {} - for k, v in self._data.items(): + for k, v in self._column_labels_and_values: if k not in (name_i, name_j): new_data[k] = v elif k == name_i: @@ -1916,7 +1922,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): join_keys = [ _match_join_keys(lcol, rcol, "inner") - for lcol, rcol in zip(target._data.columns, self._data.columns) + for lcol, rcol in zip(target._columns, self._columns) ] join_keys = map(list, zip(*join_keys)) scatter_map, indices = libcudf.join.join( @@ -2113,7 +2119,7 @@ def _split_columns_by_levels( lv if isinstance(lv, int) else level_names.index(lv) for lv in levels } - for i, (name, col) in enumerate(zip(self.names, self._data.columns)): + for i, (name, col) in enumerate(zip(self.names, self._columns)): if in_levels and i in level_indices: name = f"level_{i}" if name is None else name yield name, col @@ -2154,9 +2160,7 @@ def _columns_for_reset_index( ) -> Generator[tuple[Any, column.ColumnBase], None, None]: """Return the columns and column names for .reset_index""" if levels is None: - for i, (col, name) in enumerate( - zip(self._data.columns, self.names) - ): + for i, (col, name) in enumerate(zip(self._columns, self.names)): yield f"level_{i}" if name is None else name, col else: yield from self._split_columns_by_levels(levels, in_levels=True) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index c951db00c9a..401fef67ee6 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -410,7 +410,7 @@ def concat( result_columns = None if keys_objs is None: for o in objs: - for name, col in o._data.items(): + for name, col in o._column_labels_and_values: if name in result_data: raise NotImplementedError( f"A Column with duplicate name found: {name}, cuDF " @@ -438,7 +438,7 @@ def concat( else: # All levels in the multiindex label must have the same type has_multiple_level_types = ( - len({type(name) for o in objs for name in o._data.keys()}) > 1 + len({type(name) for o in objs for name in o._column_names}) > 1 ) if has_multiple_level_types: raise NotImplementedError( @@ -447,7 +447,7 @@ def concat( "the labels to the same type." ) for k, o in zip(keys_objs, objs): - for name, col in o._data.items(): + for name, col in o._column_labels_and_values: # if only series, then only keep keys_objs as column labels # if the existing column is multiindex, prepend it # to handle cases where dfs and srs are concatenated @@ -843,7 +843,7 @@ def get_dummies( else: result_data = { col_name: col - for col_name, col in data._data.items() + for col_name, col in data._column_labels_and_values if col_name not in columns } @@ -943,7 +943,7 @@ def _merge_sorted( columns = [ [ - *(obj.index._data.columns if not ignore_index else ()), + *(obj.index._columns if not ignore_index else ()), *obj._columns, ] for obj in objs @@ -985,7 +985,7 @@ def as_tuple(x): return x if isinstance(x, tuple) else (x,) nrows = len(index_labels) - for col_label, col in df._data.items(): + for col_label, col in df._column_labels_and_values: names = [ as_tuple(col_label) + as_tuple(name) for name in column_labels ] @@ -1009,7 +1009,7 @@ def as_tuple(x): ca = ColumnAccessor( result, multiindex=True, - level_names=(None,) + columns._data.names, + level_names=(None,) + columns._column_names, verify=False, ) return cudf.DataFrame._from_data( @@ -1087,11 +1087,7 @@ def pivot(data, columns=None, index=no_default, values=no_default): # Create a DataFrame composed of columns from both # columns and index ca = ColumnAccessor( - dict( - enumerate( - itertools.chain(index._data.columns, columns._data.columns) - ) - ), + dict(enumerate(itertools.chain(index._columns, columns._columns))), verify=False, ) columns_index = cudf.DataFrame._from_data(ca) @@ -1560,7 +1556,7 @@ def pivot_table( if values_passed and not values_multi and table._data.multiindex: column_names = table._data.level_names[1:] table_columns = tuple( - map(lambda column: column[1:], table._data.names) + map(lambda column: column[1:], table._column_names) ) table.columns = pd.MultiIndex.from_tuples( tuples=table_columns, names=column_names diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 7197560b5a4..68f34fa28ff 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -186,7 +186,7 @@ def to_datetime( if isinstance(arg, cudf.DataFrame): # we require at least Ymd required = ["year", "month", "day"] - req = list(set(required) - set(arg._data.names)) + req = list(set(required) - set(arg._column_names)) if len(req): err_req = ",".join(req) raise ValueError( @@ -196,7 +196,7 @@ def to_datetime( ) # replace passed column name with values in _unit_map - got_units = {k: get_units(k) for k in arg._data.names} + got_units = {k: get_units(k) for k in arg._column_names} unit_rev = {v: k for k, v in got_units.items()} # keys we don't recognize diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 265b87350ae..3af662b62ea 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -210,7 +210,7 @@ def _can_be_jitted(frame, func, args): # See https://github.com/numba/numba/issues/4587 return False - if any(col.has_nulls() for col in frame._data.values()): + if any(col.has_nulls() for col in frame._columns): return False np_field_types = np.dtype( list( diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 6d7362952c9..bfe716f0afc 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -126,25 +126,23 @@ def _get_udf_return_type(argty, func: Callable, args=()): def _all_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { - colname: col.dtype - if str(col.dtype) in supported_types - else np.dtype("O") - for colname, col in frame._data.items() + colname: dtype if str(dtype) in supported_types else np.dtype("O") + for colname, dtype in frame._dtypes } def _supported_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { - colname: col.dtype - for colname, col in frame._data.items() - if str(col.dtype) in supported_types + colname: dtype + for colname, dtype in frame._dtypes + if str(dtype) in supported_types } def _supported_cols_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { colname: col - for colname, col in frame._data.items() + for colname, col in frame._column_labels_and_values if str(col.dtype) in supported_types } @@ -232,8 +230,8 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"): *cudautils.make_cache_key( func, tuple(_all_dtypes_from_frame(frame).values()) ), - *(col.mask is None for col in frame._data.values()), - *frame._data.keys(), + *(col.mask is None for col in frame._columns), + *frame._column_names, scalar_argtypes, suffix, ) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index a9c20150930..3dc8915bfd1 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -186,13 +186,13 @@ def to_csv( "Dataframe doesn't have the labels provided in columns" ) - for col in df._data.columns: - if isinstance(col, cudf.core.column.ListColumn): + for _, dtype in df._dtypes: + if isinstance(dtype, cudf.ListDtype): raise NotImplementedError( "Writing to csv format is not yet supported with " "list columns." ) - elif isinstance(col, cudf.core.column.StructColumn): + elif isinstance(dtype, cudf.StructDtype): raise NotImplementedError( "Writing to csv format is not yet supported with " "Struct columns." @@ -203,12 +203,11 @@ def to_csv( # workaround once following issue is fixed: # https://github.com/rapidsai/cudf/issues/6661 if any( - isinstance(col, cudf.core.column.CategoricalColumn) - for col in df._data.columns + isinstance(dtype, cudf.CategoricalDtype) for _, dtype in df._dtypes ) or isinstance(df.index, cudf.CategoricalIndex): df = df.copy(deep=False) - for col_name, col in df._data.items(): - if isinstance(col, cudf.core.column.CategoricalColumn): + for col_name, col in df._column_labels_and_values: + if isinstance(col.dtype, cudf.CategoricalDtype): df._data[col_name] = col.astype(col.categories.dtype) if isinstance(df.index, cudf.CategoricalIndex): diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index 1347b2cc38f..fe8e446f9c0 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -79,13 +79,13 @@ def to_dlpack(cudf_obj): ) if any( - not cudf.api.types._is_non_decimal_numeric_dtype(col.dtype) - for col in gdf._data.columns + not cudf.api.types._is_non_decimal_numeric_dtype(dtype) + for _, dtype in gdf._dtypes ): raise TypeError("non-numeric data not yet supported") dtype = cudf.utils.dtypes.find_common_type( - [col.dtype for col in gdf._data.columns] + [dtype for _, dtype in gdf._dtypes] ) gdf = gdf.astype(dtype) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index fd246c6215f..c54293badbe 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -396,8 +396,8 @@ def to_orc( ): """{docstring}""" - for col in df._data.columns: - if isinstance(col, cudf.core.column.CategoricalColumn): + for _, dtype in df._dtypes: + if isinstance(dtype, cudf.CategoricalDtype): raise NotImplementedError( "Writing to ORC format is not yet supported with " "Categorical columns." diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index bf2ee6ae624..0c1cda8810b 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -881,6 +881,12 @@ def _assert_fast_slow_eq(left, right): assert_eq(left, right) +class ProxyFallbackError(Exception): + """Raised when fallback occurs""" + + pass + + def _fast_function_call(): """ Placeholder fast function for pytest profiling purposes. @@ -957,6 +963,10 @@ def _fast_slow_function_call( f"The exception was {e}." ) except Exception as err: + if _env_get_bool("CUDF_PANDAS_FAIL_ON_FALLBACK", False): + raise ProxyFallbackError( + f"The operation failed with cuDF, the reason was {type(err)}: {err}." + ) from err with nvtx.annotate( "EXECUTE_SLOW", color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"], diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 31ad24a4664..668e7a77454 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -676,7 +676,7 @@ def assert_frame_equal( if check_like: left, right = left.reindex(index=right.index), right - right = right[list(left._data.names)] + right = right[list(left._column_names)] # index comparison assert_index_equal( diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini index 2136bca0e28..d05ba9aaacc 100644 --- a/python/cudf/cudf/tests/pytest.ini +++ b/python/cudf/cudf/tests/pytest.ini @@ -14,3 +14,6 @@ filterwarnings = ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning # PerformanceWarning from cupy warming up the JIT cache ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning + # Ignore numba PEP 456 warning specific to arm machines + ignore:FNV hashing is not implemented in Numba.*:UserWarning +addopts = --tb=native diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index 773141ee71a..979c936a182 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -33,9 +33,10 @@ def __array_function__(self, *args, **kwargs): missing_arrfunc_reason = "NEP-18 support is not available in NumPy" +np.random.seed(0) + @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize("np_ar", [np.random.random(100)]) @pytest.mark.parametrize( "func", [ @@ -47,7 +48,8 @@ def __array_function__(self, *args, **kwargs): lambda x: np.linalg.norm(x), ], ) -def test_array_func_cudf_series(np_ar, func): +def test_array_func_cudf_series(func): + np_ar = np.random.random(100) cudf_ser = cudf.Series(np_ar) expect = func(np_ar) got = func(cudf_ser) @@ -58,9 +60,6 @@ def test_array_func_cudf_series(np_ar, func): @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize( - "pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))] -) @pytest.mark.parametrize( "func", [ @@ -74,7 +73,8 @@ def test_array_func_cudf_series(np_ar, func): lambda x: np.prod(x, axis=1), ], ) -def test_array_func_cudf_dataframe(pd_df, func): +def test_array_func_cudf_dataframe(func): + pd_df = pd.DataFrame(np.random.uniform(size=(100, 10))) cudf_df = cudf.from_pandas(pd_df) expect = func(pd_df) got = func(cudf_df) @@ -82,9 +82,6 @@ def test_array_func_cudf_dataframe(pd_df, func): @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize( - "pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))] -) @pytest.mark.parametrize( "func", [ @@ -93,21 +90,22 @@ def test_array_func_cudf_dataframe(pd_df, func): lambda x: np.linalg.det(x), ], ) -def test_array_func_missing_cudf_dataframe(pd_df, func): +def test_array_func_missing_cudf_dataframe(func): + pd_df = pd.DataFrame(np.random.uniform(size=(100, 10))) cudf_df = cudf.from_pandas(pd_df) with pytest.raises(TypeError): func(cudf_df) @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize("np_ar", [np.random.random(100)]) @pytest.mark.parametrize( "func", [ lambda x: np.unique(x), ], ) -def test_array_func_cudf_index(np_ar, func): +def test_array_func_cudf_index(func): + np_ar = np.random.random(100) cudf_index = cudf.Index(cudf.Series(np_ar)) expect = func(np_ar) got = func(cudf_index) @@ -118,7 +116,6 @@ def test_array_func_cudf_index(np_ar, func): @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize("np_ar", [np.random.random(100)]) @pytest.mark.parametrize( "func", [ @@ -127,7 +124,8 @@ def test_array_func_cudf_index(np_ar, func): lambda x: np.linalg.det(x), ], ) -def test_array_func_missing_cudf_index(np_ar, func): +def test_array_func_missing_cudf_index(func): + np_ar = np.random.random(100) cudf_index = cudf.Index(cudf.Series(np_ar)) with pytest.raises(TypeError): func(cudf_index) diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index 9d69e626c3d..5acdf36de80 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -236,6 +236,7 @@ def test_avro_compression(rows, codec): }, ], rows, + seed=0, ) expected_df = cudf.DataFrame.from_arrow(df) @@ -599,7 +600,7 @@ def test_avro_reader_multiblock( else: assert dtype in ("float32", "float64") avro_type = "float" if dtype == "float32" else "double" - + np.random.seed(0) # We don't use rand_dataframe() here, because it increases the # execution time of each test by a factor of 10 or more (it appears # to use a very costly approach to generating random data). diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f4d1578bda7..6f88d942746 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -515,6 +515,17 @@ def test_dataframe_drop_columns(pdf, columns, inplace): assert_eq(expected, actual) +@pytest.mark.parametrize("obj", ["Index", "Series"]) +def test_drop_cudf_obj_columns(obj): + pdf = pd.DataFrame({"A": [1], "B": [1]}) + gdf = cudf.from_pandas(pdf) + + columns = ["B"] + expected = pdf.drop(labels=getattr(pd, obj)(columns), axis=1) + actual = gdf.drop(columns=getattr(cudf, obj)(columns), axis=1) + assert_eq(expected, actual) + + @pytest.mark.parametrize( "pdf", [ diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 0aaa71e50d7..848bc259e7b 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -2470,6 +2470,7 @@ def test_groupby_2keys_rank(nelem, method, ascending, na_option, pct): ], rows=nelem, use_threads=False, + seed=0, ) pdf = t.to_pandas() pdf.columns = ["x", "y", "z"] @@ -2602,6 +2603,7 @@ def test_groupby_shift_row_mixed_numerics( ], rows=nelem, use_threads=False, + seed=0, ) pdf = t.to_pandas() gdf = cudf.from_pandas(pdf) @@ -2639,6 +2641,7 @@ def test_groupby_shift_row_mixed(nelem, shift_perc, direction): ], rows=nelem, use_threads=False, + seed=0, ) pdf = t.to_pandas() gdf = cudf.from_pandas(pdf) @@ -2687,6 +2690,7 @@ def test_groupby_shift_row_mixed_fill( ], rows=nelem, use_threads=False, + seed=0, ) pdf = t.to_pandas() gdf = cudf.from_pandas(pdf) @@ -2732,6 +2736,7 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value): ], rows=nelem, use_threads=False, + seed=0, ) gdf = cudf.from_pandas(t.to_pandas()) @@ -2782,6 +2787,7 @@ def test_groupby_diff_row_mixed_numerics(nelem, shift_perc, direction): ], rows=nelem, use_threads=False, + seed=0, ) pdf = t.to_pandas() gdf = cudf.from_pandas(pdf) @@ -2815,6 +2821,7 @@ def test_groupby_diff_row_zero_shift(nelem): ], rows=nelem, use_threads=False, + seed=0, ) gdf = cudf.from_pandas(t.to_pandas()) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index b1e095e8853..c41be3e4428 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -813,8 +813,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): mi1 = gdf.groupby(["Date", "Symbol"]).mean().index mi2 = mi1.copy(deep=deep) - lchildren = [col.children for _, col in mi1._data.items()] - rchildren = [col.children for _, col in mi2._data.items()] + lchildren = [col.children for col in mi1._columns] + rchildren = [col.children for col in mi2._columns] # Flatten lchildren = reduce(operator.add, lchildren) @@ -849,12 +849,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) # Assert ._data identity - lptrs = [ - d.base_data.get_ptr(mode="read") for _, d in mi1._data.items() - ] - rptrs = [ - d.base_data.get_ptr(mode="read") for _, d in mi2._data.items() - ] + lptrs = [d.base_data.get_ptr(mode="read") for d in mi1._columns] + rptrs = [d.base_data.get_ptr(mode="read") for d in mi2._columns] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) cudf.set_option("copy_on_write", original_cow_setting) diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index c4ab4b0a853..2bbed40e34e 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -26,7 +26,11 @@ from cudf.core._compat import PANDAS_GE_220 from cudf.pandas import LOADED, Profiler -from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object +from cudf.pandas.fast_slow_proxy import ( + ProxyFallbackError, + _Unusable, + is_proxy_object, +) from cudf.testing import assert_eq if not LOADED: @@ -1738,3 +1742,13 @@ def add_one_ufunc(a): return a + 1 assert_eq(cp.asarray(add_one_ufunc(arr1)), cp.asarray(add_one_ufunc(arr2))) + + +@pytest.mark.xfail( + reason="Fallback expected because casting to object is not supported", +) +def test_fallback_raises_error(monkeypatch): + with monkeypatch.context() as monkeycontext: + monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") + with pytest.raises(ProxyFallbackError): + pd.Series(range(2)).astype(object) diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py new file mode 100644 index 00000000000..896256bf6d7 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from cudf.pandas import LOADED + +if not LOADED: + raise ImportError("These tests must be run with cudf.pandas loaded") + +import numpy as np +import pandas as pd + + +@pytest.fixture(autouse=True) +def fail_on_fallback(monkeypatch): + monkeypatch.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") + + +@pytest.fixture +def dataframe(): + df = pd.DataFrame( + { + "a": [1, 1, 1, 2, 3], + "b": [1, 2, 3, 4, 5], + "c": [1.2, 1.3, 1.5, 1.7, 1.11], + } + ) + return df + + +@pytest.fixture +def series(dataframe): + return dataframe["a"] + + +@pytest.fixture +def array(series): + return series.values + + +@pytest.mark.parametrize( + "op", + [ + "sum", + "min", + "max", + "mean", + "std", + "var", + "prod", + "median", + ], +) +def test_no_fallback_in_reduction_ops(series, op): + s = series + getattr(s, op)() + + +def test_groupby(dataframe): + df = dataframe + df.groupby("a", sort=True).max() + + +def test_no_fallback_in_binops(dataframe): + df = dataframe + df + df + df - df + df * df + df**df + df[["a", "b"]] & df[["a", "b"]] + df <= df + + +def test_no_fallback_in_groupby_rolling_sum(dataframe): + df = dataframe + df.groupby("a").rolling(2).sum() + + +def test_no_fallback_in_concat(dataframe): + df = dataframe + pd.concat([df, df]) + + +def test_no_fallback_in_get_shape(dataframe): + df = dataframe + df.shape + + +def test_no_fallback_in_array_ufunc_op(array): + np.add(array, array) + + +def test_no_fallback_in_merge(dataframe): + df = dataframe + pd.merge(df * df, df + df, how="inner") + pd.merge(df * df, df + df, how="outer") + pd.merge(df * df, df + df, how="left") + pd.merge(df * df, df + df, how="right") diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini index 817d98e6ba2..98459035298 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini @@ -1,3 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + [pytest] xfail_strict=true markers= @@ -5,3 +7,4 @@ markers= xfail_gold: this test is expected to fail in the gold pass xfail_cudf_pandas: this test is expected to fail in the cudf_pandas pass xfail_compare: this test is expected to fail in the comparison pass +addopts = --tb=native diff --git a/python/cudf_kafka/cudf_kafka/tests/pytest.ini b/python/cudf_kafka/cudf_kafka/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/cudf_kafka/cudf_kafka/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py index 41d06f8631b..66c15f694ee 100644 --- a/python/cudf_polars/cudf_polars/__init__.py +++ b/python/cudf_polars/cudf_polars/__init__.py @@ -14,6 +14,12 @@ from cudf_polars.callback import execute_with_cudf from cudf_polars.dsl.translate import translate_ir +# Check we have a supported polars version +from cudf_polars.utils.versions import _ensure_polars_version + +_ensure_polars_version() +del _ensure_polars_version + __all__: list[str] = [ "execute_with_cudf", "translate_ir", diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index f31193aa938..76816ee0a61 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -5,19 +5,26 @@ from __future__ import annotations +import contextlib import os import warnings -from functools import partial +from functools import cache, partial from typing import TYPE_CHECKING import nvtx -from polars.exceptions import PerformanceWarning +from polars.exceptions import ComputeError, PerformanceWarning + +import rmm +from rmm._cuda import gpu from cudf_polars.dsl.translate import translate_ir if TYPE_CHECKING: + from collections.abc import Generator + import polars as pl + from polars import GPUEngine from cudf_polars.dsl.ir import IR from cudf_polars.typing import NodeTraverser @@ -25,23 +32,126 @@ __all__: list[str] = ["execute_with_cudf"] +@cache +def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource: + """ + Return the default memory resource for cudf-polars. + + Parameters + ---------- + device + Disambiguating device id when selecting the device. Must be + the active device when this function is called. + + Returns + ------- + rmm.mr.DeviceMemoryResource + The default memory resource that cudf-polars uses. Currently + an async pool resource. + """ + try: + return rmm.mr.CudaAsyncMemoryResource() + except RuntimeError as e: # pragma: no cover + msg, *_ = e.args + if ( + msg.startswith("RMM failure") + and msg.find("not supported with this CUDA driver/runtime version") > -1 + ): + raise ComputeError( + "GPU engine requested, but incorrect cudf-polars package installed. " + "If your system has a CUDA 11 driver, please uninstall `cudf-polars-cu12` " + "and install `cudf-polars-cu11`" + ) from None + else: + raise + + +@contextlib.contextmanager +def set_memory_resource( + mr: rmm.mr.DeviceMemoryResource | None, +) -> Generator[rmm.mr.DeviceMemoryResource, None, None]: + """ + Set the current memory resource for an execution block. + + Parameters + ---------- + mr + Memory resource to use. If `None`, calls :func:`default_memory_resource` + to obtain an mr on the currently active device. + + Returns + ------- + Memory resource used. + + Notes + ----- + At exit, the memory resource is restored to whatever was current + at entry. If a memory resource is provided, it must be valid to + use with the currently active device. + """ + if mr is None: + device: int = gpu.getDevice() + mr = default_memory_resource(device) + previous = rmm.mr.get_current_device_resource() + rmm.mr.set_current_device_resource(mr) + try: + yield mr + finally: + rmm.mr.set_current_device_resource(previous) + + +@contextlib.contextmanager +def set_device(device: int | None) -> Generator[int, None, None]: + """ + Set the device the query is executed on. + + Parameters + ---------- + device + Device to use. If `None`, uses the current device. + + Returns + ------- + Device active for the execution of the block. + + Notes + ----- + At exit, the device is restored to whatever was current at entry. + """ + previous: int = gpu.getDevice() + if device is not None: + gpu.setDevice(device) + try: + yield previous + finally: + gpu.setDevice(previous) + + def _callback( ir: IR, with_columns: list[str] | None, pyarrow_predicate: str | None, n_rows: int | None, + *, + device: int | None, + memory_resource: int | None, ) -> pl.DataFrame: assert with_columns is None assert pyarrow_predicate is None assert n_rows is None - with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"): + with ( + nvtx.annotate(message="ExecuteIR", domain="cudf_polars"), + # Device must be set before memory resource is obtained. + set_device(device), + set_memory_resource(memory_resource), + ): return ir.evaluate(cache={}).to_polars() def execute_with_cudf( nt: NodeTraverser, *, - raise_on_fail: bool = False, + config: GPUEngine, exception: type[Exception] | tuple[type[Exception], ...] = Exception, ) -> None: """ @@ -52,9 +162,8 @@ def execute_with_cudf( nt NodeTraverser - raise_on_fail - Should conversion raise an exception rather than continuing - without setting a callback. + config + GPUEngine configuration object exception Optional exception, or tuple of exceptions, to catch during @@ -62,9 +171,23 @@ def execute_with_cudf( The NodeTraverser is mutated if the libcudf executor can handle the plan. """ + device = config.device + memory_resource = config.memory_resource + raise_on_fail = config.config.get("raise_on_fail", False) + if unsupported := (config.config.keys() - {"raise_on_fail"}): + raise ValueError( + f"Engine configuration contains unsupported settings {unsupported}" + ) try: with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): - nt.set_udf(partial(_callback, translate_ir(nt))) + nt.set_udf( + partial( + _callback, + translate_ir(nt), + device=device, + memory_resource=memory_resource, + ) + ) except exception as e: if bool(int(os.environ.get("POLARS_VERBOSE", 0))): warnings.warn( diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index dd3b771e305..3fe3e5557cb 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -84,6 +84,34 @@ def sorted_like(self, like: Column, /) -> Self: is_sorted=like.is_sorted, order=like.order, null_order=like.null_order ) + # TODO: Return Column once #16272 is fixed. + def astype(self, dtype: plc.DataType) -> plc.Column: + """ + Return the backing column as the requested dtype. + + Parameters + ---------- + dtype + Datatype to cast to. + + Returns + ------- + Column of requested type. + + Raises + ------ + RuntimeError + If the cast is unsupported. + + Notes + ----- + This only produces a copy if the requested dtype doesn't match + the current one. + """ + if self.obj.type() != dtype: + return plc.unary.cast(self.obj, dtype) + return self.obj + def copy_metadata(self, from_: pl.Series, /) -> Self: """ Copy metadata from a host series onto self. diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index a5c99e2bc11..f3e3862d0cc 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -7,7 +7,7 @@ import itertools from functools import cached_property -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING import pyarrow as pa import pylibcudf as plc @@ -45,11 +45,19 @@ def copy(self) -> Self: def to_polars(self) -> pl.DataFrame: """Convert to a polars DataFrame.""" + # If the arrow table has empty names, from_arrow produces + # column_$i. But here we know there is only one such column + # (by construction) and it should have an empty name. + # https://github.com/pola-rs/polars/issues/11632 + # To guarantee we produce correct names, we therefore + # serialise with names we control and rename with that map. + name_map = {f"column_{i}": c.name for i, c in enumerate(self.columns)} table: pa.Table = plc.interop.to_arrow( self.table, - [plc.interop.ColumnMetadata(name=c.name) for c in self.columns], + [plc.interop.ColumnMetadata(name=name) for name in name_map], ) - return cast(pl.DataFrame, pl.from_arrow(table)).with_columns( + df: pl.DataFrame = pl.from_arrow(table) + return df.rename(name_map).with_columns( *( pl.col(c.name).set_sorted( descending=c.order == plc.types.Order.DESCENDING diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index e1b4d30b76b..c401e5a2f17 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -21,8 +21,10 @@ from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple import pyarrow as pa +import pyarrow.compute as pc import pylibcudf as plc +from polars.exceptions import InvalidOperationError from polars.polars import _expr_nodes as pl_expr from cudf_polars.containers import Column, NamedColumn @@ -477,12 +479,6 @@ def __init__( self.options = options self.name = name self.children = children - if ( - self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All) - and not self.options[0] - ): - # With ignore_nulls == False, polars uses Kleene logic - raise NotImplementedError(f"Kleene logic for {self.name}") if self.name == pl_expr.BooleanFunction.IsIn and not all( c.dtype == self.children[0].dtype for c in self.children ): @@ -577,20 +573,31 @@ def do_evaluate( child.evaluate(df, context=context, mapping=mapping) for child in self.children ] - if self.name == pl_expr.BooleanFunction.Any: + # Kleene logic for Any (OR) and All (AND) if ignore_nulls is + # False + if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All): + (ignore_nulls,) = self.options (column,) = columns - return Column( - plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.any(), self.dtype), 1 - ) - ) - elif self.name == pl_expr.BooleanFunction.All: - (column,) = columns - return Column( - plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.all(), self.dtype), 1 - ) - ) + is_any = self.name == pl_expr.BooleanFunction.Any + agg = plc.aggregation.any() if is_any else plc.aggregation.all() + result = plc.reduce.reduce(column.obj, agg, self.dtype) + if not ignore_nulls and column.obj.null_count() > 0: + # Truth tables + # Any All + # | F U T | F U T + # --+------ --+------ + # F | F U T F | F F F + # U | U U T U | F U U + # T | T T T T | F U T + # + # If the input null count was non-zero, we must + # post-process the result to insert the correct value. + h_result = plc.interop.to_arrow(result).as_py() + if is_any and not h_result or not is_any and h_result: + # Any All + # False || Null => Null True && Null => Null + return Column(plc.Column.all_null_like(column.obj, 1)) + return Column(plc.Column.from_scalar(result, 1)) if self.name == pl_expr.BooleanFunction.IsNull: (column,) = columns return Column(plc.unary.is_null(column.obj)) @@ -598,13 +605,19 @@ def do_evaluate( (column,) = columns return Column(plc.unary.is_valid(column.obj)) elif self.name == pl_expr.BooleanFunction.IsNan: - # TODO: copy over null mask since is_nan(null) => null in polars (column,) = columns - return Column(plc.unary.is_nan(column.obj)) + return Column( + plc.unary.is_nan(column.obj).with_mask( + column.obj.null_mask(), column.obj.null_count() + ) + ) elif self.name == pl_expr.BooleanFunction.IsNotNan: - # TODO: copy over null mask since is_not_nan(null) => null in polars (column,) = columns - return Column(plc.unary.is_not_nan(column.obj)) + return Column( + plc.unary.is_not_nan(column.obj).with_mask( + column.obj.null_mask(), column.obj.null_count() + ) + ) elif self.name == pl_expr.BooleanFunction.IsFirstDistinct: (column,) = columns return self._distinct( @@ -654,26 +667,22 @@ def do_evaluate( ), ) elif self.name == pl_expr.BooleanFunction.AllHorizontal: - if any(c.obj.null_count() > 0 for c in columns): - raise NotImplementedError("Kleene logic for all_horizontal") return Column( reduce( partial( plc.binaryop.binary_operation, - op=plc.binaryop.BinaryOperator.BITWISE_AND, + op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, output_type=self.dtype, ), (c.obj for c in columns), ) ) elif self.name == pl_expr.BooleanFunction.AnyHorizontal: - if any(c.obj.null_count() > 0 for c in columns): - raise NotImplementedError("Kleene logic for any_horizontal") return Column( reduce( partial( plc.binaryop.binary_operation, - op=plc.binaryop.BinaryOperator.BITWISE_OR, + op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, output_type=self.dtype, ), (c.obj for c in columns), @@ -694,7 +703,7 @@ def do_evaluate( class StringFunction(Expr): - __slots__ = ("name", "options", "children") + __slots__ = ("name", "options", "children", "_regex_program") _non_child = ("dtype", "name", "options") children: tuple[Expr, ...] @@ -713,12 +722,18 @@ def __init__( def _validate_input(self): if self.name not in ( - pl_expr.StringFunction.Lowercase, - pl_expr.StringFunction.Uppercase, - pl_expr.StringFunction.EndsWith, - pl_expr.StringFunction.StartsWith, pl_expr.StringFunction.Contains, + pl_expr.StringFunction.EndsWith, + pl_expr.StringFunction.Lowercase, + pl_expr.StringFunction.Replace, + pl_expr.StringFunction.ReplaceMany, pl_expr.StringFunction.Slice, + pl_expr.StringFunction.Strptime, + pl_expr.StringFunction.StartsWith, + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + pl_expr.StringFunction.Uppercase, ): raise NotImplementedError(f"String function {self.name}") if self.name == pl_expr.StringFunction.Contains: @@ -732,11 +747,65 @@ def _validate_input(self): raise NotImplementedError( "Regex contains only supports a scalar pattern" ) + pattern = self.children[1].value.as_py() + try: + self._regex_program = plc.strings.regex_program.RegexProgram.create( + pattern, + flags=plc.strings.regex_flags.RegexFlags.DEFAULT, + ) + except RuntimeError as e: + raise NotImplementedError( + f"Unsupported regex {pattern} for GPU engine." + ) from e + elif self.name == pl_expr.StringFunction.Replace: + _, literal = self.options + if not literal: + raise NotImplementedError("literal=False is not supported for replace") + if not all(isinstance(expr, Literal) for expr in self.children[1:]): + raise NotImplementedError("replace only supports scalar target") + target = self.children[1] + if target.value == pa.scalar("", type=pa.string()): + raise NotImplementedError( + "libcudf replace does not support empty strings" + ) + elif self.name == pl_expr.StringFunction.ReplaceMany: + (ascii_case_insensitive,) = self.options + if ascii_case_insensitive: + raise NotImplementedError( + "ascii_case_insensitive not implemented for replace_many" + ) + if not all( + isinstance(expr, (LiteralColumn, Literal)) for expr in self.children[1:] + ): + raise NotImplementedError("replace_many only supports literal inputs") + target = self.children[1] + if pc.any(pc.equal(target.value, "")).as_py(): + raise NotImplementedError( + "libcudf replace_many is implemented differently from polars " + "for empty strings" + ) elif self.name == pl_expr.StringFunction.Slice: if not all(isinstance(child, Literal) for child in self.children[1:]): raise NotImplementedError( "Slice only supports literal start and stop values" ) + elif self.name == pl_expr.StringFunction.Strptime: + format, _, exact, cache = self.options + if cache: + raise NotImplementedError("Strptime cache is a CPU feature") + if format is None: + raise NotImplementedError("Strptime format is required") + if not exact: + raise NotImplementedError("Strptime does not support exact=False") + elif self.name in { + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + }: + if not isinstance(self.children[1], Literal): + raise NotImplementedError( + "strip operations only support scalar patterns" + ) def do_evaluate( self, @@ -759,12 +828,10 @@ def do_evaluate( else pat.obj ) return Column(plc.strings.find.contains(column.obj, pattern)) - assert isinstance(arg, Literal) - prog = plc.strings.regex_program.RegexProgram.create( - arg.value.as_py(), - flags=plc.strings.regex_flags.RegexFlags.DEFAULT, - ) - return Column(plc.strings.contains.contains_re(column.obj, prog)) + else: + return Column( + plc.strings.contains.contains_re(column.obj, self._regex_program) + ) elif self.name == pl_expr.StringFunction.Slice: child, expr_offset, expr_length = self.children assert isinstance(expr_offset, Literal) @@ -795,6 +862,22 @@ def do_evaluate( plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())), ) ) + elif self.name in { + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + }: + column, chars = ( + c.evaluate(df, context=context, mapping=mapping) for c in self.children + ) + if self.name == pl_expr.StringFunction.StripCharsStart: + side = plc.strings.SideType.LEFT + elif self.name == pl_expr.StringFunction.StripCharsEnd: + side = plc.strings.SideType.RIGHT + else: + side = plc.strings.SideType.BOTH + return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar)) + columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -825,6 +908,51 @@ def do_evaluate( else prefix.obj, ) ) + elif self.name == pl_expr.StringFunction.Strptime: + # TODO: ignores ambiguous + format, strict, exact, cache = self.options + col = self.children[0].evaluate(df, context=context, mapping=mapping) + + is_timestamps = plc.strings.convert.convert_datetime.is_timestamp( + col.obj, format.encode() + ) + + if strict: + if not plc.interop.to_arrow( + plc.reduce.reduce( + is_timestamps, + plc.aggregation.all(), + plc.DataType(plc.TypeId.BOOL8), + ) + ).as_py(): + raise InvalidOperationError("conversion from `str` failed.") + else: + not_timestamps = plc.unary.unary_operation( + is_timestamps, plc.unary.UnaryOperator.NOT + ) + + null = plc.interop.from_arrow(pa.scalar(None, type=pa.string())) + res = plc.copying.boolean_mask_scatter( + [null], plc.Table([col.obj]), not_timestamps + ) + return Column( + plc.strings.convert.convert_datetime.to_timestamps( + res.columns()[0], self.dtype, format.encode() + ) + ) + elif self.name == pl_expr.StringFunction.Replace: + column, target, repl = columns + n, _ = self.options + return Column( + plc.strings.replace.replace( + column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n + ) + ) + elif self.name == pl_expr.StringFunction.ReplaceMany: + column, target, repl = columns + return Column( + plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj) + ) raise NotImplementedError( f"StringFunction {self.name}" ) # pragma: no cover; handled by init raising @@ -832,6 +960,18 @@ def do_evaluate( class TemporalFunction(Expr): __slots__ = ("name", "options", "children") + _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = { + pl_expr.TemporalFunction.Year: "year", + pl_expr.TemporalFunction.Month: "month", + pl_expr.TemporalFunction.Day: "day", + pl_expr.TemporalFunction.WeekDay: "weekday", + pl_expr.TemporalFunction.Hour: "hour", + pl_expr.TemporalFunction.Minute: "minute", + pl_expr.TemporalFunction.Second: "second", + pl_expr.TemporalFunction.Millisecond: "millisecond", + pl_expr.TemporalFunction.Microsecond: "microsecond", + pl_expr.TemporalFunction.Nanosecond: "nanosecond", + } _non_child = ("dtype", "name", "options") children: tuple[Expr, ...] @@ -846,8 +986,8 @@ def __init__( self.options = options self.name = name self.children = children - if self.name != pl_expr.TemporalFunction.Year: - raise NotImplementedError(f"String function {self.name}") + if self.name not in self._COMPONENT_MAP: + raise NotImplementedError(f"Temporal function {self.name}") def do_evaluate( self, @@ -861,12 +1001,59 @@ def do_evaluate( child.evaluate(df, context=context, mapping=mapping) for child in self.children ] - if self.name == pl_expr.TemporalFunction.Year: - (column,) = columns - return Column(plc.datetime.extract_year(column.obj)) - raise NotImplementedError( - f"TemporalFunction {self.name}" - ) # pragma: no cover; init trips first + (column,) = columns + if self.name == pl_expr.TemporalFunction.Microsecond: + millis = plc.datetime.extract_datetime_component(column.obj, "millisecond") + micros = plc.datetime.extract_datetime_component(column.obj, "microsecond") + millis_as_micros = plc.binaryop.binary_operation( + millis, + plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.DataType(plc.TypeId.INT32), + ) + total_micros = plc.binaryop.binary_operation( + micros, + millis_as_micros, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + return Column(total_micros) + elif self.name == pl_expr.TemporalFunction.Nanosecond: + millis = plc.datetime.extract_datetime_component(column.obj, "millisecond") + micros = plc.datetime.extract_datetime_component(column.obj, "microsecond") + nanos = plc.datetime.extract_datetime_component(column.obj, "nanosecond") + millis_as_nanos = plc.binaryop.binary_operation( + millis, + plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.types.DataType(plc.types.TypeId.INT32), + ) + micros_as_nanos = plc.binaryop.binary_operation( + micros, + plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.types.DataType(plc.types.TypeId.INT32), + ) + total_nanos = plc.binaryop.binary_operation( + nanos, + millis_as_nanos, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + total_nanos = plc.binaryop.binary_operation( + total_nanos, + micros_as_nanos, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + return Column(total_nanos) + + return Column( + plc.datetime.extract_datetime_component( + column.obj, + self._COMPONENT_MAP[self.name], + ) + ) class UnaryFunction(Expr): @@ -874,6 +1061,51 @@ class UnaryFunction(Expr): _non_child = ("dtype", "name", "options") children: tuple[Expr, ...] + # Note: log, and pow are handled via translation to binops + _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = { + "sin": plc.unary.UnaryOperator.SIN, + "cos": plc.unary.UnaryOperator.COS, + "tan": plc.unary.UnaryOperator.TAN, + "arcsin": plc.unary.UnaryOperator.ARCSIN, + "arccos": plc.unary.UnaryOperator.ARCCOS, + "arctan": plc.unary.UnaryOperator.ARCTAN, + "sinh": plc.unary.UnaryOperator.SINH, + "cosh": plc.unary.UnaryOperator.COSH, + "tanh": plc.unary.UnaryOperator.TANH, + "arcsinh": plc.unary.UnaryOperator.ARCSINH, + "arccosh": plc.unary.UnaryOperator.ARCCOSH, + "arctanh": plc.unary.UnaryOperator.ARCTANH, + "exp": plc.unary.UnaryOperator.EXP, + "sqrt": plc.unary.UnaryOperator.SQRT, + "cbrt": plc.unary.UnaryOperator.CBRT, + "ceil": plc.unary.UnaryOperator.CEIL, + "floor": plc.unary.UnaryOperator.FLOOR, + "abs": plc.unary.UnaryOperator.ABS, + "bit_invert": plc.unary.UnaryOperator.BIT_INVERT, + "not": plc.unary.UnaryOperator.NOT, + } + _supported_misc_fns = frozenset( + { + "drop_nulls", + "fill_null", + "mask_nans", + "round", + "set_sorted", + "unique", + } + ) + _supported_cum_aggs = frozenset( + { + "cum_min", + "cum_max", + "cum_prod", + "cum_sum", + } + ) + _supported_fns = frozenset().union( + _supported_misc_fns, _supported_cum_aggs, _OP_MAPPING.keys() + ) + def __init__( self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr ) -> None: @@ -881,15 +1113,15 @@ def __init__( self.name = name self.options = options self.children = children - if self.name not in ( - "mask_nans", - "round", - "setsorted", - "unique", - "dropnull", - "fill_null", - ): + + if self.name not in UnaryFunction._supported_fns: raise NotImplementedError(f"Unary function {name=}") + if self.name in UnaryFunction._supported_cum_aggs: + (reverse,) = self.options + if reverse: + raise NotImplementedError( + "reverse=True is not supported for cumulative aggregations" + ) def do_evaluate( self, @@ -947,7 +1179,7 @@ def do_evaluate( if maintain_order: return Column(column).sorted_like(values) return Column(column) - elif self.name == "setsorted": + elif self.name == "set_sorted": (column,) = ( child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -974,7 +1206,7 @@ def do_evaluate( order=order, null_order=null_order, ) - elif self.name == "dropnull": + elif self.name == "drop_nulls": (column,) = ( child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -994,13 +1226,65 @@ def do_evaluate( ) arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj return Column(plc.replace.replace_nulls(column.obj, arg)) - + elif self.name in self._OP_MAPPING: + column = self.children[0].evaluate(df, context=context, mapping=mapping) + if column.obj.type().id() != self.dtype.id(): + arg = plc.unary.cast(column.obj, self.dtype) + else: + arg = column.obj + return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name])) + elif self.name in UnaryFunction._supported_cum_aggs: + column = self.children[0].evaluate(df, context=context, mapping=mapping) + plc_col = column.obj + col_type = column.obj.type() + # cum_sum casts + # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention + # Bool -> UInt32 + # cum_prod casts integer dtypes < int64 and bool to int64 + # See: + # https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/cum_agg.rs + if ( + self.name == "cum_sum" + and col_type.id() + in { + plc.types.TypeId.INT8, + plc.types.TypeId.UINT8, + plc.types.TypeId.INT16, + plc.types.TypeId.UINT16, + } + ) or ( + self.name == "cum_prod" + and plc.traits.is_integral(col_type) + and plc.types.size_of(col_type) <= 4 + ): + plc_col = plc.unary.cast( + plc_col, plc.types.DataType(plc.types.TypeId.INT64) + ) + elif ( + self.name == "cum_sum" + and column.obj.type().id() == plc.types.TypeId.BOOL8 + ): + plc_col = plc.unary.cast( + plc_col, plc.types.DataType(plc.types.TypeId.UINT32) + ) + if self.name == "cum_sum": + agg = plc.aggregation.sum() + elif self.name == "cum_prod": + agg = plc.aggregation.product() + elif self.name == "cum_min": + agg = plc.aggregation.min() + elif self.name == "cum_max": + agg = plc.aggregation.max() + + return Column(plc.reduce.scan(plc_col, agg, plc.reduce.ScanType.INCLUSIVE)) raise NotImplementedError( f"Unimplemented unary function {self.name=}" ) # pragma: no cover; init trips first def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" + if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs: + raise NotImplementedError(f"{self.name} in groupby") if depth == 1: # inside aggregation, need to pre-evaluate, groupby # construction has checked that we don't have nested aggs, @@ -1187,11 +1471,7 @@ class Cast(Expr): def __init__(self, dtype: plc.DataType, value: Expr) -> None: super().__init__(dtype) self.children = (value,) - if not ( - plc.traits.is_fixed_width(self.dtype) - and plc.traits.is_fixed_width(value.dtype) - and plc.unary.is_supported_cast(value.dtype, self.dtype) - ): + if not dtypes.can_cast(value.dtype, self.dtype): raise NotImplementedError( f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}" ) @@ -1255,6 +1535,13 @@ def __init__( req = plc.aggregation.variance(ddof=options) elif name == "count": req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE) + elif name == "quantile": + _, quantile = self.children + if not isinstance(quantile, Literal): + raise NotImplementedError("Only support literal quantile values") + req = plc.aggregation.quantile( + quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options] + ) else: raise NotImplementedError( f"Unreachable, {name=} is incorrectly listed in _SUPPORTED" @@ -1286,9 +1573,18 @@ def __init__( "count", "std", "var", + "quantile", ] ) + interp_mapping: ClassVar[dict[str, plc.types.Interpolation]] = { + "nearest": plc.types.Interpolation.NEAREST, + "higher": plc.types.Interpolation.HIGHER, + "lower": plc.types.Interpolation.LOWER, + "midpoint": plc.types.Interpolation.MIDPOINT, + "linear": plc.types.Interpolation.LINEAR, + } + def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" if depth >= 1: @@ -1299,7 +1595,19 @@ def collect_agg(self, *, depth: int) -> AggInfo: raise NotImplementedError("Nan propagation in groupby for min/max") (child,) = self.children ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests - if self.request is None: + request = self.request + # These are handled specially here because we don't set up the + # request for the whole-frame agg because we can avoid a + # reduce for these. + if self.name == "first": + request = plc.aggregation.nth_element( + 0, null_handling=plc.types.NullPolicy.INCLUDE + ) + elif self.name == "last": + request = plc.aggregation.nth_element( + -1, null_handling=plc.types.NullPolicy.INCLUDE + ) + if request is None: raise NotImplementedError( f"Aggregation {self.name} in groupby" ) # pragma: no cover; __init__ trips first @@ -1308,7 +1616,7 @@ def collect_agg(self, *, depth: int) -> AggInfo: # Ignore nans in these groupby aggs, do this by masking # nans in the input expr = UnaryFunction(self.dtype, "mask_nans", (), expr) - return AggInfo([(expr, self.request, self)]) + return AggInfo([(expr, request, self)]) def _reduce( self, column: Column, *, request: plc.aggregation.Aggregation @@ -1380,7 +1688,10 @@ def do_evaluate( raise NotImplementedError( f"Agg in context {context}" ) # pragma: no cover; unreachable - (child,) = self.children + + # Aggregations like quantiles may have additional children that were + # preprocessed into pylibcudf requests. + child = self.children[0] return self.op(child.evaluate(df, context=context, mapping=mapping)) @@ -1425,6 +1736,11 @@ def __init__( right: Expr, ) -> None: super().__init__(dtype) + if plc.traits.is_boolean(self.dtype): + # For boolean output types, bitand and bitor implement + # boolean logic, so translate. bitxor also does, but the + # default behaviour is correct. + op = BinOp._BOOL_KLEENE_MAPPING.get(op, op) self.op = op self.children = (left, right) if not plc.binaryop.is_supported_operation( @@ -1436,6 +1752,15 @@ def __init__( f"with output type {self.dtype.id().name}" ) + _BOOL_KLEENE_MAPPING: ClassVar[ + dict[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator] + ] = { + plc.binaryop.BinaryOperator.BITWISE_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, + plc.binaryop.BinaryOperator.BITWISE_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, + plc.binaryop.BinaryOperator.LOGICAL_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, + plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, + } + _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = { pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL, pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS, diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index e334e6f5cc5..8cd56c8ee3a 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -15,7 +15,6 @@ import dataclasses import itertools -import types from functools import cache from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar @@ -28,7 +27,7 @@ import cudf_polars.dsl.expr as expr from cudf_polars.containers import DataFrame, NamedColumn -from cudf_polars.utils import sorting +from cudf_polars.utils import dtypes, sorting if TYPE_CHECKING: from collections.abc import Callable, MutableMapping @@ -133,8 +132,7 @@ class IR: def __post_init__(self): """Validate preconditions.""" - if any(dtype.id() == plc.TypeId.EMPTY for dtype in self.schema.values()): - raise NotImplementedError("Cannot make empty columns.") + pass # noqa: PIE790 def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """ @@ -189,32 +187,42 @@ class Scan(IR): """Cloud-related authentication options, currently ignored.""" paths: list[str] """List of paths to read from.""" - file_options: Any - """Options for reading the file. - - Attributes are: - - ``with_columns: list[str]`` of projected columns to return. - - ``n_rows: int``: Number of rows to read. - - ``row_index: tuple[name, offset] | None``: Add an integer index - column with given name. - """ + with_columns: list[str] + """Projected columns to return.""" + skip_rows: int + """Rows to skip at the start when reading.""" + n_rows: int + """Number of rows to read after skipping.""" + row_index: tuple[str, int] | None + """If not None add an integer index column of the given name.""" predicate: expr.NamedExpr | None """Mask to apply to the read dataframe.""" def __post_init__(self) -> None: """Validate preconditions.""" + super().__post_init__() if self.typ not in ("csv", "parquet", "ndjson"): # pragma: no cover # This line is unhittable ATM since IPC/Anonymous scan raise # on the polars side raise NotImplementedError(f"Unhandled scan type: {self.typ}") - if self.typ == "ndjson" and self.file_options.n_rows is not None: - raise NotImplementedError("row limit in scan") + if self.typ == "ndjson" and (self.n_rows != -1 or self.skip_rows != 0): + raise NotImplementedError("row limit in scan for json reader") + if self.skip_rows < 0: + # TODO: polars has this implemented for parquet, + # maybe we can do this too? + raise NotImplementedError("slice pushdown for negative slices") + if self.typ == "csv" and self.skip_rows != 0: # pragma: no cover + # This comes from slice pushdown, but that + # optimization doesn't happen right now + raise NotImplementedError("skipping rows in CSV reader") if self.cloud_options is not None and any( self.cloud_options.get(k) is not None for k in ("aws", "azure", "gcp") ): raise NotImplementedError( "Read from cloud storage" ) # pragma: no cover; no test yet + if any(p.startswith("https://") for p in self.paths): + raise NotImplementedError("Read from https") if self.typ == "csv": if self.reader_options["skip_rows_after_header"] != 0: raise NotImplementedError("Skipping rows after header in CSV reader") @@ -242,13 +250,21 @@ def __post_init__(self) -> None: raise NotImplementedError( "ignore_errors is not supported in the JSON reader" ) + elif ( + self.typ == "parquet" + and self.row_index is not None + and self.with_columns is not None + and len(self.with_columns) == 0 + ): + raise NotImplementedError( + "Reading only parquet metadata to produce row index." + ) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - options = self.file_options - with_columns = options.with_columns - row_index = options.row_index - nrows = self.file_options.n_rows if self.file_options.n_rows is not None else -1 + with_columns = self.with_columns + row_index = self.row_index + n_rows = self.n_rows if self.typ == "csv": parse_options = self.reader_options["parse_options"] sep = chr(parse_options["separator"]) @@ -256,7 +272,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: eol = chr(parse_options["eol_char"]) if self.reader_options["schema"] is not None: # Reader schema provides names - column_names = list(self.reader_options["schema"]["inner"].keys()) + column_names = list(self.reader_options["schema"]["fields"].keys()) else: # file provides column names column_names = None @@ -282,6 +298,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: # polars skips blank lines at the beginning of the file pieces = [] + read_partial = n_rows != -1 for p in self.paths: skiprows = self.reader_options["skip_rows"] path = Path(p) @@ -303,9 +320,13 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: comment=comment, decimal=decimal, dtypes=self.schema, - nrows=nrows, + nrows=n_rows, ) pieces.append(tbl_w_meta) + if read_partial: + n_rows -= tbl_w_meta.tbl.num_rows() + if n_rows <= 0: + break tables, colnames = zip( *( (piece.tbl, piece.column_names(include_children=False)) @@ -321,7 +342,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: tbl_w_meta = plc.io.parquet.read_parquet( plc.io.SourceInfo(self.paths), columns=with_columns, - nrows=nrows, + nrows=n_rows, + skip_rows=self.skip_rows, ) df = DataFrame.from_table( tbl_w_meta.tbl, @@ -354,12 +376,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: raise NotImplementedError( f"Unhandled scan type: {self.typ}" ) # pragma: no cover; post init trips first - if ( - row_index is not None - # TODO: remove condition when dropping support for polars 1.0 - # https://github.com/pola-rs/polars/pull/17363 - and row_index[0] in self.schema - ): + if row_index is not None: name, offset = row_index dtype = self.schema[name] step = plc.interop.from_arrow( @@ -481,36 +498,6 @@ def evaluate( return DataFrame(columns) -def placeholder_column(n: int) -> plc.Column: - """ - Produce a placeholder pylibcudf column with NO BACKING DATA. - - Parameters - ---------- - n - Number of rows the column will advertise - - Returns - ------- - pylibcudf Column that is almost unusable. DO NOT ACCESS THE DATA BUFFER. - - Notes - ----- - This is used to avoid allocating data for count aggregations. - """ - return plc.Column( - plc.DataType(plc.TypeId.INT8), - n, - plc.gpumemoryview( - types.SimpleNamespace(__cuda_array_interface__={"data": (1, True)}) - ), - None, - 0, - 0, - [], - ) - - @dataclasses.dataclass class GroupBy(IR): """Perform a groupby.""" @@ -557,8 +544,7 @@ def check_agg(agg: expr.Expr) -> int: def __post_init__(self) -> None: """Check whether all the aggregations are implemented.""" - if self.options.rolling is None and self.maintain_order: - raise NotImplementedError("Maintaining order in groupby") + super().__post_init__() if self.options.rolling: raise NotImplementedError( "rolling window/groupby" @@ -566,6 +552,8 @@ def __post_init__(self) -> None: if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests): raise NotImplementedError("Nested aggregations in groupby") self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] + if len(self.keys) == 0: + raise NotImplementedError("dynamic groupby") def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -591,7 +579,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: for info in self.agg_infos: for pre_eval, req, rep in info.requests: if pre_eval is None: - col = placeholder_column(df.num_rows) + # A count aggregation, doesn't touch the column, + # but we need to have one. Rather than evaluating + # one, just use one of the key columns. + col = keys[0].obj else: col = pre_eval.evaluate(df).obj requests.append(plc.groupby.GroupByRequest(col, [req])) @@ -611,7 +602,34 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: results = [ req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests ] - return DataFrame(broadcast(*result_keys, *results)).slice(self.options.slice) + broadcasted = broadcast(*result_keys, *results) + result_keys = broadcasted[: len(result_keys)] + results = broadcasted[len(result_keys) :] + # Handle order preservation of groups + # like cudf classic does + # https://github.com/rapidsai/cudf/blob/5780c4d8fb5afac2e04988a2ff5531f94c22d3a3/python/cudf/cudf/core/groupby/groupby.py#L723-L743 + if self.maintain_order and not sorted: + left = plc.stream_compaction.stable_distinct( + plc.Table([k.obj for k in keys]), + list(range(group_keys.num_columns())), + plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + right = plc.Table([key.obj for key in result_keys]) + _, indices = plc.join.left_join(left, right, plc.types.NullEquality.EQUAL) + ordered_table = plc.copying.gather( + plc.Table([col.obj for col in broadcasted]), + indices, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + ) + broadcasted = [ + NamedColumn(reordered, b.name) + for reordered, b in zip( + ordered_table.columns(), broadcasted, strict=True + ) + ] + return DataFrame(broadcasted).slice(self.options.slice) @dataclasses.dataclass @@ -627,7 +645,7 @@ class Join(IR): right_on: list[expr.NamedExpr] """List of expressions used as keys in the right frame.""" options: tuple[ - Literal["inner", "left", "full", "leftsemi", "leftanti", "cross"], + Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"], bool, tuple[int, int] | None, str | None, @@ -644,6 +662,7 @@ class Join(IR): def __post_init__(self) -> None: """Validate preconditions.""" + super().__post_init__() if any( isinstance(e.value, expr.Literal) for e in itertools.chain(self.left_on, self.right_on) @@ -653,7 +672,7 @@ def __post_init__(self) -> None: @staticmethod @cache def _joiners( - how: Literal["inner", "left", "full", "leftsemi", "leftanti"], + how: Literal["inner", "left", "right", "full", "leftsemi", "leftanti"], ) -> tuple[ Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None ]: @@ -663,7 +682,7 @@ def _joiners( plc.copying.OutOfBoundsPolicy.DONT_CHECK, plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) - elif how == "left": + elif how == "left" or how == "right": return ( plc.join.left_join, plc.copying.OutOfBoundsPolicy.DONT_CHECK, @@ -687,8 +706,7 @@ def _joiners( plc.copying.OutOfBoundsPolicy.DONT_CHECK, None, ) - else: - assert_never(how) + assert_never(how) def _reorder_maps( self, @@ -786,8 +804,12 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: table = plc.copying.gather(left.table, lg, left_policy) result = DataFrame.from_table(table, left.column_names) else: + if how == "right": + # Right join is a left join with the tables swapped + left, right = right, left + left_on, right_on = right_on, left_on lg, rg = join_fn(left_on.table, right_on.table, null_equality) - if how == "left": + if how == "left" or how == "right": # Order of left table is preserved lg, rg = self._reorder_maps( left.num_rows, lg, left_policy, right.num_rows, rg, right_policy @@ -815,6 +837,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ) ) right = right.discard_columns(right_on.column_names_set) + if how == "right": + # Undo the swap for right join before gluing together. + left, right = right, left right = right.rename_columns( { name: f"{name}{suffix}" @@ -1065,11 +1090,13 @@ class MapFunction(IR): # "merge_sorted", "rename", "explode", + "unpivot", ] ) def __post_init__(self) -> None: """Validate preconditions.""" + super().__post_init__() if self.name not in MapFunction._NAMES: raise NotImplementedError(f"Unhandled map function {self.name}") if self.name == "explode": @@ -1086,6 +1113,22 @@ def __post_init__(self) -> None: set(new) & (set(self.df.schema.keys() - set(old))) ): raise NotImplementedError("Duplicate new names in rename.") + elif self.name == "unpivot": + indices, pivotees, variable_name, value_name = self.options + value_name = "value" if value_name is None else value_name + variable_name = "variable" if variable_name is None else variable_name + if len(pivotees) == 0: + index = frozenset(indices) + pivotees = [name for name in self.df.schema if name not in index] + if not all( + dtypes.can_cast(self.df.schema[p], self.schema[value_name]) + for p in pivotees + ): + raise NotImplementedError( + "Unpivot cannot cast all input columns to " + f"{self.schema[value_name].id()}" + ) + self.options = (indices, pivotees, variable_name, value_name) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -1107,6 +1150,41 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return DataFrame.from_table( plc.lists.explode_outer(df.table, index), df.column_names ).sorted_like(df, subset=subset) + elif self.name == "unpivot": + indices, pivotees, variable_name, value_name = self.options + npiv = len(pivotees) + df = self.df.evaluate(cache=cache) + index_columns = [ + NamedColumn(col, name) + for col, name in zip( + plc.reshape.tile(df.select(indices).table, npiv).columns(), + indices, + strict=True, + ) + ] + (variable_column,) = plc.filling.repeat( + plc.Table( + [ + plc.interop.from_arrow( + pa.array( + pivotees, + type=plc.interop.to_arrow(self.schema[variable_name]), + ), + ) + ] + ), + df.num_rows, + ).columns() + value_column = plc.concatenate.concatenate( + [c.astype(self.schema[value_name]) for c in df.select(pivotees).columns] + ) + return DataFrame( + [ + *index_columns, + NamedColumn(variable_column, variable_name), + NamedColumn(value_column, value_name), + ] + ) else: raise AssertionError("Should never be reached") # pragma: no cover @@ -1122,6 +1200,7 @@ class Union(IR): def __post_init__(self) -> None: """Validate preconditions.""" + super().__post_init__() schema = self.dfs[0].schema if not all(s.schema == schema for s in self.dfs[1:]): raise NotImplementedError("Schema mismatch") diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 6dc97c7cb51..a0291037f01 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -75,13 +75,12 @@ def _translate_ir( def _( node: pl_ir.PythonScan, visitor: NodeTraverser, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.PythonScan( - schema, - node.options, - translate_named_expr(visitor, n=node.predicate) - if node.predicate is not None - else None, + scan_fn, with_columns, source_type, predicate, nrows = node.options + options = (scan_fn, with_columns, source_type, nrows) + predicate = ( + translate_named_expr(visitor, n=predicate) if predicate is not None else None ) + return ir.PythonScan(schema, options, predicate) @_translate_ir.register @@ -94,13 +93,27 @@ def _( cloud_options = None else: reader_options, cloud_options = map(json.loads, options) + file_options = node.file_options + with_columns = file_options.with_columns + n_rows = file_options.n_rows + if n_rows is None: + n_rows = -1 # All rows + skip_rows = 0 # Don't skip + else: + # TODO: with versioning, rename on the rust side + skip_rows, n_rows = n_rows + + row_index = file_options.row_index return ir.Scan( schema, typ, reader_options, cloud_options, node.paths, - node.file_options, + with_columns, + skip_rows, + n_rows, + row_index, translate_named_expr(visitor, n=node.predicate) if node.predicate is not None else None, @@ -293,10 +306,28 @@ def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR: ctx: AbstractContextManager[None] = ( set_node(visitor, n) if n is not None else noop_context ) + # IR is versioned with major.minor, minor is bumped for backwards + # compatible changes (e.g. adding new nodes), major is bumped for + # incompatible changes (e.g. renaming nodes). + # Polars 1.7 changes definition of the CSV reader options schema name. + if (version := visitor.version()) >= (3, 0): + raise NotImplementedError( + f"No support for polars IR {version=}" + ) # pragma: no cover; no such version for now. + with ctx: + polars_schema = visitor.get_schema() node = visitor.view_current_node() - schema = {k: dtypes.from_polars(v) for k, v in visitor.get_schema().items()} - return _translate_ir(node, visitor, schema) + schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()} + result = _translate_ir(node, visitor, schema) + if any( + isinstance(dtype, pl.Null) + for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values()) + ): + raise NotImplementedError( + f"No GPU support for {result} with Null column dtype." + ) + return result def translate_named_expr( @@ -345,6 +376,24 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex name, *options = node.function_data options = tuple(options) if isinstance(name, pl_expr.StringFunction): + if name in { + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + }: + column, chars = (translate_expr(visitor, n=n) for n in node.input) + if isinstance(chars, expr.Literal): + if chars.value == pa.scalar(""): + # No-op in polars, but libcudf uses empty string + # as signifier to remove whitespace. + return column + elif chars.value == pa.scalar(None): + # Polars uses None to mean "strip all whitespace" + chars = expr.Literal( + column.dtype, + pa.scalar("", type=plc.interop.to_arrow(column.dtype)), + ) + return expr.StringFunction(dtype, name, options, column, chars) return expr.StringFunction( dtype, name, @@ -369,19 +418,43 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex *(translate_expr(visitor, n=n) for n in node.input), ) elif isinstance(name, pl_expr.TemporalFunction): - return expr.TemporalFunction( + # functions for which evaluation of the expression may not return + # the same dtype as polars, either due to libcudf returning a different + # dtype, or due to our internal processing affecting what libcudf returns + needs_cast = { + pl_expr.TemporalFunction.Year, + pl_expr.TemporalFunction.Month, + pl_expr.TemporalFunction.Day, + pl_expr.TemporalFunction.WeekDay, + pl_expr.TemporalFunction.Hour, + pl_expr.TemporalFunction.Minute, + pl_expr.TemporalFunction.Second, + pl_expr.TemporalFunction.Millisecond, + } + result_expr = expr.TemporalFunction( dtype, name, options, *(translate_expr(visitor, n=n) for n in node.input), ) + if name in needs_cast: + return expr.Cast(dtype, result_expr) + return result_expr + elif isinstance(name, str): - return expr.UnaryFunction( - dtype, - name, - options, - *(translate_expr(visitor, n=n) for n in node.input), - ) + children = (translate_expr(visitor, n=n) for n in node.input) + if name == "log": + (base,) = options + (child,) = children + return expr.BinOp( + dtype, + plc.binaryop.BinaryOperator.LOG_BASE, + child, + expr.Literal(dtype, pa.scalar(base, type=plc.interop.to_arrow(dtype))), + ) + elif name == "pow": + return expr.BinOp(dtype, plc.binaryop.BinaryOperator.POW, *children) + return expr.UnaryFunction(dtype, name, options, *children) raise NotImplementedError( f"No handler for Expr function node with {name=}" ) # pragma: no cover; polars raises on the rust side for now diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index d37c96a15de..7b6f3848fc4 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -5,12 +5,11 @@ from __future__ import annotations -from functools import partial from typing import TYPE_CHECKING +from polars import GPUEngine from polars.testing.asserts import assert_frame_equal -from cudf_polars.callback import execute_with_cudf from cudf_polars.dsl.translate import translate_ir if TYPE_CHECKING: @@ -77,21 +76,13 @@ def assert_gpu_result_equal( NotImplementedError If GPU collection failed in some way. """ - if collect_kwargs is None: - collect_kwargs = {} - final_polars_collect_kwargs = collect_kwargs.copy() - final_cudf_collect_kwargs = collect_kwargs.copy() - if polars_collect_kwargs is not None: - final_polars_collect_kwargs.update(polars_collect_kwargs) - if cudf_collect_kwargs is not None: # pragma: no cover - # exclude from coverage since not used ATM - # but this is probably still useful - final_cudf_collect_kwargs.update(cudf_collect_kwargs) - expect = lazydf.collect(**final_polars_collect_kwargs) - got = lazydf.collect( - **final_cudf_collect_kwargs, - post_opt_callback=partial(execute_with_cudf, raise_on_fail=True), + final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs( + collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs ) + + expect = lazydf.collect(**final_polars_collect_kwargs) + engine = GPUEngine(raise_on_fail=True) + got = lazydf.collect(**final_cudf_collect_kwargs, engine=engine) assert_frame_equal( expect, got, @@ -134,3 +125,98 @@ def assert_ir_translation_raises(q: pl.LazyFrame, *exceptions: type[Exception]) raise AssertionError(f"Translation DID NOT RAISE {exceptions}") from e else: raise AssertionError(f"Translation DID NOT RAISE {exceptions}") + + +def _process_kwargs( + collect_kwargs: dict[OptimizationArgs, bool] | None, + polars_collect_kwargs: dict[OptimizationArgs, bool] | None, + cudf_collect_kwargs: dict[OptimizationArgs, bool] | None, +) -> tuple[dict[OptimizationArgs, bool], dict[OptimizationArgs, bool]]: + if collect_kwargs is None: + collect_kwargs = {} + final_polars_collect_kwargs = collect_kwargs.copy() + final_cudf_collect_kwargs = collect_kwargs.copy() + if polars_collect_kwargs is not None: # pragma: no cover; not currently used + final_polars_collect_kwargs.update(polars_collect_kwargs) + if cudf_collect_kwargs is not None: # pragma: no cover; not currently used + final_cudf_collect_kwargs.update(cudf_collect_kwargs) + return final_polars_collect_kwargs, final_cudf_collect_kwargs + + +def assert_collect_raises( + lazydf: pl.LazyFrame, + *, + polars_except: type[Exception] | tuple[type[Exception], ...], + cudf_except: type[Exception] | tuple[type[Exception], ...], + collect_kwargs: dict[OptimizationArgs, bool] | None = None, + polars_collect_kwargs: dict[OptimizationArgs, bool] | None = None, + cudf_collect_kwargs: dict[OptimizationArgs, bool] | None = None, +): + """ + Assert that collecting the result of a query raises the expected exceptions. + + Parameters + ---------- + lazydf + frame to collect. + collect_kwargs + Common keyword arguments to pass to collect for both polars CPU and + cudf-polars. + Useful for controlling optimization settings. + polars_except + Exception or exceptions polars CPU is expected to raise. If + None, CPU is not expected to raise an exception. + cudf_except + Exception or exceptions polars GPU is expected to raise. If + None, GPU is not expected to raise an exception. + collect_kwargs + Common keyword arguments to pass to collect for both polars CPU and + cudf-polars. + Useful for controlling optimization settings. + polars_collect_kwargs + Keyword arguments to pass to collect for execution on polars CPU. + Overrides kwargs in collect_kwargs. + Useful for controlling optimization settings. + cudf_collect_kwargs + Keyword arguments to pass to collect for execution on cudf-polars. + Overrides kwargs in collect_kwargs. + Useful for controlling optimization settings. + + Returns + ------- + None + If both sides raise the expected exceptions. + + Raises + ------ + AssertionError + If either side did not raise the expected exceptions. + """ + final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs( + collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs + ) + + try: + lazydf.collect(**final_polars_collect_kwargs) + except polars_except: + pass + except Exception as e: + raise AssertionError( + f"CPU execution RAISED {type(e)}, EXPECTED {polars_except}" + ) from e + else: + if polars_except != (): + raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}") + + engine = GPUEngine(raise_on_fail=True) + try: + lazydf.collect(**final_cudf_collect_kwargs, engine=engine) + except cudf_except: + pass + except Exception as e: + raise AssertionError( + f"GPU execution RAISED {type(e)}, EXPECTED {cudf_except}" + ) from e + else: + if cudf_except != (): + raise AssertionError(f"GPU execution DID NOT RAISE {cudf_except}") diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py new file mode 100644 index 00000000000..05b76d76808 --- /dev/null +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -0,0 +1,158 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Plugin for running polars test suite setting GPU engine as default.""" + +from __future__ import annotations + +from functools import partialmethod +from typing import TYPE_CHECKING + +import pytest + +import polars + +if TYPE_CHECKING: + from collections.abc import Mapping + + +def pytest_addoption(parser: pytest.Parser): + """Add plugin-specific options.""" + group = parser.getgroup( + "cudf-polars", "Plugin to set GPU as default engine for polars tests" + ) + group.addoption( + "--cudf-polars-no-fallback", + action="store_true", + help="Turn off fallback to CPU when running tests (default use fallback)", + ) + + +def pytest_configure(config: pytest.Config): + """Enable use of this module as a pytest plugin to enable GPU collection.""" + no_fallback = config.getoption("--cudf-polars-no-fallback") + collect = polars.LazyFrame.collect + engine = polars.GPUEngine(raise_on_fail=no_fallback) + polars.LazyFrame.collect = partialmethod(collect, engine=engine) + config.addinivalue_line( + "filterwarnings", + "ignore:.*GPU engine does not support streaming or background collection", + ) + config.addinivalue_line( + "filterwarnings", + "ignore:.*Query execution with GPU not supported", + ) + + +EXPECTED_FAILURES: Mapping[str, str] = { + "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed", + "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU", + "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed", + "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read", + "tests/unit/io/test_lazy_parquet.py::test_dsl2ir_cached_metadata[False]": "cudf-polars doesn't use metadata read by rust preprocessing", + "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match", + "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match", + "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?", + "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception", + "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception", + "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU", + "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR", + "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR", + "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR", + "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed", + "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed", + "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly", + "tests/unit/lazyframe/test_lazyframe.py::test_cast_frame": "Casting that raises not supported on GPU", + "tests/unit/lazyframe/test_lazyframe.py::test_lazy_cache_hit": "Debug output on stderr doesn't match", + "tests/unit/operations/aggregation/test_aggregations.py::test_duration_function_literal": "Broadcasting inside groupby-agg not supported", + "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero", + "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list", + "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context", + "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input14-expected14-input_dtype14-output_dtype14]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input15-expected15-input_dtype15-output_dtype15]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg", + "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg", + "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[left-expected0]": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[right-expected1]": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[datapoint-expected2]": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_rolling_dynamic_sortedness_check": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_validation": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_15225": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins", + "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU", + "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU", + "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU", + "tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv", + "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception", + "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match", + "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match", + "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852", + # Maybe flaky, order-dependent? + "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order", + "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero", +} + + +def pytest_collection_modifyitems( + session: pytest.Session, config: pytest.Config, items: list[pytest.Item] +): + """Mark known failing tests.""" + if config.getoption("--cudf-polars-no-fallback"): + # Don't xfail tests if running without fallback + return + for item in items: + if item.nodeid in EXPECTED_FAILURES: + item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])) diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py index adab10bdded..240b11bdf59 100644 --- a/python/cudf_polars/cudf_polars/typing/__init__.py +++ b/python/cudf_polars/cudf_polars/typing/__init__.py @@ -84,6 +84,10 @@ def view_expression(self, n: int) -> Expr: """Convert the given expression to python rep.""" ... + def version(self) -> tuple[int, int]: + """The IR version as `(major, minor)`.""" + ... + def set_udf( self, callback: Callable[[list[str] | None, str | None, int | None], pl.DataFrame], diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 7f6ea1edfd9..4154a404e98 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -13,7 +13,7 @@ import polars as pl -__all__ = ["from_polars", "downcast_arrow_lists"] +__all__ = ["from_polars", "downcast_arrow_lists", "can_cast"] def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType: @@ -45,6 +45,28 @@ def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType: return typ +def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: + """ + Can we cast (via :func:`~.pylibcudf.unary.cast`) between two datatypes. + + Parameters + ---------- + from_ + Source datatype + to + Target datatype + + Returns + ------- + True if casting is supported, False otherwise + """ + return ( + plc.traits.is_fixed_width(to) + and plc.traits.is_fixed_width(from_) + and plc.unary.is_supported_cast(from_, to) + ) + + @cache def from_polars(dtype: pl.DataType) -> plc.DataType: """ diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py index 9807cffb384..4a7ad6b3cf2 100644 --- a/python/cudf_polars/cudf_polars/utils/versions.py +++ b/python/cudf_polars/cudf_polars/utils/versions.py @@ -12,18 +12,11 @@ POLARS_VERSION = parse(__version__) -POLARS_VERSION_GE_10 = POLARS_VERSION >= parse("1.0") -POLARS_VERSION_GE_11 = POLARS_VERSION >= parse("1.1") -POLARS_VERSION_GE_12 = POLARS_VERSION >= parse("1.2") -POLARS_VERSION_GE_121 = POLARS_VERSION >= parse("1.2.1") -POLARS_VERSION_GT_10 = POLARS_VERSION > parse("1.0") -POLARS_VERSION_GT_11 = POLARS_VERSION > parse("1.1") -POLARS_VERSION_GT_12 = POLARS_VERSION > parse("1.2") - -POLARS_VERSION_LE_12 = POLARS_VERSION <= parse("1.2") -POLARS_VERSION_LE_11 = POLARS_VERSION <= parse("1.1") -POLARS_VERSION_LT_12 = POLARS_VERSION < parse("1.2") -POLARS_VERSION_LT_11 = POLARS_VERSION < parse("1.1") - -if POLARS_VERSION < parse("1.0"): # pragma: no cover - raise ImportError("cudf_polars requires py-polars v1.0 or greater.") +POLARS_VERSION_LT_18 = POLARS_VERSION < parse("1.8") + + +def _ensure_polars_version(): + if POLARS_VERSION_LT_18: + raise ImportError( + "cudf_polars requires py-polars v1.8 or greater." + ) # pragma: no cover diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index daf8286ae07..bff44af1468 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -15,8 +15,10 @@ You will need: ## Installing polars -We will need to build polars from source. Until things settle down, -live at `HEAD`. +`cudf-polars` works with polars >= 1.3, as long as the internal IR +version doesn't get a major version bump. So `pip install polars>=1.3` +should work. For development, if we're adding things to the polars +side of things, we will need to build polars from source: ```sh git clone https://github.com/pola-rs/polars @@ -59,7 +61,7 @@ The executor for the polars logical plan lives in the cudf repo, in ```sh cd cudf/python/cudf_polars -uv pip install --no-build-isolation --no-deps -e . +pip install --no-build-isolation --no-deps -e . ``` You should now be able to run the tests in the `cudf_polars` package: @@ -69,16 +71,18 @@ pytest -v tests # Executor design -The polars `LazyFrame.collect` functionality offers a -"post-optimization" callback that may be used by a third party library -to replace a node (or more, though we only replace a single node) in the -optimized logical plan with a Python callback that is to deliver the -result of evaluating the plan. This splits the execution of the plan -into two phases. First, a symbolic phase which translates to our -internal representation (IR). Second, an execution phase which executes -using our IR. - -The translation phase receives the a low-level Rust `NodeTraverse` +The polars `LazyFrame.collect` functionality offers configuration of +the engine to use for collection through the `engine` argument. At a +low level, this provides for configuration of a "post-optimization" +callback that may be used by a third party library to replace a node +(or more, though we only replace a single node) in the optimized +logical plan with a Python callback that is to deliver the result of +evaluating the plan. This splits the execution of the plan into two +phases. First, a symbolic phase which translates to our internal +representation (IR). Second, an execution phase which executes using +our IR. + +The translation phase receives the a low-level Rust `NodeTraverser` object which delivers Python representations of the plan nodes (and expressions) one at a time. During translation, we endeavour to raise `NotImplementedError` for any unsupported functionality. This way, if @@ -86,33 +90,60 @@ we can't execute something, we just don't modify the logical plan at all: if we can translate the IR, it is assumed that evaluation will later succeed. -The usage of the cudf-based executor is therefore, at present: +The usage of the cudf-based executor is therefore selected with the +gpu engine: ```python -from cudf_polars.callback import execute_with_cudf +import polars as pl -result = q.collect(post_opt_callback=execute_with_cudf) +result = q.collect(engine="gpu") ``` This should either transparently run on the GPU and deliver a polars dataframe, or else fail (but be handled) and just run the normal CPU -execution. +execution. If `POLARS_VERBOSE` is true, then fallback is logged with a +`PerformanceWarning`. -If you want to fail during translation, set the keyword argument -`raise_on_fail` to `True`: +As well as a string argument, the engine can also be specified with a +polars `GPUEngine` object. This allows passing more configuration in. +Currently, the public properties are `device`, to select the device, +and `memory_resource`, to select the RMM memory resource used for +allocations during the collection phase. +For example: ```python -from functools import partial -from cudf_polars.callback import execute_with_cudf +import polars as pl -result = q.collect( - post_opt_callback=partial(execute_with_cudf, raise_on_fail=True) -) +result = q.collect(engine=pl.GPUEngine(device=1, memory_resource=mr)) +``` + +Uses device-1, and the given memory resource. Note that the memory +resource provided _must_ be valid for allocations on the specified +device, no checking is performed. + +For debugging purposes, we can also pass undocumented keyword +arguments, at the moment, `raise_on_fail` is also supported, which +raises, rather than falling back, during translation: + +```python + +result = q.collect(engine=pl.GPUEngine(raise_on_fail=True)) ``` This is mostly useful when writing tests, since in that case we want any failures to propagate, rather than falling back to the CPU mode. +## IR versioning + +On the polars side, the `NodeTraverser` object advertises an internal +version (via `NodeTraverser.version()` as a `(major, minor)` tuple). +`minor` version bumps are for backwards compatible changes (e.g. +exposing new nodes), whereas `major` bumps are for incompatible +changes. We can therefore attempt to detect the IR version +(independently of the polars version) and dispatch, or error +appropriately. This should be done during IR translation in +`translate.py`. + ## Adding a handler for a new plan node Plan node definitions live in `cudf_polars/dsl/ir.py`, these are @@ -175,7 +206,7 @@ around their pylibcudf counterparts. We have four (in 1. `Scalar` (a wrapper around a pylibcudf `Scalar`) 2. `Column` (a wrapper around a pylibcudf `Column`) -3. `NamedColumn` a `Column` with an additional name +3. `NamedColumn` (a `Column` with an additional name) 4. `DataFrame` (a wrapper around a pylibcudf `Table`) The interfaces offered by these are somewhat in flux, but broadly diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index b44f633e2d9..f55031e0826 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.0,<1.3", + "polars>=1.8,<1.9", "pylibcudf==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -58,6 +58,9 @@ exclude_also = [ "class .*\\bProtocol\\):", "assert_never\\(" ] +# The cudf_polars test suite doesn't exercise the plugin, so we omit +# it from coverage checks. +omit = ["cudf_polars/testing/plugin.py"] [tool.ruff] line-length = 88 diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py index 6b470268084..39fb44d55a5 100644 --- a/python/cudf_polars/tests/containers/test_dataframe.py +++ b/python/cudf_polars/tests/containers/test_dataframe.py @@ -9,6 +9,7 @@ import polars as pl from cudf_polars.containers import DataFrame, NamedColumn +from cudf_polars.testing.asserts import assert_gpu_result_equal def test_select_missing_raises(): @@ -140,3 +141,13 @@ def test_sorted_flags_preserved(with_nulls, nulls_last): assert b.null_order == b_null_order assert c.is_sorted == plc.types.Sorted.NO assert df.flags == gf.to_polars().flags + + +def test_empty_name_roundtrips_overlap(): + df = pl.LazyFrame({"": [1, 2, 3], "column_0": [4, 5, 6]}) + assert_gpu_result_equal(df) + + +def test_empty_name_roundtrips_no_overlap(): + df = pl.LazyFrame({"": [1, 2, 3], "b": [4, 5, 6]}) + assert_gpu_result_equal(df) diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 245bde3acab..56055f4c6c2 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -7,15 +7,38 @@ import polars as pl from cudf_polars.dsl import expr -from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) -@pytest.fixture(params=sorted(expr.Agg._SUPPORTED)) +@pytest.fixture( + params=[ + # regular aggs from Agg + "min", + "max", + "median", + "n_unique", + "first", + "last", + "mean", + "sum", + "count", + "std", + "var", + # scan aggs from UnaryFunction + "cum_min", + "cum_max", + "cum_prod", + "cum_sum", + ] +) def agg(request): return request.param -@pytest.fixture(params=[pl.Int32, pl.Float32, pl.Int16]) +@pytest.fixture(params=[pl.Int32, pl.Float32, pl.Int16, pl.Int8, pl.UInt16]) def dtype(request): return request.param @@ -34,6 +57,11 @@ def df(dtype, with_nulls, is_sorted): if is_sorted: values = sorted(values, key=lambda x: -1000 if x is None else x) + if dtype.is_unsigned_integer(): + values = pl.Series(values).abs() + if is_sorted: + values = values.sort() + df = pl.LazyFrame({"a": values}, schema={"a": dtype}) if is_sorted: return df.set_sorted("a") @@ -52,6 +80,51 @@ def test_agg(df, agg): assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False) +def test_bool_agg(agg, request): + if agg == "cum_min" or agg == "cum_max": + pytest.skip("Does not apply") + request.applymarker( + pytest.mark.xfail( + condition=agg == "n_unique", + reason="Wrong dtype we get Int32, polars gets UInt32", + ) + ) + df = pl.LazyFrame({"a": [True, False, None, True]}) + expr = getattr(pl.col("a"), agg)() + q = df.select(expr) + + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("cum_agg", expr.UnaryFunction._supported_cum_aggs) +def test_cum_agg_reverse_unsupported(cum_agg): + df = pl.LazyFrame({"a": [1, 2, 3]}) + expr = getattr(pl.col("a"), cum_agg)(reverse=True) + q = df.select(expr) + + assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.parametrize("q", [0.5, pl.lit(0.5)]) +@pytest.mark.parametrize("interp", ["nearest", "higher", "lower", "midpoint", "linear"]) +def test_quantile(df, q, interp): + expr = pl.col("a").quantile(q, interp) + q = df.select(expr) + + # https://github.com/rapidsai/cudf/issues/15852 + check_dtypes = q.collect_schema()["a"] == pl.Float64 + if not check_dtypes: + with pytest.raises(AssertionError): + assert_gpu_result_equal(q) + assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False) + + +def test_quantile_invalid_q(df): + expr = pl.col("a").quantile(pl.col("a")) + q = df.select(expr) + assert_ir_translation_raises(q, NotImplementedError) + + @pytest.mark.parametrize( "op", [pl.Expr.min, pl.Expr.nan_min, pl.Expr.max, pl.Expr.nan_max] ) diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py index 97421008669..2347021c40e 100644 --- a/python/cudf_polars/tests/expressions/test_booleanfunction.py +++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py @@ -17,15 +17,11 @@ def has_nulls(request): return request.param -@pytest.mark.parametrize( - "ignore_nulls", - [ - pytest.param( - False, marks=pytest.mark.xfail(reason="No support for Kleene logic") - ), - True, - ], -) +@pytest.fixture(params=[False, True], ids=["include_nulls", "ignore_nulls"]) +def ignore_nulls(request): + return request.param + + def test_booleanfunction_reduction(ignore_nulls): ldf = pl.LazyFrame( { @@ -43,6 +39,25 @@ def test_booleanfunction_reduction(ignore_nulls): assert_gpu_result_equal(query) +@pytest.mark.parametrize("expr", [pl.Expr.any, pl.Expr.all]) +def test_booleanfunction_all_any_kleene(expr, ignore_nulls): + ldf = pl.LazyFrame( + { + "a": [False, None], + "b": [False, False], + "c": [False, True], + "d": [None, False], + "e": pl.Series([None, None], dtype=pl.Boolean()), + "f": [None, True], + "g": [True, False], + "h": [True, None], + "i": [True, True], + } + ) + q = ldf.select(expr(pl.col("*"), ignore_nulls=ignore_nulls)) + assert_gpu_result_equal(q) + + @pytest.mark.parametrize( "expr", [ @@ -54,14 +69,7 @@ def test_booleanfunction_reduction(ignore_nulls): ids=lambda f: f"{f.__name__}()", ) @pytest.mark.parametrize("has_nans", [False, True], ids=["no_nans", "nans"]) -def test_boolean_function_unary(request, expr, has_nans, has_nulls): - if has_nulls and expr in (pl.Expr.is_nan, pl.Expr.is_not_nan): - request.applymarker( - pytest.mark.xfail( - reason="Need to copy null mask since is_{not_}nan(null) => null" - ) - ) - +def test_boolean_function_unary(expr, has_nans, has_nulls): values: list[float | None] = [1, 2, 3, 4, 5] if has_nans: values[3] = float("nan") @@ -119,9 +127,7 @@ def test_boolean_isbetween(closed, bounds): "expr", [pl.any_horizontal("*"), pl.all_horizontal("*")], ids=["any", "all"] ) @pytest.mark.parametrize("wide", [False, True], ids=["narrow", "wide"]) -def test_boolean_horizontal(request, expr, has_nulls, wide): - if has_nulls: - request.applymarker(pytest.mark.xfail(reason="No support for Kleene logic")) +def test_boolean_horizontal(expr, has_nulls, wide): ldf = pl.LazyFrame( { "a": [False, False, False, False, False, True], @@ -164,6 +170,18 @@ def test_boolean_is_in(expr): assert_gpu_result_equal(q) +@pytest.mark.parametrize("expr", [pl.Expr.and_, pl.Expr.or_, pl.Expr.xor]) +def test_boolean_kleene_logic(expr): + ldf = pl.LazyFrame( + { + "a": [False, False, False, None, None, None, True, True, True], + "b": [False, None, True, False, None, True, False, None, True], + } + ) + q = ldf.select(expr(pl.col("a"), pl.col("b"))) + assert_gpu_result_equal(q) + + def test_boolean_is_in_raises_unsupported(): ldf = pl.LazyFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int64)}) q = ldf.select(pl.col("a").is_in(pl.lit(1, dtype=pl.Int32()))) diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py index 218101bf87c..c6ea29ddd38 100644 --- a/python/cudf_polars/tests/expressions/test_datetime_basic.py +++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py @@ -9,7 +9,11 @@ import polars as pl -from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.dsl.expr import TemporalFunction +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) @pytest.mark.parametrize( @@ -37,26 +41,97 @@ def test_datetime_dataframe_scan(dtype): assert_gpu_result_equal(query) +datetime_extract_fields = [ + "year", + "month", + "day", + "weekday", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", +] + + +@pytest.fixture( + ids=datetime_extract_fields, + params=[methodcaller(f) for f in datetime_extract_fields], +) +def field(request): + return request.param + + +def test_datetime_extract(field): + ldf = pl.LazyFrame( + { + "datetimes": pl.datetime_range( + datetime.datetime(2020, 1, 1), + datetime.datetime(2021, 12, 30), + "3mo14h15s11ms33us999ns", + eager=True, + ) + } + ) + + q = ldf.select(field(pl.col("datetimes").dt)) + + assert_gpu_result_equal(q) + + +def test_datetime_extra_unsupported(monkeypatch): + ldf = pl.LazyFrame( + { + "datetimes": pl.datetime_range( + datetime.datetime(2020, 1, 1), + datetime.datetime(2021, 12, 30), + "3mo14h15s11ms33us999ns", + eager=True, + ) + } + ) + + def unsupported_name_setter(self, value): + pass + + def unsupported_name_getter(self): + return "unsupported" + + monkeypatch.setattr( + TemporalFunction, + "name", + property(unsupported_name_getter, unsupported_name_setter), + ) + + q = ldf.select(pl.col("datetimes").dt.nanosecond()) + + assert_ir_translation_raises(q, NotImplementedError) + + @pytest.mark.parametrize( "field", [ methodcaller("year"), - pytest.param( - methodcaller("day"), - marks=pytest.mark.xfail(reason="day extraction not implemented"), - ), + methodcaller("month"), + methodcaller("day"), + methodcaller("weekday"), ], ) -def test_datetime_extract(field): +def test_date_extract(field): + ldf = pl.LazyFrame( + { + "dates": [ + datetime.date(2024, 1, 1), + datetime.date(2024, 10, 11), + ] + } + ) + ldf = pl.LazyFrame( {"dates": [datetime.date(2024, 1, 1), datetime.date(2024, 10, 11)]} ) - q = ldf.select(field(pl.col("dates").dt)) - with pytest.raises(AssertionError): - # polars produces int32, libcudf produces int16 for the year extraction - # libcudf can lose data here. - # https://github.com/rapidsai/cudf/issues/16196 - assert_gpu_result_equal(q) + q = ldf.select(field(pl.col("dates").dt)) - assert_gpu_result_equal(q, check_dtypes=False) + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/expressions/test_gather.py b/python/cudf_polars/tests/expressions/test_gather.py index 6bffa3e252c..f7c5d1bf2cd 100644 --- a/python/cudf_polars/tests/expressions/test_gather.py +++ b/python/cudf_polars/tests/expressions/test_gather.py @@ -6,7 +6,6 @@ import polars as pl -from cudf_polars import execute_with_cudf from cudf_polars.testing.asserts import assert_gpu_result_equal @@ -47,4 +46,4 @@ def test_gather_out_of_bounds(negative): query = ldf.select(pl.col("a").gather(pl.col("b"))) with pytest.raises(pl.exceptions.ComputeError): - query.collect(post_opt_callback=execute_with_cudf) + query.collect(engine="gpu") diff --git a/python/cudf_polars/tests/expressions/test_numeric_unaryops.py b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py new file mode 100644 index 00000000000..ac3aecf88e6 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import numpy as np +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture( + params=[ + "sin", + "cos", + "tan", + "arcsin", + "arccos", + "arctan", + "sinh", + "cosh", + "tanh", + "arcsinh", + "arccosh", + "arctanh", + "exp", + "sqrt", + "cbrt", + "ceil", + "floor", + "abs", + ] +) +def op(request): + return request.param + + +@pytest.fixture(params=[pl.Int32, pl.Float32]) +def dtype(request): + return request.param + + +@pytest.fixture +def ldf(with_nulls, dtype): + values = [1, 2, 4, 5, -2, -4, 0] + if with_nulls: + values.append(None) + if dtype == pl.Float32: + values.append(-float("inf")) + values.append(float("nan")) + values.append(float("inf")) + elif dtype == pl.Int32: + iinfo = np.iinfo("int32") + values.append(iinfo.min) + values.append(iinfo.max) + return pl.LazyFrame( + { + "a": pl.Series(values, dtype=dtype), + "b": pl.Series([i - 4 for i in range(len(values))], dtype=pl.Float32), + } + ) + + +def test_unary(ldf, op): + expr = getattr(pl.col("a"), op)() + q = ldf.select(expr) + assert_gpu_result_equal(q, check_exact=False) + + +@pytest.mark.parametrize("base_literal", [False, True]) +@pytest.mark.parametrize("exponent_literal", [False, True]) +def test_pow(ldf, base_literal, exponent_literal): + base = pl.lit(2) if base_literal else pl.col("a") + exponent = pl.lit(-3, dtype=pl.Float32) if exponent_literal else pl.col("b") + + q = ldf.select(base.pow(exponent)) + + assert_gpu_result_equal(q, check_exact=False) + + +@pytest.mark.parametrize("natural", [True, False]) +def test_log(ldf, natural): + if natural: + expr = pl.col("a").log() + else: + expr = pl.col("a").log(10) + + q = ldf.select(expr) + + assert_gpu_result_equal(q, check_exact=False) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index df08e15baa4..4f6850ac977 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -10,6 +10,7 @@ from cudf_polars import execute_with_cudf from cudf_polars.testing.asserts import ( + assert_collect_raises, assert_gpu_result_equal, assert_ir_translation_raises, ) @@ -152,3 +153,187 @@ def test_slice_column(slice_column_data): else: query = slice_column_data.select(pl.col("a").str.slice(pl.col("start"))) assert_ir_translation_raises(query, NotImplementedError) + + +@pytest.fixture +def to_datetime_data(): + return pl.LazyFrame( + { + "a": [ + "2021-01-01", + "2021-01-02", + "abcd", + ] + } + ) + + +@pytest.mark.parametrize("cache", [True, False], ids=lambda cache: f"{cache=}") +@pytest.mark.parametrize("strict", [True, False], ids=lambda strict: f"{strict=}") +@pytest.mark.parametrize("exact", [True, False], ids=lambda exact: f"{exact=}") +@pytest.mark.parametrize("format", ["%Y-%m-%d", None], ids=lambda format: f"{format=}") +def test_to_datetime(to_datetime_data, cache, strict, format, exact): + query = to_datetime_data.select( + pl.col("a").str.strptime( + pl.Datetime("ns"), format=format, cache=cache, strict=strict, exact=exact + ) + ) + if cache or format is None or not exact: + assert_ir_translation_raises(query, NotImplementedError) + elif strict: + assert_collect_raises( + query, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=pl.exceptions.ComputeError, + ) + else: + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "target, repl", + [("a", "a"), ("Wı", "☺"), ("FG", ""), ("doesnotexist", "blahblah")], # noqa: RUF001 +) +@pytest.mark.parametrize("n", [0, 3, -1]) +def test_replace_literal(ldf, target, repl, n): + query = ldf.select(pl.col("a").str.replace(target, repl, literal=True, n=n)) + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize("target, repl", [("", ""), ("a", pl.col("a"))]) +def test_replace_literal_unsupported(ldf, target, repl): + query = ldf.select(pl.col("a").str.replace(target, repl, literal=True)) + assert_ir_translation_raises(query, NotImplementedError) + + +def test_replace_re(ldf): + query = ldf.select(pl.col("a").str.replace("A", "a", literal=False)) + assert_ir_translation_raises(query, NotImplementedError) + + +@pytest.mark.parametrize( + "target,repl", + [ + (["A", "de", "kLm", "awef"], "a"), + (["A", "de", "kLm", "awef"], ""), + (["A", "de", "kLm", "awef"], ["a", "b", "c", "d"]), + (["A", "de", "kLm", "awef"], ["a", "b", "c", ""]), + ( + pl.lit(pl.Series(["A", "de", "kLm", "awef"])), + pl.lit(pl.Series(["a", "b", "c", "d"])), + ), + ], +) +def test_replace_many(ldf, target, repl): + query = ldf.select(pl.col("a").str.replace_many(target, repl)) + + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "target,repl", + [(["A", ""], ["a", "b"]), (pl.col("a").drop_nulls(), pl.col("a").drop_nulls())], +) +def test_replace_many_notimplemented(ldf, target, repl): + query = ldf.select(pl.col("a").str.replace_many(target, repl)) + assert_ir_translation_raises(query, NotImplementedError) + + +def test_replace_many_ascii_case(ldf): + query = ldf.select( + pl.col("a").str.replace_many(["a", "b", "c"], "a", ascii_case_insensitive=True) + ) + + assert_ir_translation_raises(query, NotImplementedError) + + +_strip_data = [ + "AbC", + "123abc", + "", + " ", + None, + "aAaaaAAaa", + " ab c ", + "abc123", + " ", + "\tabc\t", + "\nabc\n", + "\r\nabc\r\n", + "\t\n abc \n\t", + "!@#$%^&*()", + " abc!!! ", + " abc\t\n!!! ", + "__abc__", + "abc\n\n", + "123abc456", + "abcxyzabc", +] + +strip_chars = [ + "a", + "", + " ", + "\t", + "\n", + "\r\n", + "!", + "@#", + "123", + "xyz", + "abc", + "__", + " \t\n", + "abc123", + None, +] + + +@pytest.fixture +def strip_ldf(): + return pl.DataFrame({"a": _strip_data}).lazy() + + +@pytest.fixture(params=strip_chars) +def to_strip(request): + return request.param + + +def test_strip_chars(strip_ldf, to_strip): + q = strip_ldf.select(pl.col("a").str.strip_chars(to_strip)) + assert_gpu_result_equal(q) + + +def test_strip_chars_start(strip_ldf, to_strip): + q = strip_ldf.select(pl.col("a").str.strip_chars_start(to_strip)) + assert_gpu_result_equal(q) + + +def test_strip_chars_end(strip_ldf, to_strip): + q = strip_ldf.select(pl.col("a").str.strip_chars_end(to_strip)) + assert_gpu_result_equal(q) + + +def test_strip_chars_column(strip_ldf): + q = strip_ldf.select(pl.col("a").str.strip_chars(pl.col("a"))) + assert_ir_translation_raises(q, NotImplementedError) + + +def test_invalid_regex_raises(): + df = pl.LazyFrame({"a": ["abc"]}) + + q = df.select(pl.col("a").str.contains(r"ab)", strict=True)) + + assert_collect_raises( + q, + polars_except=pl.exceptions.ComputeError, + cudf_except=pl.exceptions.ComputeError, + ) + + +@pytest.mark.parametrize("pattern", ["a{1000}", "a(?i:B)"]) +def test_unsupported_regex_raises(pattern): + df = pl.LazyFrame({"a": ["abc"]}) + + q = df.select(pl.col("a").str.contains(pattern, strict=True)) + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/pytest.ini b/python/cudf_polars/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/cudf_polars/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py index 5b4bba55552..3c3986be19b 100644 --- a/python/cudf_polars/tests/test_config.py +++ b/python/cudf_polars/tests/test_config.py @@ -6,6 +6,9 @@ import pytest import polars as pl +from polars.testing.asserts import assert_frame_equal + +import rmm from cudf_polars.dsl.ir import IR from cudf_polars.testing.asserts import ( @@ -32,3 +35,48 @@ def raise_unimplemented(self): ): # And ensure that collecting issues the correct warning. assert_gpu_result_equal(q) + + +def test_unsupported_config_raises(): + q = pl.LazyFrame({}) + + with pytest.raises(pl.exceptions.ComputeError): + q.collect(engine=pl.GPUEngine(unknown_key=True)) + + +@pytest.mark.parametrize("device", [-1, "foo"]) +def test_invalid_device_raises(device): + q = pl.LazyFrame({}) + with pytest.raises(pl.exceptions.ComputeError): + q.collect(engine=pl.GPUEngine(device=device)) + + +@pytest.mark.parametrize("mr", [1, object()]) +def test_invalid_memory_resource_raises(mr): + q = pl.LazyFrame({}) + with pytest.raises(pl.exceptions.ComputeError): + q.collect(engine=pl.GPUEngine(memory_resource=mr)) + + +def test_explicit_device_zero(): + q = pl.LazyFrame({"a": [1, 2, 3]}) + + result = q.collect(engine=pl.GPUEngine(device=0)) + assert_frame_equal(q.collect(), result) + + +def test_explicit_memory_resource(): + upstream = rmm.mr.CudaMemoryResource() + n_allocations = 0 + + def allocate(bytes, stream): + nonlocal n_allocations + n_allocations += 1 + return upstream.allocate(bytes, stream) + + mr = rmm.mr.CallbackMemoryResource(allocate, upstream.deallocate) + + q = pl.LazyFrame({"a": [1, 2, 3]}) + result = q.collect(engine=pl.GPUEngine(memory_resource=mr)) + assert_frame_equal(q.collect(), result) + assert n_allocations > 0 diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index a75825ef3d3..74bf8b9e4e2 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -12,7 +12,6 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) -from cudf_polars.utils import versions @pytest.fixture @@ -31,6 +30,7 @@ def df(): params=[ [pl.col("key1")], [pl.col("key2")], + [pl.col("key1"), pl.lit(1)], [pl.col("key1") * pl.col("key2")], [pl.col("key1"), pl.col("key2")], [pl.col("key1") == pl.col("key2")], @@ -52,6 +52,7 @@ def keys(request): [(pl.col("float") - pl.lit(2)).max()], [pl.col("float").sum().round(decimals=1)], [pl.col("float").round(decimals=1).sum()], + [pl.col("int").first(), pl.col("float").last()], ], ids=lambda aggs: "-".join(map(str, aggs)), ) @@ -60,15 +61,7 @@ def exprs(request): @pytest.fixture( - params=[ - False, - pytest.param( - True, - marks=pytest.mark.xfail( - reason="Maintaining order in groupby not implemented" - ), - ), - ], + params=[False, True], ids=["no_maintain_order", "maintain_order"], ) def maintain_order(request): @@ -98,15 +91,10 @@ def test_groupby_sorted_keys(df: pl.LazyFrame, keys, exprs): # Multiple keys don't do sorting qsorted = q.sort(*sort_keys) if len(keys) > 1: - with pytest.raises(AssertionError): - # https://github.com/pola-rs/polars/issues/17556 - assert_gpu_result_equal(q, check_exact=False) - if versions.POLARS_VERSION_LT_12 and schema[sort_keys[1]] == pl.Boolean(): - # https://github.com/pola-rs/polars/issues/17557 - with pytest.raises(AssertionError): - assert_gpu_result_equal(qsorted, check_exact=False) - else: - assert_gpu_result_equal(qsorted, check_exact=False) + # https://github.com/pola-rs/polars/issues/17556 + # Can't assert that the query without post-sorting fails, + # since it _might_ pass. + assert_gpu_result_equal(qsorted, check_exact=False) elif schema[sort_keys[0]] == pl.Boolean(): # Boolean keys don't do sorting, so we get random order assert_gpu_result_equal(qsorted, check_exact=False) @@ -133,6 +121,21 @@ def test_groupby_unsupported(df, expr): assert_ir_translation_raises(q, NotImplementedError) +def test_groupby_null_keys(maintain_order): + df = pl.LazyFrame( + { + "key": pl.Series([1, float("nan"), 2, None, 2, None], dtype=pl.Float64()), + "value": [-1, 2, 1, 2, 3, 4], + } + ) + + q = df.group_by("key", maintain_order=maintain_order).agg(pl.col("value").min()) + if not maintain_order: + q = q.sort("key") + + assert_gpu_result_equal(q) + + @pytest.mark.xfail(reason="https://github.com/pola-rs/polars/issues/17513") def test_groupby_minmax_with_nan(): df = pl.LazyFrame( @@ -159,21 +162,17 @@ def test_groupby_nan_minmax_raises(op): @pytest.mark.parametrize( "key", - [ - pytest.param( - 1, - marks=pytest.mark.xfail( - versions.POLARS_VERSION_GE_121, reason="polars 1.2.1 disallows this" - ), - ), - pl.col("key1"), - ], + [1, pl.col("key1")], ) @pytest.mark.parametrize( "expr", [ pl.lit(1).alias("value"), - pl.lit([[4, 5, 6]]).alias("value"), + pytest.param( + pl.lit([[4, 5, 6]]).alias("value"), + marks=pytest.mark.xfail(reason="Need to expose OtherScalar in rust IR"), + ), + pl.Series("value", [[4, 5, 6]], dtype=pl.List(pl.Int32)), pl.col("float") * (1 - pl.col("int")), [pl.lit(2).alias("value"), pl.col("float") * 2], ], @@ -183,3 +182,12 @@ def test_groupby_literal_in_agg(df, key, expr): # so just sort by the group key q = df.group_by(key).agg(expr).sort(key, maintain_order=True) assert_gpu_result_equal(q) + + +@pytest.mark.parametrize( + "expr", + [pl.col("int").unique(), pl.col("int").drop_nulls(), pl.col("int").cum_max()], +) +def test_groupby_unary_non_pointwise_raises(df, expr): + q = df.group_by("key1").agg(expr) + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_groupby_dynamic.py b/python/cudf_polars/tests/test_groupby_dynamic.py new file mode 100644 index 00000000000..38b3ce74ac5 --- /dev/null +++ b/python/cudf_polars/tests/test_groupby_dynamic.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from datetime import datetime + +import polars as pl + +from cudf_polars.testing.asserts import assert_ir_translation_raises + + +def test_groupby_dynamic_raises(): + df = pl.LazyFrame( + { + "dt": [ + datetime(2021, 12, 31, 0, 0, 0), + datetime(2022, 1, 1, 0, 0, 1), + datetime(2022, 3, 31, 0, 0, 1), + datetime(2022, 4, 1, 0, 0, 1), + ] + } + ) + + q = ( + df.sort("dt") + .group_by_dynamic("dt", every="1q") + .agg(pl.col("dt").count().alias("num_values")) + ) + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 1e880cdc6de..7d9ec98db97 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -17,7 +17,7 @@ def join_nulls(request): return request.param -@pytest.fixture(params=["inner", "left", "semi", "anti", "full"]) +@pytest.fixture(params=["inner", "left", "right", "semi", "anti", "full"]) def how(request): return request.param diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py index 77032108e6f..e895f27f637 100644 --- a/python/cudf_polars/tests/test_mapfunction.py +++ b/python/cudf_polars/tests/test_mapfunction.py @@ -61,3 +61,48 @@ def test_rename_columns(mapping): q = df.rename(mapping) assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("index", [None, ["a"], ["d", "a"]]) +@pytest.mark.parametrize("variable_name", [None, "names"]) +@pytest.mark.parametrize("value_name", [None, "unpivoted"]) +def test_unpivot(index, variable_name, value_name): + df = pl.LazyFrame( + { + "a": ["x", "y", "z"], + "b": pl.Series([1, 3, 5], dtype=pl.Int16), + "c": pl.Series([2, 4, 6], dtype=pl.Float32), + "d": ["a", "b", "c"], + } + ) + q = df.unpivot( + ["c", "b"], index=index, variable_name=variable_name, value_name=value_name + ) + + assert_gpu_result_equal(q) + + +def test_unpivot_defaults(): + df = pl.LazyFrame( + { + "a": pl.Series([11, 12, 13], dtype=pl.UInt16), + "b": pl.Series([1, 3, 5], dtype=pl.Int16), + "c": pl.Series([2, 4, 6], dtype=pl.Float32), + "d": ["a", "b", "c"], + } + ) + q = df.unpivot(index="d") + assert_gpu_result_equal(q) + + +def test_unpivot_unsupported_cast_raises(): + df = pl.LazyFrame( + { + "a": ["x", "y", "z"], + "b": pl.Series([1, 3, 5], dtype=pl.Int16), + } + ) + + q = df.unpivot(["a", "b"]) + + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_python_scan.py b/python/cudf_polars/tests/test_python_scan.py index fd8453b77c4..0cda89474a8 100644 --- a/python/cudf_polars/tests/test_python_scan.py +++ b/python/cudf_polars/tests/test_python_scan.py @@ -8,7 +8,9 @@ def test_python_scan(): - def source(with_columns, predicate, nrows): + def source(with_columns, predicate, nrows, *batch_size): + # PythonScan interface changes between 1.3 and 1.4 to add an + # extra batch_size argument return pl.DataFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int8())}) q = pl.LazyFrame._scan_python_function({"a": pl.Int8}, source, pyarrow=False) diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index 64acbb076ed..792b136acd8 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -12,7 +12,6 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) -from cudf_polars.utils import versions @pytest.fixture( @@ -58,6 +57,22 @@ def mask(request): return request.param +@pytest.fixture( + params=[ + None, + (1, 1), + ], + ids=[ + "no-slice", + "slice-second", + ], +) +def slice(request): + # For use in testing that we handle + # polars slice pushdown correctly + return request.param + + def make_source(df, path, format): """ Writes the passed polars df to a file of @@ -79,7 +94,9 @@ def make_source(df, path, format): ("parquet", pl.scan_parquet), ], ) -def test_scan(tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, request): +def test_scan( + tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, slice, request +): name, offset = row_index make_source(df, tmp_path / "file", format) request.applymarker( @@ -94,21 +111,23 @@ def test_scan(tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, r row_index_offset=offset, n_rows=n_rows, ) + if slice is not None: + q = q.slice(*slice) if mask is not None: q = q.filter(mask) if columns is not None: q = q.select(*columns) - polars_collect_kwargs = {} - if versions.POLARS_VERSION_LT_12: - # https://github.com/pola-rs/polars/issues/17553 - polars_collect_kwargs = {"projection_pushdown": False} - assert_gpu_result_equal( - q, - polars_collect_kwargs=polars_collect_kwargs, - # This doesn't work in polars < 1.2 since the row-index - # is in the wrong order in previous polars releases - check_column_order=versions.POLARS_VERSION_LT_12, - ) + assert_gpu_result_equal(q) + + +def test_negative_slice_pushdown_raises(tmp_path): + df = pl.DataFrame({"a": [1, 2, 3]}) + + df.write_parquet(tmp_path / "df.parquet") + q = pl.scan_parquet(tmp_path / "df.parquet") + # Take the last row + q = q.slice(-1, 1) + assert_ir_translation_raises(q, NotImplementedError) def test_scan_unsupported_raises(tmp_path): @@ -127,10 +146,6 @@ def test_scan_ndjson_nrows_notimplemented(tmp_path, df): assert_ir_translation_raises(q, NotImplementedError) -@pytest.mark.xfail( - versions.POLARS_VERSION_LT_11, - reason="https://github.com/pola-rs/polars/issues/15730", -) def test_scan_row_index_projected_out(tmp_path): df = pl.DataFrame({"a": [1, 2, 3]}) @@ -169,15 +184,25 @@ def test_scan_csv_column_renames_projection_schema(tmp_path): ("test*.csv", False), ], ) -def test_scan_csv_multi(tmp_path, filename, glob): +@pytest.mark.parametrize( + "nrows_skiprows", + [ + (None, 0), + (1, 1), + (3, 0), + (4, 2), + ], +) +def test_scan_csv_multi(tmp_path, filename, glob, nrows_skiprows): + n_rows, skiprows = nrows_skiprows with (tmp_path / "test1.csv").open("w") as f: - f.write("""foo,bar,baz\n1,2\n3,4,5""") + f.write("""foo,bar,baz\n1,2,3\n3,4,5""") with (tmp_path / "test2.csv").open("w") as f: - f.write("""foo,bar,baz\n1,2\n3,4,5""") + f.write("""foo,bar,baz\n1,2,3\n3,4,5""") with (tmp_path / "test*.csv").open("w") as f: - f.write("""foo,bar,baz\n1,2\n3,4,5""") + f.write("""foo,bar,baz\n1,2,3\n3,4,5""") os.chdir(tmp_path) - q = pl.scan_csv(filename, glob=glob) + q = pl.scan_csv(filename, glob=glob, n_rows=n_rows, skip_rows=skiprows) assert_gpu_result_equal(q) @@ -280,3 +305,24 @@ def test_scan_ndjson_unsupported(df, tmp_path): make_source(df, tmp_path / "file", "ndjson") q = pl.scan_ndjson(tmp_path / "file", ignore_errors=True) assert_ir_translation_raises(q, NotImplementedError) + + +def test_scan_parquet_nested_null_raises(tmp_path): + df = pl.DataFrame({"a": pl.Series([None], dtype=pl.List(pl.Null))}) + + df.write_parquet(tmp_path / "file.pq") + + q = pl.scan_parquet(tmp_path / "file.pq") + + assert_ir_translation_raises(q, NotImplementedError) + + +def test_scan_parquet_only_row_index_raises(df, tmp_path): + make_source(df, tmp_path / "file", "parquet") + q = pl.scan_parquet(tmp_path / "file", row_index_name="index").select("index") + assert_ir_translation_raises(q, NotImplementedError) + + +def test_scan_hf_url_raises(): + q = pl.scan_csv("hf://datasets/scikit-learn/iris/Iris.csv") + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_sort.py b/python/cudf_polars/tests/test_sort.py index ecc02efd967..cfa8e5ff9b9 100644 --- a/python/cudf_polars/tests/test_sort.py +++ b/python/cudf_polars/tests/test_sort.py @@ -13,10 +13,7 @@ "sort_keys", [ (pl.col("a"),), - pytest.param( - (pl.col("d").abs(),), - marks=pytest.mark.xfail(reason="abs not yet implemented"), - ), + (pl.col("d").abs(),), (pl.col("a"), pl.col("d")), (pl.col("b"),), ], diff --git a/python/cudf_polars/tests/testing/test_asserts.py b/python/cudf_polars/tests/testing/test_asserts.py index 5bc2fe1efb7..ace1c6b8648 100644 --- a/python/cudf_polars/tests/testing/test_asserts.py +++ b/python/cudf_polars/tests/testing/test_asserts.py @@ -8,6 +8,7 @@ import polars as pl from cudf_polars.testing.asserts import ( + assert_collect_raises, assert_gpu_result_equal, assert_ir_translation_raises, ) @@ -26,10 +27,59 @@ def test_translation_assert_raises(): class E(Exception): pass - unsupported = df.group_by("a").agg(pl.col("a").cum_max().alias("b")) + unsupported = df.group_by("a").agg(pl.col("a").upper_bound().alias("b")) # Unsupported query should raise NotImplementedError assert_ir_translation_raises(unsupported, NotImplementedError) with pytest.raises(AssertionError): # This should fail, because we can't translate this query, but it doesn't raise E. assert_ir_translation_raises(unsupported, E) + + +def test_collect_assert_raises(): + df = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + with pytest.raises(AssertionError, match="CPU execution DID NOT RAISE"): + # This should raise, because polars CPU can run this query, + # but we expect an error. + assert_collect_raises( + df, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=(), + ) + + with pytest.raises(AssertionError, match="GPU execution DID NOT RAISE"): + # This should raise, because polars GPU can run this query, + # but we expect an error. + assert_collect_raises( + df, + polars_except=(), + cudf_except=pl.exceptions.InvalidOperationError, + ) + + # Here's an invalid query that gets caught at IR optimisation time. + q = df.select(pl.col("a") * pl.col("b")) + + # This exception is raised in preprocessing, so is the same for + # both CPU and GPU engines. + assert_collect_raises( + q, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=pl.exceptions.InvalidOperationError, + ) + + with pytest.raises(AssertionError, match="GPU execution RAISED"): + # This should raise because the expected GPU error is wrong + assert_collect_raises( + q, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=NotImplementedError, + ) + + with pytest.raises(AssertionError, match="CPU execution RAISED"): + # This should raise because the expected CPU error is wrong + assert_collect_raises( + q, + polars_except=NotImplementedError, + cudf_except=pl.exceptions.InvalidOperationError, + ) diff --git a/python/custreamz/custreamz/tests/pytest.ini b/python/custreamz/custreamz/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/custreamz/custreamz/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md index 4655d2165f0..69e1524be39 100644 --- a/python/dask_cudf/README.md +++ b/python/dask_cudf/README.md @@ -16,6 +16,7 @@ See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to ## Resources - [Dask cuDF documentation](https://docs.rapids.ai/api/dask-cudf/stable/) +- [Best practices](https://docs.rapids.ai/api/dask-cudf/stable/best_practices/) - [cuDF documentation](https://docs.rapids.ai/api/cudf/stable/) - [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/) - [Dask-CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/) diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py index 97e1dffc65b..907abaa2bfc 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/expr/_collection.py @@ -15,6 +15,7 @@ from dask import config from dask.dataframe.core import is_dataframe_like +from dask.typing import no_default import cudf @@ -90,6 +91,17 @@ def var( ) ) + def rename_axis( + self, mapper=no_default, index=no_default, columns=no_default, axis=0 + ): + from dask_cudf.expr._expr import RenameAxisCudf + + return new_collection( + RenameAxisCudf( + self, mapper=mapper, index=index, columns=columns, axis=axis + ) + ) + class DataFrame(DXDataFrame, CudfFrameBase): @classmethod @@ -202,27 +214,58 @@ class Index(DXIndex, CudfFrameBase): ## -try: - from dask_expr._backends import create_array_collection - - @get_collection_type.register_lazy("cupy") - def _register_cupy(): - import cupy - - @get_collection_type.register(cupy.ndarray) - def get_collection_type_cupy_array(_): - return create_array_collection - - @get_collection_type.register_lazy("cupyx") - def _register_cupyx(): - # Needed for cuml - from cupyx.scipy.sparse import spmatrix - - @get_collection_type.register(spmatrix) - def get_collection_type_csr_matrix(_): - return create_array_collection - -except ImportError: - # Older version of dask-expr. - # Implicit conversion to array wont work. - pass +def _create_array_collection_with_meta(expr): + # NOTE: This is the GPU compatible version of + # `new_dd_object` for DataFrame -> Array conversion. + # This can be removed if dask#11017 is resolved + # (See: https://github.com/dask/dask/issues/11017) + import numpy as np + + import dask.array as da + from dask.blockwise import Blockwise + from dask.highlevelgraph import HighLevelGraph + + result = expr.optimize() + dsk = result.__dask_graph__() + name = result._name + meta = result._meta + divisions = result.divisions + chunks = ((np.nan,) * (len(divisions) - 1),) + tuple( + (d,) for d in meta.shape[1:] + ) + if len(chunks) > 1: + if isinstance(dsk, HighLevelGraph): + layer = dsk.layers[name] + else: + # dask-expr provides a dict only + layer = dsk + if isinstance(layer, Blockwise): + layer.new_axes["j"] = chunks[1][0] + layer.output_indices = layer.output_indices + ("j",) + else: + suffix = (0,) * (len(chunks) - 1) + for i in range(len(chunks[0])): + layer[(name, i) + suffix] = layer.pop((name, i)) + + return da.Array(dsk, name=name, chunks=chunks, meta=meta) + + +@get_collection_type.register_lazy("cupy") +def _register_cupy(): + import cupy + + get_collection_type.register( + cupy.ndarray, + lambda _: _create_array_collection_with_meta, + ) + + +@get_collection_type.register_lazy("cupyx") +def _register_cupyx(): + # Needed for cuml + from cupyx.scipy.sparse import spmatrix + + get_collection_type.register( + spmatrix, + lambda _: _create_array_collection_with_meta, + ) diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py index 8a2c50d3fe7..b284ab3774d 100644 --- a/python/dask_cudf/dask_cudf/expr/_expr.py +++ b/python/dask_cudf/dask_cudf/expr/_expr.py @@ -4,11 +4,12 @@ import dask_expr._shuffle as _shuffle_module from dask_expr import new_collection from dask_expr._cumulative import CumulativeBlockwise -from dask_expr._expr import Elemwise, Expr, VarColumns +from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns from dask_expr._reductions import Reduction, Var from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty from dask.dataframe.dispatch import is_categorical_dtype +from dask.typing import no_default import cudf @@ -17,6 +18,19 @@ ## +class RenameAxisCudf(RenameAxis): + # TODO: Remove this after rename_axis is supported in cudf + # (See: https://github.com/rapidsai/cudf/issues/16895) + @staticmethod + def operation(df, index=no_default, **kwargs): + if index != no_default: + df.index.name = index + return df + raise NotImplementedError( + "Only `index` is supported for the cudf backend" + ) + + class ToCudfBackend(Elemwise): # TODO: Inherit from ToBackend when rapids-dask-dependency # is pinned to dask>=2024.8.1 diff --git a/python/dask_cudf/dask_cudf/tests/pytest.ini b/python/dask_cudf/dask_cudf/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/dask_cudf/dask_cudf/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 7aa0f6320f2..5f0fae86691 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -16,6 +16,7 @@ import dask_cudf from dask_cudf.tests.utils import ( + QUERY_PLANNING_ON, require_dask_expr, skip_dask_expr, xfail_dask_expr, @@ -950,12 +951,16 @@ def test_implicit_array_conversion_cupy(): def func(x): return x.values - # Need to compute the dask collection for now. - # See: https://github.com/dask/dask/issues/11017 - result = ds.map_partitions(func, meta=s.values).compute() - expect = func(s) + result = ds.map_partitions(func, meta=s.values) - dask.array.assert_eq(result, expect) + if QUERY_PLANNING_ON: + # Check Array and round-tripped DataFrame + dask.array.assert_eq(result, func(s)) + dd.assert_eq(result.to_dask_dataframe(), s, check_index=False) + else: + # Legacy version still carries numpy metadata + # See: https://github.com/dask/dask/issues/11017 + dask.array.assert_eq(result.compute(), func(s)) def test_implicit_array_conversion_cupy_sparse(): @@ -967,8 +972,6 @@ def test_implicit_array_conversion_cupy_sparse(): def func(x): return cupyx.scipy.sparse.csr_matrix(x.values) - # Need to compute the dask collection for now. - # See: https://github.com/dask/dask/issues/11017 result = ds.map_partitions(func, meta=s.values).compute() expect = func(s) @@ -1024,3 +1027,15 @@ def test_cov_corr(op, numeric_only): # (See: https://github.com/rapidsai/cudf/issues/12626) expect = getattr(df.to_pandas(), op)(numeric_only=numeric_only) dd.assert_eq(res, expect) + + +def test_rename_axis_after_join(): + df1 = cudf.DataFrame(index=["a", "b", "c"], data=dict(a=[1, 2, 3])) + df1.index.name = "test" + ddf1 = dd.from_pandas(df1, 2) + + df2 = cudf.DataFrame(index=["a", "b", "d"], data=dict(b=[1, 2, 3])) + ddf2 = dd.from_pandas(df2, 2) + result = ddf1.join(ddf2, how="outer") + expected = df1.join(df2, how="outer") + dd.assert_eq(result, expected, check_index=False) diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index 88b15718382..d03e92319be 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -13,6 +13,7 @@ def _make_random_frame(nelem, npartitions=2): + np.random.seed(0) df = pd.DataFrame( { "x": np.random.randint(0, 5, size=nelem), @@ -38,7 +39,6 @@ def wrapped(series): @pytest.mark.parametrize("reducer", _reducers) def test_series_reduce(reducer): reducer = _get_reduce_fn(reducer) - np.random.seed(0) size = 10 df, gdf = _make_random_frame(size) diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx index 5a67f4d6cdb..5f9d145139a 100644 --- a/python/pylibcudf/pylibcudf/binaryop.pyx +++ b/python/pylibcudf/pylibcudf/binaryop.pyx @@ -94,7 +94,7 @@ cpdef bool is_supported_operation( ): """Check if an operation is supported for the given data types. - For details, see :cpp:func::is_supported_operation`. + For details, see :cpp:func::`is_supported_operation`. Parameters ---------- diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx index 4601cba515a..e9085e3ea02 100644 --- a/python/pylibcudf/pylibcudf/column_factories.pyx +++ b/python/pylibcudf/pylibcudf/column_factories.pyx @@ -18,6 +18,20 @@ from .types import MaskState, TypeId cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id): + """Creates an empty column of the specified type. + + For details, see :cpp:func::`make_empty_column`. + + Parameters + ---------- + type_or_id : Union[DataType, type_id, object] + The column data type. + + Returns + ------- + Column + An empty Column + """ cdef unique_ptr[column] result cdef type_id id @@ -60,7 +74,11 @@ cpdef Column make_numeric_column( size_type size, MaskArg mstate ): + """Creates an empty numeric column. + + For details, see :cpp:func::`make_numeric_column`. + """ cdef unique_ptr[column] result cdef mask_state state diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx index 0ddc68bcb9d..e8e0caaf42d 100644 --- a/python/pylibcudf/pylibcudf/datetime.pyx +++ b/python/pylibcudf/pylibcudf/datetime.pyx @@ -2,7 +2,19 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.datetime cimport extract_year as cpp_extract_year +from pylibcudf.libcudf.datetime cimport ( + day_of_year as cpp_day_of_year, + extract_day as cpp_extract_day, + extract_hour as cpp_extract_hour, + extract_microsecond_fraction as cpp_extract_microsecond_fraction, + extract_millisecond_fraction as cpp_extract_millisecond_fraction, + extract_minute as cpp_extract_minute, + extract_month as cpp_extract_month, + extract_nanosecond_fraction as cpp_extract_nanosecond_fraction, + extract_second as cpp_extract_second, + extract_weekday as cpp_extract_weekday, + extract_year as cpp_extract_year, +) from .column cimport Column @@ -28,3 +40,42 @@ cpdef Column extract_year( with nogil: result = move(cpp_extract_year(values.view())) return Column.from_libcudf(move(result)) + + +def extract_datetime_component(Column col, str field): + + cdef unique_ptr[column] c_result + + with nogil: + if field == "year": + c_result = move(cpp_extract_year(col.view())) + elif field == "month": + c_result = move(cpp_extract_month(col.view())) + elif field == "day": + c_result = move(cpp_extract_day(col.view())) + elif field == "weekday": + c_result = move(cpp_extract_weekday(col.view())) + elif field == "hour": + c_result = move(cpp_extract_hour(col.view())) + elif field == "minute": + c_result = move(cpp_extract_minute(col.view())) + elif field == "second": + c_result = move(cpp_extract_second(col.view())) + elif field == "millisecond": + c_result = move( + cpp_extract_millisecond_fraction(col.view()) + ) + elif field == "microsecond": + c_result = move( + cpp_extract_microsecond_fraction(col.view()) + ) + elif field == "nanosecond": + c_result = move( + cpp_extract_nanosecond_fraction(col.view()) + ) + elif field == "day_of_year": + c_result = move(cpp_day_of_year(col.view())) + else: + raise ValueError(f"Invalid datetime field: '{field}'") + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx index ae5d33aaa46..afb95dba5b3 100644 --- a/python/pylibcudf/pylibcudf/groupby.pyx +++ b/python/pylibcudf/pylibcudf/groupby.pyx @@ -286,7 +286,7 @@ cdef class GroupBy: Returns ------- - Tuple[List[int], Table, Table]] + Tuple[List[int], Table, Table] A tuple of tables containing three items: - A list of integer offsets into the group keys/values - A table of group keys diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx index 667c67f4c36..438b0ff1634 100644 --- a/python/pylibcudf/pylibcudf/io/avro.pyx +++ b/python/pylibcudf/pylibcudf/io/avro.pyx @@ -20,6 +20,8 @@ cpdef TableWithMetadata read_avro( """ Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`. + For details, see :cpp:func:`read_avro`. + Parameters ---------- source_info: SourceInfo diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx index df1f1b14247..981ca7b8159 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet.pyx @@ -59,6 +59,8 @@ cdef class ChunkedParquetReader: """ Reads chunks of a Parquet file into a :py:class:`~.types.TableWithMetadata`. + For details, see :cpp:class:`chunked_parquet_reader`. + Parameters ---------- source_info : SourceInfo @@ -167,6 +169,8 @@ cpdef read_parquet( ): """Reads an Parquet file into a :py:class:`~.types.TableWithMetadata`. + For details, see :cpp:func:`read_parquet`. + Parameters ---------- source_info : SourceInfo diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx index b5a7445df36..b3f6a92d85c 100644 --- a/python/pylibcudf/pylibcudf/labeling.pyx +++ b/python/pylibcudf/pylibcudf/labeling.pyx @@ -20,6 +20,8 @@ cpdef Column label_bins( ): """Labels elements based on membership in the specified bins. + For details see :cpp:func:`label_bins`. + Parameters ---------- input : Column diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt index bd6e2e0af02..abf4357f862 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources char_types.pyx regex_flags.pyx) +set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd index 12cd628fc1f..b7166167cfd 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd @@ -10,5 +10,9 @@ from pylibcudf.libcudf.table.table cimport table cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[table] extract( - column_view source_strings, - regex_program) except + + column_view input, + regex_program prog) except + + + cdef unique_ptr[column] extract_all_record( + column_view input, + regex_program prog) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd index b25724586e1..e0a8b776465 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd @@ -9,5 +9,5 @@ from pylibcudf.libcudf.strings.regex_program cimport regex_program cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] findall( - column_view source_strings, - regex_program) except + + column_view input, + regex_program prog) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd index 3a89299f11a..019ff3f17ba 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd @@ -1,10 +1,10 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t cdef extern from "cudf/strings/side_type.hpp" namespace "cudf::strings" nogil: - ctypedef enum side_type: + cpdef enum class side_type(int32_t): LEFT 'cudf::strings::side_type::LEFT' RIGHT 'cudf::strings::side_type::RIGHT' BOTH 'cudf::strings::side_type::BOTH' diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx index 947caddc485..6f82124d06e 100644 --- a/python/pylibcudf/pylibcudf/lists.pyx +++ b/python/pylibcudf/pylibcudf/lists.pyx @@ -52,6 +52,8 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx): All other columns will be duplicated for each element in the list. + For details, see :cpp:func:`explode_outer`. + Parameters ---------- input : Table @@ -75,6 +77,8 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx): cpdef Column concatenate_rows(Table input): """Concatenate multiple lists columns into a single lists column row-wise. + For details, see :cpp:func:`concatenate_list_elements`. + Parameters ---------- input : Table @@ -96,6 +100,8 @@ cpdef Column concatenate_rows(Table input): cpdef Column concatenate_list_elements(Column input, bool dropna): """Concatenate multiple lists on the same row into a single list. + For details, see :cpp:func:`concatenate_list_elements`. + Parameters ---------- input : Column @@ -168,6 +174,8 @@ cpdef Column contains_nulls(Column input): """Create a column of bool values indicating whether each row in the lists column contains a null value. + For details, see :cpp:func:`contains_nulls`. + Parameters ---------- input : Column @@ -290,6 +298,8 @@ cpdef Column segmented_gather(Column input, Column gather_map_list): cpdef Column extract_list_element(Column input, ColumnOrSizeType index): """Create a column of extracted list elements. + For details, see :cpp:func:`extract_list_element`. + Parameters ---------- input : Column @@ -318,6 +328,8 @@ cpdef Column count_elements(Column input): list element in the given lists column. For details, see :cpp:func:`count_elements`. + For details, see :cpp:func:`count_elements`. + Parameters ---------- input : Column diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx index a7d43c9d158..6d707b67449 100644 --- a/python/pylibcudf/pylibcudf/merge.pyx +++ b/python/pylibcudf/pylibcudf/merge.pyx @@ -19,6 +19,8 @@ cpdef Table merge ( ): """Merge a set of sorted tables. + For details see :cpp:func:`merge`. + Parameters ---------- tables_to_merge : list diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx index b847ade774d..3a771fbe7ef 100644 --- a/python/pylibcudf/pylibcudf/quantiles.pyx +++ b/python/pylibcudf/pylibcudf/quantiles.pyx @@ -30,6 +30,8 @@ cpdef Column quantile( Computes the specified quantiles by interpolating values between which they lie, using the interpolation strategy specified in interp. + For details see :cpp:func:`quantile`. + Parameters ---------- input: Column @@ -91,6 +93,8 @@ cpdef Table quantiles( specified quantiles. In the event a quantile lies in between rows, the specified interpolation strategy is used to pick between the rows. + For details see :cpp:func:`quantiles`. + Parameters ---------- input: Table diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx index a99145be900..eb1499ebbea 100644 --- a/python/pylibcudf/pylibcudf/reshape.pyx +++ b/python/pylibcudf/pylibcudf/reshape.pyx @@ -23,6 +23,8 @@ cpdef Column interleave_columns(Table source_table): in = [[A1, A2, A3], [B1, B2, B3]] return = [A1, B1, A2, B2, A3, B3] + For details, see :cpp:func:`interleave_columns`. + Parameters ---------- source_table: Table @@ -44,6 +46,8 @@ cpdef Column interleave_columns(Table source_table): cpdef Table tile(Table source_table, size_type count): """Repeats the rows from input table count times to form a new table. + For details, see :cpp:func:`tile`. + Parameters ---------- source_table: Table diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx index ff2468f3f9c..814bc6553d8 100644 --- a/python/pylibcudf/pylibcudf/search.pyx +++ b/python/pylibcudf/pylibcudf/search.pyx @@ -19,6 +19,8 @@ cpdef Column lower_bound( ): """Find smallest indices in haystack where needles may be inserted to retain order. + For details, see :cpp:func:`lower_bound`. + Parameters ---------- haystack : Table @@ -58,6 +60,8 @@ cpdef Column upper_bound( ): """Find largest indices in haystack where needles may be inserted to retain order. + For details, see :cpp:func:`upper_bound`. + Parameters ---------- haystack : Table @@ -92,6 +96,8 @@ cpdef Column upper_bound( cpdef Column contains(Column haystack, Column needles): """Check whether needles are present in haystack. + For details, see :cpp:func:`contains`. + Parameters ---------- haystack : Table diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx index bd173eebacb..42289d54bca 100644 --- a/python/pylibcudf/pylibcudf/sorting.pyx +++ b/python/pylibcudf/pylibcudf/sorting.pyx @@ -16,6 +16,8 @@ from .table cimport Table cpdef Column sorted_order(Table source_table, list column_order, list null_precedence): """Computes the row indices required to sort the table. + For details, see :cpp:func:`sorted_order`. + Parameters ---------- source_table : Table @@ -52,6 +54,8 @@ cpdef Column stable_sorted_order( """Computes the row indices required to sort the table, preserving order of equal elements. + For details, see :cpp:func:`stable_sorted_order`. + Parameters ---------- source_table : Table @@ -90,6 +94,8 @@ cpdef Column rank( ): """Computes the rank of each element in the column. + For details, see :cpp:func:`rank`. + Parameters ---------- input_view : Column @@ -128,6 +134,8 @@ cpdef Column rank( cpdef bool is_sorted(Table tbl, list column_order, list null_precedence): """Checks if the table is sorted. + For details, see :cpp:func:`is_sorted`. + Parameters ---------- tbl : Table @@ -165,6 +173,8 @@ cpdef Table segmented_sort_by_key( ): """Sorts the table by key, within segments. + For details, see :cpp:func:`segmented_sort_by_key`. + Parameters ---------- values : Table @@ -209,6 +219,8 @@ cpdef Table stable_segmented_sort_by_key( """Sorts the table by key preserving order of equal elements, within segments. + For details, see :cpp:func:`stable_segmented_sort_by_key`. + Parameters ---------- values : Table @@ -251,6 +263,8 @@ cpdef Table sort_by_key( ): """Sorts the table by key. + For details, see :cpp:func:`sort_by_key`. + Parameters ---------- values : Table @@ -290,6 +304,8 @@ cpdef Table stable_sort_by_key( ): """Sorts the table by key preserving order of equal elements. + For details, see :cpp:func:`stable_sort_by_key`. + Parameters ---------- values : Table @@ -324,6 +340,8 @@ cpdef Table stable_sort_by_key( cpdef Table sort(Table source_table, list column_order, list null_precedence): """Sorts the table. + For details, see :cpp:func:`sort`. + Parameters ---------- source_table : Table @@ -355,6 +373,8 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence): cpdef Table stable_sort(Table source_table, list column_order, list null_precedence): """Sorts the table preserving order of equal elements. + For details, see :cpp:func:`stable_sort`. + Parameters ---------- source_table : Table diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx index b574bfa9fa2..d5475ea79d5 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pyx +++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx @@ -25,6 +25,8 @@ from .table cimport Table cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): """Filters out rows from the input table based on the presence of nulls. + For details, see :cpp:func:`drop_nulls`. + Parameters ---------- source_table : Table @@ -53,6 +55,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold): """Filters out rows from the input table based on the presence of NaNs. + For details, see :cpp:func:`drop_nans`. + Parameters ---------- source_table : Table @@ -81,6 +85,8 @@ cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold): cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask): """Filters out rows from the input table based on a boolean mask. + For details, see :cpp:func:`apply_boolean_mask`. + Parameters ---------- source_table : Table @@ -111,6 +117,8 @@ cpdef Table unique( ): """Filter duplicate consecutive rows from the input table. + For details, see :cpp:func:`unique`. + Parameters ---------- input : Table @@ -153,6 +161,8 @@ cpdef Table distinct( ): """Get the distinct rows from the input table. + For details, see :cpp:func:`distinct`. + Parameters ---------- input : Table @@ -191,6 +201,8 @@ cpdef Column distinct_indices( ): """Get the indices of the distinct rows from the input table. + For details, see :cpp:func:`distinct_indices`. + Parameters ---------- input : Table @@ -226,6 +238,8 @@ cpdef Table stable_distinct( ): """Get the distinct rows from the input table, preserving input order. + For details, see :cpp:func:`stable_distinct`. + Parameters ---------- input : Table @@ -263,6 +277,8 @@ cpdef size_type unique_count( ): """Returns the number of unique consecutive elements in the input column. + For details, see :cpp:func:`unique_count`. + Parameters ---------- source : Column @@ -294,6 +310,8 @@ cpdef size_type distinct_count( ): """Returns the number of distinct elements in the input column. + For details, see :cpp:func:`distinct_count`. + Parameters ---------- source : Column diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index 457e462e3cf..77f20b0b917 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -12,8 +12,9 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx - regex_program.pyx repeat.pyx replace.pyx slice.pyx +set(cython_sources + capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx findall.pyx + regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx ) set(linked_libraries cudf::cudf) @@ -22,3 +23,5 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf ) + +add_subdirectory(convert) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index d1f632d6d8e..91d884b294b 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -5,9 +5,14 @@ from . cimport ( case, char_types, contains, + convert, + extract, find, + findall, regex_flags, regex_program, replace, slice, + strip, ) +from .side_type cimport side_type diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index 250cefedf55..b4856784390 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -5,10 +5,15 @@ case, char_types, contains, + convert, + extract, find, + findall, regex_flags, regex_program, repeat, replace, slice, + strip, ) +from .side_type import SideType diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt new file mode 100644 index 00000000000..175c9b3738e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources convert_durations.pyx convert_datetime.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd new file mode 100644 index 00000000000..05324cb49df --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -0,0 +1,2 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from . cimport convert_datetime, convert_durations diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py new file mode 100644 index 00000000000..d803399d53c --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from . import convert_datetime, convert_durations diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd new file mode 100644 index 00000000000..07c84d263d6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_timestamps( + Column input, + DataType timestamp_type, + const string& format +) + +cpdef Column from_timestamps( + Column input, + const string& format, + Column input_strings_names +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx new file mode 100644 index 00000000000..fcacb096f87 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx @@ -0,0 +1,56 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_datetime as cpp_convert_datetime, +) + +from pylibcudf.types import DataType + + +cpdef Column to_timestamps( + Column input, + DataType timestamp_type, + const string& format +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_datetime.to_timestamps( + input.view(), + timestamp_type.c_obj, + format + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_timestamps( + Column input, + const string& format, + Column input_strings_names +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_datetime.from_timestamps( + input.view(), + format, + input_strings_names.view() + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column is_timestamp( + Column input, + const string& format +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_datetime.is_timestamp( + input.view(), + format + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd new file mode 100644 index 00000000000..ac11b8959ed --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_durations( + Column input, + DataType duration_type, + const string& format +) + +cpdef Column from_durations( + Column input, + const string& format +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx new file mode 100644 index 00000000000..f3e0b7c9c8e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx @@ -0,0 +1,41 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_durations as cpp_convert_durations, +) + +from pylibcudf.types import DataType + + +cpdef Column to_durations( + Column input, + DataType duration_type, + const string& format +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_durations.to_durations( + input.view(), + duration_type.c_obj, + format + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_durations( + Column input, + const string& format +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_durations.from_durations( + input.view(), + format + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/strings/extract.pxd new file mode 100644 index 00000000000..3871f5a0e4e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/extract.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + + +cpdef Table extract(Column input, RegexProgram prog) + +cpdef Column extract_all_record(Column input, RegexProgram prog) diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx new file mode 100644 index 00000000000..dcb11ca10ce --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/extract.pyx @@ -0,0 +1,76 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport extract as cpp_extract +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + + +cpdef Table extract(Column input, RegexProgram prog): + """ + Returns a table of strings columns where each column + corresponds to the matching group specified in the given + egex_program object. + + For details, see :cpp:func:`cudf::strings::extract`. + + Parameters + ---------- + input : Column + Strings instance for this operation + prog : RegexProgram + Regex program instance + + Returns + ------- + Table + Columns of strings extracted from the input column. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = move( + cpp_extract.extract( + input.view(), + prog.c_obj.get()[0] + ) + ) + + return Table.from_libcudf(move(c_result)) + + +cpdef Column extract_all_record(Column input, RegexProgram prog): + """ + Returns a lists column of strings where each string column + row corresponds to the matching group specified in the given + regex_program object. + + For details, see :cpp:func:`cudf::strings::extract_all_record`. + + Parameters + ---------- + input : Column + Strings instance for this operation + prog : RegexProgram + Regex program instance + + Returns + ------- + Column + Lists column containing strings extracted from the input column + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_extract.extract_all_record( + input.view(), + prog.c_obj.get()[0] + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/strings/findall.pxd new file mode 100644 index 00000000000..54afa088141 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/findall.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column findall(Column input, RegexProgram pattern) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx new file mode 100644 index 00000000000..3a6b87504b3 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/findall.pyx @@ -0,0 +1,40 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport findall as cpp_findall +from pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column findall(Column input, RegexProgram pattern): + """ + Returns a lists column of strings for each matching occurrence using + the regex_program pattern within each string. + + For details, see :cpp:func:`cudf::strings::findall`. + + Parameters + ---------- + input : Column + Strings instance for this operation + pattern : RegexProgram + Regex pattern + + Returns + ------- + Column + New lists column of strings + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_findall.findall( + input.view(), + pattern.c_obj.get()[0] + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/strings/side_type.pxd new file mode 100644 index 00000000000..34b7a580380 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/side_type.pxd @@ -0,0 +1,3 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.libcudf.strings.side_type cimport side_type diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx new file mode 100644 index 00000000000..acdc7d6ff1f --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.libcudf.strings.side_type import \ + side_type as SideType # no-cython-lint diff --git a/python/pylibcudf/pylibcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/strings/strip.pxd new file mode 100644 index 00000000000..8bbe4753edd --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/strip.pxd @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.side_type cimport side_type + + +cpdef Column strip( + Column input, + side_type side=*, + Scalar to_strip=* +) diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyx b/python/pylibcudf/pylibcudf/strings/strip.pyx new file mode 100644 index 00000000000..429a23c3cdf --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/strip.pyx @@ -0,0 +1,60 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings cimport strip as cpp_strip +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.side_type cimport side_type + + +cpdef Column strip( + Column input, + side_type side=side_type.BOTH, + Scalar to_strip=None +): + """Removes the specified characters from the beginning + or end (or both) of each string. + + For details, see :cpp:func:`cudf::strings::strip`. + + Parameters + ---------- + input : Column + Strings column for this operation + side : SideType, default SideType.BOTH + Indicates characters are to be stripped from the beginning, + end, or both of each string; Default is both + to_strip : Scalar + UTF-8 encoded characters to strip from each string; + Default is empty string which indicates strip whitespace characters + + Returns + ------- + pylibcudf.Column + New strings column. + """ + + if to_strip is None: + to_strip = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + cdef unique_ptr[column] c_result + cdef string_scalar* cpp_to_strip + cpp_to_strip = (to_strip.c_obj.get()) + + with nogil: + c_result = cpp_strip.strip( + input.view(), + side, + dereference(cpp_to_strip) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini index 1761c0f011c..f572f85ca49 100644 --- a/python/pylibcudf/pylibcudf/tests/pytest.ini +++ b/python/pylibcudf/pylibcudf/tests/pytest.ini @@ -6,3 +6,4 @@ filterwarnings = error ignore:::.*xdist.* ignore:::.*pytest.* +addopts = --tb=native diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py index d3aa6101e2d..89c96829e71 100644 --- a/python/pylibcudf/pylibcudf/tests/test_datetime.py +++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import datetime +import functools import pyarrow as pa import pyarrow.compute as pc @@ -10,7 +11,7 @@ @pytest.fixture -def column(has_nulls): +def date_column(has_nulls): values = [ datetime.date(1999, 1, 1), datetime.date(2024, 10, 12), @@ -22,9 +23,41 @@ def column(has_nulls): return plc.interop.from_arrow(pa.array(values, type=pa.date32())) -def test_extract_year(column): - got = plc.datetime.extract_year(column) +@pytest.fixture(scope="module", params=["s", "ms", "us", "ns"]) +def datetime_column(has_nulls, request): + values = [ + datetime.datetime(1999, 1, 1), + datetime.datetime(2024, 10, 12), + datetime.datetime(1970, 1, 1), + datetime.datetime(2260, 1, 1), + datetime.datetime(2024, 2, 29, 3, 14, 15), + datetime.datetime(2024, 2, 29, 3, 14, 15, 999), + ] + if has_nulls: + values[2] = None + return plc.interop.from_arrow( + pa.array(values, type=pa.timestamp(request.param)) + ) + + +@pytest.mark.parametrize( + "component, pc_fun", + [ + ("year", pc.year), + ("month", pc.month), + ("day", pc.day), + ("weekday", functools.partial(pc.day_of_week, count_from_zero=False)), + ("hour", pc.hour), + ("minute", pc.minute), + ("second", pc.second), + ("millisecond", pc.millisecond), + ("microsecond", pc.microsecond), + ("nanosecond", pc.nanosecond), + ], +) +def test_extraction(datetime_column, component, pc_fun): + got = plc.datetime.extract_datetime_component(datetime_column, component) # libcudf produces an int16, arrow produces an int64 - expect = pc.year(plc.interop.to_arrow(column)).cast(pa.int16()) + expect = pc_fun(plc.interop.to_arrow(datetime_column)).cast(pa.int16()) assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py new file mode 100644 index 00000000000..e9e95459d0e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py @@ -0,0 +1,85 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from datetime import datetime + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture( + scope="module", + params=[ + pa.timestamp("ns"), + pa.timestamp("us"), + pa.timestamp("ms"), + pa.timestamp("s"), + ], +) +def timestamp_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=[ + pa.duration("ns"), + pa.duration("us"), + pa.duration("ms"), + pa.duration("s"), + ], +) +def duration_type(request): + return request.param + + +@pytest.fixture(scope="module") +def pa_timestamp_col(): + return pa.array(["2011-01-01", "2011-01-02", "2011-01-03"]) + + +@pytest.fixture(scope="module") +def pa_duration_col(): + return pa.array(["05:20:25"]) + + +@pytest.fixture(scope="module") +def plc_timestamp_col(pa_timestamp_col): + return plc.interop.from_arrow(pa_timestamp_col) + + +@pytest.fixture(scope="module") +def plc_duration_col(pa_duration_col): + return plc.interop.from_arrow(pa_duration_col) + + +@pytest.mark.parametrize("format", ["%Y-%m-%d"]) +def test_to_datetime( + pa_timestamp_col, plc_timestamp_col, timestamp_type, format +): + expect = pa.compute.strptime(pa_timestamp_col, format, timestamp_type.unit) + got = plc.strings.convert.convert_datetime.to_timestamps( + plc_timestamp_col, + plc.interop.from_arrow(timestamp_type), + format.encode(), + ) + assert_column_eq(expect, got) + + +@pytest.mark.parametrize("format", ["%H:%M:%S"]) +def test_to_duration(pa_duration_col, plc_duration_col, duration_type, format): + def to_timedelta(duration_str): + date = datetime.strptime(duration_str, format) + return date - datetime(1900, 1, 1) # "%H:%M:%S" zero date + + expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast( + duration_type + ) + + got = plc.strings.convert.convert_durations.to_durations( + plc_duration_col, + plc.interop.from_arrow(duration_type), + format.encode(), + ) + assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py new file mode 100644 index 00000000000..788b86423c4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc + + +def test_extract(): + pattern = "([ab])(\\d)" + pa_pattern = "(?P[ab])(?P\\d)" + arr = pa.array(["a1", "b2", "c3"]) + plc_result = plc.strings.extract.extract( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + result = plc.interop.to_arrow(plc_result) + expected = pc.extract_regex(arr, pa_pattern) + for i, result_col in enumerate(result.itercolumns()): + expected_col = pa.chunked_array(expected.field(i)) + assert result_col.fill_null("").equals(expected_col) + + +def test_extract_all_record(): + pattern = "([ab])(\\d)" + arr = pa.array(["a1", "b2", "c3"]) + plc_result = plc.strings.extract.extract_all_record( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array( + [pa.array([["a", "1"], ["b", "2"], None], type=result.type)] + ) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py new file mode 100644 index 00000000000..994552fa276 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import re + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_findall(): + arr = pa.array(["bunny", "rabbit", "hare", "dog"]) + pattern = "[ab]" + result = plc.strings.findall.findall( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + pa_result = plc.interop.to_arrow(result) + expected = pa.array( + [re.findall(pattern, elem) for elem in arr.to_pylist()], + type=pa_result.type, + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_strip.py b/python/pylibcudf/pylibcudf/tests/test_string_strip.py new file mode 100644 index 00000000000..005e5e4a405 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_strip.py @@ -0,0 +1,122 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + +data_strings = [ + "AbC", + "123abc", + "", + " ", + None, + "aAaaaAAaa", + " ab c ", + "abc123", + " ", + "\tabc\t", + "\nabc\n", + "\r\nabc\r\n", + "\t\n abc \n\t", + "!@#$%^&*()", + " abc!!! ", + " abc\t\n!!! ", + "__abc__", + "abc\n\n", + "123abc456", + "abcxyzabc", +] + +strip_chars = [ + "a", + "", + " ", + "\t", + "\n", + "\r\n", + "!", + "@#", + "123", + "xyz", + "abc", + "__", + " \t\n", + "abc123", +] + + +@pytest.fixture +def pa_col(): + return pa.array(data_strings, type=pa.string()) + + +@pytest.fixture +def plc_col(pa_col): + return plc.interop.from_arrow(pa_col) + + +@pytest.fixture(params=strip_chars) +def pa_char(request): + return pa.scalar(request.param, type=pa.string()) + + +@pytest.fixture +def plc_char(pa_char): + return plc.interop.from_arrow(pa_char) + + +def test_strip(pa_col, plc_col, pa_char, plc_char): + def strip_string(st, char): + if st is None: + return None + + elif char == "": + return st.strip() + return st.strip(char) + + expected = pa.array( + [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()], + type=pa.string(), + ) + + got = plc.strings.strip.strip(plc_col, plc.strings.SideType.BOTH, plc_char) + assert_column_eq(expected, got) + + +def test_strip_right(pa_col, plc_col, pa_char, plc_char): + def strip_string(st, char): + if st is None: + return None + + elif char == "": + return st.rstrip() + return st.rstrip(char) + + expected = pa.array( + [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()], + type=pa.string(), + ) + + got = plc.strings.strip.strip( + plc_col, plc.strings.SideType.RIGHT, plc_char + ) + assert_column_eq(expected, got) + + +def test_strip_left(pa_col, plc_col, pa_char, plc_char): + def strip_string(st, char): + if st is None: + return None + + elif char == "": + return st.lstrip() + return st.lstrip(char) + + expected = pa.array( + [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()], + type=pa.string(), + ) + + got = plc.strings.strip.strip(plc_col, plc.strings.SideType.LEFT, plc_char) + assert_column_eq(expected, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py index 06fc35d8835..d5c618f07e4 100644 --- a/python/pylibcudf/pylibcudf/tests/test_transform.py +++ b/python/pylibcudf/pylibcudf/tests/test_transform.py @@ -29,3 +29,54 @@ def test_nans_to_nulls(has_nans): got = input.with_mask(mask, null_count) assert_column_eq(expect, got) + + +def test_bools_to_mask_roundtrip(): + pa_array = pa.array([True, None, False]) + plc_input = plc.interop.from_arrow(pa_array) + mask, result_null_count = plc.transform.bools_to_mask(plc_input) + + assert result_null_count == 2 + result = plc_input.with_mask(mask, result_null_count) + assert_column_eq(pa.array([True, None, None]), result) + + plc_output = plc.transform.mask_to_bools(mask.ptr, 0, len(pa_array)) + result_pa = plc.interop.to_arrow(plc_output) + expected_pa = pa.chunked_array([[True, False, False]]) + assert result_pa.equals(expected_pa) + + +def test_encode(): + pa_table = pa.table({"a": [1, 3, 4], "b": [1, 2, 4]}) + plc_input = plc.interop.from_arrow(pa_table) + result_table, result_column = plc.transform.encode(plc_input) + pa_table_result = plc.interop.to_arrow(result_table) + pa_column_result = plc.interop.to_arrow(result_column) + + pa_table_expected = pa.table( + [[1, 3, 4], [1, 2, 4]], + schema=pa.schema( + [ + pa.field("", pa.int64(), nullable=False), + pa.field("", pa.int64(), nullable=False), + ] + ), + ) + assert pa_table_result.equals(pa_table_expected) + + pa_column_expected = pa.chunked_array([[0, 1, 2]], type=pa.int32()) + assert pa_column_result.equals(pa_column_expected) + + +def test_one_hot_encode(): + pa_column = pa.array([1, 2, 3]) + pa_categories = pa.array([0, 0, 0]) + plc_input = plc.interop.from_arrow(pa_column) + plc_categories = plc.interop.from_arrow(pa_categories) + plc_table = plc.transform.one_hot_encode(plc_input, plc_categories) + result = plc.interop.to_arrow(plc_table) + expected = pa.table( + [[False] * 3] * 3, + schema=pa.schema([pa.field("", pa.bool_(), nullable=False)] * 3), + ) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/transform.pxd b/python/pylibcudf/pylibcudf/transform.pxd index 4b21feffe25..b530f433c97 100644 --- a/python/pylibcudf/pylibcudf/transform.pxd +++ b/python/pylibcudf/pylibcudf/transform.pxd @@ -1,7 +1,21 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp cimport bool +from pylibcudf.libcudf.types cimport bitmask_type, data_type from .column cimport Column from .gpumemoryview cimport gpumemoryview +from .table cimport Table +from .types cimport DataType cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input) + +cpdef tuple[gpumemoryview, int] bools_to_mask(Column input) + +cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit) + +cpdef Column transform(Column input, str unary_udf, DataType output_type, bool is_ptx) + +cpdef tuple[Table, Column] encode(Table input) + +cpdef Table one_hot_encode(Column input_column, Column categories) diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx index 100ccb580ce..de425a27c15 100644 --- a/python/pylibcudf/pylibcudf/transform.pyx +++ b/python/pylibcudf/pylibcudf/transform.pyx @@ -1,19 +1,27 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from libcpp.string cimport string from libcpp.utility cimport move, pair from pylibcudf.libcudf cimport transform as cpp_transform -from pylibcudf.libcudf.types cimport size_type +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.libcudf.table.table_view cimport table_view +from pylibcudf.libcudf.types cimport bitmask_type, size_type from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer from .column cimport Column from .gpumemoryview cimport gpumemoryview +from .types cimport DataType +from .utils cimport int_to_bitmask_ptr cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): """Create a null mask preserving existing nulls and converting nans to null. + For details, see :cpp:func:`nans_to_nulls`. + Parameters ---------- input : Column @@ -32,3 +40,141 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), c_result.second ) + + +cpdef tuple[gpumemoryview, int] bools_to_mask(Column input): + """Create a bitmask from a column of boolean elements + + Parameters + ---------- + input : Column + Column to produce new mask from. + + Returns + ------- + tuple[gpumemoryview, int] + Two-tuple of a gpumemoryview wrapping the bitmask and the null count. + """ + cdef pair[unique_ptr[device_buffer], size_type] c_result + + with nogil: + c_result = move(cpp_transform.bools_to_mask(input.view())) + + return ( + gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), + c_result.second + ) + + +cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit): + """Creates a boolean column from given bitmask. + + Parameters + ---------- + bitmask : int + Pointer to the bitmask which needs to be converted + begin_bit : int + Position of the bit from which the conversion should start + end_bit : int + Position of the bit before which the conversion should stop + + Returns + ------- + Column + Boolean column of the bitmask from [begin_bit, end_bit] + """ + cdef unique_ptr[column] c_result + cdef bitmask_type * bitmask_ptr = int_to_bitmask_ptr(bitmask) + + with nogil: + c_result = move(cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit)) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column transform(Column input, str unary_udf, DataType output_type, bool is_ptx): + """Create a new column by applying a unary function against every + element of an input column. + + Parameters + ---------- + input : Column + Column to transform. + unary_udf : str + The PTX/CUDA string of the unary function to apply. + output_type : DataType + The output type that is compatible with the output type in the unary_udf. + is_ptx : bool + If `True`, the UDF is treated as PTX code. + If `False`, the UDF is treated as CUDA code. + + Returns + ------- + Column + The transformed column having the UDF applied to each element. + """ + cdef unique_ptr[column] c_result + cdef string c_unary_udf = unary_udf.encode() + cdef bool c_is_ptx = is_ptx + + with nogil: + c_result = move( + cpp_transform.transform( + input.view(), c_unary_udf, output_type.c_obj, c_is_ptx + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef tuple[Table, Column] encode(Table input): + """Encode the rows of the given table as integers. + + Parameters + ---------- + input : Table + Table containing values to be encoded + + Returns + ------- + tuple[Table, Column] + The distinct row of the input table in sorted order, + and a column of integer indices representing the encoded rows. + """ + cdef pair[unique_ptr[table], unique_ptr[column]] c_result + + with nogil: + c_result = move(cpp_transform.encode(input.view())) + + return ( + Table.from_libcudf(move(c_result.first)), + Column.from_libcudf(move(c_result.second)) + ) + +cpdef Table one_hot_encode(Column input, Column categories): + """Encodes `input` by generating a new column + for each value in `categories` indicating the presence + of that value in `input`. + + Parameters + ---------- + input : Column + Column containing values to be encoded. + categories : Column + Column containing categories + + Returns + ------- + Column + A table of the encoded values. + """ + cdef pair[unique_ptr[column], table_view] c_result + cdef Table owner_table + + with nogil: + c_result = move(cpp_transform.one_hot_encode(input.view(), categories.view())) + + owner_table = Table( + [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns() + ) + + return Table.from_table_view(c_result.second, owner_table)