diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a35802f2ab0..ceee9074b93 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -25,7 +25,8 @@ jobs: - docs-build - wheel-build-cudf - wheel-tests-cudf - - test-cudf-polars + - wheel-build-cudf-polars + - wheel-tests-cudf-polars - wheel-build-dask-cudf - wheel-tests-dask-cudf - devcontainer @@ -133,9 +134,18 @@ jobs: with: build_type: pull-request script: ci/test_wheel_cudf.sh - test-cudf-polars: + wheel-build-cudf-polars: needs: wheel-build-cudf secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 + with: + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) + build_type: pull-request + script: "ci/build_wheel_cudf_polars.sh" + wheel-tests-cudf-polars: + needs: wheel-build-cudf-polars + secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -143,7 +153,7 @@ jobs: build_type: pull-request # This always runs, but only fails if this PR touches code in # pylibcudf or cudf_polars - script: "ci/test_cudf_polars.sh" + script: "ci/test_wheel_cudf_polars.sh" wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f8c4f4b9143..bbcd78d051f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -136,11 +136,6 @@ repos: .*test.*| ^CHANGELOG.md$ ) - - repo: https://github.com/rapidsai/dependency-file-generator - rev: v1.13.11 - hooks: - - id: rapids-dependency-file-generator - args: ["--clean"] - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.4.8 hooks: @@ -149,7 +144,7 @@ repos: - id: ruff-format files: python/.*$ - repo: https://github.com/rapidsai/pre-commit-hooks - rev: v0.0.3 + rev: v0.2.0 hooks: - id: verify-copyright exclude: | @@ -158,6 +153,12 @@ repos: cpp/src/io/parquet/ipc/Message_generated[.]h$| cpp/src/io/parquet/ipc/Schema_generated[.]h$ ) + - id: verify-alpha-spec + - repo: https://github.com/rapidsai/dependency-file-generator + rev: v1.13.11 + hooks: + - id: rapids-dependency-file-generator + args: ["--clean"] default_language_version: python: python3 diff --git a/build.sh b/build.sh index 4291c88ea12..52bb1e64d16 100755 --- a/build.sh +++ b/build.sh @@ -17,7 +17,7 @@ ARGS=$* # script, and that this script resides in the repo dir! REPODIR=$(cd $(dirname $0); pwd) -VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp --show_depr_warn --ptds -h --build_metrics --incl_cache_stats" +VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings" HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"\\\"] clean - remove all existing build artifacts and configuration (start over) @@ -39,6 +39,7 @@ HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [li --opensource_nvcomp - disable use of proprietary nvcomp extensions --show_depr_warn - show cmake deprecation warnings --ptds - enable per-thread default stream + --disable_large_strings - disable large strings support --build_metrics - generate build metrics report for libcudf --incl_cache_stats - include cache statistics in build metrics report --cmake-args=\\\"\\\" - pass arbitrary list of CMake configuration options (escape all quotes in argument) @@ -69,6 +70,7 @@ BUILD_DISABLE_DEPRECATION_WARNINGS=ON BUILD_PER_THREAD_DEFAULT_STREAM=OFF BUILD_REPORT_METRICS=OFF BUILD_REPORT_INCL_CACHE_STATS=OFF +BUILD_DISABLE_LARGE_STRINGS=OFF USE_PROPRIETARY_NVCOMP=ON PYTHON_ARGS_FOR_INSTALL="-m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true" @@ -153,6 +155,7 @@ function buildLibCudfJniInDocker { -DCUDF_ENABLE_ARROW_S3=OFF \ -DBUILD_TESTS=OFF \ -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=ON \ + -DCUDF_LARGE_STRINGS_DISABLED=ON \ -DRMM_LOGGING_LEVEL=OFF \ -DBUILD_SHARED_LIBS=OFF && \ cmake --build . --parallel ${PARALLEL_LEVEL} && \ @@ -239,6 +242,9 @@ if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUDF_CPP"* ]]; then EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_CUDF_CPP=ON" fi +if hasArg --disable_large_strings; then + BUILD_DISABLE_LARGE_STRINGS="ON" +fi # If clean given, run it prior to any other steps if hasArg clean; then @@ -292,6 +298,7 @@ if buildAll || hasArg libcudf; then -DBUILD_BENCHMARKS=${BUILD_BENCHMARKS} \ -DDISABLE_DEPRECATION_WARNINGS=${BUILD_DISABLE_DEPRECATION_WARNINGS} \ -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=${BUILD_PER_THREAD_DEFAULT_STREAM} \ + -DCUDF_LARGE_STRINGS_DISABLED=${BUILD_DISABLE_LARGE_STRINGS} \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ ${EXTRA_CMAKE_ARGS} diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh new file mode 100755 index 00000000000..9c945e11c00 --- /dev/null +++ b/ci/build_wheel_cudf_polars.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +set -euo pipefail + +package_dir="python/cudf_polars" + +./ci/build_wheel.sh ${package_dir} + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" +RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist diff --git a/ci/run_cudf_polars_pytests.sh b/ci/run_cudf_polars_pytests.sh new file mode 100755 index 00000000000..c10612a065a --- /dev/null +++ b/ci/run_cudf_polars_pytests.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +# It is essential to cd into python/cudf_polars as `pytest-xdist` + `coverage` seem to work only at this directory level. + +# Support invoking run_cudf_polars_pytests.sh outside the script directory +cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_polars/ + +python -m pytest --cache-clear "$@" tests diff --git a/ci/test_java.sh b/ci/test_java.sh index 9713eb192d2..629ad11014a 100755 --- a/ci/test_java.sh +++ b/ci/test_java.sh @@ -39,6 +39,9 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e +# disable large strings +export LIBCUDF_LARGE_STRINGS_ENABLED=0 + rapids-logger "Run Java tests" pushd java mvn test -B -DCUDF_JNI_ENABLE_PROFILING=OFF diff --git a/ci/test_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh similarity index 65% rename from ci/test_cudf_polars.sh rename to ci/test_wheel_cudf_polars.sh index 669e049ab26..900acd5d473 100755 --- a/ci/test_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -18,21 +18,14 @@ else fi RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist +RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist -RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} -RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ -mkdir -p "${RAPIDS_TESTS_DIR}" - -rapids-logger "Install cudf wheel" -# echo to expand wildcard before adding `[extra]` requires for pip -python -m pip install $(echo ./dist/cudf*.whl)[test] - -rapids-logger "Install polars (allow pre-release versions)" -python -m pip install 'polars>=1.0.0a0' +# Download the cudf built in the previous step +RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep +python -m pip install ./local-cudf-dep/cudf*.whl rapids-logger "Install cudf_polars" -python -m pip install --no-deps python/cudf_polars +python -m pip install $(echo ./dist/cudf_polars*.whl)[test] rapids-logger "Run cudf_polars tests" @@ -44,13 +37,11 @@ EXITCODE=0 trap set_exitcode ERR set +e -python -m pytest \ - --cache-clear \ +./ci/run_cudf_polars_pytests.sh \ --cov cudf_polars \ --cov-fail-under=100 \ - --cov-config=python/cudf_polars/pyproject.toml \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf_polars.xml" \ - python/cudf_polars/tests + --cov-config=./pyproject.toml \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars.xml" trap ERR set -e diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index 2b20b9d9ce4..c3800d3cc25 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -8,7 +8,7 @@ RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE=" # Download the cudf built in the previous step RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep -python -m pip install --no-deps ./local-cudf-dep/cudf*.whl +python -m pip install ./local-cudf-dep/cudf*.whl # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install $(echo ./dist/dask_cudf*.whl)[test] diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 946e2d1cd32..b8d73a01f96 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -15,7 +15,7 @@ dependencies: - cachetools - clang-tools=16.0.6 - clang==16.0.6 -- cmake>=3.26.4 +- cmake>=3.26.4,!=3.30.0 - cramjam - cubinlinker - cuda-nvtx=11.8 @@ -26,7 +26,6 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 -- dask-cuda==24.8.* - dask-cuda==24.8.*,>=0.0.0a0 - dlpack>=0.8,<1.0 - doxygen=1.9.1 @@ -44,10 +43,10 @@ dependencies: - libcufile=1.4.0.31 - libcurand-dev=10.3.0.86 - libcurand=10.3.0.86 -- libkvikio==24.8.* +- libkvikio==24.8.*,>=0.0.0a0 - libparquet==16.1.0.* - librdkafka>=1.9.0,<1.10.0a0 -- librmm==24.8.* +- librmm==24.8.*,>=0.0.0a0 - make - moto>=4.0.8 - msgpack-python diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index f069616ddbe..c32d21c5d36 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -15,7 +15,7 @@ dependencies: - cachetools - clang-tools=16.0.6 - clang==16.0.6 -- cmake>=3.26.4 +- cmake>=3.26.4,!=3.30.0 - cramjam - cuda-cudart-dev - cuda-nvcc @@ -27,7 +27,6 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 -- dask-cuda==24.8.* - dask-cuda==24.8.*,>=0.0.0a0 - dlpack>=0.8,<1.0 - doxygen=1.9.1 @@ -43,10 +42,10 @@ dependencies: - libarrow==16.1.0.* - libcufile-dev - libcurand-dev -- libkvikio==24.8.* +- libkvikio==24.8.*,>=0.0.0a0 - libparquet==16.1.0.* - librdkafka>=1.9.0,<1.10.0a0 -- librmm==24.8.* +- librmm==24.8.*,>=0.0.0a0 - make - moto>=4.0.8 - msgpack-python @@ -66,7 +65,7 @@ dependencies: - pre-commit - pyarrow==16.1.0.* - pydata-sphinx-theme!=0.14.2 -- pynvjitlink +- pynvjitlink>=0.0.0a0 - pytest-benchmark - pytest-cases>=3.8.2 - pytest-cov diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml index d399e440edd..af894cccda0 100644 --- a/conda/recipes/cudf/conda_build_config.yaml +++ b/conda/recipes/cudf/conda_build_config.yaml @@ -11,7 +11,7 @@ c_stdlib_version: - "2.17" cmake_version: - - ">=3.26.4" + - ">=3.26.4,!=3.30.0" cuda_compiler: - cuda-nvcc diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml index d399e440edd..af894cccda0 100644 --- a/conda/recipes/cudf_kafka/conda_build_config.yaml +++ b/conda/recipes/cudf_kafka/conda_build_config.yaml @@ -11,7 +11,7 @@ c_stdlib_version: - "2.17" cmake_version: - - ">=3.26.4" + - ">=3.26.4,!=3.30.0" cuda_compiler: - cuda-nvcc diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index c01178bf732..4f99411e978 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -17,7 +17,7 @@ c_stdlib_version: - "2.17" cmake_version: - - ">=3.26.4" + - ">=3.26.4,!=3.30.0" libarrow_version: - "==16.1.0" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 94df0433b81..80f737378c5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -52,6 +52,8 @@ option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON) option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON) mark_as_advanced(CUDF_BUILD_TESTUTIL) option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON) +option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF) +mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED) option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF) option(CUDF_ENABLE_ARROW_ORC "Build the Arrow ORC adapter" OFF) option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF) @@ -365,6 +367,7 @@ add_library( src/interop/to_arrow_device.cu src/interop/from_arrow_device.cu src/interop/from_arrow_host.cu + src/interop/from_arrow_stream.cu src/interop/to_arrow_schema.cpp src/interop/detail/arrow_allocator.cpp src/io/avro/avro.cpp @@ -782,6 +785,11 @@ if(NOT USE_NVTX) target_compile_definitions(cudf PUBLIC NVTX_DISABLE) endif() +# Disable large strings support +if(CUDF_LARGE_STRINGS_DISABLED) + target_compile_definitions(cudf PRIVATE CUDF_LARGE_STRINGS_DISABLED) +endif() + # Define RMM logging level target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL") diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 8a48126e195..a5b248135c1 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -40,8 +40,13 @@ target_include_directories( # Use an OBJECT library so we only compile these helper source files only once add_library( - cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp" - synchronization/synchronization.cpp io/cuio_common.cpp + cudf_benchmark_common OBJECT + "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp" + synchronization/synchronization.cpp + io/cuio_common.cpp + common/table_utilities.cpp + common/benchmark_utilities.cpp + common/nvbench_utilities.cpp ) target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $) add_custom_command( diff --git a/cpp/benchmarks/common/benchmark_utilities.cpp b/cpp/benchmarks/common/benchmark_utilities.cpp new file mode 100644 index 00000000000..0b9fc17e779 --- /dev/null +++ b/cpp/benchmarks/common/benchmark_utilities.cpp @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark_utilities.hpp" + +void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration) +{ + state.SetItemsProcessed(state.iterations() * items_processed_per_iteration); +} + +void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration) +{ + state.SetBytesProcessed(state.iterations() * bytes_processed_per_iteration); +} diff --git a/cpp/benchmarks/common/benchmark_utilities.hpp b/cpp/benchmarks/common/benchmark_utilities.hpp new file mode 100644 index 00000000000..c5c80e73674 --- /dev/null +++ b/cpp/benchmarks/common/benchmark_utilities.hpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +/** + * @brief Sets the number of items processed during the benchmark. + * + * This function could be used instead of ::benchmark::State.SetItemsProcessed() + * to avoid repeatedly computing ::benchmark::State.iterations() * items_processed_per_iteration. + * + * @param state the benchmark state + * @param items_processed_per_iteration number of items processed per iteration + */ +void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration); + +/** + * @brief Sets the number of bytes processed during the benchmark. + * + * This function could be used instead of ::benchmark::State.SetItemsProcessed() + * to avoid repeatedly computing ::benchmark::State.iterations() * bytes_processed_per_iteration. + * + * @param state the benchmark state + * @param bytes_processed_per_iteration number of bytes processed per iteration + */ +void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration); diff --git a/cpp/benchmarks/common/nvbench_utilities.cpp b/cpp/benchmarks/common/nvbench_utilities.cpp new file mode 100644 index 00000000000..c740eaa52f4 --- /dev/null +++ b/cpp/benchmarks/common/nvbench_utilities.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nvbench_utilities.hpp" + +#include + +// This function is copied over from +// https://github.com/NVIDIA/nvbench/blob/a171514056e5d6a7f52a035dd6c812fa301d4f4f/nvbench/detail/measure_cold.cu#L190-L224. +void set_throughputs(nvbench::state& state) +{ + double avg_cuda_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + + if (const auto items = state.get_element_count(); items != 0) { + auto& summ = state.add_summary("nv/cold/bw/item_rate"); + summ.set_string("name", "Elem/s"); + summ.set_string("hint", "item_rate"); + summ.set_string("description", "Number of input elements processed per second"); + summ.set_float64("value", static_cast(items) / avg_cuda_time); + } + + if (const auto bytes = state.get_global_memory_rw_bytes(); bytes != 0) { + const auto avg_used_gmem_bw = static_cast(bytes) / avg_cuda_time; + { + auto& summ = state.add_summary("nv/cold/bw/global/bytes_per_second"); + summ.set_string("name", "GlobalMem BW"); + summ.set_string("hint", "byte_rate"); + summ.set_string("description", + "Number of bytes read/written per second to the CUDA " + "device's global memory"); + summ.set_float64("value", avg_used_gmem_bw); + } + + { + const auto peak_gmem_bw = + static_cast(state.get_device()->get_global_memory_bus_bandwidth()); + + auto& summ = state.add_summary("nv/cold/bw/global/utilization"); + summ.set_string("name", "BWUtil"); + summ.set_string("hint", "percentage"); + summ.set_string("description", + "Global device memory utilization as a percentage of the " + "device's peak bandwidth"); + summ.set_float64("value", avg_used_gmem_bw / peak_gmem_bw); + } + } +} diff --git a/cpp/benchmarks/common/nvbench_utilities.hpp b/cpp/benchmarks/common/nvbench_utilities.hpp new file mode 100644 index 00000000000..98d879efac5 --- /dev/null +++ b/cpp/benchmarks/common/nvbench_utilities.hpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace nvbench { +struct state; +} + +/** + * @brief Sets throughput statistics, such as "Elem/s", "GlobalMem BW", and "BWUtil" for the + * nvbench results summary. + * + * This function could be used to work around a known issue that the throughput statistics + * should be added before the nvbench::state.exec() call, otherwise they will not be printed + * in the summary. See https://github.com/NVIDIA/nvbench/issues/175 for more details. + */ +void set_throughputs(nvbench::state& state); diff --git a/cpp/benchmarks/common/table_utilities.cpp b/cpp/benchmarks/common/table_utilities.cpp new file mode 100644 index 00000000000..a6fbdac9fb8 --- /dev/null +++ b/cpp/benchmarks/common/table_utilities.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "table_utilities.hpp" + +#include +#include + +#include + +int64_t estimate_size(cudf::column_view const& col) +{ + return estimate_size(cudf::table_view({col})); +} + +int64_t estimate_size(cudf::table_view const& view) +{ + // Compute the size in bits for each row. + auto const row_sizes = cudf::row_bit_count(view); + // Accumulate the row sizes to compute a sum. + auto const agg = cudf::make_sum_aggregation(); + cudf::data_type sum_dtype{cudf::type_id::INT64}; + auto const total_size_scalar = cudf::reduce(*row_sizes, *agg, sum_dtype); + auto const total_size_in_bits = + static_cast*>(total_size_scalar.get())->value(); + // Convert the size in bits to the size in bytes. + return static_cast(std::ceil(static_cast(total_size_in_bits) / 8)); +} diff --git a/cpp/benchmarks/common/table_utilities.hpp b/cpp/benchmarks/common/table_utilities.hpp new file mode 100644 index 00000000000..04ee847d397 --- /dev/null +++ b/cpp/benchmarks/common/table_utilities.hpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +/** + * @brief Estimates the column size in bytes. + * + * @remark As this function internally uses cudf::row_bit_count() to estimate each row size + * and accumulates them, the returned estimate may be an inexact approximation in some + * cases. See cudf::row_bit_count() for more details. + * + * @param view The column view to estimate its size + */ +int64_t estimate_size(cudf::column_view const& view); + +/** + * @brief Estimates the table size in bytes. + * + * @remark As this function internally uses cudf::row_bit_count() to estimate each row size + * and accumulates them, the returned estimate may be an inexact approximation in some + * cases. See cudf::row_bit_count() for more details. + * + * @param view The table view to estimate its size + */ +int64_t estimate_size(cudf::table_view const& view); diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp index 8b1e71c1585..e9d23881764 100644 --- a/cpp/benchmarks/reduction/anyall.cpp +++ b/cpp/benchmarks/reduction/anyall.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,9 @@ * limitations under the License. */ +#include #include +#include #include #include @@ -42,6 +44,10 @@ void BM_reduction_anyall(benchmark::State& state, cuda_event_timer timer(state, true); auto result = cudf::reduce(*values, *agg, output_dtype); } + + // The benchmark takes a column and produces one scalar. + set_items_processed(state, column_size + 1); + set_bytes_processed(state, estimate_size(values->view()) + cudf::size_of(output_dtype)); } #define concat(a, b, c) a##b##c diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp index c1c44c919ac..5095337dbb3 100644 --- a/cpp/benchmarks/reduction/dictionary.cpp +++ b/cpp/benchmarks/reduction/dictionary.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -52,6 +53,13 @@ void BM_reduction_dictionary(benchmark::State& state, cuda_event_timer timer(state, true); auto result = cudf::reduce(*values, *agg, output_dtype); } + + // The benchmark takes a column and produces two scalars. + set_items_processed(state, column_size + 1); + + // We don't set the metrics for the size read/written as row_bit_count() doesn't + // support the dictionary type yet (and so is estimate_size()). + // See https://github.com/rapidsai/cudf/issues/16121 for details. } #define concat(a, b, c) a##b##c diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp index 963c26692e7..050f2887221 100644 --- a/cpp/benchmarks/reduction/minmax.cpp +++ b/cpp/benchmarks/reduction/minmax.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,9 @@ * limitations under the License. */ +#include #include +#include #include #include @@ -28,14 +30,19 @@ template void BM_reduction(benchmark::State& state) { cudf::size_type const column_size{(cudf::size_type)state.range(0)}; - auto const dtype = cudf::type_to_id(); + auto const dtype_id = cudf::type_to_id(); auto const input_column = - create_random_column(dtype, row_count{column_size}, data_profile_builder().no_validity()); + create_random_column(dtype_id, row_count{column_size}, data_profile_builder().no_validity()); for (auto _ : state) { cuda_event_timer timer(state, true); auto result = cudf::minmax(*input_column); } + + // The benchmark takes a column and produces two scalars. + set_items_processed(state, column_size + 2); + cudf::data_type dtype = cudf::data_type{dtype_id}; + set_bytes_processed(state, estimate_size(input_column->view()) + 2 * cudf::size_of(dtype)); } #define concat(a, b, c) a##b##c diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp index e55f3b9e09f..14876c80d3e 100644 --- a/cpp/benchmarks/reduction/rank.cpp +++ b/cpp/benchmarks/reduction/rank.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,8 @@ */ #include +#include +#include #include #include @@ -39,11 +41,18 @@ static void nvbench_reduction_scan(nvbench::state& state, nvbench::type_listview(), 2); cudf::column_view input(new_tbl->view().column(0)); + std::unique_ptr result = nullptr; state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { rmm::cuda_stream_view stream_view{launch.get_stream()}; - auto result = cudf::detail::inclusive_dense_rank_scan( + result = cudf::detail::inclusive_dense_rank_scan( input, stream_view, rmm::mr::get_current_device_resource()); }); + + state.add_element_count(input.size()); + state.add_global_memory_reads(estimate_size(input)); + state.add_global_memory_writes(estimate_size(result->view())); + + set_throughputs(state); } using data_type = nvbench::type_list; diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp index 5bd3e2e3bba..63c96f4fe9e 100644 --- a/cpp/benchmarks/reduction/reduce.cpp +++ b/cpp/benchmarks/reduction/reduce.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,9 @@ * limitations under the License. */ +#include #include +#include #include #include @@ -46,6 +48,10 @@ void BM_reduction(benchmark::State& state, std::unique_ptrview()) + cudf::size_of(output_dtype)); } #define concat(a, b, c) a##b##c diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp index 8c9883ece9c..dc05aad9807 100644 --- a/cpp/benchmarks/reduction/scan.cpp +++ b/cpp/benchmarks/reduction/scan.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,9 @@ * limitations under the License. */ +#include #include +#include #include #include @@ -34,11 +36,16 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls) auto const column = create_random_column(dtype, row_count{n_rows}); if (!include_nulls) column->set_null_mask(rmm::device_buffer{}, 0); + std::unique_ptr result = nullptr; for (auto _ : state) { cuda_event_timer timer(state, true); - auto result = cudf::scan( + result = cudf::scan( *column, *cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE); } + + // The benchmark takes a column and produces a new column of the same size as input. + set_items_processed(state, n_rows * 2); + set_bytes_processed(state, estimate_size(column->view()) + estimate_size(result->view())); } #define SCAN_BENCHMARK_DEFINE(name, type, nulls) \ diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp index ee97b54fbef..a781f75a314 100644 --- a/cpp/benchmarks/reduction/scan_structs.cpp +++ b/cpp/benchmarks/reduction/scan_structs.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,8 @@ */ #include +#include +#include #include #include @@ -45,16 +47,24 @@ static void nvbench_structs_scan(nvbench::state& state) auto [null_mask, null_count] = create_random_null_mask(size, null_probability); auto const input = cudf::make_structs_column( size, std::move(data_table->release()), null_count, std::move(null_mask)); + auto input_view = input->view(); auto const agg = cudf::make_min_aggregation(); auto const null_policy = static_cast(state.get_int64("null_policy")); auto const stream = cudf::get_default_stream(); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + std::unique_ptr result = nullptr; state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto const result = cudf::detail::scan_inclusive( - *input, *agg, null_policy, stream, rmm::mr::get_current_device_resource()); + result = cudf::detail::scan_inclusive( + input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource()); }); + + state.add_element_count(input_view.size()); + state.add_global_memory_reads(estimate_size(input_view)); + state.add_global_memory_writes(estimate_size(result->view())); + + set_throughputs(state); } NVBENCH_BENCH(nvbench_structs_scan) diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index 9758958b44f..6ec35ddcaf1 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -15,6 +15,11 @@ # This function finds cuCollections and performs any additional configuration. function(find_and_configure_cucollections) include(${rapids-cmake-dir}/cpm/cuco.cmake) + include(${rapids-cmake-dir}/cpm/package_override.cmake) + + set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") + rapids_cpm_package_override("${cudf_patch_dir}/cuco_override.json") + if(BUILD_SHARED_LIBS) rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports) else() diff --git a/cpp/cmake/thirdparty/patches/cuco_noexcept.diff b/cpp/cmake/thirdparty/patches/cuco_noexcept.diff new file mode 100644 index 00000000000..0f334c0e81f --- /dev/null +++ b/cpp/cmake/thirdparty/patches/cuco_noexcept.diff @@ -0,0 +1,227 @@ +diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh +index 7f9de01..5228193 100644 +--- a/include/cuco/aow_storage.cuh ++++ b/include/cuco/aow_storage.cuh +@@ -81,7 +81,7 @@ class aow_storage : public detail::aow_storage_base { + * @param size Number of windows to (de)allocate + * @param allocator Allocator used for (de)allocating device storage + */ +- explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept; ++ explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}); + + aow_storage(aow_storage&&) = default; ///< Move constructor + /** +@@ -122,7 +122,7 @@ class aow_storage : public detail::aow_storage_base { + * @param key Key to which all keys in `slots` are initialized + * @param stream Stream used for executing the kernel + */ +- void initialize(value_type key, cuda_stream_ref stream = {}) noexcept; ++ void initialize(value_type key, cuda_stream_ref stream = {}); + + /** + * @brief Asynchronously initializes each slot in the AoW storage to contain `key`. +diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh +index c2c9c14..8ac4236 100644 +--- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh ++++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh +@@ -125,7 +125,7 @@ class open_addressing_impl { + KeyEqual const& pred, + ProbingScheme const& probing_scheme, + Allocator const& alloc, +- cuda_stream_ref stream) noexcept ++ cuda_stream_ref stream) + : empty_slot_sentinel_{empty_slot_sentinel}, + erased_key_sentinel_{this->extract_key(empty_slot_sentinel)}, + predicate_{pred}, +@@ -233,7 +233,7 @@ class open_addressing_impl { + * + * @param stream CUDA stream this operation is executed in + */ +- void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); } ++ void clear(cuda_stream_ref stream) { storage_.initialize(empty_slot_sentinel_, stream); } + + /** + * @brief Asynchronously erases all elements from the container. After this call, `size()` returns +@@ -599,7 +599,7 @@ class open_addressing_impl { + * + * @return The number of elements in the container + */ +- [[nodiscard]] size_type size(cuda_stream_ref stream) const noexcept ++ [[nodiscard]] size_type size(cuda_stream_ref stream) const + { + auto counter = + detail::counter_storage{this->allocator()}; +diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl +index e17a145..3fa1d02 100644 +--- a/include/cuco/detail/static_map/static_map.inl ++++ b/include/cuco/detail/static_map/static_map.inl +@@ -123,7 +123,7 @@ template + void static_map::clear( +- cuda_stream_ref stream) noexcept ++ cuda_stream_ref stream) + { + impl_->clear(stream); + } +@@ -215,7 +215,7 @@ template + template + void static_map:: +- insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream) noexcept ++ insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream) + { + return this->insert_or_assign_async(first, last, stream); + stream.synchronize(); +@@ -465,7 +465,7 @@ template + static_map::size_type + static_map::size( +- cuda_stream_ref stream) const noexcept ++ cuda_stream_ref stream) const + { + return impl_->size(stream); + } +diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl +index 174f9bc..582926b 100644 +--- a/include/cuco/detail/static_multiset/static_multiset.inl ++++ b/include/cuco/detail/static_multiset/static_multiset.inl +@@ -97,7 +97,7 @@ template + void static_multiset::clear( +- cuda_stream_ref stream) noexcept ++ cuda_stream_ref stream) + { + impl_->clear(stream); + } +@@ -183,7 +183,7 @@ template + static_multiset::size_type + static_multiset::size( +- cuda_stream_ref stream) const noexcept ++ cuda_stream_ref stream) const + { + return impl_->size(stream); + } +diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl +index 645013f..d3cece0 100644 +--- a/include/cuco/detail/static_set/static_set.inl ++++ b/include/cuco/detail/static_set/static_set.inl +@@ -98,7 +98,7 @@ template + void static_set::clear( +- cuda_stream_ref stream) noexcept ++ cuda_stream_ref stream) + { + impl_->clear(stream); + } +@@ -429,7 +429,7 @@ template + static_set::size_type + static_set::size( +- cuda_stream_ref stream) const noexcept ++ cuda_stream_ref stream) const + { + return impl_->size(stream); + } +diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl +index 3547f4c..94b7f98 100644 +--- a/include/cuco/detail/storage/aow_storage.inl ++++ b/include/cuco/detail/storage/aow_storage.inl +@@ -32,8 +32,8 @@ + namespace cuco { + + template +-constexpr aow_storage::aow_storage( +- Extent size, Allocator const& allocator) noexcept ++constexpr aow_storage::aow_storage(Extent size, ++ Allocator const& allocator) + : detail::aow_storage_base{size}, + allocator_{allocator}, + window_deleter_{capacity(), allocator_}, +@@ -64,7 +64,7 @@ aow_storage::ref() const noexcept + + template + void aow_storage::initialize(value_type key, +- cuda_stream_ref stream) noexcept ++ cuda_stream_ref stream) + { + this->initialize_async(key, stream); + stream.synchronize(); +diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh +index c86e90c..95da423 100644 +--- a/include/cuco/static_map.cuh ++++ b/include/cuco/static_map.cuh +@@ -269,7 +269,7 @@ class static_map { + * + * @param stream CUDA stream this operation is executed in + */ +- void clear(cuda_stream_ref stream = {}) noexcept; ++ void clear(cuda_stream_ref stream = {}); + + /** + * @brief Asynchronously erases all elements from the container. After this call, `size()` returns +@@ -387,7 +387,7 @@ class static_map { + * @param stream CUDA stream used for insert + */ + template +- void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; ++ void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {}); + + /** + * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k` +@@ -690,7 +690,7 @@ class static_map { + * @param stream CUDA stream used to get the number of inserted elements + * @return The number of elements in the container + */ +- [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept; ++ [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const; + + /** + * @brief Gets the maximum number of elements the hash map can hold. +diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh +index 0daf103..fbcbc9c 100644 +--- a/include/cuco/static_multiset.cuh ++++ b/include/cuco/static_multiset.cuh +@@ -235,7 +235,7 @@ class static_multiset { + * + * @param stream CUDA stream this operation is executed in + */ +- void clear(cuda_stream_ref stream = {}) noexcept; ++ void clear(cuda_stream_ref stream = {}); + + /** + * @brief Asynchronously erases all elements from the container. After this call, `size()` returns +@@ -339,7 +339,7 @@ class static_multiset { + * @param stream CUDA stream used to get the number of inserted elements + * @return The number of elements in the container + */ +- [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept; ++ [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const; + + /** + * @brief Gets the maximum number of elements the multiset can hold. +diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh +index a069939..3517f84 100644 +--- a/include/cuco/static_set.cuh ++++ b/include/cuco/static_set.cuh +@@ -240,7 +240,7 @@ class static_set { + * + * @param stream CUDA stream this operation is executed in + */ +- void clear(cuda_stream_ref stream = {}) noexcept; ++ void clear(cuda_stream_ref stream = {}); + + /** + * @brief Asynchronously erases all elements from the container. After this call, `size()` returns +@@ -687,7 +687,7 @@ class static_set { + * @param stream CUDA stream used to get the number of inserted elements + * @return The number of elements in the container + */ +- [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept; ++ [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const; + + /** + * @brief Gets the maximum number of elements the hash set can hold. diff --git a/cpp/cmake/thirdparty/patches/cuco_override.json b/cpp/cmake/thirdparty/patches/cuco_override.json new file mode 100644 index 00000000000..ae0a9a4b4f0 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/cuco_override.json @@ -0,0 +1,14 @@ + +{ + "packages" : { + "cuco" : { + "patches" : [ + { + "file" : "${current_json_dir}/cuco_noexcept.diff", + "issue" : "Remove erroneous noexcept clauses on cuco functions that may throw [https://github.com/rapidsai/cudf/issues/16059]", + "fixed_in" : "" + } + ] + } + } +} diff --git a/cpp/include/cudf/detail/merge.hpp b/cpp/include/cudf/detail/merge.hpp index 837eda0d7b5..56ac0554403 100644 --- a/cpp/include/cudf/detail/merge.hpp +++ b/cpp/include/cudf/detail/merge.hpp @@ -46,6 +46,7 @@ using index_vector = rmm::device_uvector; * std::vector const& key_cols, * std::vector const& column_order, * std::vector const& null_precedence, + * rmm::cuda_stream_view stream, * rmm::device_async_resource_ref mr) * * @param stream CUDA stream used for device memory operations and kernel launches diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp index e8486a80afc..9cdda773dbb 100644 --- a/cpp/include/cudf/dictionary/detail/update_keys.hpp +++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp @@ -29,7 +29,7 @@ namespace dictionary { namespace detail { /** * @copydoc cudf::dictionary::add_keys(dictionary_column_view const&,column_view - * const&,mm::mr::device_memory_resource*) + * const&,rmm::device_async_resource_ref) * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -40,7 +40,7 @@ std::unique_ptr add_keys(dictionary_column_view const& dictionary_column /** * @copydoc cudf::dictionary::remove_keys(dictionary_column_view const&,column_view - * const&,mm::mr::device_memory_resource*) + * const&,rmm::device_async_resource_ref) * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -51,7 +51,7 @@ std::unique_ptr remove_keys(dictionary_column_view const& dictionary_col /** * @copydoc cudf::dictionary::remove_unused_keys(dictionary_column_view - * const&,mm::mr::device_memory_resource*) + * const&,rmm::device_async_resource_ref) * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -61,7 +61,7 @@ std::unique_ptr remove_unused_keys(dictionary_column_view const& diction /** * @copydoc cudf::dictionary::set_keys(dictionary_column_view - * const&,mm::mr::device_memory_resource*) + * const&,rmm::device_async_resource_ref) * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -72,7 +72,7 @@ std::unique_ptr set_keys(dictionary_column_view const& dictionary_column /** * @copydoc - * cudf::dictionary::match_dictionaries(std::vector,mm::mr::device_memory_resource*) + * cudf::dictionary::match_dictionaries(std::vector,rmm::device_async_resource_ref) * * @param stream CUDA stream used for device memory operations and kernel launches. */ diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 56ec62fa6e1..11f6ce2bad7 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -39,6 +39,7 @@ #include #include +#include #include @@ -50,6 +51,8 @@ struct ArrowSchema; struct ArrowArray; +struct ArrowArrayStream; + namespace cudf { /** * @addtogroup interop_dlpack @@ -367,10 +370,11 @@ std::unique_ptr from_arrow( * @param mr Device memory resource used to allocate `cudf::table` * @return cudf table generated from given arrow data */ -std::unique_ptr from_arrow(ArrowSchema const* schema, - ArrowArray const* input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +std::unique_ptr from_arrow( + ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input @@ -385,10 +389,11 @@ std::unique_ptr from_arrow(ArrowSchema const* schema, * @param mr Device memory resource used to allocate `cudf::column` * @return cudf column generated from given arrow data */ -std::unique_ptr from_arrow_column(ArrowSchema const* schema, - ArrowArray const* input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +std::unique_ptr from_arrow_column( + ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::table` from given ArrowDeviceArray input @@ -411,8 +416,26 @@ std::unique_ptr from_arrow_column(ArrowSchema const* schema, std::unique_ptr from_arrow_host( ArrowSchema const* schema, ArrowDeviceArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Create `cudf::table` from given ArrowArrayStream input + * + * @throws std::invalid_argument if input is NULL + * + * The conversion WILL release the input ArrayArrayStream and its constituent + * arrays or schema since Arrow streams are not suitable for multiple reads. + * + * @param input `ArrowArrayStream` pointer to object that will produce ArrowArray data + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to perform cuda allocation + * @return cudf table generated from the given Arrow data + */ +std::unique_ptr
from_arrow_stream( + ArrowArrayStream* input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::column` from given ArrowDeviceArray input @@ -434,8 +457,8 @@ std::unique_ptr
from_arrow_host( std::unique_ptr from_arrow_host_column( ArrowSchema const* schema, ArrowDeviceArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray @@ -515,8 +538,8 @@ using unique_table_view_t = unique_table_view_t from_arrow_device( ArrowSchema const* schema, ArrowDeviceArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief typedef for a unique_ptr to a `cudf::column_view` with custom deleter @@ -558,8 +581,8 @@ using unique_column_view_t = unique_column_view_t from_arrow_device_column( ArrowSchema const* schema, ArrowDeviceArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp index 0086432d003..60ee867f058 100644 --- a/cpp/include/cudf/io/text/byte_range_info.hpp +++ b/cpp/include/cudf/io/text/byte_range_info.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,17 +24,22 @@ namespace cudf { namespace io { namespace text { +/** + * @addtogroup io_readers + * @{ + * @file + */ /** * @brief stores offset and size used to indicate a byte range */ class byte_range_info { private: - int64_t _offset; ///< offset in bytes - int64_t _size; ///< size in bytes + int64_t _offset{}; ///< offset in bytes + int64_t _size{}; ///< size in bytes public: - constexpr byte_range_info() noexcept : _offset(0), _size(0) {} + constexpr byte_range_info() = default; /** * @brief Constructs a byte_range_info object * @@ -104,6 +109,8 @@ std::vector create_byte_range_infos_consecutive(int64_t total_b */ byte_range_info create_byte_range_info_max(); +/** @} */ // end of group + } // namespace text } // namespace io } // namespace cudf diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp index 28204c82780..13aff4b3b8f 100644 --- a/cpp/include/cudf/io/text/data_chunk_source.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,12 @@ namespace cudf { namespace io { namespace text { +/** + * @addtogroup io_readers + * @{ + * @file + */ + /** * @brief A contract guaranteeing stream-ordered memory access to the underlying device data. * @@ -110,6 +116,8 @@ class data_chunk_source { [[nodiscard]] virtual std::unique_ptr create_reader() const = 0; }; +/** @} */ // end of group + } // namespace text } // namespace io } // namespace cudf diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp index 7abae7c754b..e29ab78ae46 100644 --- a/cpp/include/cudf/io/text/multibyte_split.hpp +++ b/cpp/include/cudf/io/text/multibyte_split.hpp @@ -30,6 +30,11 @@ namespace cudf { namespace io { namespace text { +/** + * @addtogroup io_readers + * @{ + * @file + */ /** * @brief Parsing options for multibyte_split. @@ -79,6 +84,7 @@ struct parse_options { * @param source The source string * @param delimiter UTF-8 encoded string for which to find offsets in the source * @param options the parsing options to use (including byte range) + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Memory resource to use for the device memory allocation * @return The strings found by splitting the source by the delimiter within the relevant byte * range. @@ -87,17 +93,30 @@ std::unique_ptr multibyte_split( data_chunk_source const& source, std::string const& delimiter, parse_options options = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); -std::unique_ptr multibyte_split( +/** + * @brief Splits the source text into a strings column using a multiple byte delimiter. + * + * @deprecated Since 24.08 + * + * @param source The source input data encoded in UTF-8 + * @param delimiter UTF-8 encoded string for which to find offsets in the source + * @param byte_range The position and size within `source` to produce the column from + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Memory resource to use for the device memory allocation + * @return The strings found by splitting the source by the delimiter within the relevant byte + * range. + */ +[[deprecated]] std::unique_ptr multibyte_split( data_chunk_source const& source, std::string const& delimiter, std::optional byte_range, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); -std::unique_ptr multibyte_split(data_chunk_source const& source, - std::string const& delimiter, - rmm::device_async_resource_ref mr); +/** @} */ // end of group } // namespace text } // namespace io diff --git a/cpp/include/cudf/merge.hpp b/cpp/include/cudf/merge.hpp index 29aa3ffe934..301e56c19b8 100644 --- a/cpp/include/cudf/merge.hpp +++ b/cpp/include/cudf/merge.hpp @@ -97,6 +97,7 @@ namespace cudf { * @param[in] column_order Sort order types of columns indexed by key_cols * @param[in] null_precedence Array indicating the order of nulls with respect * to non-nulls for the indexing columns (key_cols) + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * * @returns A table containing sorted data from all input tables @@ -106,7 +107,7 @@ std::unique_ptr merge( std::vector const& key_cols, std::vector const& column_order, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); - /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp index a19aa9be0c0..a714f762a19 100644 --- a/cpp/include/cudf/strings/replace.hpp +++ b/cpp/include/cudf/strings/replace.hpp @@ -122,7 +122,7 @@ std::unique_ptr replace_slice( * If a target string is found, it is replaced by the corresponding entry in the repls column. * All occurrences found in each string are replaced. * - * This does not use regex to match targets in the string. + * This does not use regex to match targets in the string. Empty string targets are ignored. * * Null string entries will return null output string entries. * diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index e9b81a525fc..c181ac7d402 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -252,7 +252,7 @@ using optional_dremel_view = thrust::optional; * * @tparam has_nested_columns compile-time optimization for primitive types. * This template parameter is to be used by the developer by querying - * `cudf::detail::has_nested_columns(input)`. `true` compiles operator + * `cudf::has_nested_columns(input)`. `true` compiles operator * overloads for nested types, while `false` only compiles operator * overloads for primitive types. * @tparam Nullate A cudf::nullate type describing whether to check for nulls. @@ -1014,7 +1014,7 @@ class self_comparator { * * @tparam has_nested_columns compile-time optimization for primitive types. * This template parameter is to be used by the developer by querying - * `cudf::detail::has_nested_columns(input)`. `true` compiles operator + * `cudf::has_nested_columns(input)`. `true` compiles operator * overloads for nested types, while `false` only compiles operator * overloads for primitive types. * @tparam Nullate A cudf::nullate type describing whether to check for nulls. @@ -1186,7 +1186,7 @@ class two_table_comparator { * * @tparam has_nested_columns compile-time optimization for primitive types. * This template parameter is to be used by the developer by querying - * `cudf::detail::has_nested_columns(input)`. `true` compiles operator + * `cudf::has_nested_columns(input)`. `true` compiles operator * overloads for nested types, while `false` only compiles operator * overloads for primitive types. * @tparam Nullate A cudf::nullate type describing whether to check for nulls. @@ -1326,7 +1326,7 @@ struct nan_equal_physical_equality_comparator { * * @tparam has_nested_columns compile-time optimization for primitive types. * This template parameter is to be used by the developer by querying - * `cudf::detail::has_nested_columns(input)`. `true` compiles operator + * `cudf::has_nested_columns(input)`. `true` compiles operator * overloads for nested types, while `false` only compiles operator * overloads for primitive types. * @tparam Nullate A cudf::nullate type describing whether to check for nulls. @@ -1643,7 +1643,7 @@ class self_comparator { * * @tparam has_nested_columns compile-time optimization for primitive types. * This template parameter is to be used by the developer by querying - * `cudf::detail::has_nested_columns(input)`. `true` compiles operator + * `cudf::has_nested_columns(input)`. `true` compiles operator * overloads for nested types, while `false` only compiles operator * overloads for primitive types. * @tparam Nullate A cudf::nullate type describing whether to check for nulls. @@ -1757,7 +1757,7 @@ class two_table_comparator { * * @tparam has_nested_columns compile-time optimization for primitive types. * This template parameter is to be used by the developer by querying - * `cudf::detail::has_nested_columns(input)`. `true` compiles operator + * `cudf::has_nested_columns(input)`. `true` compiles operator * overloads for nested types, while `false` only compiles operator * overloads for primitive types. * @tparam Nullate A cudf::nullate type describing whether to check for nulls. diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp index a71e0558dec..4a990f67ce4 100644 --- a/cpp/include/cudf/table/table_view.hpp +++ b/cpp/include/cudf/table/table_view.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -32,7 +33,7 @@ * passed by value. */ -namespace cudf { +namespace CUDF_EXPORT cudf { namespace detail { /** * @brief Base class for a table of `ColumnView`s @@ -123,7 +124,10 @@ class table_view_base { * @param column_index The index of the desired column * @return A reference to the desired column */ - [[nodiscard]] ColumnView const& column(size_type column_index) const; + [[nodiscard]] ColumnView const& column(size_type column_index) const + { + return _columns.at(column_index); + } /** * @brief Returns the number of columns @@ -174,8 +178,17 @@ class table_view_base { * @return Whether nested columns exist in the input table */ bool has_nested_columns(table_view const& table); + } // namespace detail +/** + * @brief Determine if any nested columns exist in a given table. + * + * @param table The input table + * @return Whether nested columns exist in the input table + */ +bool has_nested_columns(table_view const& table); + /** * @brief A set of cudf::column_view's of the same size. * @@ -374,4 +387,4 @@ extern template bool is_relationally_comparable(mutable_tabl mutable_table_view const& rhs); // @endcond } // namespace detail -} // namespace cudf +} // namespace CUDF_EXPORT cudf diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index b66e5cab333..4be3054b3dc 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -463,10 +463,6 @@ void traverse_children::operator()(host_span */ void bounds_and_type_check(host_span cols, rmm::cuda_stream_view stream) { - CUDF_EXPECTS(cudf::all_have_same_types(cols.begin(), cols.end()), - "Type mismatch in columns to concatenate.", - cudf::data_type_error); - // total size of all concatenated rows size_t const total_row_count = std::accumulate(cols.begin(), cols.end(), std::size_t{}, [](size_t a, auto const& b) { @@ -476,6 +472,21 @@ void bounds_and_type_check(host_span cols, rmm::cuda_stream_v "Total number of concatenated rows exceeds the column size limit", std::overflow_error); + if (std::any_of(cols.begin(), cols.end(), [](column_view const& c) { + return c.type().id() == cudf::type_id::EMPTY; + })) { + CUDF_EXPECTS( + std::all_of(cols.begin(), + cols.end(), + [](column_view const& c) { return c.type().id() == cudf::type_id::EMPTY; }), + "Mismatch in columns to concatenate.", + cudf::data_type_error); + return; + } + CUDF_EXPECTS(cudf::all_have_same_types(cols.begin(), cols.end()), + "Type mismatch in columns to concatenate.", + cudf::data_type_error); + // traverse children cudf::type_dispatcher(cols.front().type(), traverse_children{}, cols, stream); } @@ -498,6 +509,15 @@ std::unique_ptr concatenate(host_span columns_to_conc return empty_like(columns_to_concat.front()); } + // For empty columns, we can just create an EMPTY column of the appropriate length. + if (columns_to_concat.front().type().id() == cudf::type_id::EMPTY) { + auto length = std::accumulate( + columns_to_concat.begin(), columns_to_concat.end(), 0, [](auto a, auto const& b) { + return a + b.size(); + }); + return std::make_unique( + data_type(type_id::EMPTY), length, rmm::device_buffer{}, rmm::device_buffer{}, length); + } return type_dispatcher( columns_to_concat.front().type(), concatenate_dispatch{columns_to_concat, stream, mr}); } diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp index dd9e9600a87..605d813ed1e 100644 --- a/cpp/src/interop/arrow_utilities.cpp +++ b/cpp/src/interop/arrow_utilities.cpp @@ -39,7 +39,8 @@ data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view) case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32); case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64); case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS); - case NANOARROW_TYPE_STRING: return data_type(type_id::STRING); + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: return data_type(type_id::STRING); case NANOARROW_TYPE_LIST: return data_type(type_id::LIST); case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32); case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT); diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index 002a8ec1f14..e1d289e67a3 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -56,7 +57,7 @@ struct dispatch_from_arrow_device { data_type, bool, rmm::cuda_stream_view, - rmm::mr::device_memory_resource*) + rmm::device_async_resource_ref) { CUDF_FAIL("Unsupported type in from_arrow_device", cudf::data_type_error); } @@ -68,7 +69,7 @@ struct dispatch_from_arrow_device { data_type type, bool skip_mask, rmm::cuda_stream_view, - rmm::mr::device_memory_resource*) + rmm::device_async_resource_ref mr) { size_type const num_rows = input->length; size_type const offset = input->offset; @@ -90,7 +91,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema, data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); + rmm::device_async_resource_ref mr); template <> dispatch_tuple_t dispatch_from_arrow_device::operator()(ArrowSchemaView* schema, @@ -98,7 +99,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()(ArrowSchemaView* s data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { if (input->length == 0) { return std::make_tuple( @@ -141,8 +142,11 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { + CUDF_EXPECTS(schema->type != NANOARROW_TYPE_LARGE_STRING, + "Large strings are not yet supported in from_arrow_device", + cudf::data_type_error); if (input->length == 0) { return std::make_tuple( {type, @@ -179,7 +183,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { ArrowSchemaView keys_schema_view; NANOARROW_THROW_NOT_OK( @@ -235,7 +239,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { std::vector children; owned_columns_t out_owned_cols; @@ -280,7 +284,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { size_type const num_rows = input->length; size_type const offset = input->offset; @@ -321,7 +325,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema, data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { return type.id() != type_id::EMPTY ? std::move(type_dispatcher( @@ -339,7 +343,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema, unique_table_view_t from_arrow_device(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(schema != nullptr && input != nullptr, "input ArrowSchema and ArrowDeviceArray must not be NULL", @@ -394,7 +398,7 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema, unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(schema != nullptr && input != nullptr, "input ArrowSchema and ArrowDeviceArray must not be NULL", @@ -426,7 +430,7 @@ unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, unique_table_view_t from_arrow_device(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -436,7 +440,7 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema, unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index 854a1d68fdc..b3087dedf98 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -49,7 +50,7 @@ namespace { struct dispatch_copy_from_arrow_host { rmm::cuda_stream_view stream; - rmm::mr::device_memory_resource* mr; + rmm::device_async_resource_ref mr; std::unique_ptr get_mask_buffer(ArrowArray const* array) { @@ -131,7 +132,7 @@ std::unique_ptr get_column_copy(ArrowSchemaView* schema, data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); + rmm::device_async_resource_ref mr); template <> std::unique_ptr dispatch_copy_from_arrow_host::operator()(ArrowSchemaView* schema, @@ -188,8 +189,16 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator()(offset_buffers[1])[input->length + input->offset]; + int64_t const char_data_length = [&]() { + if (schema->type == NANOARROW_TYPE_LARGE_STRING) { + return reinterpret_cast(offset_buffers[1])[input->length + input->offset]; + } else if (schema->type == NANOARROW_TYPE_STRING) { + return static_cast( + reinterpret_cast(offset_buffers[1])[input->length + input->offset]); + } else { + CUDF_FAIL("Unsupported string type", cudf::data_type_error); + } + }(); void const* char_buffers[2] = {nullptr, input->buffers[2]}; ArrowArray char_array = { .length = char_data_length, @@ -210,15 +219,27 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator()operator()(&view, &offsets_array, data_type(type_id::INT32), true); + auto offsets_column = [&]() { + if (schema->type == NANOARROW_TYPE_LARGE_STRING) { + return this->operator()(&view, &offsets_array, data_type(type_id::INT64), true); + } else if (schema->type == NANOARROW_TYPE_STRING) { + return this->operator()(&view, &offsets_array, data_type(type_id::INT32), true); + } else { + CUDF_FAIL("Unsupported string type", cudf::data_type_error); + } + }(); NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, char_data_schema.get(), nullptr)); - auto chars_column = this->operator()(&view, &char_array, data_type(type_id::INT8), true); + rmm::device_buffer chars(char_data_length, stream, mr); + CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(), + reinterpret_cast(char_array.buffers[1]), + chars.size(), + cudaMemcpyDefault, + stream.value())); auto const num_rows = offsets_column->size() - 1; auto out_col = make_strings_column(num_rows, std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + std::move(chars), input->null_count, std::move(*get_mask_buffer(input))); @@ -368,7 +389,7 @@ std::unique_ptr get_column_copy(ArrowSchemaView* schema, data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { return type.id() != type_id::EMPTY ? std::move(type_dispatcher( @@ -385,7 +406,7 @@ std::unique_ptr get_column_copy(ArrowSchemaView* schema, std::unique_ptr
from_arrow_host(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(schema != nullptr && input != nullptr, "input ArrowSchema and ArrowDeviceArray must not be NULL", @@ -421,7 +442,7 @@ std::unique_ptr
from_arrow_host(ArrowSchema const* schema, std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(schema != nullptr && input != nullptr, "input ArrowSchema and ArrowDeviceArray must not be NULL", @@ -442,7 +463,7 @@ std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, std::unique_ptr
from_arrow_host(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -452,7 +473,7 @@ std::unique_ptr
from_arrow_host(ArrowSchema const* schema, std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -462,7 +483,7 @@ std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, std::unique_ptr
from_arrow(ArrowSchema const* schema, ArrowArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -477,7 +498,7 @@ std::unique_ptr
from_arrow(ArrowSchema const* schema, std::unique_ptr from_arrow_column(ArrowSchema const* schema, ArrowArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); diff --git a/cpp/src/interop/from_arrow_stream.cu b/cpp/src/interop/from_arrow_stream.cu new file mode 100644 index 00000000000..578105aa90a --- /dev/null +++ b/cpp/src/interop/from_arrow_stream.cu @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arrow_utilities.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +namespace cudf { +namespace detail { + +namespace { + +std::unique_ptr make_empty_column_from_schema(ArrowSchema const* schema, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + ArrowSchemaView schema_view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, nullptr)); + + auto const type{arrow_to_cudf_type(&schema_view)}; + switch (type.id()) { + case type_id::EMPTY: { + return std::make_unique( + data_type(type_id::EMPTY), 0, rmm::device_buffer{}, rmm::device_buffer{}, 0); + } + case type_id::LIST: { + return cudf::make_lists_column(0, + cudf::make_empty_column(data_type{type_id::INT32}), + make_empty_column_from_schema(schema->children[0], stream, mr), + 0, + {}, + stream, + mr); + } + case type_id::STRUCT: { + std::vector> child_columns; + child_columns.reserve(schema->n_children); + std::transform( + schema->children, + schema->children + schema->n_children, + std::back_inserter(child_columns), + [&](auto const& child) { return make_empty_column_from_schema(child, stream, mr); }); + return cudf::make_structs_column(0, std::move(child_columns), 0, {}, stream, mr); + } + default: { + return cudf::make_empty_column(type); + } + } +} + +} // namespace + +std::unique_ptr
from_arrow_stream(ArrowArrayStream* input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(input != nullptr, "input ArrowArrayStream must not be NULL", std::invalid_argument); + + // Potential future optimization: Since the from_arrow API accepts an + // ArrowSchema we're allocating one here instead of using a view, which we + // could avoid with a different underlying implementation. + ArrowSchema schema; + NANOARROW_THROW_NOT_OK(ArrowArrayStreamGetSchema(input, &schema, nullptr)); + + std::vector> chunks; + ArrowArray chunk; + while (true) { + NANOARROW_THROW_NOT_OK(ArrowArrayStreamGetNext(input, &chunk, nullptr)); + if (chunk.release == nullptr) { break; } + chunks.push_back(from_arrow(&schema, &chunk, stream, mr)); + chunk.release(&chunk); + } + input->release(input); + + if (chunks.empty()) { + if (schema.n_children == 0) { + schema.release(&schema); + return std::make_unique(); + } + + // If there are no chunks but the schema has children, we need to construct a suitable empty + // table. + std::vector> columns; + columns.reserve(chunks.size()); + std::transform( + schema.children, + schema.children + schema.n_children, + std::back_inserter(columns), + [&](auto const& child) { return make_empty_column_from_schema(child, stream, mr); }); + schema.release(&schema); + return std::make_unique(std::move(columns)); + } + + schema.release(&schema); + + auto chunk_views = std::vector{}; + chunk_views.reserve(chunks.size()); + std::transform( + chunks.begin(), chunks.end(), std::back_inserter(chunk_views), [](auto const& chunk) { + return chunk->view(); + }); + return cudf::detail::concatenate(chunk_views, stream, mr); +} + +} // namespace detail + +std::unique_ptr
from_arrow_stream(ArrowArrayStream* input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::from_arrow_stream(input, stream, mr); +} +} // namespace cudf diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 7c4d5711281..63eb0b03c5f 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -372,15 +373,33 @@ void write_chunked(data_sink* out_sink, CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column."); cudf::string_scalar newline{options.get_line_terminator(), true, stream}; - auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view, - newline, - string_scalar{"", false, stream}, - stream, - rmm::mr::get_current_device_resource()); - strings_column_view strings_column{p_str_col_w_nl->view()}; - auto total_num_bytes = strings_column.chars_size(stream); - char const* ptr_all_bytes = strings_column.chars_begin(stream); + // use strings concatenate to build the final CSV output in device memory + auto contents_w_nl = [&] { + auto const total_size = + str_column_view.chars_size(stream) + (newline.size() * str_column_view.size()); + auto const empty_str = string_scalar("", true, stream); + // use join_strings when the output will be less than 2GB + if (total_size < static_cast(std::numeric_limits::max())) { + return cudf::strings::detail::join_strings(str_column_view, newline, empty_str, stream, mr) + ->release(); + } + auto nl_col = cudf::make_column_from_scalar(newline, str_column_view.size(), stream); + // convert the last element into an empty string by resetting the last offset value + auto& offsets = nl_col->child(strings_column_view::offsets_column_index); + auto offsets_view = offsets.mutable_view(); + cudf::fill_in_place(offsets_view, + offsets.size() - 1, // set the last element with + offsets.size(), // the value from 2nd to last element + *cudf::detail::get_element(offsets.view(), offsets.size() - 2, stream, mr), + stream); + auto const nl_tbl = cudf::table_view({str_column_view.parent(), nl_col->view()}); + return cudf::strings::detail::concatenate( + nl_tbl, empty_str, empty_str, strings::separator_on_nulls::NO, stream, mr) + ->release(); + }(); + auto const total_num_bytes = contents_w_nl.data->size(); + auto const ptr_all_bytes = static_cast(contents_w_nl.data->data()); if (out_sink->is_device_write_preferred(total_num_bytes)) { // Direct write from device memory @@ -491,7 +510,8 @@ void write_csv(data_sink* out_sink, str_table_view.column(0), options_narep, stream, rmm::mr::get_current_device_resource()); }(); - write_chunked(out_sink, str_concat_col->view(), options, stream, mr); + write_chunked( + out_sink, str_concat_col->view(), options, stream, rmm::mr::get_current_device_resource()); } } } diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 405084cc4ad..1cd45fc897c 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -2242,131 +2242,6 @@ std::pair, std::vector> json_column_to return {}; } -table_with_metadata host_parse_nested_json(device_span d_input, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - // Range of orchestrating/encapsulating function - CUDF_FUNC_RANGE(); - - auto const h_input = cudf::detail::make_std_vector_async(d_input, stream); - - auto const new_line_delimited_json = options.is_enabled_lines(); - - // Get internal JSON column - json_column root_column{}; - std::stack data_path{}; - - constexpr uint32_t row_offset_zero = 0; - constexpr uint32_t token_begin_offset_zero = 0; - constexpr uint32_t token_end_offset_zero = 0; - constexpr uint32_t node_init_child_count_zero = 0; - - // Whether the tokenizer stage should keep quote characters for string values - // If the tokenizer keeps the quote characters, they may be stripped during type casting - constexpr bool include_quote_chars = true; - - // We initialize the very root node and root column, which represent the JSON document being - // parsed. That root node is a list node and that root column is a list column. The column has the - // root node as its only row. The values parsed from the JSON input will be treated as follows: - // (1) For JSON lines: we expect to find a list of JSON values that all - // will be inserted into this root list column. (2) For regular JSON: we expect to have only a - // single value (list, struct, string, number, literal) that will be inserted into this root - // column. - root_column.append_row( - row_offset_zero, json_col_t::ListColumn, token_begin_offset_zero, token_end_offset_zero, 1); - - // Push the root node onto the stack for the data path - data_path.push({&root_column, row_offset_zero, nullptr, node_init_child_count_zero}); - - make_json_column( - root_column, data_path, h_input, d_input, options, include_quote_chars, stream, mr); - - // data_root refers to the root column of the data represented by the given JSON string - auto const& data_root = - new_line_delimited_json ? root_column : root_column.child_columns.begin()->second; - - // Zero row entries - if (data_root.type == json_col_t::ListColumn && data_root.child_columns.empty()) { - return table_with_metadata{std::make_unique
(std::vector>{})}; - } - - // Verify that we were in fact given a list of structs (or in JSON speech: an array of objects) - auto constexpr single_child_col_count = 1; - CUDF_EXPECTS(data_root.type == json_col_t::ListColumn and - data_root.child_columns.size() == single_child_col_count and - data_root.child_columns.begin()->second.type == json_col_t::StructColumn, - "Currently the nested JSON parser only supports an array of (nested) objects"); - - // Slice off the root list column, which has only a single row that contains all the structs - auto const& root_struct_col = data_root.child_columns.begin()->second; - - // Initialize meta data to be populated while recursing through the tree of columns - std::vector> out_columns; - std::vector out_column_names; - - // Iterate over the struct's child columns and convert to cudf column - size_type column_index = 0; - for (auto const& col_name : root_struct_col.column_order) { - auto const& json_col = root_struct_col.child_columns.find(col_name)->second; - // Insert this columns name into the schema - out_column_names.emplace_back(col_name); - - std::optional child_schema_element = std::visit( - cudf::detail::visitor_overload{ - [column_index](std::vector const& user_dtypes) -> std::optional { - auto ret = (static_cast(column_index) < user_dtypes.size()) - ? std::optional{{user_dtypes[column_index]}} - : std::optional{}; -#ifdef NJP_DEBUG_PRINT - std::cout << "Column by index: #" << column_index << ", type id: " - << (ret.has_value() ? std::to_string(static_cast(ret->type.id())) : "n/a") - << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children" - << "\n"; -#endif - return ret; - }, - [col_name]( - std::map const& user_dtypes) -> std::optional { - auto ret = (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? std::optional{{user_dtypes.find(col_name)->second}} - : std::optional{}; -#ifdef NJP_DEBUG_PRINT - std::cout << "Column by flat name: '" << col_name << "', type id: " - << (ret.has_value() ? std::to_string(static_cast(ret->type.id())) : "n/a") - << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children" - << "\n"; -#endif - return ret; - }, - [col_name](std::map const& user_dtypes) - -> std::optional { - auto ret = (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? user_dtypes.find(col_name)->second - : std::optional{}; -#ifdef NJP_DEBUG_PRINT - std::cout << "Column by nested name: #" << col_name << ", type id: " - << (ret.has_value() ? std::to_string(static_cast(ret->type.id())) : "n/a") - << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children" - << "\n"; -#endif - return ret; - }}, - options.get_dtypes()); - - // Get this JSON column's cudf column and schema info - auto [cudf_col, col_name_info] = - json_column_to_cudf_column(json_col, d_input, options, child_schema_element, stream, mr); - out_column_names.back().children = std::move(col_name_info); - out_columns.emplace_back(std::move(cudf_col)); - - column_index++; - } - - return table_with_metadata{std::make_unique
(std::move(out_columns)), {out_column_names}}; -} - } // namespace detail } // namespace cudf::io::json diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 9c406369068..51dc0ca90af 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -565,35 +565,32 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source } // namespace detail +// deprecated in 24.08 std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, std::string const& delimiter, std::optional byte_range, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return multibyte_split( - source, delimiter, parse_options{byte_range.value_or(create_byte_range_info_max())}, mr); + return multibyte_split(source, + delimiter, + parse_options{byte_range.value_or(create_byte_range_info_max())}, + stream, + mr); } std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, std::string const& delimiter, parse_options options, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto stream = cudf::get_default_stream(); - auto result = detail::multibyte_split( source, delimiter, options.byte_range, options.strip_delimiters, stream, mr); return result; } -std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, - std::string const& delimiter, - rmm::device_async_resource_ref mr) -{ - return multibyte_split(source, delimiter, parse_options{}, mr); -} - } // namespace text } // namespace io } // namespace cudf diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index 42e0e4f45ee..90748e6f322 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -82,9 +82,7 @@ mixed_join( // Left and full joins all return all the row indices from // left with a corresponding NULL from the right. case join_kind::LEFT_JOIN: - case join_kind::FULL_JOIN: - return get_trivial_left_join_indices( - left_conditional, stream, rmm::mr::get_current_device_resource()); + case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left_conditional, stream, mr); // Inner joins return empty output because no matches can exist. case join_kind::INNER_JOIN: return std::pair(std::make_unique>(0, stream, mr), @@ -100,8 +98,7 @@ mixed_join( std::make_unique>(0, stream, mr)); // Full joins need to return the trivial complement. case join_kind::FULL_JOIN: { - auto ret_flipped = get_trivial_left_join_indices( - right_conditional, stream, rmm::mr::get_current_device_resource()); + auto ret_flipped = get_trivial_left_join_indices(right_conditional, stream, mr); return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first)); } default: CUDF_FAIL("Invalid join kind."); break; diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index 8500b248fcf..c147ea3c253 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -117,9 +117,7 @@ std::unique_ptr> mixed_join_semi( // Anti and semi return all the row indices from left // with a corresponding NULL from the right. case join_kind::LEFT_ANTI_JOIN: - return get_trivial_left_join_indices( - left_conditional, stream, rmm::mr::get_current_device_resource()) - .first; + return get_trivial_left_join_indices(left_conditional, stream, mr).first; // Inner and left semi joins return empty output because no matches can exist. case join_kind::LEFT_SEMI_JOIN: return std::make_unique>(0, stream, mr); diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu index 630cf328579..7ecaa0fba56 100644 --- a/cpp/src/merge/merge.cu +++ b/cpp/src/merge/merge.cu @@ -694,11 +694,11 @@ std::unique_ptr merge(std::vector const& tables_to_merg std::vector const& key_cols, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::merge( - tables_to_merge, key_cols, column_order, null_precedence, cudf::get_default_stream(), mr); + return detail::merge(tables_to_merge, key_cols, column_order, null_precedence, stream, mr); } } // namespace cudf diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index 43a3d69091a..2ca22f0e017 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -451,8 +451,8 @@ struct replace_multi_fn { while (spos < d_str.size_bytes()) { for (int tgt_idx = 0; tgt_idx < d_targets.size(); ++tgt_idx) { auto const d_tgt = d_targets.element(tgt_idx); - if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) && // check fit - (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0)) // and match + if (!d_tgt.empty() && (d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) && // check fit + (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0)) // and match { auto const d_repl = (d_repls.size() == 1) ? d_repls.element(0) : d_repls.element(tgt_idx); @@ -468,9 +468,8 @@ struct replace_multi_fn { } ++spos; } - if (out_ptr) // copy remainder - { - memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos); + if (out_ptr) { + memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos); // copy remainder } else { d_sizes[idx] = bytes; } diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 101004a5d06..f70598f33be 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -158,8 +158,13 @@ int64_t get_offset64_threshold() bool is_large_strings_enabled() { + // default depends on compile-time switch but can be overridden by the environment variable auto const env = std::getenv("LIBCUDF_LARGE_STRINGS_ENABLED"); +#ifdef CUDF_LARGE_STRINGS_DISABLED return env != nullptr && std::string(env) == "1"; +#else + return env == nullptr || std::string(env) == "1"; +#endif } int64_t get_offset_value(cudf::column_view const& offsets, diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp index 13832b0d9dc..8a5340dc20d 100644 --- a/cpp/src/table/table_view.cpp +++ b/cpp/src/table/table_view.cpp @@ -52,12 +52,6 @@ auto concatenate_column_views(std::vector const& views) return concat_cols; } -template -ColumnView const& table_view_base::column(size_type column_index) const -{ - return _columns.at(column_index); -} - // Explicit instantiation for a table of `column_view`s template class table_view_base; @@ -172,6 +166,7 @@ bool has_nested_columns(table_view const& table) return std::any_of( table.begin(), table.end(), [](column_view const& col) { return is_nested(col.type()); }); } - } // namespace detail + +bool has_nested_columns(table_view const& table) { return detail::has_nested_columns(table); } } // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index eef09954647..8e2017ccb97 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -273,6 +273,7 @@ ConfigureTest( interop/from_arrow_test.cpp interop/from_arrow_device_test.cpp interop/from_arrow_host_test.cpp + interop/from_arrow_stream_test.cpp interop/dlpack_test.cpp EXTRA_LIB nanoarrow @@ -691,6 +692,8 @@ ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_MERGE_TEST streams/merge_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_MULTIBYTE_SPLIT_TEST streams/io/multibyte_split_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 054441788d0..18140c34abd 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -1667,3 +1667,63 @@ TEST_F(DictionaryConcatTest, ErrorsTest) std::vector empty; EXPECT_THROW(cudf::concatenate(empty), cudf::logic_error); } + +struct EmptyColumnTest : public cudf::test::BaseFixture {}; + +TEST_F(EmptyColumnTest, SimpleTest) +{ + std::vector columns; + constexpr auto num_copies = 10; + constexpr auto num_rows = 10; + for (auto i = 0; i < num_copies; ++i) { + columns.emplace_back(cudf::data_type(cudf::type_id::EMPTY), + num_rows, + rmm::device_buffer{}, + rmm::device_buffer{}, + 0); + } + + // Create views from columns + std::vector views; + for (auto& col : columns) { + views.push_back(col.view()); + } + auto result = cudf::concatenate(views); + + ASSERT_EQ(result->size(), num_copies * num_rows); + ASSERT_EQ(result->type().id(), cudf::type_id::EMPTY); +} + +struct TableOfEmptyColumnsTest : public cudf::test::BaseFixture {}; + +TEST_F(TableOfEmptyColumnsTest, SimpleTest) +{ + std::vector tables; + constexpr auto num_copies = 10; + constexpr auto num_rows = 10; + constexpr auto num_columns = 10; + for (auto i = 0; i < num_copies; ++i) { + std::vector> columns; + for (auto j = 0; j < num_columns; ++j) { + columns.push_back(std::make_unique(cudf::data_type(cudf::type_id::EMPTY), + num_rows, + rmm::device_buffer{}, + rmm::device_buffer{}, + 0)); + } + tables.emplace_back(std::move(columns)); + } + + // Create views from columns + std::vector views; + for (auto& tbl : tables) { + views.push_back(tbl.view()); + } + auto result = cudf::concatenate(views); + + ASSERT_EQ(result->num_rows(), num_copies * num_rows); + ASSERT_EQ(result->num_columns(), num_columns); + for (auto i = 0; i < num_columns; ++i) { + ASSERT_EQ(result->get_column(i).type().id(), cudf::type_id::EMPTY); + } +} diff --git a/cpp/tests/interop/from_arrow_stream_test.cpp b/cpp/tests/interop/from_arrow_stream_test.cpp new file mode 100644 index 00000000000..80a2e4b2ffd --- /dev/null +++ b/cpp/tests/interop/from_arrow_stream_test.cpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nanoarrow_utils.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct VectorOfArrays { + std::vector arrays; + nanoarrow::UniqueSchema schema; + size_t index{0}; + + static int get_schema(ArrowArrayStream* stream, ArrowSchema* out_schema) + { + auto private_data = static_cast(stream->private_data); + + [[maybe_unused]] auto rc = ArrowSchemaDeepCopy(private_data->schema.get(), out_schema); + return 0; + } + + static int get_next(ArrowArrayStream* stream, ArrowArray* out_array) + { + auto private_data = static_cast(stream->private_data); + if (private_data->index >= private_data->arrays.size()) { + out_array->release = nullptr; + return 0; + } + ArrowArrayMove(private_data->arrays[private_data->index++].get(), out_array); + return 0; + } + + static const char* get_last_error(ArrowArrayStream* stream) { return nullptr; } + + static void release(ArrowArrayStream* stream) + { + delete static_cast(stream->private_data); + } +}; + +struct FromArrowStreamTest : public cudf::test::BaseFixture {}; + +void makeStreamFromArrays(std::vector arrays, + nanoarrow::UniqueSchema schema, + ArrowArrayStream* out) +{ + auto* private_data = new VectorOfArrays{std::move(arrays), std::move(schema)}; + out->get_schema = VectorOfArrays::get_schema; + out->get_next = VectorOfArrays::get_next; + out->get_last_error = VectorOfArrays::get_last_error; + out->release = VectorOfArrays::release; + out->private_data = private_data; +} + +TEST_F(FromArrowStreamTest, BasicTest) +{ + constexpr auto num_copies = 3; + std::vector> tables; + // The schema is unique across all tables. + nanoarrow::UniqueSchema schema; + std::vector arrays; + for (auto i = 0; i < num_copies; ++i) { + auto [tbl, sch, arr] = get_nanoarrow_host_tables(0); + tables.push_back(std::move(tbl)); + arrays.push_back(std::move(arr)); + if (i == 0) { sch.move(schema.get()); } + } + std::vector table_views; + for (auto const& table : tables) { + table_views.push_back(table->view()); + } + auto expected = cudf::concatenate(table_views); + + ArrowArrayStream stream; + makeStreamFromArrays(std::move(arrays), std::move(schema), &stream); + auto result = cudf::from_arrow_stream(&stream); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result->view()); +} + +TEST_F(FromArrowStreamTest, EmptyTest) +{ + auto [tbl, sch, arr] = get_nanoarrow_host_tables(0); + std::vector table_views{tbl->view()}; + auto expected = cudf::concatenate(table_views); + + ArrowArrayStream stream; + makeStreamFromArrays({}, std::move(sch), &stream); + auto result = cudf::from_arrow_stream(&stream); + cudf::have_same_types(expected->view(), result->view()); +} diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index 94c4372e74a..4147728b2a6 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -375,3 +375,6 @@ nanoarrow::UniqueArray get_nanoarrow_list_array(std::initializer_list data, std::tuple, nanoarrow::UniqueSchema, generated_test_data> get_nanoarrow_cudf_table(cudf::size_type length); + +std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> +get_nanoarrow_host_tables(cudf::size_type length); diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 36338253c9b..408d54bd5ff 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -97,10 +97,9 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange) auto expected = strings_column_wrapper{"abcdefg:"}; auto source = cudf::io::text::make_source(host_input); - auto out = cudf::io::text::multibyte_split( - *source, - delimiter, - cudf::io::text::byte_range_info{0, static_cast(host_input.size())}); + cudf::io::text::parse_options options{ + cudf::io::text::byte_range_info{0, static_cast(host_input.size())}}; + auto out = cudf::io::text::multibyte_split(*source, delimiter, options); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } @@ -113,10 +112,9 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange2) auto expected = strings_column_wrapper{"abcdefg:"}; auto source = cudf::io::text::make_source(host_input); - auto out = cudf::io::text::multibyte_split( - *source, - delimiter, - cudf::io::text::byte_range_info{0, static_cast(host_input.size() - 1)}); + cudf::io::text::parse_options options{ + cudf::io::text::byte_range_info{0, static_cast(host_input.size() - 1)}}; + auto out = cudf::io::text::multibyte_split(*source, delimiter, options); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } @@ -277,9 +275,12 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRange) auto source = cudf::io::text::make_source(host_input); auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3); - auto out0 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]); - auto out1 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]); - auto out2 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]); + auto out0 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]}); + auto out1 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]}); + auto out2 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]}); auto out_views = std::vector({out0->view(), out1->view(), out2->view()}); auto out = cudf::concatenate(out_views); @@ -303,9 +304,12 @@ TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRange) auto source = cudf::io::text::make_source(host_input); auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3); - auto out0 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]); - auto out1 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]); - auto out2 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]); + auto out0 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]}); + auto out1 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]}); + auto out2 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]}); auto out_views = std::vector({out0->view(), out1->view(), out2->view()}); auto out = cudf::concatenate(out_views); @@ -327,9 +331,12 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRangeSingleByte) auto source = cudf::io::text::make_source(host_input); auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3); - auto out0 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]); - auto out1 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]); - auto out2 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]); + auto out0 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]}); + auto out1 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]}); + auto out2 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]}); auto out_views = std::vector({out0->view(), out1->view(), out2->view()}); auto out = cudf::concatenate(out_views); @@ -352,9 +359,12 @@ TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRangeSingleByte) auto source = cudf::io::text::make_source(host_input); auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3); - auto out0 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]); - auto out1 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]); - auto out2 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]); + auto out0 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]}); + auto out1 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]}); + auto out2 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]}); auto out_views = std::vector({out0->view(), out1->view(), out2->view()}); auto out = cudf::concatenate(out_views); @@ -383,9 +393,14 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRanges) SCOPED_TRACE(split1); for (int split2 = split1 + 1; split2 < size; split2++) { SCOPED_TRACE(split2); - auto out1 = multibyte_split(*source, delimiter, byte_range_info{0, split1}); - auto out2 = multibyte_split(*source, delimiter, byte_range_info{split1, split2 - split1}); - auto out3 = multibyte_split(*source, delimiter, byte_range_info{split2, size - split2}); + auto out1 = multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_range_info{0, split1}}); + auto out2 = + multibyte_split(*source, + delimiter, + cudf::io::text::parse_options{byte_range_info{split1, split2 - split1}}); + auto out3 = multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_range_info{split2, size - split2}}); auto out_views = std::vector({out1->view(), out2->view(), out3->view()}); auto out = cudf::concatenate(out_views); @@ -416,9 +431,14 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRangesSingleByte) SCOPED_TRACE(split1); for (int split2 = split1 + 1; split2 < size; split2++) { SCOPED_TRACE(split2); - auto out1 = multibyte_split(*source, delimiter, byte_range_info{0, split1}); - auto out2 = multibyte_split(*source, delimiter, byte_range_info{split1, split2 - split1}); - auto out3 = multibyte_split(*source, delimiter, byte_range_info{split2, size - split2}); + auto out1 = multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_range_info{0, split1}}); + auto out2 = + multibyte_split(*source, + delimiter, + cudf::io::text::parse_options{byte_range_info{split1, split2 - split1}}); + auto out3 = multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_range_info{split2, size - split2}}); auto out_views = std::vector({out1->view(), out2->view(), out3->view()}); auto out = cudf::concatenate(out_views); @@ -441,7 +461,8 @@ TEST_F(MultibyteSplitTest, SingletonRangeAtEnd) auto source = make_source(host_input); auto expected = strings_column_wrapper{}; - auto out = multibyte_split(*source, delimiter, byte_range_info{5, 1}); + auto out = + multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{5, 1}}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS); } @@ -480,7 +501,8 @@ TEST_F(MultibyteSplitTest, EmptyRange) auto source = make_source(host_input); auto expected = strings_column_wrapper{}; - auto out = multibyte_split(*source, delimiter, byte_range_info{4, 0}); + auto out = + multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{4, 0}}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS); } @@ -493,7 +515,8 @@ TEST_F(MultibyteSplitTest, EmptyRangeSingleByte) auto source = make_source(host_input); auto expected = strings_column_wrapper{}; - auto out = multibyte_split(*source, delimiter, byte_range_info{3, 0}); + auto out = + multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{3, 0}}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS); } diff --git a/cpp/tests/streams/io/multibyte_split_test.cpp b/cpp/tests/streams/io/multibyte_split_test.cpp new file mode 100644 index 00000000000..b0eff1d3340 --- /dev/null +++ b/cpp/tests/streams/io/multibyte_split_test.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +#include + +class MultibyteSplitTest : public cudf::test::BaseFixture {}; + +TEST_F(MultibyteSplitTest, Reader) +{ + auto delimiter = std::string(":"); + auto host_input = std::string("abc:def"); + auto source = cudf::io::text::make_source(host_input); + cudf::io::text::parse_options options{}; + auto result = + cudf::io::text::multibyte_split(*source, delimiter, options, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/merge_test.cpp b/cpp/tests/streams/merge_test.cpp new file mode 100644 index 00000000000..1dfe877878d --- /dev/null +++ b/cpp/tests/streams/merge_test.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +template +class MergeTest_ : public cudf::test::BaseFixture {}; + +TYPED_TEST_SUITE(MergeTest_, cudf::test::FixedWidthTypes); + +TYPED_TEST(MergeTest_, MergeIsZeroWhenShouldNotBeZero) +{ + using columnFactoryT = cudf::test::fixed_width_column_wrapper; + + columnFactoryT leftColWrap1({1, 2, 3, 4, 5}); + cudf::test::fixed_width_column_wrapper rightColWrap1{}; + + std::vector key_cols{0}; + std::vector column_order; + column_order.push_back(cudf::order::ASCENDING); + std::vector null_precedence(column_order.size(), cudf::null_order::AFTER); + + cudf::table_view left_view{{leftColWrap1}}; + cudf::table_view right_view{{rightColWrap1}}; + cudf::table_view expected{{leftColWrap1}}; + + auto result = cudf::merge({left_view, right_view}, + key_cols, + column_order, + null_precedence, + cudf::test::get_default_stream()); + + int expected_len = 5; + ASSERT_EQ(result->num_rows(), expected_len); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result->view()); +} + +TYPED_TEST(MergeTest_, SingleTableInput) +{ + cudf::size_type inputRows = 40; + + auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); + cudf::test::fixed_width_column_wrapper + colWrap1(sequence, sequence + inputRows); + + std::vector key_cols{0}; + std::vector column_order{cudf::order::ASCENDING}; + std::vector null_precedence{}; + + cudf::table_view left_view{{colWrap1}}; + + std::unique_ptr p_outputTable; + CUDF_EXPECT_NO_THROW( + p_outputTable = cudf::merge( + {left_view}, key_cols, column_order, null_precedence, cudf::test::get_default_stream())); + + auto input_column_view{left_view.column(0)}; + auto output_column_view{p_outputTable->view().column(0)}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(input_column_view, output_column_view); +} + +class MergeTest : public cudf::test::BaseFixture {}; + +TEST_F(MergeTest, KeysWithNulls) +{ + cudf::size_type nrows = 13200; // Ensures that thrust::merge uses more than one tile/block + auto data_iter = thrust::make_counting_iterator(0); + auto valids1 = + cudf::detail::make_counting_transform_iterator(0, [](auto row) { return row % 10 != 0; }); + cudf::test::fixed_width_column_wrapper data1(data_iter, data_iter + nrows, valids1); + auto valids2 = + cudf::detail::make_counting_transform_iterator(0, [](auto row) { return row % 15 != 0; }); + cudf::test::fixed_width_column_wrapper data2(data_iter, data_iter + nrows, valids2); + auto all_data = cudf::concatenate(std::vector{{data1, data2}}, + cudf::test::get_default_stream()); + + std::vector column_orders{cudf::order::ASCENDING, cudf::order::DESCENDING}; + std::vector null_precedences{cudf::null_order::AFTER, cudf::null_order::BEFORE}; + + for (auto co : column_orders) + for (auto np : null_precedences) { + std::vector column_order{co}; + std::vector null_precedence{np}; + auto sorted1 = cudf::sort(cudf::table_view({data1}), + column_order, + null_precedence, + cudf::test::get_default_stream()) + ->release(); + auto col1 = sorted1.front()->view(); + auto sorted2 = cudf::sort(cudf::table_view({data2}), + column_order, + null_precedence, + cudf::test::get_default_stream()) + ->release(); + auto col2 = sorted2.front()->view(); + + auto result = cudf::merge({cudf::table_view({col1}), cudf::table_view({col2})}, + {0}, + column_order, + null_precedence, + cudf::test::get_default_stream()); + auto sorted_all = cudf::sort(cudf::table_view({all_data->view()}), + column_order, + null_precedence, + cudf::test::get_default_stream()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_all->view().column(0), result->view().column(0)); + } +} + +CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp index 3aa7467d156..6c4afbb435a 100644 --- a/cpp/tests/strings/replace_tests.cpp +++ b/cpp/tests/strings/replace_tests.cpp @@ -532,6 +532,23 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong) } } +TEST_F(StringsReplaceTest, EmptyTarget) +{ + auto const input = cudf::test::strings_column_wrapper({"hello", "world", "", "accénted"}); + auto const sv = cudf::strings_column_view(input); + + auto const targets = cudf::test::strings_column_wrapper({"e", "", "d"}); + auto const tv = cudf::strings_column_view(targets); + + auto const repls = cudf::test::strings_column_wrapper({"E", "_", "D"}); + auto const rv = cudf::strings_column_view(repls); + + // empty target should be ignored + auto results = cudf::strings::replace_multiple(sv, tv, rv); + auto expected = cudf::test::strings_column_wrapper({"hEllo", "worlD", "", "accéntED"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + TEST_F(StringsReplaceTest, EmptyStringsColumn) { auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view(); diff --git a/cpp/tests/table/experimental_row_operator_tests.cu b/cpp/tests/table/experimental_row_operator_tests.cu index 896cc7a82d4..0d9e4e27f2c 100644 --- a/cpp/tests/table/experimental_row_operator_tests.cu +++ b/cpp/tests/table/experimental_row_operator_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -109,15 +109,14 @@ TYPED_TEST(TypedTableViewTest, TestSortSameTableFromTwoTables) auto const lhs = cudf::table_view{{col1}}; auto const empty_rhs = cudf::table_view{{col2}}; - auto const stream = cudf::get_default_stream(); - auto const test_sort = [stream](auto const& preprocessed, - auto const& input, - auto const& comparator, - auto const& expected) { - auto const order = sorted_order( - preprocessed, input.num_rows(), cudf::detail::has_nested_columns(input), comparator, stream); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view()); - }; + auto const stream = cudf::get_default_stream(); + auto const test_sort = + [stream]( + auto const& preprocessed, auto const& input, auto const& comparator, auto const& expected) { + auto const order = sorted_order( + preprocessed, input.num_rows(), cudf::has_nested_columns(input), comparator, stream); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view()); + }; auto const test_sort_two_tables = [&](auto const& preprocessed_lhs, auto const& preprocessed_empty_rhs) { @@ -188,15 +187,14 @@ TYPED_TEST(TypedTableViewTest, TestSortSameTableFromTwoTablesWithListsOfStructs) auto const lhs = cudf::table_view{{*col1}}; auto const empty_rhs = cudf::table_view{{*col2}}; - auto const stream = cudf::get_default_stream(); - auto const test_sort = [stream](auto const& preprocessed, - auto const& input, - auto const& comparator, - auto const& expected) { - auto const order = sorted_order( - preprocessed, input.num_rows(), cudf::detail::has_nested_columns(input), comparator, stream); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view()); - }; + auto const stream = cudf::get_default_stream(); + auto const test_sort = + [stream]( + auto const& preprocessed, auto const& input, auto const& comparator, auto const& expected) { + auto const order = sorted_order( + preprocessed, input.num_rows(), cudf::has_nested_columns(input), comparator, stream); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view()); + }; auto const test_sort_two_tables = [&](auto const& preprocessed_lhs, auto const& preprocessed_empty_rhs) { diff --git a/cpp/tests/table/row_operator_tests_utilities.cu b/cpp/tests/table/row_operator_tests_utilities.cu index cfffa1cdd54..6127864987d 100644 --- a/cpp/tests/table/row_operator_tests_utilities.cu +++ b/cpp/tests/table/row_operator_tests_utilities.cu @@ -42,7 +42,7 @@ std::unique_ptr two_table_comparison(cudf::table_view lhs, auto output = cudf::make_numeric_column( cudf::data_type(cudf::type_id::BOOL8), lhs.num_rows(), cudf::mask_state::UNALLOCATED); - if (cudf::detail::has_nested_columns(lhs) || cudf::detail::has_nested_columns(rhs)) { + if (cudf::has_nested_columns(lhs) || cudf::has_nested_columns(rhs)) { thrust::transform(rmm::exec_policy(stream), lhs_it, lhs_it + lhs.num_rows(), @@ -129,7 +129,7 @@ std::unique_ptr two_table_equality(cudf::table_view lhs, auto output = cudf::make_numeric_column( cudf::data_type(cudf::type_id::BOOL8), lhs.num_rows(), cudf::mask_state::UNALLOCATED); - if (cudf::detail::has_nested_columns(lhs) or cudf::detail::has_nested_columns(rhs)) { + if (cudf::has_nested_columns(lhs) or cudf::has_nested_columns(rhs)) { auto const equal_comparator = table_comparator.equal_to(cudf::nullate::NO{}, cudf::null_equality::EQUAL, comparator); diff --git a/cpp/tests/table/row_operator_tests_utilities2.cu b/cpp/tests/table/row_operator_tests_utilities2.cu index 057d9ee1004..17d274eba13 100644 --- a/cpp/tests/table/row_operator_tests_utilities2.cu +++ b/cpp/tests/table/row_operator_tests_utilities2.cu @@ -41,7 +41,7 @@ std::unique_ptr self_comparison(cudf::table_view input, auto output = cudf::make_numeric_column( cudf::data_type(cudf::type_id::BOOL8), input.num_rows(), cudf::mask_state::UNALLOCATED); - if (cudf::detail::has_nested_columns(input)) { + if (cudf::has_nested_columns(input)) { thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(input.num_rows()), diff --git a/dependencies.yaml b/dependencies.yaml index 38ec30a8033..27621ff9a3f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -243,7 +243,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - &cmake_ver cmake>=3.26.4 + - &cmake_ver cmake>=3.26.4,!=3.30.0 - &ninja ninja build_all: common: @@ -287,8 +287,8 @@ dependencies: - output_types: conda packages: - fmt>=10.1.1,<11 - - librmm==24.8.* - - libkvikio==24.8.* + - librmm==24.8.*,>=0.0.0a0 + - libkvikio==24.8.*,>=0.0.0a0 - librdkafka>=1.9.0,<1.10.0a0 # Align nvcomp version with rapids-cmake - nvcomp==3.0.6 @@ -500,7 +500,7 @@ dependencies: - output_types: [conda] packages: - breathe>=4.35.0 - - dask-cuda==24.8.* + - dask-cuda==24.8.*,>=0.0.0a0 - *doxygen - make - myst-nb @@ -582,7 +582,7 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - pynvjitlink + - pynvjitlink>=0.0.0a0 - matrix: {cuda: "11.*"} packages: - cubinlinker @@ -592,7 +592,7 @@ dependencies: - matrix: {cuda: "12.*"} packages: - rmm-cu12==24.8.*,>=0.0.0a0 - - pynvjitlink-cu12 + - pynvjitlink-cu12>=0.0.0a0 - matrix: {cuda: "11.*"} packages: - rmm-cu11==24.8.*,>=0.0.0a0 @@ -603,7 +603,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=0.20.30 + - polars>=1.0 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] @@ -755,7 +755,7 @@ dependencies: - {matrix: null, packages: *cupy_packages_cu11} test_python_pandas_cudf: common: - - output_types: pyproject + - output_types: [requirements, pyproject] packages: # dependencies to run pandas tests # https://github.com/pandas-dev/pandas/blob/main/environment.yml @@ -766,7 +766,7 @@ dependencies: - pytest-reportlog test_python_cudf_pandas: common: - - output_types: pyproject + - output_types: [requirements, pyproject] packages: - ipython - openpyxl diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 108f12bc099..c3c14ac8cad 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -372,7 +372,7 @@ def _generate_namespaces(namespaces): _all_namespaces = _generate_namespaces( { # Note that io::datasource is actually a nested class - "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression"}, + "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression", "io::text"}, "numeric": {}, "nvtext": {}, } diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index e9dad705cbf..bd6f0f77357 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -18,22 +18,22 @@ This page provides API documentation for pylibcudf. filling gpumemoryview groupby - io/index.rst interop join lists merge quantiles reduce + replace reshape rolling round scalar search - stream_compaction sorting - replace + stream_compaction table + traits types unary @@ -41,4 +41,5 @@ This page provides API documentation for pylibcudf. :maxdepth: 2 :caption: Subpackages + io/index.rst strings/index.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst index 0d53ac92db9..bde6d8094ce 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst @@ -16,3 +16,4 @@ I/O Functions :maxdepth: 1 avro + json diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst new file mode 100644 index 00000000000..6aeae1f322a --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst @@ -0,0 +1,6 @@ +==== +JSON +==== + +.. automodule:: cudf._lib.pylibcudf.io.json + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst new file mode 100644 index 00000000000..294ca8dc78c --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst @@ -0,0 +1,6 @@ +====== +traits +====== + +.. automodule:: cudf._lib.pylibcudf.traits + :members: diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index a8fef907bad..22e34feb547 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -9,38 +9,27 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool from libcpp.map cimport map -from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types -from cudf._lib.column cimport Column -from cudf._lib.io.utils cimport ( - make_sink_info, - make_source_info, - update_struct_field_names, -) -from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink +from cudf._lib.io.utils cimport make_source_info, update_struct_field_names from cudf._lib.pylibcudf.libcudf.io.json cimport ( json_reader_options, json_recovery_mode_t, - json_writer_options, read_json as libcudf_read_json, schema_element, - write_json as libcudf_write_json, ) from cudf._lib.pylibcudf.libcudf.io.types cimport ( - column_name_info, compression_type, - sink_info, - table_metadata, table_with_metadata, ) -from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type from cudf._lib.types cimport dtype_to_data_type -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport data_from_unique_ptr + +import cudf._lib.pylibcudf as plc cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines): @@ -175,45 +164,27 @@ def write_json( -------- cudf.to_json """ - cdef table_view input_table_view = table_view_from_table( - table, ignore_index=True - ) - - cdef unique_ptr[data_sink] data_sink_c - cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c) - cdef string na_c = na_rep.encode() - cdef bool include_nulls_c = include_nulls - cdef bool lines_c = lines - cdef int rows_per_chunk_c = rows_per_chunk - cdef string true_value_c = 'true'.encode() - cdef string false_value_c = 'false'.encode() - cdef table_metadata tbl_meta - - num_index_cols_meta = 0 - cdef column_name_info child_info - for i, name in enumerate(table._column_names, num_index_cols_meta): - child_info.name = name.encode() - tbl_meta.schema_info.push_back(child_info) - _set_col_children_metadata( - table[name]._column, - tbl_meta.schema_info[i] - ) + cdef list colnames = [] - cdef json_writer_options options = move( - json_writer_options.builder(sink_info_c, input_table_view) - .metadata(tbl_meta) - .na_rep(na_c) - .include_nulls(include_nulls_c) - .lines(lines_c) - .rows_per_chunk(rows_per_chunk_c) - .true_value(true_value_c) - .false_value(false_value_c) - .build() - ) + for name in table._column_names: + colnames.append((name, _dtype_to_names_list(table[name]._column))) try: - with nogil: - libcudf_write_json(options) + plc.io.json.write_json( + plc.io.SinkInfo([path_or_buf]), + plc.io.TableWithMetadata( + plc.Table([ + c.to_pylibcudf(mode="read") for c in table._columns + ]), + colnames + ), + na_rep, + include_nulls, + lines, + rows_per_chunk, + true_value="true", + false_value="false" + ) except OverflowError: raise OverflowError( f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. " @@ -254,23 +225,12 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: ) return dtype_to_data_type(dtype) -cdef _set_col_children_metadata(Column col, - column_name_info& col_meta): - cdef column_name_info child_info + +def _dtype_to_names_list(col): if isinstance(col.dtype, cudf.StructDtype): - for i, (child_col, name) in enumerate( - zip(col.children, list(col.dtype.fields)) - ): - child_info.name = name.encode() - col_meta.children.push_back(child_info) - _set_col_children_metadata( - child_col, col_meta.children[i] - ) + return [(name, _dtype_to_names_list(child)) + for name, child in zip(col.dtype.fields, col.children)] elif isinstance(col.dtype, cudf.ListDtype): - for i, child_col in enumerate(col.children): - col_meta.children.push_back(child_info) - _set_col_children_metadata( - child_col, col_meta.children[i] - ) - else: - return + return [("", _dtype_to_names_list(child)) + for child in col.children] + return [] diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 0a198f431a7..d22096081af 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -38,6 +38,7 @@ set(cython_sources stream_compaction.pyx sorting.pyx table.pyx + traits.pyx types.pyx unary.pyx utils.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 5131df9a5cd..d4d615cde34 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -23,6 +23,7 @@ from . cimport ( sorting, stream_compaction, strings, + traits, types, unary, ) @@ -54,12 +55,14 @@ __all__ = [ "quantiles", "reduce", "replace", + "reshape", "rolling", "round", "search", "stream_compaction", "strings", "sorting", + "traits", "types", "unary", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 43a9e2aca31..91f8acaf682 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -23,6 +23,7 @@ sorting, stream_compaction, strings, + traits, types, unary, ) @@ -35,6 +36,7 @@ __all__ = [ "Column", "DataType", + "MaskState", "Scalar", "Table", "TypeId", @@ -54,12 +56,14 @@ "quantiles", "reduce", "replace", + "reshape", "rolling", "round", "search", "stream_compaction", "strings", "sorting", + "traits", "types", "unary", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index 07e9d1ead11..adf7e1fd7e8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from cpython cimport pycapsule from cython.operator cimport dereference from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move @@ -11,9 +12,15 @@ from functools import singledispatch from pyarrow import lib as pa +from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.interop cimport ( + ArrowArray, + ArrowArrayStream, + ArrowSchema, column_metadata, from_arrow as cpp_from_arrow, + from_arrow_column as cpp_from_arrow_column, + from_arrow_stream as cpp_from_arrow_stream, to_arrow as cpp_to_arrow, ) from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport ( @@ -124,11 +131,15 @@ def _from_arrow_datatype(pyarrow_object): def _from_arrow_table(pyarrow_object, *, DataType data_type=None): if data_type is not None: raise ValueError("data_type may not be passed for tables") - cdef shared_ptr[pa.CTable] arrow_table = pa.pyarrow_unwrap_table(pyarrow_object) + stream = pyarrow_object.__arrow_c_stream__() + cdef ArrowArrayStream* c_stream = ( + pycapsule.PyCapsule_GetPointer(stream, "arrow_array_stream") + ) cdef unique_ptr[table] c_result with nogil: - c_result = move(cpp_from_arrow(dereference(arrow_table))) + # The libcudf function here will release the stream. + c_result = move(cpp_from_arrow_stream(c_stream)) return Table.from_libcudf(move(c_result)) @@ -190,8 +201,25 @@ def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None): def _from_arrow_column(pyarrow_object, *, DataType data_type=None): if data_type is not None: raise ValueError("data_type may not be passed for arrays") - pa_table = pa.table([pyarrow_object], [""]) - return from_arrow(pa_table).columns()[0] + + schema, array = pyarrow_object.__arrow_c_array__() + cdef ArrowSchema* c_schema = ( + pycapsule.PyCapsule_GetPointer(schema, "arrow_schema") + ) + cdef ArrowArray* c_array = ( + pycapsule.PyCapsule_GetPointer(array, "arrow_array") + ) + + cdef unique_ptr[column] c_result + with nogil: + c_result = move(cpp_from_arrow_column(c_schema, c_array)) + + # The capsule destructors should release automatically for us, but we + # choose to do it explicitly here for clarity. + c_schema.release(c_schema) + c_array.release(c_array) + + return Column.from_libcudf(move(c_result)) @singledispatch diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt index 32f0f5543e4..084b341ec48 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources avro.pyx datasource.pyx types.pyx) +set(cython_sources avro.pyx datasource.pyx json.pyx types.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( @@ -21,5 +21,7 @@ rapids_cython_create_modules( LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf ) -set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_types) +set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_json + pylibcudf_io_types +) link_to_pyarrow_headers("${targets_using_arrow_headers}") diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd index cfd6d2cd281..ef4c65b277e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd @@ -1,4 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport avro, datasource, types +from . cimport avro, datasource, json, types from .types cimport SourceInfo, TableWithMetadata diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py index a54ba1834dc..fb4e4c7e4bb 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import avro, datasource, types -from .types import SourceInfo, TableWithMetadata +from . import avro, datasource, json, types +from .types import SinkInfo, SourceInfo, TableWithMetadata diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx index 946e0896fc8..538bd8aa322 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx @@ -19,7 +19,7 @@ cpdef TableWithMetadata read_avro( size_type num_rows = -1 ): """ - Reads an Avro dataset into a set of columns. + Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`. Parameters ---------- @@ -36,7 +36,7 @@ cpdef TableWithMetadata read_avro( Returns ------- TableWithMetadata - The Table and its corresponding metadata that was read in. + The Table and its corresponding metadata (column names) that were read in. """ cdef vector[string] c_columns if columns is not None and len(columns) > 0: diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd new file mode 100644 index 00000000000..a91d574131f --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool + +from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata +from cudf._lib.pylibcudf.libcudf.types cimport size_type + + +cpdef void write_json( + SinkInfo sink_info, + TableWithMetadata tbl, + str na_rep = *, + bool include_nulls = *, + bool lines = *, + size_type rows_per_chunk = *, + str true_value = *, + str false_value = * +) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx new file mode 100644 index 00000000000..7530eba3803 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx @@ -0,0 +1,68 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.limits cimport numeric_limits +from libcpp.string cimport string + +from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata +from cudf._lib.pylibcudf.libcudf.io.json cimport ( + json_writer_options, + write_json as cpp_write_json, +) +from cudf._lib.pylibcudf.libcudf.io.types cimport table_metadata +from cudf._lib.pylibcudf.types cimport size_type + + +cpdef void write_json( + SinkInfo sink_info, + TableWithMetadata table_w_meta, + str na_rep = "", + bool include_nulls = False, + bool lines = False, + size_type rows_per_chunk = numeric_limits[size_type].max(), + str true_value = "true", + str false_value = "false" +): + """ + Writes a :py:class:`~cudf._lib.pylibcudf.table.Table` to JSON format. + + Parameters + ---------- + sink_info: SinkInfo + The SinkInfo object to write the JSON to. + table_w_meta: TableWithMetadata + The TableWithMetadata object containing the Table to write + na_rep: str, default "" + The string representation for null values. + include_nulls: bool, default False + Enables/Disables output of nulls as 'null'. + lines: bool, default False + If `True`, write output in the JSON lines format. + rows_per_chunk: size_type, defaults to length of the input table + The maximum number of rows to write at a time. + true_value: str, default "true" + The string representation for values != 0 in INT8 types. + false_value: str, default "false" + The string representation for values == 0 in INT8 types. + """ + cdef table_metadata tbl_meta = table_w_meta.metadata + cdef string na_rep_c = na_rep.encode() + + cdef json_writer_options options = ( + json_writer_options.builder(sink_info.c_obj, table_w_meta.tbl.view()) + .metadata(tbl_meta) + .na_rep(na_rep_c) + .include_nulls(include_nulls) + .lines(lines) + .build() + ) + + if rows_per_chunk != numeric_limits[size_type].max(): + options.set_rows_per_chunk(rows_per_chunk) + if true_value != "true": + options.set_true_value(true_value.encode()) + if false_value != "false": + options.set_false_value(false_value.encode()) + + with nogil: + cpp_write_json(options) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd index aa846a47343..88daf54f33b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd @@ -1,4 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink from cudf._lib.pylibcudf.libcudf.io.types cimport ( column_encoding, column_in_metadata, @@ -22,8 +26,15 @@ cdef class TableWithMetadata: cdef public Table tbl cdef table_metadata metadata + cdef vector[column_name_info] _make_column_info(self, list column_names) + @staticmethod cdef TableWithMetadata from_libcudf(table_with_metadata& tbl) cdef class SourceInfo: cdef source_info c_obj + +cdef class SinkInfo: + # This vector just exists to keep the unique_ptrs to the sinks alive + cdef vector[unique_ptr[data_sink]] sink_storage + cdef sink_info c_obj diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx index ab3375da662..f94e20970a4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -1,17 +1,23 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from cpython.buffer cimport PyBUF_READ +from cpython.memoryview cimport PyMemoryView_FromMemory +from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector from cudf._lib.pylibcudf.io.datasource cimport Datasource +from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource from cudf._lib.pylibcudf.libcudf.io.types cimport ( + column_name_info, host_buffer, source_info, table_with_metadata, ) +import codecs import errno import io import os @@ -22,7 +28,39 @@ cdef class TableWithMetadata: (e.g. column names) For details, see :cpp:class:`cudf::io::table_with_metadata`. + + Parameters + ---------- + tbl : Table + The input table. + column_names : list + A list of tuples each containing the name of each column + and the names of its child columns (in the same format). + e.g. + [("id", []), ("name", [("first", []), ("last", [])])] + """ + def __init__(self, Table tbl, list column_names): + self.tbl = tbl + + self.metadata.schema_info = self._make_column_info(column_names) + + cdef vector[column_name_info] _make_column_info(self, list column_names): + cdef vector[column_name_info] col_name_infos + cdef column_name_info info + + col_name_infos.reserve(len(column_names)) + + for name, child_names in column_names: + if not isinstance(name, str): + raise ValueError("Column name must be a string!") + + info.name = name.encode() + info.children = self._make_column_info(child_names) + + col_name_infos.push_back(info) + + return col_name_infos @property def columns(self): @@ -51,6 +89,7 @@ cdef class TableWithMetadata: out.metadata = tbl_with_meta.metadata return out + cdef class SourceInfo: """A class containing details on a source to read from. @@ -119,7 +158,87 @@ cdef class SourceInfo: raise ValueError("Sources must be a list of str/paths, " "bytes, io.BytesIO, or a Datasource") - if empty_buffer is True: - c_host_buffers.push_back(host_buffer(NULL, 0)) + self.c_obj = source_info(c_host_buffers) + + +# Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you +# write from cudf to any python file-like object (File/BytesIO/SocketIO etc) +cdef cppclass iobase_data_sink(data_sink): + object buf + + iobase_data_sink(object buf_): + this.buf = buf_ + + void host_write(const void * data, size_t size) with gil: + if isinstance(buf, io.TextIOBase): + buf.write(PyMemoryView_FromMemory(data, size, PyBUF_READ) + .tobytes().decode()) + else: + buf.write(PyMemoryView_FromMemory(data, size, PyBUF_READ)) + + void flush() with gil: + buf.flush() + + size_t bytes_written() with gil: + return buf.tell() + + +cdef class SinkInfo: + """A class containing details on a source to read from. + + For details, see :cpp:class:`cudf::io::sink_info`. + + Parameters + ---------- + sinks : list of str, PathLike, BytesIO, StringIO + + A homogeneous list of sinks (this can be a string filename, + bytes, or one of the Python I/O classes) to read from. + + Mixing different types of sinks will raise a `ValueError`. + """ + + def __init__(self, list sinks): + cdef vector[data_sink *] data_sinks + cdef vector[string] paths + + if not sinks: + raise ValueError("Need to pass at least one sink") + + if isinstance(sinks[0], os.PathLike): + sinks = [os.path.expanduser(s) for s in sinks] + + cdef object initial_sink_cls = type(sinks[0]) + + if not all(isinstance(s, initial_sink_cls) for s in sinks): + raise ValueError("All sinks must be of the same type!") + + if initial_sink_cls in {io.StringIO, io.BytesIO, io.TextIOBase}: + data_sinks.reserve(len(sinks)) + if isinstance(sinks[0], (io.StringIO, io.BytesIO)): + for s in sinks: + self.sink_storage.push_back( + unique_ptr[data_sink](new iobase_data_sink(s)) + ) + elif isinstance(sinks[0], io.TextIOBase): + for s in sinks: + if codecs.lookup(s).name not in ('utf-8', 'ascii'): + raise NotImplementedError(f"Unsupported encoding {s.encoding}") + self.sink_storage.push_back( + unique_ptr[data_sink](new iobase_data_sink(s.buffer)) + ) + data_sinks.push_back(self.sink_storage.back().get()) + elif initial_sink_cls is str: + paths.reserve(len(sinks)) + for s in sinks: + paths.push_back( s.encode()) + else: + raise TypeError( + "Unrecognized input type: {}".format(type(sinks[0])) + ) - self.c_obj = move(source_info(c_host_buffers)) + if data_sinks.size() > 0: + self.c_obj = sink_info(data_sinks) + else: + # we don't have sinks so we must have paths to sinks + self.c_obj = sink_info(paths) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd index 471b78505fb..2151da28d4b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd @@ -7,6 +7,7 @@ from pyarrow.lib cimport CScalar, CTable from cudf._lib.types import cudf_to_np_types, np_to_cudf_types +from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar from cudf._lib.pylibcudf.libcudf.table.table cimport table from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view @@ -16,6 +17,19 @@ cdef extern from "dlpack/dlpack.h" nogil: ctypedef struct DLManagedTensor: void(*deleter)(DLManagedTensor*) except + + +# The Arrow structs are not namespaced. +cdef extern from "cudf/interop.hpp" nogil: + cdef struct ArrowSchema: + void (*release)(ArrowSchema*) noexcept nogil + + cdef struct ArrowArray: + void (*release)(ArrowArray*) noexcept nogil + + cdef struct ArrowArrayStream: + void (*release)(ArrowArrayStream*) noexcept nogil + + cdef extern from "cudf/interop.hpp" namespace "cudf" \ nogil: cdef unique_ptr[table] from_dlpack(const DLManagedTensor* tensor @@ -42,3 +56,9 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \ const scalar& input, column_metadata metadata, ) except + + + cdef unique_ptr[table] from_arrow_stream(ArrowArrayStream* input) except + + cdef unique_ptr[column] from_arrow_column( + const ArrowSchema* schema, + const ArrowArray* input + ) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd new file mode 100644 index 00000000000..0382a5d42c3 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport ( + lists_column_view, +) + + +cdef extern from "cudf/lists/reverse.hpp" namespace "cudf::lists" nogil: + cdef unique_ptr[column] reverse( + const lists_column_view& lists_column, + ) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd new file mode 100644 index 00000000000..0cc58af735b --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.types cimport data_type + + +cdef extern from "cudf/utilities/traits.hpp" namespace "cudf" nogil: + cdef bool is_relationally_comparable(data_type) + cdef bool is_equality_comparable(data_type) + cdef bool is_numeric(data_type) + cdef bool is_index_type(data_type) + cdef bool is_unsigned(data_type) + cdef bool is_integral(data_type) + cdef bool is_integral_not_bool(data_type) + cdef bool is_floating_point(data_type) + cdef bool is_boolean(data_type) + cdef bool is_timestamp(data_type) + cdef bool is_fixed_point(data_type) + cdef bool is_duration(data_type) + cdef bool is_chrono(data_type) + cdef bool is_dictionary(data_type) + cdef bool is_fixed_width(data_type) + cdef bool is_compound(data_type) + cdef bool is_nested(data_type) + cdef bool is_bit_castable(data_type, data_type) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd index 2ccf0139e90..c9d0a84e8ac 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd @@ -23,3 +23,5 @@ cpdef Column contains(Column, ColumnOrScalar) cpdef Column contains_nulls(Column) cpdef Column index_of(Column, ColumnOrScalar, bool) + +cpdef Column reverse(Column) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index a94d940accd..651f1346f88 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -9,6 +9,7 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.lists cimport ( contains as cpp_contains, explode as cpp_explode, + reverse as cpp_reverse, ) from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( concatenate_list_elements as cpp_concatenate_list_elements, @@ -206,3 +207,28 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o find_option, )) return Column.from_libcudf(move(c_result)) + + +cpdef Column reverse(Column input): + """Reverse the element order within each list of the input column. + + For details, see :cpp:func:`reverse`. + + Parameters + ---------- + input : Column + The input column. + + Returns + ------- + Column + A new Column with reversed lists. + """ + cdef unique_ptr[column] c_result + cdef ListColumnView list_view = input.list_view() + + with nogil: + c_result = move(cpp_reverse.reverse( + list_view.view(), + )) + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/traits.pxd new file mode 100644 index 00000000000..668fa775202 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/traits.pxd @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool + +from .types cimport DataType + + +cpdef bool is_relationally_comparable(DataType typ) +cpdef bool is_equality_comparable(DataType typ) +cpdef bool is_numeric(DataType typ) +cpdef bool is_index_type(DataType typ) +cpdef bool is_unsigned(DataType typ) +cpdef bool is_integral(DataType typ) +cpdef bool is_integral_not_bool(DataType typ) +cpdef bool is_floating_point(DataType typ) +cpdef bool is_boolean(DataType typ) +cpdef bool is_timestamp(DataType typ) +cpdef bool is_fixed_point(DataType typ) +cpdef bool is_duration(DataType typ) +cpdef bool is_chrono(DataType typ) +cpdef bool is_dictionary(DataType typ) +cpdef bool is_fixed_width(DataType typ) +cpdef bool is_compound(DataType typ) +cpdef bool is_nested(DataType typ) +cpdef bool is_bit_castable(DataType source, DataType target) diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pyx b/python/cudf/cudf/_lib/pylibcudf/traits.pyx new file mode 100644 index 00000000000..d2370f8d641 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/traits.pyx @@ -0,0 +1,151 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool + +from cudf._lib.pylibcudf.libcudf.utilities cimport traits + +from .types cimport DataType + + +cpdef bool is_relationally_comparable(DataType typ): + """Checks if the given data type supports relational comparisons. + + For details, see :cpp:func:`is_relationally_comparable`. + """ + return traits.is_relationally_comparable(typ.c_obj) + + +cpdef bool is_equality_comparable(DataType typ): + """Checks if the given data type supports equality comparisons. + + For details, see :cpp:func:`is_equality_comparable`. + """ + return traits.is_equality_comparable(typ.c_obj) + + +cpdef bool is_numeric(DataType typ): + """Checks if the given data type is numeric. + + For details, see :cpp:func:`is_numeric`. + """ + return traits.is_numeric(typ.c_obj) + + +cpdef bool is_index_type(DataType typ): + """Checks if the given data type is an index type. + + For details, see :cpp:func:`is_index_type`. + """ + return traits.is_index_type(typ.c_obj) + + +cpdef bool is_unsigned(DataType typ): + """Checks if the given data type is an unsigned type. + + For details, see :cpp:func:`is_unsigned`. + """ + return traits.is_unsigned(typ.c_obj) + + +cpdef bool is_integral(DataType typ): + """Checks if the given data type is an integral type. + + For details, see :cpp:func:`is_integral`. + """ + return traits.is_integral(typ.c_obj) + + +cpdef bool is_integral_not_bool(DataType typ): + """Checks if the given data type is an integral type excluding booleans. + + For details, see :cpp:func:`is_integral_not_bool`. + """ + return traits.is_integral_not_bool(typ.c_obj) + + +cpdef bool is_floating_point(DataType typ): + """Checks if the given data type is a floating point type. + + For details, see :cpp:func:`is_floating_point`. + """ + return traits.is_floating_point(typ.c_obj) + + +cpdef bool is_boolean(DataType typ): + """Checks if the given data type is a boolean type. + + For details, see :cpp:func:`is_boolean`. + """ + return traits.is_boolean(typ.c_obj) + + +cpdef bool is_timestamp(DataType typ): + """Checks if the given data type is a timestamp type. + + For details, see :cpp:func:`is_timestamp`. + """ + return traits.is_timestamp(typ.c_obj) + + +cpdef bool is_fixed_point(DataType typ): + """Checks if the given data type is a fixed point type. + + For details, see :cpp:func:`is_fixed_point`. + """ + return traits.is_fixed_point(typ.c_obj) + + +cpdef bool is_duration(DataType typ): + """Checks if the given data type is a duration type. + + For details, see :cpp:func:`is_duration`. + """ + return traits.is_duration(typ.c_obj) + + +cpdef bool is_chrono(DataType typ): + """Checks if the given data type is a chrono type. + + For details, see :cpp:func:`is_chrono`. + """ + return traits.is_chrono(typ.c_obj) + + +cpdef bool is_dictionary(DataType typ): + """Checks if the given data type is a dictionary type. + + For details, see :cpp:func:`is_dictionary`. + """ + return traits.is_dictionary(typ.c_obj) + + +cpdef bool is_fixed_width(DataType typ): + """Checks if the given data type is a fixed width type. + + For details, see :cpp:func:`is_fixed_width`. + """ + return traits.is_fixed_width(typ.c_obj) + + +cpdef bool is_compound(DataType typ): + """Checks if the given data type is a compound type. + + For details, see :cpp:func:`is_compound`. + """ + return traits.is_compound(typ.c_obj) + + +cpdef bool is_nested(DataType typ): + """Checks if the given data type is a nested type. + + For details, see :cpp:func:`is_nested`. + """ + return traits.is_nested(typ.c_obj) + + +cpdef bool is_bit_castable(DataType source, DataType target): + """Checks if the source type is bit-castable to the target type. + + For details, see :cpp:func:`is_bit_castable`. + """ + return traits.is_bit_castable(source.c_obj, target.c_obj) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 5db6fd904a9..e7a2863da8c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -959,6 +959,15 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: raise NotImplementedError() def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: + if len(self) == 0: + dtype = cudf.dtype(dtype) + if self.dtype == dtype: + if copy: + return self.copy() + else: + return self + else: + return column_empty(0, dtype=dtype, masked=self.nullable) if copy: col = self.copy() else: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 121076b69ce..c10aceba9f4 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -280,8 +280,8 @@ def __contains__(self, item: ScalarLike) -> bool: return False elif ts.tzinfo is not None: ts = ts.tz_convert(None) - return ts.to_numpy().astype("int64") in self.as_numerical_column( - "int64" + return ts.to_numpy().astype("int64") in cast( + "cudf.core.column.NumericalColumn", self.astype("int64") ) @functools.cached_property @@ -503,9 +503,9 @@ def mean( self, skipna=None, min_count: int = 0, dtype=np.float64 ) -> ScalarLike: return pd.Timestamp( - self.as_numerical_column("int64").mean( - skipna=skipna, min_count=min_count, dtype=dtype - ), + cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).mean(skipna=skipna, min_count=min_count, dtype=dtype), unit=self.time_unit, ).as_unit(self.time_unit) @@ -517,7 +517,7 @@ def std( ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical_column("int64").std( + cast("cudf.core.column.NumericalColumn", self.astype("int64")).std( skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof ) * _unit_to_nanoseconds_conversion[self.time_unit], @@ -525,7 +525,9 @@ def std( def median(self, skipna: bool | None = None) -> pd.Timestamp: return pd.Timestamp( - self.as_numerical_column("int64").median(skipna=skipna), + cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).median(skipna=skipna), unit=self.time_unit, ).as_unit(self.time_unit) @@ -534,18 +536,18 @@ def cov(self, other: DatetimeColumn) -> float: raise TypeError( f"cannot perform cov with types {self.dtype}, {other.dtype}" ) - return self.as_numerical_column("int64").cov( - other.as_numerical_column("int64") - ) + return cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).cov(cast("cudf.core.column.NumericalColumn", other.astype("int64"))) def corr(self, other: DatetimeColumn) -> float: if not isinstance(other, DatetimeColumn): raise TypeError( f"cannot perform corr with types {self.dtype}, {other.dtype}" ) - return self.as_numerical_column("int64").corr( - other.as_numerical_column("int64") - ) + return cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).corr(cast("cudf.core.column.NumericalColumn", other.astype("int64"))) def quantile( self, @@ -554,7 +556,7 @@ def quantile( exact: bool, return_scalar: bool, ) -> ColumnBase: - result = self.as_numerical_column("int64").quantile( + result = self.astype("int64").quantile( q=q, interpolation=interpolation, exact=exact, @@ -645,12 +647,12 @@ def indices_of( ) -> cudf.core.column.NumericalColumn: value = column.as_column( pd.to_datetime(value), dtype=self.dtype - ).as_numerical_column("int64") - return self.as_numerical_column("int64").indices_of(value) + ).astype("int64") + return self.astype("int64").indices_of(value) @property def is_unique(self) -> bool: - return self.as_numerical_column("int64").is_unique + return self.astype("int64").is_unique def isin(self, values: Sequence) -> ColumnBase: return cudf.core.tools.datetimes._isin_datetimelike(self, values) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index d66908b5f94..3e238d65cff 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -157,7 +157,7 @@ def normalize_binop_value(self, other): "Decimal columns only support binary operations with " "integer numerical columns." ) - other = other.as_decimal_column( + other = other.astype( self.dtype.__class__(self.dtype.__class__.MAX_PRECISION, 0) ) elif not isinstance(other, DecimalBaseColumn): diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index f24ca3fdad1..d09a1f66539 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -4,7 +4,7 @@ import cudf from cudf.core.column import StructColumn -from cudf.core.dtypes import CategoricalDtype, IntervalDtype +from cudf.core.dtypes import IntervalDtype class IntervalColumn(StructColumn): @@ -87,20 +87,16 @@ def copy(self, deep=True): def as_interval_column(self, dtype): if isinstance(dtype, IntervalDtype): - if isinstance(self.dtype, CategoricalDtype): - new_struct = self._get_decategorized_column() - return IntervalColumn.from_struct_column(new_struct) - else: - return IntervalColumn( - size=self.size, - dtype=dtype, - mask=self.mask, - offset=self.offset, - null_count=self.null_count, - children=tuple( - child.astype(dtype.subtype) for child in self.children - ), - ) + return IntervalColumn( + size=self.size, + dtype=dtype, + mask=self.mask, + offset=self.offset, + null_count=self.null_count, + children=tuple( + child.astype(dtype.subtype) for child in self.children + ), + ) else: raise ValueError("dtype must be IntervalDtype") diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index c548db67344..1992d471947 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -73,10 +73,15 @@ def memory_usage(self): child0_size = ( current_base_child.size + 1 - current_offset ) * current_base_child.base_children[0].dtype.itemsize - current_offset = current_base_child.base_children[ - 0 - ].element_indexing(current_offset) n += child0_size + current_offset_col = current_base_child.base_children[0] + if not len(current_offset_col): + # See https://github.com/rapidsai/cudf/issues/16164 why + # offset column can be uninitialized + break + current_offset = current_offset_col.element_indexing( + current_offset + ) current_base_child = current_base_child.base_children[1] n += ( diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 8f41bcb6422..5a0171bbbdc 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -107,7 +107,9 @@ def __contains__(self, item: DatetimeLikeScalar) -> bool: # np.timedelta64 raises ValueError, hence `item` # cannot exist in `self`. return False - return item.view("int64") in self.as_numerical_column("int64") + return item.view("int64") in cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ) @property def values(self): @@ -132,9 +134,7 @@ def to_arrow(self) -> pa.Array: self.mask_array_view(mode="read").copy_to_host() ) data = pa.py_buffer( - self.as_numerical_column("int64") - .data_array_view(mode="read") - .copy_to_host() + self.astype("int64").data_array_view(mode="read").copy_to_host() ) pa_dtype = np_to_pa_dtype(self.dtype) return pa.Array.from_buffers( @@ -295,13 +295,17 @@ def as_timedelta_column( def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical_column("int64").mean(skipna=skipna, dtype=dtype), + cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).mean(skipna=skipna, dtype=dtype), unit=self.time_unit, ).as_unit(self.time_unit) def median(self, skipna: bool | None = None) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical_column("int64").median(skipna=skipna), + cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).median(skipna=skipna), unit=self.time_unit, ).as_unit(self.time_unit) @@ -315,7 +319,7 @@ def quantile( exact: bool, return_scalar: bool, ) -> ColumnBase: - result = self.as_numerical_column("int64").quantile( + result = self.astype("int64").quantile( q=q, interpolation=interpolation, exact=exact, @@ -337,7 +341,7 @@ def sum( # Since sum isn't overridden in Numerical[Base]Column, mypy only # sees the signature from Reducible (which doesn't have the extra # parameters from ColumnBase._reduce) so we have to ignore this. - self.as_numerical_column("int64").sum( # type: ignore + self.astype("int64").sum( # type: ignore skipna=skipna, min_count=min_count, dtype=dtype ), unit=self.time_unit, @@ -351,7 +355,7 @@ def std( ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical_column("int64").std( + cast("cudf.core.column.NumericalColumn", self.astype("int64")).std( skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype ), unit=self.time_unit, @@ -362,18 +366,18 @@ def cov(self, other: TimeDeltaColumn) -> float: raise TypeError( f"cannot perform cov with types {self.dtype}, {other.dtype}" ) - return self.as_numerical_column("int64").cov( - other.as_numerical_column("int64") - ) + return cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).cov(cast("cudf.core.column.NumericalColumn", other.astype("int64"))) def corr(self, other: TimeDeltaColumn) -> float: if not isinstance(other, TimeDeltaColumn): raise TypeError( f"cannot perform corr with types {self.dtype}, {other.dtype}" ) - return self.as_numerical_column("int64").corr( - other.as_numerical_column("int64") - ) + return cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).corr(cast("cudf.core.column.NumericalColumn", other.astype("int64"))) def components(self) -> dict[str, ColumnBase]: """ diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4dfeb68b7ba..b249410c2e4 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2404,7 +2404,7 @@ def scatter_by_map( if isinstance(map_index, cudf.core.column.StringColumn): cat_index = cast( cudf.core.column.CategoricalColumn, - map_index.as_categorical_column("category"), + map_index.astype("category"), ) map_index = cat_index.codes warnings.warn( diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 034849d0e71..de715191c08 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -937,7 +937,7 @@ def to_pandas(self) -> pd.IntervalDtype: def __eq__(self, other): if isinstance(other, str): # This means equality isn't transitive but mimics pandas - return other == self.name + return other in (self.name, str(self)) return ( type(self) == type(other) and self.subtype == other.subtype diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9bac75dc6ac..253d200f7d4 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -927,7 +927,7 @@ def from_arrow(cls, data: pa.Table) -> Self: # of column is 0 (i.e., empty) then we will have an # int8 column in result._data[name] returned by libcudf, # which needs to be type-casted to 'category' dtype. - result[name] = result[name].as_categorical_column("category") + result[name] = result[name].astype("category") elif ( pandas_dtypes.get(name) == "empty" and np_dtypes.get(name) == "object" @@ -936,7 +936,7 @@ def from_arrow(cls, data: pa.Table) -> Self: # is specified as 'empty' and np_dtypes as 'object', # hence handling this special case to type-cast the empty # float column to str column. - result[name] = result[name].as_string_column(cudf.dtype("str")) + result[name] = result[name].astype(cudf.dtype("str")) elif name in data.column_names and isinstance( data[name].type, ( diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 73a1cd26367..a5fed02cbed 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -229,7 +229,7 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec: else: key = cudf.core.column.as_column(key) if isinstance(key, cudf.core.column.CategoricalColumn): - key = key.as_numerical_column(key.codes.dtype) + key = key.astype(key.codes.dtype) if is_bool_dtype(key.dtype): return MaskIndexer(BooleanMask(key, n)) elif len(key) == 0: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 97b6bbec2d4..4a60470fafa 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3107,10 +3107,12 @@ def value_counts( # Pandas returns an IntervalIndex as the index of res # this condition makes sure we do too if bins is given if bins is not None and len(res) == len(res.index.categories): - int_index = IntervalColumn.as_interval_column( - res.index._column, res.index.categories.dtype + interval_col = IntervalColumn.from_struct_column( + res.index._column._get_decategorized_column() + ) + res.index = cudf.IntervalIndex._from_data( + {res.index.name: interval_col} ) - res.index = int_index res.name = result_name return res diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 68b23f1e059..ef6b86a04a7 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -115,11 +115,11 @@ def to_numeric(arg, errors="raise", downcast=None): dtype = col.dtype if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype): - col = col.as_numerical_column(cudf.dtype("int64")) + col = col.astype(cudf.dtype("int64")) elif isinstance(dtype, CategoricalDtype): cat_dtype = col.dtype.type if _is_non_decimal_numeric_dtype(cat_dtype): - col = col.as_numerical_column(cat_dtype) + col = col.astype(cat_dtype) else: try: col = _convert_str_col( @@ -146,8 +146,8 @@ def to_numeric(arg, errors="raise", downcast=None): raise ValueError("Unrecognized datatype") # str->float conversion may require lower precision - if col.dtype == cudf.dtype("f"): - col = col.as_numerical_column("d") + if col.dtype == cudf.dtype("float32"): + col = col.astype("float64") if downcast: if downcast == "float": @@ -205,7 +205,7 @@ def _convert_str_col(col, errors, _downcast=None): is_integer = libstrings.is_integer(col) if is_integer.all(): - return col.as_numerical_column(dtype=cudf.dtype("i8")) + return col.astype(dtype=cudf.dtype("i8")) col = _proc_inf_empty_strings(col) @@ -218,9 +218,9 @@ def _convert_str_col(col, errors, _downcast=None): "limited by float32 precision." ) ) - return col.as_numerical_column(dtype=cudf.dtype("f")) + return col.astype(dtype=cudf.dtype("float32")) else: - return col.as_numerical_column(dtype=cudf.dtype("d")) + return col.astype(dtype=cudf.dtype("float64")) else: if errors == "coerce": col = libcudf.string_casting.stod(col) diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index bf927e661fe..d41e6c720bf 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -1,24 +1,39 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from __future__ import annotations +import io +import os + import pyarrow as pa import pytest from cudf._lib import pylibcudf as plc -def metadata_from_arrow_array( - pa_array: pa.Array, +def metadata_from_arrow_type( + pa_type: pa.Array, + name: str = "", ) -> plc.interop.ColumnMetadata | None: - metadata = None - if pa.types.is_list(dtype := pa_array.type) or pa.types.is_struct(dtype): + metadata = plc.interop.ColumnMetadata(name) # None + if pa.types.is_list(pa_type): + child_meta = [plc.interop.ColumnMetadata("offsets")] + for i in range(pa_type.num_fields): + field_meta = metadata_from_arrow_type( + pa_type.field(i).type, pa_type.field(i).name + ) + child_meta.append(field_meta) + metadata = plc.interop.ColumnMetadata(name, child_meta) + elif pa.types.is_struct(pa_type): + child_meta = [] + for i in range(pa_type.num_fields): + field_meta = metadata_from_arrow_type( + pa_type.field(i).type, pa_type.field(i).name + ) + child_meta.append(field_meta) metadata = plc.interop.ColumnMetadata( - "", + name, # libcudf does not store field names, so just match pyarrow's. - [ - plc.interop.ColumnMetadata(pa_array.type.field(i).name) - for i in range(pa_array.type.num_fields) - ], + child_meta, ) return metadata @@ -32,13 +47,13 @@ def assert_column_eq( rhs, plc.Column ): rhs = plc.interop.to_arrow( - rhs, metadata=metadata_from_arrow_array(lhs) + rhs, metadata=metadata_from_arrow_type(lhs.type) ) elif isinstance(lhs, plc.Column) and isinstance( rhs, (pa.Array, pa.ChunkedArray) ): lhs = plc.interop.to_arrow( - lhs, metadata=metadata_from_arrow_array(rhs) + lhs, metadata=metadata_from_arrow_type(rhs.type) ) else: raise ValueError( @@ -87,56 +102,84 @@ def cudf_raises(expected_exception: BaseException, *args, **kwargs): return pytest.raises(expected_exception, *args, **kwargs) -# TODO: Consider moving these type utilities into pylibcudf.types itself. -def is_signed_integer(plc_dtype: plc.DataType): - return ( - plc.TypeId.INT8.value <= plc_dtype.id().value <= plc.TypeId.INT64.value - ) - - -def is_unsigned_integer(plc_dtype: plc.DataType): - return plc_dtype.id() in ( - plc.TypeId.UINT8, - plc.TypeId.UINT16, - plc.TypeId.UINT32, - plc.TypeId.UINT64, - ) - +def is_string(plc_dtype: plc.DataType): + return plc_dtype.id() == plc.TypeId.STRING -def is_integer(plc_dtype: plc.DataType): - return plc_dtype.id() in ( - plc.TypeId.INT8, - plc.TypeId.INT16, - plc.TypeId.INT32, - plc.TypeId.INT64, - ) +def nesting_level(typ) -> tuple[int, int]: + """Return list and struct nesting of a pyarrow type.""" + if isinstance(typ, pa.ListType): + list_, struct = nesting_level(typ.value_type) + return list_ + 1, struct + elif isinstance(typ, pa.StructType): + lists, structs = map(max, zip(*(nesting_level(t.type) for t in typ))) + return lists, structs + 1 + else: + return 0, 0 -def is_floating(plc_dtype: plc.DataType): - return plc_dtype.id() in ( - plc.TypeId.FLOAT32, - plc.TypeId.FLOAT64, - ) +def is_nested_struct(typ): + return nesting_level(typ)[1] > 1 -def is_boolean(plc_dtype: plc.DataType): - return plc_dtype.id() == plc.TypeId.BOOL8 +def is_nested_list(typ): + return nesting_level(typ)[0] > 1 -def is_string(plc_dtype: plc.DataType): - return plc_dtype.id() == plc.TypeId.STRING +def sink_to_str(sink): + """ + Takes a sink (e.g. StringIO/BytesIO, filepath, etc.) + and reads in the contents into a string (str not bytes) + for comparison + """ + if isinstance(sink, (str, os.PathLike)): + with open(sink, "r") as f: + str_result = f.read() + elif isinstance(sink, io.BytesIO): + sink.seek(0) + str_result = sink.read().decode() + else: + sink.seek(0) + str_result = sink.read() + return str_result -def is_fixed_width(plc_dtype: plc.DataType): - return ( - is_integer(plc_dtype) - or is_floating(plc_dtype) - or is_boolean(plc_dtype) - ) +NUMERIC_PA_TYPES = [pa.int64(), pa.float64(), pa.uint64()] +STRING_PA_TYPES = [pa.string()] +BOOL_PA_TYPES = [pa.bool_()] +LIST_PA_TYPES = [ + pa.list_(pa.int64()), + # Nested case + pa.list_(pa.list_(pa.int64())), +] # We must explicitly specify this type via a field to ensure we don't include # nullability accidentally. DEFAULT_STRUCT_TESTING_TYPE = pa.struct( [pa.field("v", pa.int64(), nullable=False)] ) +NESTED_STRUCT_TESTING_TYPE = pa.struct( + [ + pa.field("a", pa.int64(), nullable=False), + pa.field( + "b_struct", + pa.struct([pa.field("b", pa.float64(), nullable=False)]), + nullable=False, + ), + ] +) + +DEFAULT_PA_STRUCT_TESTING_TYPES = [ + DEFAULT_STRUCT_TESTING_TYPE, + NESTED_STRUCT_TESTING_TYPE, +] + +DEFAULT_PA_TYPES = ( + NUMERIC_PA_TYPES + + STRING_PA_TYPES + + BOOL_PA_TYPES + + LIST_PA_TYPES + + DEFAULT_PA_STRUCT_TESTING_TYPES +) + +ALL_PA_TYPES = DEFAULT_PA_TYPES diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py index b169bbdee5b..e4760ea7ac8 100644 --- a/python/cudf/cudf/pylibcudf_tests/conftest.py +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -1,9 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. # Tell ruff it's OK that some imports occur after the sys.path.insert # ruff: noqa: E402 +import io import os +import pathlib import sys +import numpy as np import pyarrow as pa import pytest @@ -11,7 +14,7 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) -from utils import DEFAULT_STRUCT_TESTING_TYPE +from utils import ALL_PA_TYPES, DEFAULT_PA_TYPES, NUMERIC_PA_TYPES # This fixture defines the standard set of types that all tests should default to @@ -20,14 +23,7 @@ # across modules. Otherwise it may be defined on a per-module basis. @pytest.fixture( scope="session", - params=[ - pa.int64(), - pa.float64(), - pa.string(), - pa.bool_(), - pa.list_(pa.int64()), - DEFAULT_STRUCT_TESTING_TYPE, - ], + params=DEFAULT_PA_TYPES, ) def pa_type(request): return request.param @@ -35,16 +31,96 @@ def pa_type(request): @pytest.fixture( scope="session", - params=[ - pa.int64(), - pa.float64(), - pa.uint64(), - ], + params=NUMERIC_PA_TYPES, ) def numeric_pa_type(request): return request.param +# TODO: Consider adding another fixture/adapting this +# fixture to consider nullability +@pytest.fixture(scope="session", params=[0, 100]) +def table_data(request): + """ + Returns (TableWithMetadata, pa_table). + + This is the default fixture you should be using for testing + pylibcudf I/O writers. + + Contains one of each category (e.g. int, bool, list, struct) + of dtypes. + """ + nrows = request.param + + table_dict = {} + # Colnames in the format expected by + # plc.io.TableWithMetadata + colnames = [] + + np.random.seed(42) + + for typ in ALL_PA_TYPES: + rand_vals = np.random.randint(0, nrows, nrows) + child_colnames = [] + + def _generate_nested_data(typ): + child_colnames = [] + + # recurse to get vals for children + rand_arrs = [] + for i in range(typ.num_fields): + rand_arr, grandchild_colnames = _generate_nested_data( + typ.field(i).type + ) + rand_arrs.append(rand_arr) + child_colnames.append((typ.field(i).name, grandchild_colnames)) + + if isinstance(typ, pa.StructType): + pa_array = pa.StructArray.from_arrays( + [rand_arr for rand_arr in rand_arrs], + names=[typ.field(i).name for i in range(typ.num_fields)], + ) + elif isinstance(typ, pa.ListType): + pa_array = pa.array( + [list(row_vals) for row_vals in zip(rand_arrs[0])], + type=typ, + ) + child_colnames.append(("", grandchild_colnames)) + else: + # typ is scalar type + pa_array = pa.array(rand_vals).cast(typ) + return pa_array, child_colnames + + if isinstance(typ, (pa.ListType, pa.StructType)): + rand_arr, child_colnames = _generate_nested_data(typ) + else: + rand_arr = pa.array(rand_vals).cast(typ) + + table_dict[f"col_{typ}"] = rand_arr + colnames.append((f"col_{typ}", child_colnames)) + + pa_table = pa.Table.from_pydict(table_dict) + + return plc.io.TableWithMetadata( + plc.interop.from_arrow(pa_table), column_names=colnames + ), pa_table + + +@pytest.fixture( + params=["a.txt", pathlib.Path("a.txt"), io.BytesIO, io.StringIO], +) +def source_or_sink(request, tmp_path): + fp_or_buf = request.param + if isinstance(fp_or_buf, str): + return f"{tmp_path}/{fp_or_buf}" + elif isinstance(fp_or_buf, os.PathLike): + return tmp_path.joinpath(fp_or_buf) + elif issubclass(fp_or_buf, io.IOBase): + # Must construct io.StringIO/io.BytesIO inside + # fixture, or we'll end up re-using it + return fp_or_buf() + + @pytest.fixture( scope="session", params=[opt for opt in plc.types.Interpolation] ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_avro.py b/python/cudf/cudf/pylibcudf_tests/io/test_avro.py similarity index 100% rename from python/cudf/cudf/pylibcudf_tests/test_avro.py rename to python/cudf/cudf/pylibcudf_tests/io/test_avro.py diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_json.py b/python/cudf/cudf/pylibcudf_tests/io/test_json.py new file mode 100644 index 00000000000..d6b8bfa6976 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/io/test_json.py @@ -0,0 +1,116 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import io + +import pyarrow as pa +import pytest +from utils import sink_to_str + +import cudf._lib.pylibcudf as plc + + +@pytest.mark.parametrize("rows_per_chunk", [8, 100]) +@pytest.mark.parametrize("lines", [True, False]) +def test_write_json_basic(table_data, source_or_sink, lines, rows_per_chunk): + plc_table_w_meta, pa_table = table_data + sink = source_or_sink + + plc.io.json.write_json( + plc.io.SinkInfo([sink]), + plc_table_w_meta, + lines=lines, + rows_per_chunk=rows_per_chunk, + ) + + exp = pa_table.to_pandas() + + # Convert everything to string to make + # comparisons easier + str_result = sink_to_str(sink) + + pd_result = exp.to_json(orient="records", lines=lines) + + assert str_result == pd_result + + +@pytest.mark.parametrize("include_nulls", [True, False]) +@pytest.mark.parametrize("na_rep", ["null", "awef", ""]) +def test_write_json_nulls(na_rep, include_nulls): + names = ["a", "b"] + pa_tbl = pa.Table.from_arrays( + [pa.array([1.0, 2.0, None]), pa.array([True, None, False])], + names=names, + ) + plc_tbl = plc.interop.from_arrow(pa_tbl) + plc_tbl_w_meta = plc.io.types.TableWithMetadata( + plc_tbl, column_names=[(name, []) for name in names] + ) + + sink = io.StringIO() + + plc.io.json.write_json( + plc.io.SinkInfo([sink]), + plc_tbl_w_meta, + na_rep=na_rep, + include_nulls=include_nulls, + ) + + exp = pa_tbl.to_pandas() + + # Convert everything to string to make + # comparisons easier + str_result = sink_to_str(sink) + pd_result = exp.to_json(orient="records") + + if not include_nulls: + # No equivalent in pandas, so we just + # sanity check by making sure na_rep + # doesn't appear in the output + + # don't quote null + for name in names: + assert f'{{"{name}":{na_rep}}}' not in str_result + return + + # pandas doesn't suppport na_rep + # let's just manually do str.replace + pd_result = pd_result.replace("null", na_rep) + + assert str_result == pd_result + + +@pytest.mark.parametrize("true_value", ["True", "correct"]) +@pytest.mark.parametrize("false_value", ["False", "wrong"]) +def test_write_json_bool_opts(true_value, false_value): + names = ["a"] + pa_tbl = pa.Table.from_arrays([pa.array([True, None, False])], names=names) + plc_tbl = plc.interop.from_arrow(pa_tbl) + plc_tbl_w_meta = plc.io.types.TableWithMetadata( + plc_tbl, column_names=[(name, []) for name in names] + ) + + sink = io.StringIO() + + plc.io.json.write_json( + plc.io.SinkInfo([sink]), + plc_tbl_w_meta, + include_nulls=True, + na_rep="null", + true_value=true_value, + false_value=false_value, + ) + + exp = pa_tbl.to_pandas() + + # Convert everything to string to make + # comparisons easier + str_result = sink_to_str(sink) + pd_result = exp.to_json(orient="records") + + # pandas doesn't suppport na_rep + # let's just manually do str.replace + if true_value != "true": + pd_result = pd_result.replace("true", true_value) + if false_value != "false": + pd_result = pd_result.replace("false", false_value) + + assert str_result == pd_result diff --git a/python/cudf/cudf/pylibcudf_tests/test_source_info.py b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py similarity index 72% rename from python/cudf/cudf/pylibcudf_tests/test_source_info.py rename to python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py index 019321b7259..287dd8f21c8 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_source_info.py +++ b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py @@ -9,6 +9,21 @@ from cudf._lib.pylibcudf.io.datasource import NativeFileDatasource +@pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo]) +def io_class(request): + return request.param + + +def _skip_invalid_sinks(io_class, sink): + """ + Skip invalid sinks for SinkInfo + """ + if io_class is plc.io.SinkInfo and isinstance( + sink, (bytes, NativeFileDatasource) + ): + pytest.skip(f"{sink} is not a valid input for SinkInfo") + + @pytest.mark.parametrize( "source", [ @@ -18,16 +33,15 @@ NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")), ], ) -def test_source_info_ctor(source, tmp_path): +def test_source_info_ctor(io_class, source, tmp_path): if isinstance(source, str): file = tmp_path / source file.write_bytes("hello world".encode("utf-8")) source = str(file) - plc.io.SourceInfo([source]) + _skip_invalid_sinks(io_class, source) - # TODO: test contents of source_info buffer is correct - # once buffers are exposed on python side + io_class([source]) @pytest.mark.parametrize( @@ -42,7 +56,7 @@ def test_source_info_ctor(source, tmp_path): ], ], ) -def test_source_info_ctor_multiple(sources, tmp_path): +def test_source_info_ctor_multiple(io_class, sources, tmp_path): for i in range(len(sources)): source = sources[i] if isinstance(source, str): @@ -50,10 +64,9 @@ def test_source_info_ctor_multiple(sources, tmp_path): file.write_bytes("hello world".encode("utf-8")) sources[i] = str(file) - plc.io.SourceInfo(sources) + _skip_invalid_sinks(io_class, source) - # TODO: test contents of source_info buffer is correct - # once buffers are exposed on python side + io_class(sources) @pytest.mark.parametrize( @@ -73,7 +86,7 @@ def test_source_info_ctor_multiple(sources, tmp_path): ], ], ) -def test_source_info_ctor_mixing_invalid(sources, tmp_path): +def test_source_info_ctor_mixing_invalid(io_class, sources, tmp_path): # Unlike the previous test # don't create files so that they are missing for i in range(len(sources)): @@ -82,8 +95,9 @@ def test_source_info_ctor_mixing_invalid(sources, tmp_path): file = tmp_path / source file.write_bytes("hello world".encode("utf-8")) sources[i] = str(file) + _skip_invalid_sinks(io_class, source) with pytest.raises(ValueError): - plc.io.SourceInfo(sources) + io_class(sources) def test_source_info_invalid(): diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py index da3ca3a6d1e..f27fe4e942e 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_copying.py +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -5,19 +5,21 @@ import pytest from utils import ( DEFAULT_STRUCT_TESTING_TYPE, + NESTED_STRUCT_TESTING_TYPE, assert_column_eq, assert_table_eq, cudf_raises, - is_fixed_width, - is_floating, - is_integer, + is_nested_list, + is_nested_struct, is_string, - metadata_from_arrow_array, + metadata_from_arrow_type, ) from cudf._lib import pylibcudf as plc +# TODO: consider moving this to conftest and "pairing" +# it with pa_type, so that they don't get out of sync # TODO: Test nullable data @pytest.fixture(scope="module") def input_column(pa_type): @@ -28,10 +30,27 @@ def input_column(pa_type): elif pa.types.is_boolean(pa_type): pa_array = pa.array([True, True, False], type=pa_type) elif pa.types.is_list(pa_type): - # TODO: Add heterogenous sizes - pa_array = pa.array([[1], [2], [3]], type=pa_type) + if pa_type.value_type == pa.int64(): + pa_array = pa.array([[1], [2, 3], [3]], type=pa_type) + elif ( + isinstance(pa_type.value_type, pa.ListType) + and pa_type.value_type.value_type == pa.int64() + ): + pa_array = pa.array([[[1]], [[2, 3]], [[3]]], type=pa_type) + else: + raise ValueError("Unsupported type " + pa_type.value_type) elif pa.types.is_struct(pa_type): - pa_array = pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) + if not is_nested_struct(pa_type): + pa_array = pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) + else: + pa_array = pa.array( + [ + {"a": 1, "b_struct": {"b": 1.0}}, + {"a": 2, "b_struct": {"b": 2.0}}, + {"a": 3, "b_struct": {"b": 3.0}}, + ], + type=pa_type, + ) else: raise ValueError("Unsupported type") return pa_array, plc.interop.from_arrow(pa_array) @@ -55,13 +74,37 @@ def target_column(pa_type): [False, True, True, False, True, False], type=pa_type ) elif pa.types.is_list(pa_type): - # TODO: Add heterogenous sizes - pa_array = pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type) + if pa_type.value_type == pa.int64(): + pa_array = pa.array( + [[4], [5, 6], [7], [8], [9], [10]], type=pa_type + ) + elif ( + isinstance(pa_type.value_type, pa.ListType) + and pa_type.value_type.value_type == pa.int64() + ): + pa_array = pa.array( + [[[4]], [[5, 6]], [[7]], [[8]], [[9]], [[10]]], type=pa_type + ) + else: + raise ValueError("Unsupported type") elif pa.types.is_struct(pa_type): - pa_array = pa.array( - [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}], - type=pa_type, - ) + if not is_nested_struct(pa_type): + pa_array = pa.array( + [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}], + type=pa_type, + ) + else: + pa_array = pa.array( + [ + {"a": 4, "b_struct": {"b": 4.0}}, + {"a": 5, "b_struct": {"b": 5.0}}, + {"a": 6, "b_struct": {"b": 6.0}}, + {"a": 7, "b_struct": {"b": 7.0}}, + {"a": 8, "b_struct": {"b": 8.0}}, + {"a": 9, "b_struct": {"b": 9.0}}, + ], + type=pa_type, + ) else: raise ValueError("Unsupported type") return pa_array, plc.interop.from_arrow(pa_array) @@ -96,10 +139,22 @@ def source_scalar(pa_type): elif pa.types.is_boolean(pa_type): pa_scalar = pa.scalar(False, type=pa_type) elif pa.types.is_list(pa_type): - # TODO: Longer list? - pa_scalar = pa.scalar([1], type=pa_type) + if pa_type.value_type == pa.int64(): + pa_scalar = pa.scalar([1, 2, 3, 4], type=pa_type) + elif ( + isinstance(pa_type.value_type, pa.ListType) + and pa_type.value_type.value_type == pa.int64() + ): + pa_scalar = pa.scalar([[1, 2, 3, 4]], type=pa_type) + else: + raise ValueError("Unsupported type") elif pa.types.is_struct(pa_type): - pa_scalar = pa.scalar({"v": 1}, type=pa_type) + if not is_nested_struct(pa_type): + pa_scalar = pa.scalar({"v": 1}, type=pa_type) + else: + pa_scalar = pa.scalar( + {"a": 1, "b_struct": {"b": 1.0}}, type=pa_type + ) else: raise ValueError("Unsupported type") return pa_scalar, plc.interop.from_arrow(pa_scalar) @@ -196,27 +251,54 @@ def test_scatter_table( ) if pa.types.is_list(dtype := pa_target_table[0].type): - expected = pa.table( - [pa.array([[4], [1], [2], [3], [8], [9]])] * 3, [""] * 3 - ) + if is_nested_list(dtype): + expected = pa.table( + [pa.array([[[4]], [[1]], [[2, 3]], [[3]], [[9]], [[10]]])] + * 3, + [""] * 3, + ) + else: + expected = pa.table( + [pa.array([[4], [1], [2, 3], [3], [9], [10]])] * 3, + [""] * 3, + ) elif pa.types.is_struct(dtype): - expected = pa.table( - [ - pa.array( - [ - {"v": 4}, - {"v": 1}, - {"v": 2}, - {"v": 3}, - {"v": 8}, - {"v": 9}, - ], - type=DEFAULT_STRUCT_TESTING_TYPE, - ) - ] - * 3, - [""] * 3, - ) + if is_nested_struct(dtype): + expected = pa.table( + [ + pa.array( + [ + {"a": 4, "b_struct": {"b": 4.0}}, + {"a": 1, "b_struct": {"b": 1.0}}, + {"a": 2, "b_struct": {"b": 2.0}}, + {"a": 3, "b_struct": {"b": 3.0}}, + {"a": 8, "b_struct": {"b": 8.0}}, + {"a": 9, "b_struct": {"b": 9.0}}, + ], + type=NESTED_STRUCT_TESTING_TYPE, + ) + ] + * 3, + [""] * 3, + ) + else: + expected = pa.table( + [ + pa.array( + [ + {"v": 4}, + {"v": 1}, + {"v": 2}, + {"v": 3}, + {"v": 8}, + {"v": 9}, + ], + type=DEFAULT_STRUCT_TESTING_TYPE, + ) + ] + * 3, + [""] * 3, + ) else: expected = _pyarrow_boolean_mask_scatter_table( pa_source_table, @@ -274,9 +356,9 @@ def test_scatter_table_type_mismatch(source_table, index_column, target_table): _, plc_index_column = index_column _, plc_target_table = target_table with cudf_raises(TypeError): - if is_integer( + if plc.traits.is_integral_not_bool( dtype := plc_target_table.columns()[0].type() - ) or is_floating(dtype): + ) or plc.traits.is_floating_point(dtype): pa_array = pa.array([True] * plc_source_table.num_rows()) else: pa_array = pa.array([1] * plc_source_table.num_rows()) @@ -343,9 +425,9 @@ def test_scatter_scalars_type_mismatch(index_column, target_table): _, plc_index_column = index_column _, plc_target_table = target_table with cudf_raises(TypeError): - if is_integer( + if plc.traits.is_integral_not_bool( dtype := plc_target_table.columns()[0].type() - ) or is_floating(dtype): + ) or plc.traits.is_floating_point(dtype): plc_source_scalar = [plc.interop.from_arrow(pa.scalar(True))] else: plc_source_scalar = [plc.interop.from_arrow(pa.scalar(1))] @@ -373,7 +455,7 @@ def test_empty_like_table(source_table): @pytest.mark.parametrize("size", [None, 10]) def test_allocate_like(input_column, size): _, plc_input_column = input_column - if is_fixed_width(plc_input_column.type()): + if plc.traits.is_fixed_width(plc_input_column.type()): result = plc.copying.allocate_like( plc_input_column, plc.copying.MaskAllocationPolicy.RETAIN, @@ -399,7 +481,7 @@ def test_copy_range_in_place( pa_target_column, _ = target_column - if not is_fixed_width(mutable_target_column.type()): + if not plc.traits.is_fixed_width(mutable_target_column.type()): with pytest.raises(TypeError): plc.copying.copy_range_in_place( plc_input_column, @@ -431,7 +513,7 @@ def test_copy_range_in_place_out_of_bounds( ): _, plc_input_column = input_column - if is_fixed_width(mutable_target_column.type()): + if plc.traits.is_fixed_width(mutable_target_column.type()): with cudf_raises(IndexError): plc.copying.copy_range_in_place( plc_input_column, @@ -443,7 +525,9 @@ def test_copy_range_in_place_out_of_bounds( def test_copy_range_in_place_different_types(mutable_target_column): - if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype): + if plc.traits.is_integral_not_bool( + dtype := mutable_target_column.type() + ) or plc.traits.is_floating_point(dtype): plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) @@ -463,7 +547,7 @@ def test_copy_range_in_place_null_mismatch( ): pa_input_column, _ = input_column - if is_fixed_width(mutable_target_column.type()): + if plc.traits.is_fixed_width(mutable_target_column.type()): pa_input_column = pc.if_else( _pyarrow_index_to_mask([0], len(pa_input_column)), pa_input_column, @@ -483,7 +567,9 @@ def test_copy_range_in_place_null_mismatch( def test_copy_range(input_column, target_column): pa_input_column, plc_input_column = input_column pa_target_column, plc_target_column = target_column - if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype): + if plc.traits.is_fixed_width( + dtype := plc_target_column.type() + ) or is_string(dtype): result = plc.copying.copy_range( plc_input_column, plc_target_column, @@ -525,7 +611,9 @@ def test_copy_range_out_of_bounds(input_column, target_column): def test_copy_range_different_types(target_column): _, plc_target_column = target_column - if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + if plc.traits.is_integral_not_bool( + dtype := plc_target_column.type() + ) or plc.traits.is_floating_point(dtype): plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) @@ -544,7 +632,9 @@ def test_shift(target_column, source_scalar): pa_source_scalar, plc_source_scalar = source_scalar pa_target_column, plc_target_column = target_column shift = 2 - if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype): + if plc.traits.is_fixed_width( + dtype := plc_target_column.type() + ) or is_string(dtype): result = plc.copying.shift(plc_target_column, shift, plc_source_scalar) expected = pa.concat_arrays( [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]] @@ -557,7 +647,9 @@ def test_shift(target_column, source_scalar): def test_shift_type_mismatch(target_column): _, plc_target_column = target_column - if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + if plc.traits.is_integral_not_bool( + dtype := plc_target_column.type() + ) or plc.traits.is_floating_point(dtype): fill_value = plc.interop.from_arrow(pa.scalar("a")) else: fill_value = plc.interop.from_arrow(pa.scalar(1)) @@ -627,6 +719,7 @@ def test_split_column_out_of_bounds(target_column): def test_split_table(target_table): pa_target_table, plc_target_table = target_table + upper_bounds = [1, 3, 5] lower_bounds = [0] + upper_bounds[:-1] result = plc.copying.split(plc_target_table, upper_bounds) @@ -661,7 +754,9 @@ def test_copy_if_else_column_column(target_column, mask, source_scalar): def test_copy_if_else_wrong_type(target_column, mask): _, plc_target_column = target_column _, plc_mask = mask - if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + if plc.traits.is_integral_not_bool( + dtype := plc_target_column.type() + ) or plc.traits.is_floating_point(dtype): plc_input_column = plc.interop.from_arrow( pa.array(["a"] * plc_target_column.size()) ) @@ -718,6 +813,7 @@ def test_copy_if_else_column_scalar( pa_target_column, plc_target_column = target_column pa_source_scalar, plc_source_scalar = source_scalar pa_mask, plc_mask = mask + args = ( (plc_target_column, plc_source_scalar) if array_left @@ -766,27 +862,58 @@ def test_boolean_mask_scatter_from_table( ) if pa.types.is_list(dtype := pa_target_table[0].type): - expected = pa.table( - [pa.array([[1], [5], [2], [7], [3], [9]])] * 3, [""] * 3 - ) + if is_nested_list(dtype): + expected = pa.table( + [ + pa.array( + [[[1]], [[5, 6]], [[2, 3]], [[8]], [[3]], [[10]]] + ) + ] + * 3, + [""] * 3, + ) + else: + expected = pa.table( + [pa.array([[1], [5, 6], [2, 3], [8], [3], [10]])] * 3, + [""] * 3, + ) elif pa.types.is_struct(dtype): - expected = pa.table( - [ - pa.array( - [ - {"v": 1}, - {"v": 5}, - {"v": 2}, - {"v": 7}, - {"v": 3}, - {"v": 9}, - ], - type=DEFAULT_STRUCT_TESTING_TYPE, - ) - ] - * 3, - [""] * 3, - ) + if is_nested_struct(dtype): + expected = pa.table( + [ + pa.array( + [ + {"a": 1, "b_struct": {"b": 1.0}}, + {"a": 5, "b_struct": {"b": 5.0}}, + {"a": 2, "b_struct": {"b": 2.0}}, + {"a": 7, "b_struct": {"b": 7.0}}, + {"a": 3, "b_struct": {"b": 3.0}}, + {"a": 9, "b_struct": {"b": 9.0}}, + ], + type=NESTED_STRUCT_TESTING_TYPE, + ) + ] + * 3, + [""] * 3, + ) + else: + expected = pa.table( + [ + pa.array( + [ + {"v": 1}, + {"v": 5}, + {"v": 2}, + {"v": 7}, + {"v": 3}, + {"v": 9}, + ], + type=DEFAULT_STRUCT_TESTING_TYPE, + ) + ] + * 3, + [""] * 3, + ) else: expected = _pyarrow_boolean_mask_scatter_table( pa_source_table, pa_mask, pa_target_table @@ -833,9 +960,9 @@ def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table): def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask): _, plc_target_table = target_table _, plc_mask = mask - if is_integer( + if plc.traits.is_integral_not_bool( dtype := plc_target_table.columns()[0].type() - ) or is_floating(dtype): + ) or plc.traits.is_floating_point(dtype): input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) @@ -887,7 +1014,7 @@ def test_get_element(input_column): assert ( plc.interop.to_arrow( - result, metadata_from_arrow_array(pa_input_column) + result, metadata_from_arrow_type(pa_input_column.type) ).as_py() == pa_input_column[index].as_py() ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index c781126e388..58a1dcf8d56 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -134,3 +134,15 @@ def test_index_of_list_column(test_data, column): expect = pa.array(column[1], type=pa.int32()) assert_column_eq(expect, res) + + +def test_reverse(test_data): + list_column = test_data[0][0] + arr = pa.array(list_column) + plc_column = plc.interop.from_arrow(arr) + + res = plc.lists.reverse(plc_column) + + expect = pa.array([lst[::-1] for lst in list_column]) + + assert_column_eq(expect, res) diff --git a/python/cudf/cudf/pylibcudf_tests/test_traits.py b/python/cudf/cudf/pylibcudf_tests/test_traits.py new file mode 100644 index 00000000000..6c22cb02f21 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_traits.py @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib import pylibcudf as plc + + +def test_is_relationally_comparable(): + assert plc.traits.is_relationally_comparable(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_relationally_comparable( + plc.DataType(plc.TypeId.LIST) + ) + + +def test_is_equality_comparable(): + assert plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.LIST)) + + +def test_is_numeric(): + assert plc.traits.is_numeric(plc.DataType(plc.TypeId.FLOAT64)) + assert not plc.traits.is_numeric(plc.DataType(plc.TypeId.LIST)) + + +def test_is_index_type(): + assert plc.traits.is_index_type(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_index_type(plc.DataType(plc.TypeId.BOOL8)) + + +def test_is_unsigned(): + assert plc.traits.is_unsigned(plc.DataType(plc.TypeId.UINT8)) + assert not plc.traits.is_unsigned(plc.DataType(plc.TypeId.INT8)) + + +def test_is_integral(): + assert plc.traits.is_integral(plc.DataType(plc.TypeId.BOOL8)) + assert not plc.traits.is_integral(plc.DataType(plc.TypeId.DECIMAL32)) + + +def test_is_integral_not_bool(): + assert plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.BOOL8)) + + +def test_is_floating_point(): + assert plc.traits.is_floating_point(plc.DataType(plc.TypeId.FLOAT64)) + assert not plc.traits.is_floating_point(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_boolean(): + assert plc.traits.is_boolean(plc.DataType(plc.TypeId.BOOL8)) + assert not plc.traits.is_boolean(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_timestamp(): + assert plc.traits.is_timestamp( + plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS) + ) + assert not plc.traits.is_timestamp( + plc.DataType(plc.TypeId.DURATION_MICROSECONDS) + ) + + +def test_is_fixed_point(): + assert plc.traits.is_fixed_point(plc.DataType(plc.TypeId.DECIMAL128)) + assert not plc.traits.is_fixed_point(plc.DataType(plc.TypeId.FLOAT32)) + + +def test_is_duration(): + assert plc.traits.is_duration( + plc.DataType(plc.TypeId.DURATION_MICROSECONDS) + ) + assert not plc.traits.is_duration( + plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS) + ) + + +def test_is_chrono(): + assert plc.traits.is_chrono(plc.DataType(plc.TypeId.DURATION_MICROSECONDS)) + assert plc.traits.is_chrono( + plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS) + ) + assert not plc.traits.is_chrono(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_dictionary(): + assert plc.traits.is_dictionary(plc.DataType(plc.TypeId.DICTIONARY32)) + assert not plc.traits.is_dictionary(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_fixed_width(): + assert plc.traits.is_fixed_width(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_fixed_width(plc.DataType(plc.TypeId.STRING)) + + +def test_is_compound(): + assert plc.traits.is_compound(plc.DataType(plc.TypeId.STRUCT)) + assert not plc.traits.is_compound(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_nested(): + assert plc.traits.is_nested(plc.DataType(plc.TypeId.STRUCT)) + assert not plc.traits.is_nested(plc.DataType(plc.TypeId.STRING)) + + +def test_is_bit_castable(): + assert plc.traits.is_bit_castable( + plc.DataType(plc.TypeId.INT8), plc.DataType(plc.TypeId.UINT8) + ) + assert not plc.traits.is_bit_castable( + plc.DataType(plc.TypeId.UINT8), plc.DataType(plc.TypeId.UINT16) + ) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index ea919c786b9..c288155112c 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -515,17 +515,6 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype): np.testing.assert_array_equal(expect_mask, got_mask) -def test_concatenate_large_column_strings(): - num_strings = 1_000_000 - string_scale_f = 100 - - s_1 = cudf.Series(["very long string " * string_scale_f] * num_strings) - s_2 = cudf.Series(["very long string " * string_scale_f] * num_strings) - - with pytest.raises(OverflowError): - cudf.concat([s_1, s_2]) - - @pytest.mark.parametrize( "alias,expect_dtype", [ diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py index 1b395c09ba8..5eeea87d8e0 100644 --- a/python/cudf/cudf/tests/test_interval.py +++ b/python/cudf/cudf/tests/test_interval.py @@ -188,3 +188,9 @@ def test_from_pandas_intervaldtype(): result = cudf.from_pandas(dtype) expected = cudf.IntervalDtype("int64", closed="left") assert_eq(result, expected) + + +def test_intervaldtype_eq_string_with_attributes(): + dtype = cudf.IntervalDtype("int64", closed="left") + assert dtype == "interval" + assert dtype == "interval[int64, left]" diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index f76143cb381..ec9d7995b05 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -12,6 +12,7 @@ from cudf import NA from cudf._lib.copying import get_element from cudf.api.types import is_scalar +from cudf.core.column.column import column_empty from cudf.testing import assert_eq from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES @@ -926,3 +927,29 @@ def test_list_iterate_error(): def test_list_struct_list_memory_usage(): df = cudf.DataFrame({"a": [[{"b": [1]}]]}) assert df.memory_usage().sum() == 16 + + +def test_empty_nested_list_uninitialized_offsets_memory_usage(): + col = column_empty(0, cudf.ListDtype(cudf.ListDtype("int64"))) + nested_col = col.children[1] + empty_inner = type(nested_col)( + size=nested_col.size, + dtype=nested_col.dtype, + mask=nested_col.mask, + offset=nested_col.offset, + null_count=nested_col.null_count, + children=( + column_empty(0, nested_col.children[0].dtype), + nested_col.children[1], + ), + ) + col_empty_offset = type(col)( + size=col.size, + dtype=col.dtype, + mask=col.mask, + offset=col.offset, + null_count=col.null_count, + children=(column_empty(0, col.children[0].dtype), empty_inner), + ) + ser = cudf.Series._from_data({None: col_empty_offset}) + assert ser.memory_usage() == 8 diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 467d0c46ae7..8ed78d804bf 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2757,8 +2757,6 @@ def test_series_from_large_string(pa_type): assert_eq(expected, got) - assert pa_string_array.equals(got.to_arrow()) - @pytest.mark.parametrize( "scalar", @@ -2873,3 +2871,42 @@ def test_nunique_all_null(dropna): result = pd_ser.nunique(dropna=dropna) expected = cudf_ser.nunique(dropna=dropna) assert result == expected + + +@pytest.mark.parametrize( + "type1", + [ + "category", + "interval[int64, right]", + "int64", + "float64", + "str", + "datetime64[ns]", + "timedelta64[ns]", + ], +) +@pytest.mark.parametrize( + "type2", + [ + "category", + "interval[int64, right]", + "int64", + "float64", + "str", + "datetime64[ns]", + "timedelta64[ns]", + ], +) +@pytest.mark.parametrize( + "as_dtype", [lambda x: x, cudf.dtype], ids=["string", "object"] +) +@pytest.mark.parametrize("copy", [True, False]) +def test_empty_astype_always_castable(type1, type2, as_dtype, copy): + ser = cudf.Series([], dtype=as_dtype(type1)) + result = ser.astype(as_dtype(type2), copy=copy) + expected = cudf.Series([], dtype=as_dtype(type2)) + assert_eq(result, expected) + if not copy and cudf.dtype(type1) == cudf.dtype(type2): + assert ser._column is result._column + else: + assert ser._column is not result._column diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 20b731624df..dcb33b1fc1a 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -121,7 +121,7 @@ skip = [ build-backend = "scikit_build_core.build" dependencies-file = "../../dependencies.yaml" requires = [ - "cmake>=3.26.4", + "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", "ninja", "numpy==1.23.*", diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 11e18cd4f32..badfdf06d15 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -101,7 +101,7 @@ regex = "(?P.*)" build-backend = "scikit_build_core.build" dependencies-file = "../../dependencies.yaml" requires = [ - "cmake>=3.26.4", + "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", "ninja", "numpy==1.23.*", diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 979087d5273..764cdd3b3ca 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -34,7 +34,12 @@ def _callback( return ir.evaluate(cache={}).to_polars() -def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None: +def execute_with_cudf( + nt: NodeTraverser, + *, + raise_on_fail: bool = False, + exception: type[Exception] | tuple[type[Exception], ...] = Exception, +) -> None: """ A post optimization callback that attempts to execute the plan with cudf. @@ -47,11 +52,15 @@ def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None Should conversion raise an exception rather than continuing without setting a callback. + exception + Optional exception, or tuple of exceptions, to catch during + translation. Defaults to ``Exception``. + The NodeTraverser is mutated if the libcudf executor can handle the plan. """ try: with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): nt.set_udf(partial(_callback, translate_ir(nt))) - except NotImplementedError: + except exception: if raise_on_fail: raise diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index ec8d00c3123..d86656578d7 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -5,6 +5,7 @@ from __future__ import annotations +import itertools from functools import cached_property from typing import TYPE_CHECKING, cast @@ -160,7 +161,10 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self: ----- If column names overlap, newer names replace older ones. """ - return type(self)([*self.columns, *columns]) + columns = list( + {c.name: c for c in itertools.chain(self.columns, columns)}.values() + ) + return type(self)(columns) def discard_columns(self, names: Set[str]) -> Self: """Drop columns by name.""" diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 17d7d15e4e5..93cb9db7cbd 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -27,11 +27,12 @@ import cudf._lib.pylibcudf as plc from cudf_polars.containers import Column, NamedColumn -from cudf_polars.utils import sorting +from cudf_polars.utils import dtypes, sorting if TYPE_CHECKING: from collections.abc import Mapping, Sequence + import polars.polars as plrs import polars.type_aliases as pl_types from cudf_polars.containers import DataFrame @@ -43,6 +44,7 @@ "Col", "BooleanFunction", "StringFunction", + "TemporalFunction", "Sort", "SortBy", "Gather", @@ -369,6 +371,29 @@ def do_evaluate( return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1)) +class LiteralColumn(Expr): + __slots__ = ("value",) + _non_child = ("dtype", "value") + value: pa.Array[Any, Any] + children: tuple[()] + + def __init__(self, dtype: plc.DataType, value: plrs.PySeries) -> None: + super().__init__(dtype) + data = value.to_arrow() + self.value = data.cast(dtypes.downcast_arrow_lists(data.type)) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + # datatype of pyarrow array is correct by construction. + return Column(plc.interop.from_arrow(self.value)) + + class Col(Expr): __slots__ = ("name",) _non_child = ("dtype", "name") @@ -679,6 +704,7 @@ def _validate_input(self): pl_expr.StringFunction.EndsWith, pl_expr.StringFunction.StartsWith, pl_expr.StringFunction.Contains, + pl_expr.StringFunction.Slice, ): raise NotImplementedError(f"String function {self.name}") if self.name == pl_expr.StringFunction.Contains: @@ -692,6 +718,11 @@ def _validate_input(self): raise NotImplementedError( "Regex contains only supports a scalar pattern" ) + elif self.name == pl_expr.StringFunction.Slice: + if not all(isinstance(child, Literal) for child in self.children[1:]): + raise NotImplementedError( + "Slice only supports literal start and stop values" + ) def do_evaluate( self, @@ -720,6 +751,36 @@ def do_evaluate( flags=plc.strings.regex_flags.RegexFlags.DEFAULT, ) return Column(plc.strings.contains.contains_re(column.obj, prog)) + elif self.name == pl_expr.StringFunction.Slice: + child, expr_offset, expr_length = self.children + assert isinstance(expr_offset, Literal) + assert isinstance(expr_length, Literal) + + column = child.evaluate(df, context=context, mapping=mapping) + # libcudf slices via [start,stop). + # polars slices with offset + length where start == offset + # stop = start + length. Negative values for start look backward + # from the last element of the string. If the end index would be + # below zero, an empty string is returned. + # Do this maths on the host + start = expr_offset.value.as_py() + length = expr_length.value.as_py() + + if length == 0: + stop = start + else: + # No length indicates a scan to the end + # The libcudf equivalent is a null stop + stop = start + length if length else None + if length and start < 0 and length >= -start: + stop = None + return Column( + plc.strings.slice.slice_strings( + column.obj, + plc.interop.from_arrow(pa.scalar(start, type=pa.int32())), + plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())), + ) + ) columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -755,6 +816,129 @@ def do_evaluate( ) # pragma: no cover; handled by init raising +class TemporalFunction(Expr): + __slots__ = ("name", "options", "children") + _non_child = ("dtype", "name", "options") + children: tuple[Expr, ...] + + def __init__( + self, + dtype: plc.DataType, + name: pl_expr.TemporalFunction, + options: tuple[Any, ...], + *children: Expr, + ) -> None: + super().__init__(dtype) + self.options = options + self.name = name + self.children = children + if self.name != pl_expr.TemporalFunction.Year: + raise NotImplementedError(f"String function {self.name}") + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + columns = [ + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ] + if self.name == pl_expr.TemporalFunction.Year: + (column,) = columns + return Column(plc.datetime.extract_year(column.obj)) + raise NotImplementedError( + f"TemporalFunction {self.name}" + ) # pragma: no cover; init trips first + + +class UnaryFunction(Expr): + __slots__ = ("name", "options", "children") + _non_child = ("dtype", "name", "options") + children: tuple[Expr, ...] + + def __init__( + self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr + ) -> None: + super().__init__(dtype) + self.name = name + self.options = options + self.children = children + if self.name not in ("round", "unique"): + raise NotImplementedError(f"Unary function {name=}") + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + if self.name == "round": + (decimal_places,) = self.options + (values,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + return Column( + plc.round.round( + values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP + ) + ).sorted_like(values) + elif self.name == "unique": + (maintain_order,) = self.options + (values,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + # Only one column, so keep_any is the same as keep_first + # for stable distinct + keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY + if values.is_sorted: + maintain_order = True + result = plc.stream_compaction.unique( + plc.Table([values.obj]), + [0], + keep, + plc.types.NullEquality.EQUAL, + ) + else: + distinct = ( + plc.stream_compaction.stable_distinct + if maintain_order + else plc.stream_compaction.distinct + ) + result = distinct( + plc.Table([values.obj]), + [0], + keep, + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + (column,) = result.columns() + if maintain_order: + return Column(column).sorted_like(values) + return Column(column) + raise NotImplementedError( + f"Unimplemented unary function {self.name=}" + ) # pragma: no cover; init trips first + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + if depth == 1: + # inside aggregation, need to pre-evaluate, groupby + # construction has checked that we don't have nested aggs, + # so stop the recursion and return ourselves for pre-eval + return AggInfo([(self, plc.aggregation.collect_list(), self)]) + else: + (child,) = self.children + return child.collect_agg(depth=depth) + + class Sort(Expr): __slots__ = ("options", "children") _non_child = ("dtype", "options") @@ -954,15 +1138,15 @@ def collect_agg(self, *, depth: int) -> AggInfo: class Agg(Expr): __slots__ = ("name", "options", "op", "request", "children") _non_child = ("dtype", "name", "options") - children: tuple[Expr] + children: tuple[Expr, ...] def __init__( - self, dtype: plc.DataType, name: str, options: Any, value: Expr + self, dtype: plc.DataType, name: str, options: Any, *children: Expr ) -> None: super().__init__(dtype) self.name = name self.options = options - self.children = (value,) + self.children = children if name not in Agg._SUPPORTED: raise NotImplementedError( f"Unsupported aggregation {name=}" @@ -1156,6 +1340,13 @@ def __init__( super().__init__(dtype) self.op = op self.children = (left, right) + if ( + op in (plc.binaryop.BinaryOperator.ADD, plc.binaryop.BinaryOperator.SUB) + and plc.traits.is_chrono(left.dtype) + and plc.traits.is_chrono(right.dtype) + and not dtypes.have_compatible_resolution(left.dtype.id(), right.dtype.id()) + ): + raise NotImplementedError("Casting rules for timelike types") _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = { pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL, diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 3f5f3c74050..6b552642e88 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -15,6 +15,7 @@ import dataclasses import itertools +import json import types from functools import cache from typing import TYPE_CHECKING, Any, Callable, ClassVar @@ -29,7 +30,7 @@ import cudf_polars.dsl.expr as expr from cudf_polars.containers import DataFrame, NamedColumn -from cudf_polars.utils import sorting +from cudf_polars.utils import dtypes, sorting if TYPE_CHECKING: from collections.abc import MutableMapping @@ -95,6 +96,8 @@ def broadcast( ``target_length`` is provided and not all columns are length-1 (i.e. ``n != 1``), then ``target_length`` must be equal to ``n``. """ + if len(columns) == 0: + return [] lengths: set[int] = {column.obj.size() for column in columns} if lengths == {1}: if target_length is None: @@ -130,6 +133,11 @@ class IR: schema: Schema """Mapping from column names to their data types.""" + def __post_init__(self): + """Validate preconditions.""" + if any(dtype.id() == plc.TypeId.EMPTY for dtype in self.schema.values()): + raise NotImplementedError("Cannot make empty columns.") + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """ Evaluate the node and return a dataframe. @@ -175,8 +183,10 @@ def __post_init__(self): class Scan(IR): """Input from files.""" - typ: Any + typ: str """What type of file are we reading? Parquet, CSV, etc...""" + options: tuple[Any, ...] + """Type specific options, as json-encoded strings.""" paths: list[str] """List of paths to read from.""" file_options: Any @@ -206,17 +216,21 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: with_columns = options.with_columns row_index = options.row_index if self.typ == "csv": + opts, cloud_opts = map(json.loads, self.options) df = DataFrame.from_cudf( cudf.concat( [cudf.read_csv(p, usecols=with_columns) for p in self.paths] ) ) elif self.typ == "parquet": + opts, cloud_opts = map(json.loads, self.options) cdf = cudf.read_parquet(self.paths, columns=with_columns) assert isinstance(cdf, cudf.DataFrame) df = DataFrame.from_cudf(cdf) else: - assert_never(self.typ) + raise NotImplementedError( + f"Unhandled scan type: {self.typ}" + ) # pragma: no cover; post init trips first if row_index is not None: name, offset = row_index dtype = self.schema[name] @@ -292,15 +306,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: table = pdf.to_arrow() schema = table.schema for i, field in enumerate(schema): - # TODO: Nested types - if field.type == pa.large_string(): - # TODO: goes away when libcudf supports large strings - schema = schema.set(i, pa.field(field.name, pa.string())) - elif isinstance(field.type, pa.LargeListType): - # TODO: goes away when libcudf supports large lists - schema = schema.set( - i, pa.field(field.name, pa.list_(field.type.field(0))) - ) + schema = schema.set( + i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type)) + ) + # No-op if the schema is unchanged. table = table.cast(schema) df = DataFrame.from_table( plc.interop.from_arrow(table), list(self.schema.keys()) @@ -424,7 +433,7 @@ def check_agg(agg: expr.Expr) -> int: NotImplementedError For unsupported expression nodes. """ - if isinstance(agg, (expr.BinOp, expr.Cast)): + if isinstance(agg, (expr.BinOp, expr.Cast, expr.UnaryFunction)): return max(GroupBy.check_agg(child) for child in agg.children) elif isinstance(agg, expr.Agg): return 1 + max(GroupBy.check_agg(child) for child in agg.children) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 953ff636cce..5a1e682abe7 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -12,6 +12,7 @@ import pyarrow as pa from typing_extensions import assert_never +import polars.polars as plrs from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir import cudf._lib.pylibcudf as plc @@ -86,9 +87,11 @@ def _( def _( node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType] ) -> ir.IR: + typ, *options = node.scan_type return ir.Scan( schema, - node.scan_type, + typ, + tuple(options), node.paths, node.file_options, translate_named_expr(visitor, n=node.predicate) @@ -358,8 +361,23 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex options, *(translate_expr(visitor, n=n) for n in node.input), ) - else: - raise NotImplementedError(f"No handler for Expr function node with {name=}") + elif isinstance(name, pl_expr.TemporalFunction): + return expr.TemporalFunction( + dtype, + name, + options, + *(translate_expr(visitor, n=n) for n in node.input), + ) + elif isinstance(name, str): + return expr.UnaryFunction( + dtype, + name, + options, + *(translate_expr(visitor, n=n) for n in node.input), + ) + raise NotImplementedError( + f"No handler for Expr function node with {name=}" + ) # pragma: no cover; polars raises on the rust side for now @_translate_expr.register @@ -383,6 +401,8 @@ def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr @_translate_expr.register def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: + if isinstance(node.value, plrs.PySeries): + return expr.LiteralColumn(dtype, node.value) value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype)) return expr.Literal(dtype, value) @@ -427,8 +447,11 @@ def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.E # Push casts into literals so we can handle Cast(Literal(Null)) if isinstance(inner, expr.Literal): return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype))) - else: - return expr.Cast(dtype, inner) + elif isinstance(inner, expr.Cast): + # Translation of Len/Count-agg put in a cast, remove double + # casts if we have one. + (inner,) = inner.children + return expr.Cast(dtype, inner) @_translate_expr.register @@ -438,12 +461,15 @@ def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr @_translate_expr.register def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: - return expr.Agg( + value = expr.Agg( dtype, node.name, node.options, - translate_expr(visitor, n=node.arguments), + *(translate_expr(visitor, n=n) for n in node.arguments), ) + if value.name == "count" and value.dtype.id() != plc.TypeId.INT32: + return expr.Cast(value.dtype, value) + return value @_translate_expr.register @@ -470,7 +496,10 @@ def _( @_translate_expr.register def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: - return expr.Len(dtype) + value = expr.Len(dtype) + if dtype.id() != plc.TypeId.INT32: + return expr.Cast(dtype, value) + return value # pragma: no cover; never reached since polars len has uint32 dtype def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr: diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 3d4a643e1fc..918cd024fa2 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -7,13 +7,79 @@ from functools import cache +import pyarrow as pa from typing_extensions import assert_never import polars as pl import cudf._lib.pylibcudf as plc -__all__ = ["from_polars"] +__all__ = ["from_polars", "downcast_arrow_lists", "have_compatible_resolution"] + + +def have_compatible_resolution(lid: plc.TypeId, rid: plc.TypeId): + """ + Do two datetime typeids have matching resolution for a binop. + + Parameters + ---------- + lid + Left type id + rid + Right type id + + Returns + ------- + True if resolutions are compatible, False otherwise. + + Notes + ----- + Polars has different casting rules for combining + datetimes/durations than libcudf, and while we don't encode the + casting rules fully, just reject things we can't handle. + + Precondition for correctness: both lid and rid are timelike. + """ + if lid == rid: + return True + # Timestamps are smaller than durations in the libcudf enum. + lid, rid = sorted([lid, rid]) + if lid == plc.TypeId.TIMESTAMP_MILLISECONDS: + return rid == plc.TypeId.DURATION_MILLISECONDS + elif lid == plc.TypeId.TIMESTAMP_MICROSECONDS: + return rid == plc.TypeId.DURATION_MICROSECONDS + elif lid == plc.TypeId.TIMESTAMP_NANOSECONDS: + return rid == plc.TypeId.DURATION_NANOSECONDS + return False + + +def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType: + """ + Sanitize an arrow datatype from polars. + + Parameters + ---------- + typ + Arrow type to sanitize + + Returns + ------- + Sanitized arrow type + + Notes + ----- + As well as arrow ``ListType``s, polars can produce + ``LargeListType``s and ``FixedSizeListType``s, these are not + currently handled by libcudf, so we attempt to cast them all into + normal ``ListType``s on the arrow side before consuming the arrow + data. + """ + if isinstance(typ, pa.LargeListType): + return pa.list_(downcast_arrow_lists(typ.value_type)) + # We don't have to worry about diving into struct types for now + # since those are always NotImplemented before we get here. + assert not isinstance(typ, pa.StructType) + return typ @cache diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index effa4861e0c..0b559f7a8e9 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -20,7 +20,7 @@ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ "cudf==24.8.*,>=0.0.0a0", - "polars>=0.20.30", + "polars>=1.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -182,5 +182,3 @@ docstring-code-format = true [tool.rapids-build-backend] build-backend = "setuptools.build_meta" dependencies-file = "../../dependencies.yaml" -# Pure python -disable-cuda = true diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py index 6ba2a1dce1e..218101bf87c 100644 --- a/python/cudf_polars/tests/expressions/test_datetime_basic.py +++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py @@ -2,6 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import datetime +from operator import methodcaller + import pytest import polars as pl @@ -32,3 +35,28 @@ def test_datetime_dataframe_scan(dtype): query = ldf.select(pl.col("b"), pl.col("a")) assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "field", + [ + methodcaller("year"), + pytest.param( + methodcaller("day"), + marks=pytest.mark.xfail(reason="day extraction not implemented"), + ), + ], +) +def test_datetime_extract(field): + ldf = pl.LazyFrame( + {"dates": [datetime.date(2024, 1, 1), datetime.date(2024, 10, 11)]} + ) + q = ldf.select(field(pl.col("dates").dt)) + + with pytest.raises(AssertionError): + # polars produces int32, libcudf produces int16 for the year extraction + # libcudf can lose data here. + # https://github.com/rapidsai/cudf/issues/16196 + assert_gpu_result_equal(q) + + assert_gpu_result_equal(q, check_dtypes=False) diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py new file mode 100644 index 00000000000..55e688428bd --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_literal.py @@ -0,0 +1,96 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) +from cudf_polars.utils import dtypes + + +@pytest.fixture( + params=[ + None, + pl.Int8(), + pl.Int16(), + pl.Int32(), + pl.Int64(), + pl.UInt8(), + pl.UInt16(), + pl.UInt32(), + pl.UInt64(), + ] +) +def integer(request): + return pl.lit(10, dtype=request.param) + + +@pytest.fixture(params=[None, pl.Float32(), pl.Float64()]) +def float(request): + return pl.lit(1.0, dtype=request.param) + + +def test_numeric_literal(integer, float): + df = pl.LazyFrame({}) + + q = df.select(integer=integer, float_=float, sum_=integer + float) + + assert_gpu_result_equal(q) + + +@pytest.fixture( + params=[pl.Date(), pl.Datetime("ms"), pl.Datetime("us"), pl.Datetime("ns")] +) +def timestamp(request): + return pl.lit(10_000, dtype=request.param) + + +@pytest.fixture(params=[pl.Duration("ms"), pl.Duration("us"), pl.Duration("ns")]) +def timedelta(request): + return pl.lit(9_000, dtype=request.param) + + +def test_timelike_literal(timestamp, timedelta): + df = pl.LazyFrame({}) + + q = df.select( + time=timestamp, + delta=timedelta, + adjusted=timestamp + timedelta, + two_delta=timedelta + timedelta, + ) + schema = q.collect_schema() + time_type = schema["time"] + delta_type = schema["delta"] + if dtypes.have_compatible_resolution( + dtypes.from_polars(time_type).id(), dtypes.from_polars(delta_type).id() + ): + assert_gpu_result_equal(q) + else: + assert_ir_translation_raises(q, NotImplementedError) + + +def test_select_literal_series(): + df = pl.LazyFrame({}) + + q = df.select( + a=pl.Series(["a", "b", "c"], dtype=pl.String()), + b=pl.Series([[1, 2], [3], None], dtype=pl.List(pl.UInt16())), + c=pl.Series([[[1]], [], [[1, 2, 3, 4]]], dtype=pl.List(pl.List(pl.Float32()))), + ) + + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("expr", [pl.lit(None), pl.lit(10, dtype=pl.Decimal())]) +def test_unsupported_literal_raises(expr): + df = pl.LazyFrame({}) + + q = df.select(expr) + + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/expressions/test_round.py b/python/cudf_polars/tests/expressions/test_round.py new file mode 100644 index 00000000000..3af3a0ce6d1 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_round.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import math + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture(params=[pl.Float32, pl.Float64]) +def dtype(request): + return request.param + + +@pytest.fixture +def df(dtype, with_nulls): + a = [-math.e, 10, 22.5, 1.5, 2.5, -1.5, math.pi, 8] + if with_nulls: + a[2] = None + a[-1] = None + return pl.LazyFrame({"a": a}, schema={"a": dtype}) + + +@pytest.mark.parametrize("decimals", [0, 2, 4]) +def test_round(df, decimals): + q = df.select(pl.col("a").round(decimals=decimals)) + + assert_gpu_result_equal(q, check_exact=False) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 9729e765948..8cf65dd51ac 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -37,6 +37,30 @@ def ldf(with_nulls): return pl.LazyFrame({"a": a, "b": range(len(a))}) +slice_cases = [ + (1, 3), + (0, 3), + (0, 0), + (-3, 1), + (-100, 5), + (1, 1), + (100, 100), + (-3, 4), + (-3, 3), +] + + +@pytest.fixture(params=slice_cases) +def slice_column_data(ldf, request): + start, length = request.param + if length: + return ldf.with_columns( + pl.lit(start).alias("start"), pl.lit(length).alias("length") + ) + else: + return ldf.with_columns(pl.lit(start).alias("start")) + + def test_supported_stringfunction_expression(ldf): query = ldf.select( pl.col("a").str.starts_with("Z"), @@ -104,3 +128,25 @@ def test_contains_invalid(ldf): query.collect() with pytest.raises(pl.exceptions.ComputeError): query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)) + + +@pytest.mark.parametrize("offset", [1, -1, 0, 100, -100]) +def test_slice_scalars_offset(ldf, offset): + query = ldf.select(pl.col("a").str.slice(offset)) + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize("offset,length", slice_cases) +def test_slice_scalars_length_and_offset(ldf, offset, length): + query = ldf.select(pl.col("a").str.slice(offset, length)) + assert_gpu_result_equal(query) + + +def test_slice_column(slice_column_data): + if "length" in slice_column_data.collect_schema(): + query = slice_column_data.select( + pl.col("a").str.slice(pl.col("start"), pl.col("length")) + ) + else: + query = slice_column_data.select(pl.col("a").str.slice(pl.col("start"))) + assert_ir_translation_raises(query, NotImplementedError) diff --git a/python/cudf_polars/tests/expressions/test_unique.py b/python/cudf_polars/tests/expressions/test_unique.py new file mode 100644 index 00000000000..9b009a422c2 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_unique.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"]) +@pytest.mark.parametrize("pre_sorted", [False, True], ids=["unsorted", "sorted"]) +def test_unique(maintain_order, pre_sorted): + ldf = pl.DataFrame( + { + "b": [1.5, 2.5, None, 1.5, 3, float("nan"), 3], + } + ).lazy() + if pre_sorted: + ldf = ldf.sort("b") + + query = ldf.select(pl.col("b").unique(maintain_order=maintain_order)) + assert_gpu_result_equal(query, check_row_order=maintain_order) diff --git a/python/cudf_polars/tests/test_dataframescan.py b/python/cudf_polars/tests/test_dataframescan.py index 1ffe06ac562..b5c0fb7be9f 100644 --- a/python/cudf_polars/tests/test_dataframescan.py +++ b/python/cudf_polars/tests/test_dataframescan.py @@ -41,3 +41,22 @@ def test_scan_drop_nulls(subset, predicate_pushdown): assert_gpu_result_equal( q, collect_kwargs={"predicate_pushdown": predicate_pushdown} ) + + +def test_can_convert_lists(): + df = pl.LazyFrame( + { + "a": pl.Series([[1, 2], [3]], dtype=pl.List(pl.Int8())), + "b": pl.Series([[1], [2]], dtype=pl.List(pl.UInt16())), + "c": pl.Series( + [ + [["1", "2", "3"], ["4", "567"]], + [["8", "9"], []], + ], + dtype=pl.List(pl.List(pl.String())), + ), + "d": pl.Series([[[1, 2]], []], dtype=pl.List(pl.List(pl.UInt16()))), + } + ) + + assert_gpu_result_equal(df) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index aefad59eb91..b84e2c16b43 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -47,6 +47,8 @@ def keys(request): [pl.col("float").max() - pl.col("int").min()], [pl.col("float").mean(), pl.col("int").std()], [(pl.col("float") - pl.lit(2)).max()], + [pl.col("float").sum().round(decimals=1)], + [pl.col("float").round(decimals=1).sum()], ], ids=lambda aggs: "-".join(map(str, aggs)), ) @@ -83,10 +85,7 @@ def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs): def test_groupby_len(df, keys): q = df.group_by(*keys).agg(pl.len()) - # TODO: polars returns UInt32, libcudf returns Int32 - with pytest.raises(AssertionError): - assert_gpu_result_equal(q, check_row_order=False) - assert_gpu_result_equal(q, check_dtypes=False, check_row_order=False) + assert_gpu_result_equal(q, check_row_order=False) @pytest.mark.parametrize( diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py index b021d832910..865b95a7d91 100644 --- a/python/cudf_polars/tests/test_union.py +++ b/python/cudf_polars/tests/test_union.py @@ -46,3 +46,12 @@ def test_concat_vertical(): q = pl.concat([ldf, ldf2], how="vertical") assert_gpu_result_equal(q) + + +def test_concat_diagonal_empty(): + df1 = pl.LazyFrame() + df2 = pl.LazyFrame({"a": [1, 2]}) + + q = pl.concat([df1, df2], how="diagonal_relaxed") + + assert_gpu_result_equal(q, collect_kwargs={"no_optimization": True})