diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a35802f2ab0..ceee9074b93 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -25,7 +25,8 @@ jobs: - docs-build - wheel-build-cudf - wheel-tests-cudf - - test-cudf-polars + - wheel-build-cudf-polars + - wheel-tests-cudf-polars - wheel-build-dask-cudf - wheel-tests-dask-cudf - devcontainer @@ -133,9 +134,18 @@ jobs: with: build_type: pull-request script: ci/test_wheel_cudf.sh - test-cudf-polars: + wheel-build-cudf-polars: needs: wheel-build-cudf secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 + with: + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) + build_type: pull-request + script: "ci/build_wheel_cudf_polars.sh" + wheel-tests-cudf-polars: + needs: wheel-build-cudf-polars + secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -143,7 +153,7 @@ jobs: build_type: pull-request # This always runs, but only fails if this PR touches code in # pylibcudf or cudf_polars - script: "ci/test_cudf_polars.sh" + script: "ci/test_wheel_cudf_polars.sh" wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh new file mode 100755 index 00000000000..9c945e11c00 --- /dev/null +++ b/ci/build_wheel_cudf_polars.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +set -euo pipefail + +package_dir="python/cudf_polars" + +./ci/build_wheel.sh ${package_dir} + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" +RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist diff --git a/ci/run_cudf_polars_pytests.sh b/ci/run_cudf_polars_pytests.sh new file mode 100755 index 00000000000..c10612a065a --- /dev/null +++ b/ci/run_cudf_polars_pytests.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +# It is essential to cd into python/cudf_polars as `pytest-xdist` + `coverage` seem to work only at this directory level. + +# Support invoking run_cudf_polars_pytests.sh outside the script directory +cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_polars/ + +python -m pytest --cache-clear "$@" tests diff --git a/ci/test_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh similarity index 67% rename from ci/test_cudf_polars.sh rename to ci/test_wheel_cudf_polars.sh index 95fb4b431bf..900acd5d473 100755 --- a/ci/test_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -18,19 +18,14 @@ else fi RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist +RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist -RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} -RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ -mkdir -p "${RAPIDS_TESTS_DIR}" - -rapids-logger "Install cudf wheel" -# echo to expand wildcard before adding `[extra]` requires for pip -python -m pip install $(echo ./dist/cudf*.whl)[test] +# Download the cudf built in the previous step +RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep +python -m pip install ./local-cudf-dep/cudf*.whl rapids-logger "Install cudf_polars" -python -m pip install 'polars>=1.0' -python -m pip install --no-deps python/cudf_polars +python -m pip install $(echo ./dist/cudf_polars*.whl)[test] rapids-logger "Run cudf_polars tests" @@ -42,13 +37,11 @@ EXITCODE=0 trap set_exitcode ERR set +e -python -m pytest \ - --cache-clear \ +./ci/run_cudf_polars_pytests.sh \ --cov cudf_polars \ --cov-fail-under=100 \ - --cov-config=python/cudf_polars/pyproject.toml \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf_polars.xml" \ - python/cudf_polars/tests + --cov-config=./pyproject.toml \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars.xml" trap ERR set -e diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index 2b20b9d9ce4..c3800d3cc25 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -8,7 +8,7 @@ RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE=" # Download the cudf built in the previous step RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep -python -m pip install --no-deps ./local-cudf-dep/cudf*.whl +python -m pip install ./local-cudf-dep/cudf*.whl # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install $(echo ./dist/dask_cudf*.whl)[test] diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index cc9238ab80a..b8d73a01f96 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -15,7 +15,7 @@ dependencies: - cachetools - clang-tools=16.0.6 - clang==16.0.6 -- cmake>=3.26.4 +- cmake>=3.26.4,!=3.30.0 - cramjam - cubinlinker - cuda-nvtx=11.8 diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index 9fecd452248..c32d21c5d36 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -15,7 +15,7 @@ dependencies: - cachetools - clang-tools=16.0.6 - clang==16.0.6 -- cmake>=3.26.4 +- cmake>=3.26.4,!=3.30.0 - cramjam - cuda-cudart-dev - cuda-nvcc diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml index d399e440edd..af894cccda0 100644 --- a/conda/recipes/cudf/conda_build_config.yaml +++ b/conda/recipes/cudf/conda_build_config.yaml @@ -11,7 +11,7 @@ c_stdlib_version: - "2.17" cmake_version: - - ">=3.26.4" + - ">=3.26.4,!=3.30.0" cuda_compiler: - cuda-nvcc diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml index d399e440edd..af894cccda0 100644 --- a/conda/recipes/cudf_kafka/conda_build_config.yaml +++ b/conda/recipes/cudf_kafka/conda_build_config.yaml @@ -11,7 +11,7 @@ c_stdlib_version: - "2.17" cmake_version: - - ">=3.26.4" + - ">=3.26.4,!=3.30.0" cuda_compiler: - cuda-nvcc diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index c01178bf732..4f99411e978 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -17,7 +17,7 @@ c_stdlib_version: - "2.17" cmake_version: - - ">=3.26.4" + - ">=3.26.4,!=3.30.0" libarrow_version: - "==16.1.0" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2811711d58c..7999ada9282 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -925,6 +925,11 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL) add_library( ${_tgt} SHARED src/utilities/stacktrace.cpp tests/utilities/identify_stream_usage.cpp ) + if(CUDF_USE_PER_THREAD_DEFAULT_STREAM) + target_compile_definitions( + ${_tgt} PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM CUDF_USE_PER_THREAD_DEFAULT_STREAM + ) + endif() set_target_properties( ${_tgt} diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp index e8486a80afc..9cdda773dbb 100644 --- a/cpp/include/cudf/dictionary/detail/update_keys.hpp +++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp @@ -29,7 +29,7 @@ namespace dictionary { namespace detail { /** * @copydoc cudf::dictionary::add_keys(dictionary_column_view const&,column_view - * const&,mm::mr::device_memory_resource*) + * const&,rmm::device_async_resource_ref) * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -40,7 +40,7 @@ std::unique_ptr add_keys(dictionary_column_view const& dictionary_column /** * @copydoc cudf::dictionary::remove_keys(dictionary_column_view const&,column_view - * const&,mm::mr::device_memory_resource*) + * const&,rmm::device_async_resource_ref) * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -51,7 +51,7 @@ std::unique_ptr remove_keys(dictionary_column_view const& dictionary_col /** * @copydoc cudf::dictionary::remove_unused_keys(dictionary_column_view - * const&,mm::mr::device_memory_resource*) + * const&,rmm::device_async_resource_ref) * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -61,7 +61,7 @@ std::unique_ptr remove_unused_keys(dictionary_column_view const& diction /** * @copydoc cudf::dictionary::set_keys(dictionary_column_view - * const&,mm::mr::device_memory_resource*) + * const&,rmm::device_async_resource_ref) * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -72,7 +72,7 @@ std::unique_ptr set_keys(dictionary_column_view const& dictionary_column /** * @copydoc - * cudf::dictionary::match_dictionaries(std::vector,mm::mr::device_memory_resource*) + * cudf::dictionary::match_dictionaries(std::vector,rmm::device_async_resource_ref) * * @param stream CUDA stream used for device memory operations and kernel launches. */ diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 502ffb9ba4f..11f6ce2bad7 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -39,6 +39,7 @@ #include #include +#include #include @@ -372,8 +373,8 @@ std::unique_ptr from_arrow( std::unique_ptr from_arrow( ArrowSchema const* schema, ArrowArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input @@ -391,8 +392,8 @@ std::unique_ptr from_arrow( std::unique_ptr from_arrow_column( ArrowSchema const* schema, ArrowArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::table` from given ArrowDeviceArray input @@ -415,8 +416,8 @@ std::unique_ptr from_arrow_column( std::unique_ptr from_arrow_host( ArrowSchema const* schema, ArrowDeviceArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::table` from given ArrowArrayStream input @@ -433,8 +434,8 @@ std::unique_ptr
from_arrow_host( */ std::unique_ptr
from_arrow_stream( ArrowArrayStream* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::column` from given ArrowDeviceArray input @@ -456,8 +457,8 @@ std::unique_ptr
from_arrow_stream( std::unique_ptr from_arrow_host_column( ArrowSchema const* schema, ArrowDeviceArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray @@ -537,8 +538,8 @@ using unique_table_view_t = unique_table_view_t from_arrow_device( ArrowSchema const* schema, ArrowDeviceArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief typedef for a unique_ptr to a `cudf::column_view` with custom deleter @@ -580,8 +581,8 @@ using unique_column_view_t = unique_column_view_t from_arrow_device_column( ArrowSchema const* schema, ArrowDeviceArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp index a19aa9be0c0..a714f762a19 100644 --- a/cpp/include/cudf/strings/replace.hpp +++ b/cpp/include/cudf/strings/replace.hpp @@ -122,7 +122,7 @@ std::unique_ptr replace_slice( * If a target string is found, it is replaced by the corresponding entry in the repls column. * All occurrences found in each string are replaced. * - * This does not use regex to match targets in the string. + * This does not use regex to match targets in the string. Empty string targets are ignored. * * Null string entries will return null output string entries. * diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index 73c1a474310..e1d289e67a3 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -56,7 +57,7 @@ struct dispatch_from_arrow_device { data_type, bool, rmm::cuda_stream_view, - rmm::mr::device_memory_resource*) + rmm::device_async_resource_ref) { CUDF_FAIL("Unsupported type in from_arrow_device", cudf::data_type_error); } @@ -68,7 +69,7 @@ struct dispatch_from_arrow_device { data_type type, bool skip_mask, rmm::cuda_stream_view, - rmm::mr::device_memory_resource*) + rmm::device_async_resource_ref mr) { size_type const num_rows = input->length; size_type const offset = input->offset; @@ -90,7 +91,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema, data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); + rmm::device_async_resource_ref mr); template <> dispatch_tuple_t dispatch_from_arrow_device::operator()(ArrowSchemaView* schema, @@ -98,7 +99,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()(ArrowSchemaView* s data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { if (input->length == 0) { return std::make_tuple( @@ -141,7 +142,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(schema->type != NANOARROW_TYPE_LARGE_STRING, "Large strings are not yet supported in from_arrow_device", @@ -182,7 +183,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { ArrowSchemaView keys_schema_view; NANOARROW_THROW_NOT_OK( @@ -238,7 +239,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { std::vector children; owned_columns_t out_owned_cols; @@ -283,7 +284,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { size_type const num_rows = input->length; size_type const offset = input->offset; @@ -324,7 +325,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema, data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { return type.id() != type_id::EMPTY ? std::move(type_dispatcher( @@ -342,7 +343,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema, unique_table_view_t from_arrow_device(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(schema != nullptr && input != nullptr, "input ArrowSchema and ArrowDeviceArray must not be NULL", @@ -397,7 +398,7 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema, unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(schema != nullptr && input != nullptr, "input ArrowSchema and ArrowDeviceArray must not be NULL", @@ -429,7 +430,7 @@ unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, unique_table_view_t from_arrow_device(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -439,7 +440,7 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema, unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index b7e07056686..b3087dedf98 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -49,7 +50,7 @@ namespace { struct dispatch_copy_from_arrow_host { rmm::cuda_stream_view stream; - rmm::mr::device_memory_resource* mr; + rmm::device_async_resource_ref mr; std::unique_ptr get_mask_buffer(ArrowArray const* array) { @@ -131,7 +132,7 @@ std::unique_ptr get_column_copy(ArrowSchemaView* schema, data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); + rmm::device_async_resource_ref mr); template <> std::unique_ptr dispatch_copy_from_arrow_host::operator()(ArrowSchemaView* schema, @@ -388,7 +389,7 @@ std::unique_ptr get_column_copy(ArrowSchemaView* schema, data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { return type.id() != type_id::EMPTY ? std::move(type_dispatcher( @@ -405,7 +406,7 @@ std::unique_ptr get_column_copy(ArrowSchemaView* schema, std::unique_ptr
from_arrow_host(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(schema != nullptr && input != nullptr, "input ArrowSchema and ArrowDeviceArray must not be NULL", @@ -441,7 +442,7 @@ std::unique_ptr
from_arrow_host(ArrowSchema const* schema, std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(schema != nullptr && input != nullptr, "input ArrowSchema and ArrowDeviceArray must not be NULL", @@ -462,7 +463,7 @@ std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, std::unique_ptr
from_arrow_host(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -472,7 +473,7 @@ std::unique_ptr
from_arrow_host(ArrowSchema const* schema, std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -482,7 +483,7 @@ std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, std::unique_ptr
from_arrow(ArrowSchema const* schema, ArrowArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -497,7 +498,7 @@ std::unique_ptr
from_arrow(ArrowSchema const* schema, std::unique_ptr from_arrow_column(ArrowSchema const* schema, ArrowArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); diff --git a/cpp/src/interop/from_arrow_stream.cu b/cpp/src/interop/from_arrow_stream.cu index 0c85b561944..578105aa90a 100644 --- a/cpp/src/interop/from_arrow_stream.cu +++ b/cpp/src/interop/from_arrow_stream.cu @@ -41,7 +41,7 @@ namespace { std::unique_ptr make_empty_column_from_schema(ArrowSchema const* schema, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { ArrowSchemaView schema_view; NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, nullptr)); @@ -81,7 +81,7 @@ std::unique_ptr make_empty_column_from_schema(ArrowSchema const* schema, std::unique_ptr
from_arrow_stream(ArrowArrayStream* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(input != nullptr, "input ArrowArrayStream must not be NULL", std::invalid_argument); @@ -135,7 +135,7 @@ std::unique_ptr
from_arrow_stream(ArrowArrayStream* input, std::unique_ptr
from_arrow_stream(ArrowArrayStream* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::from_arrow_stream(input, stream, mr); diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 2b3aa2f08f1..62b85891adb 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -376,7 +376,12 @@ std::shared_ptr dispatch_to_arrow::operator()( metadata.children_meta.empty() ? std::vector{{}, {}} : metadata.children_meta; auto child_arrays = fetch_child_array(input_view, children_meta, ar_mr, stream); if (child_arrays.empty()) { - return std::make_shared(arrow::list(arrow::null()), 0, nullptr, nullptr); + // Empty list will have only one value in offset of 4 bytes + auto tmp_offset_buffer = allocate_arrow_buffer(sizeof(int32_t), ar_mr); + memset(tmp_offset_buffer->mutable_data(), 0, sizeof(int32_t)); + + return std::make_shared( + arrow::list(arrow::null()), 0, std::move(tmp_offset_buffer), nullptr); } auto offset_buffer = child_arrays[0]->data()->buffers[1]; diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 7c4d5711281..63eb0b03c5f 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -372,15 +373,33 @@ void write_chunked(data_sink* out_sink, CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column."); cudf::string_scalar newline{options.get_line_terminator(), true, stream}; - auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view, - newline, - string_scalar{"", false, stream}, - stream, - rmm::mr::get_current_device_resource()); - strings_column_view strings_column{p_str_col_w_nl->view()}; - auto total_num_bytes = strings_column.chars_size(stream); - char const* ptr_all_bytes = strings_column.chars_begin(stream); + // use strings concatenate to build the final CSV output in device memory + auto contents_w_nl = [&] { + auto const total_size = + str_column_view.chars_size(stream) + (newline.size() * str_column_view.size()); + auto const empty_str = string_scalar("", true, stream); + // use join_strings when the output will be less than 2GB + if (total_size < static_cast(std::numeric_limits::max())) { + return cudf::strings::detail::join_strings(str_column_view, newline, empty_str, stream, mr) + ->release(); + } + auto nl_col = cudf::make_column_from_scalar(newline, str_column_view.size(), stream); + // convert the last element into an empty string by resetting the last offset value + auto& offsets = nl_col->child(strings_column_view::offsets_column_index); + auto offsets_view = offsets.mutable_view(); + cudf::fill_in_place(offsets_view, + offsets.size() - 1, // set the last element with + offsets.size(), // the value from 2nd to last element + *cudf::detail::get_element(offsets.view(), offsets.size() - 2, stream, mr), + stream); + auto const nl_tbl = cudf::table_view({str_column_view.parent(), nl_col->view()}); + return cudf::strings::detail::concatenate( + nl_tbl, empty_str, empty_str, strings::separator_on_nulls::NO, stream, mr) + ->release(); + }(); + auto const total_num_bytes = contents_w_nl.data->size(); + auto const ptr_all_bytes = static_cast(contents_w_nl.data->data()); if (out_sink->is_device_write_preferred(total_num_bytes)) { // Direct write from device memory @@ -491,7 +510,8 @@ void write_csv(data_sink* out_sink, str_table_view.column(0), options_narep, stream, rmm::mr::get_current_device_resource()); }(); - write_chunked(out_sink, str_concat_col->view(), options, stream, mr); + write_chunked( + out_sink, str_concat_col->view(), options, stream, rmm::mr::get_current_device_resource()); } } } diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index 43a3d69091a..2ca22f0e017 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -451,8 +451,8 @@ struct replace_multi_fn { while (spos < d_str.size_bytes()) { for (int tgt_idx = 0; tgt_idx < d_targets.size(); ++tgt_idx) { auto const d_tgt = d_targets.element(tgt_idx); - if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) && // check fit - (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0)) // and match + if (!d_tgt.empty() && (d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) && // check fit + (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0)) // and match { auto const d_repl = (d_repls.size() == 1) ? d_repls.element(0) : d_repls.element(tgt_idx); @@ -468,9 +468,8 @@ struct replace_multi_fn { } ++spos; } - if (out_ptr) // copy remainder - { - memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos); + if (out_ptr) { + memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos); // copy remainder } else { d_sizes[idx] = bytes; } diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp index 3aa7467d156..6c4afbb435a 100644 --- a/cpp/tests/strings/replace_tests.cpp +++ b/cpp/tests/strings/replace_tests.cpp @@ -532,6 +532,23 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong) } } +TEST_F(StringsReplaceTest, EmptyTarget) +{ + auto const input = cudf::test::strings_column_wrapper({"hello", "world", "", "accénted"}); + auto const sv = cudf::strings_column_view(input); + + auto const targets = cudf::test::strings_column_wrapper({"e", "", "d"}); + auto const tv = cudf::strings_column_view(targets); + + auto const repls = cudf::test::strings_column_wrapper({"E", "_", "D"}); + auto const rv = cudf::strings_column_view(repls); + + // empty target should be ignored + auto results = cudf::strings::replace_multiple(sv, tv, rv); + auto expected = cudf::test::strings_column_wrapper({"hEllo", "worlD", "", "accéntED"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + TEST_F(StringsReplaceTest, EmptyStringsColumn) { auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view(); diff --git a/dependencies.yaml b/dependencies.yaml index e3f8a72e76c..27621ff9a3f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -243,7 +243,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - &cmake_ver cmake>=3.26.4 + - &cmake_ver cmake>=3.26.4,!=3.30.0 - &ninja ninja build_all: common: @@ -755,7 +755,7 @@ dependencies: - {matrix: null, packages: *cupy_packages_cu11} test_python_pandas_cudf: common: - - output_types: pyproject + - output_types: [requirements, pyproject] packages: # dependencies to run pandas tests # https://github.com/pandas-dev/pandas/blob/main/environment.yml @@ -766,7 +766,7 @@ dependencies: - pytest-reportlog test_python_cudf_pandas: common: - - output_types: pyproject + - output_types: [requirements, pyproject] packages: - ipython - openpyxl diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index df7749b2a40..1b6cfa73a2b 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -18,7 +18,6 @@ This page provides API documentation for pylibcudf. filling gpumemoryview groupby - io/index.rst interop join lists @@ -26,15 +25,16 @@ This page provides API documentation for pylibcudf. null_mask quantiles reduce + replace reshape rolling round scalar search - stream_compaction sorting - replace + stream_compaction table + traits types unary @@ -42,4 +42,5 @@ This page provides API documentation for pylibcudf. :maxdepth: 2 :caption: Subpackages + io/index.rst strings/index.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst new file mode 100644 index 00000000000..294ca8dc78c --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst @@ -0,0 +1,6 @@ +====== +traits +====== + +.. automodule:: cudf._lib.pylibcudf.traits + :members: diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd index 252d986843a..680a87c789e 100644 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ b/python/cudf/cudf/_lib/io/utils.pxd @@ -16,6 +16,10 @@ cdef source_info make_source_info(list src) except* cdef sink_info make_sinks_info( list src, vector[unique_ptr[data_sink]] & data) except* cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except* +cdef add_df_col_struct_names( + df, + child_names_dict +) cdef update_struct_field_names( table, vector[column_name_info]& schema_info) diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index 1d7c56888d9..58956b9e9b7 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -147,10 +147,37 @@ cdef cppclass iobase_data_sink(data_sink): return buf.tell() +cdef add_df_col_struct_names(df, child_names_dict): + for name, child_names in child_names_dict.items(): + col = df._data[name] + + df._data[name] = update_col_struct_field_names(col, child_names) + + +cdef update_col_struct_field_names(Column col, child_names): + if col.children: + children = list(col.children) + for i, (child, names) in enumerate(zip(children, child_names.values())): + children[i] = update_col_struct_field_names( + child, + names + ) + col.set_base_children(tuple(children)) + + if isinstance(col.dtype, StructDtype): + col = col._rename_fields( + child_names.keys() + ) + + return col + + cdef update_struct_field_names( table, vector[column_name_info]& schema_info ): + # Deprecated, remove in favor of add_col_struct_names + # when a reader is ported to pylibcudf for i, (name, col) in enumerate(table._data.items()): table._data[name] = update_column_struct_field_names( col, schema_info[i] diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 22e34feb547..9c646e3357b 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -8,26 +8,16 @@ import cudf from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from libcpp.map cimport map -from libcpp.string cimport string -from libcpp.utility cimport move -from libcpp.vector cimport vector cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types -from cudf._lib.io.utils cimport make_source_info, update_struct_field_names -from cudf._lib.pylibcudf.libcudf.io.json cimport ( - json_reader_options, - json_recovery_mode_t, - read_json as libcudf_read_json, - schema_element, -) -from cudf._lib.pylibcudf.libcudf.io.types cimport ( - compression_type, - table_with_metadata, -) -from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type +from cudf._lib.io.utils cimport add_df_col_struct_names +from cudf._lib.pylibcudf.io.types cimport compression_type +from cudf._lib.pylibcudf.libcudf.io.json cimport json_recovery_mode_t +from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type +from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id +from cudf._lib.pylibcudf.types cimport DataType from cudf._lib.types cimport dtype_to_data_type -from cudf._lib.utils cimport data_from_unique_ptr +from cudf._lib.utils cimport data_from_pylibcudf_io import cudf._lib.pylibcudf as plc @@ -62,6 +52,7 @@ cpdef read_json(object filepaths_or_buffers, # If input data is a JSON string (or StringIO), hold a reference to # the encoded memoryview externally to ensure the encoded buffer # isn't destroyed before calling libcudf `read_json()` + for idx in range(len(filepaths_or_buffers)): if isinstance(filepaths_or_buffers[idx], io.StringIO): filepaths_or_buffers[idx] = \ @@ -71,17 +62,7 @@ cpdef read_json(object filepaths_or_buffers, filepaths_or_buffers[idx] = filepaths_or_buffers[idx].encode() # Setup arguments - cdef vector[data_type] c_dtypes_list - cdef map[string, schema_element] c_dtypes_schema_map cdef cudf_io_types.compression_type c_compression - # Determine byte read offsets if applicable - cdef size_type c_range_offset = ( - byte_range[0] if byte_range is not None else 0 - ) - cdef size_type c_range_size = ( - byte_range[1] if byte_range is not None else 0 - ) - cdef bool c_lines = lines if compression is not None: if compression == 'gzip': @@ -94,56 +75,50 @@ cpdef read_json(object filepaths_or_buffers, c_compression = cudf_io_types.compression_type.AUTO else: c_compression = cudf_io_types.compression_type.NONE - is_list_like_dtypes = False + + processed_dtypes = None + if dtype is False: raise ValueError("False value is unsupported for `dtype`") elif dtype is not True: + processed_dtypes = [] if isinstance(dtype, abc.Mapping): for k, v in dtype.items(): - c_dtypes_schema_map[str(k).encode()] = \ - _get_cudf_schema_element_from_dtype(v) + # Make sure keys are string + k = str(k) + lib_type, child_types = _get_cudf_schema_element_from_dtype(v) + processed_dtypes.append((k, lib_type, child_types)) elif isinstance(dtype, abc.Collection): - is_list_like_dtypes = True - c_dtypes_list.reserve(len(dtype)) for col_dtype in dtype: - c_dtypes_list.push_back( - _get_cudf_data_type_from_dtype( - col_dtype)) + processed_dtypes.append( + # Ignore child columns since we cannot specify their dtypes + # when passing a list + _get_cudf_schema_element_from_dtype(col_dtype)[0] + ) else: raise TypeError("`dtype` must be 'list like' or 'dict'") - cdef json_reader_options opts = move( - json_reader_options.builder(make_source_info(filepaths_or_buffers)) - .compression(c_compression) - .lines(c_lines) - .byte_range_offset(c_range_offset) - .byte_range_size(c_range_size) - .recovery_mode(_get_json_recovery_mode(on_bad_lines)) - .build() + table_w_meta = plc.io.json.read_json( + plc.io.SourceInfo(filepaths_or_buffers), + processed_dtypes, + c_compression, + lines, + byte_range_offset = byte_range[0] if byte_range is not None else 0, + byte_range_size = byte_range[1] if byte_range is not None else 0, + keep_quotes = keep_quotes, + mixed_types_as_string = mixed_types_as_string, + prune_columns = prune_columns, + recovery_mode = _get_json_recovery_mode(on_bad_lines) ) - if is_list_like_dtypes: - opts.set_dtypes(c_dtypes_list) - else: - opts.set_dtypes(c_dtypes_schema_map) - - opts.enable_keep_quotes(keep_quotes) - opts.enable_mixed_types_as_string(mixed_types_as_string) - opts.enable_prune_columns(prune_columns) - - # Read JSON - cdef cudf_io_types.table_with_metadata c_result - with nogil: - c_result = move(libcudf_read_json(opts)) - - meta_names = [info.name.decode() for info in c_result.metadata.schema_info] - df = cudf.DataFrame._from_data(*data_from_unique_ptr( - move(c_result.tbl), - column_names=meta_names - )) - - update_struct_field_names(df, c_result.metadata.schema_info) + df = cudf.DataFrame._from_data( + *data_from_pylibcudf_io( + table_w_meta + ) + ) + # Post-processing to add in struct column names + add_df_col_struct_names(df, table_w_meta.child_names) return df @@ -192,28 +167,32 @@ def write_json( ) -cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *: - cdef schema_element s_element - cdef data_type lib_type +cdef _get_cudf_schema_element_from_dtype(object dtype) except *: dtype = cudf.dtype(dtype) if isinstance(dtype, cudf.CategoricalDtype): raise NotImplementedError( "CategoricalDtype as dtype is not yet " "supported in JSON reader" ) - lib_type = dtype_to_data_type(dtype) - s_element.type = lib_type + + lib_type = DataType.from_libcudf(dtype_to_data_type(dtype)) + child_types = [] + if isinstance(dtype, cudf.StructDtype): for name, child_type in dtype.fields.items(): - s_element.child_types[name.encode()] = \ + child_lib_type, grandchild_types = \ _get_cudf_schema_element_from_dtype(child_type) + child_types.append((name, child_lib_type, grandchild_types)) elif isinstance(dtype, cudf.ListDtype): - s_element.child_types["offsets".encode()] = \ - _get_cudf_schema_element_from_dtype(cudf.dtype("int32")) - s_element.child_types["element".encode()] = \ + child_lib_type, grandchild_types = \ _get_cudf_schema_element_from_dtype(dtype.element_type) - return s_element + child_types = [ + ("offsets", DataType.from_libcudf(data_type(type_id.INT32)), []), + ("element", child_lib_type, grandchild_types) + ] + + return lib_type, child_types cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index caa445c3a0e..e5ad9d01310 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -39,6 +39,7 @@ set(cython_sources stream_compaction.pyx sorting.pyx table.pyx + traits.pyx types.pyx unary.pyx utils.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 16d21daa140..92ce4523a5d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -24,6 +24,7 @@ from . cimport ( sorting, stream_compaction, strings, + traits, types, unary, ) @@ -56,12 +57,14 @@ __all__ = [ "quantiles", "reduce", "replace", + "reshape", "rolling", "round", "search", "stream_compaction", "strings", "sorting", + "traits", "types", "unary", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index fd4f35d30bf..3c3c98be1cc 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -24,6 +24,7 @@ sorting, stream_compaction, strings, + traits, types, unary, ) @@ -36,6 +37,7 @@ __all__ = [ "Column", "DataType", + "MaskState", "Scalar", "Table", "TypeId", @@ -56,12 +58,14 @@ "quantiles", "reduce", "replace", + "reshape", "rolling", "round", "search", "stream_compaction", "strings", "sorting", + "traits", "types", "unary", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd index a91d574131f..f7f733a493d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd @@ -1,11 +1,30 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - from libcpp cimport bool -from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata +from cudf._lib.pylibcudf.io.types cimport ( + SinkInfo, + SourceInfo, + TableWithMetadata, + compression_type, +) +from cudf._lib.pylibcudf.libcudf.io.json cimport json_recovery_mode_t from cudf._lib.pylibcudf.libcudf.types cimport size_type +cpdef TableWithMetadata read_json( + SourceInfo source_info, + list dtypes = *, + compression_type compression = *, + bool lines = *, + size_type byte_range_offset = *, + size_type byte_range_size = *, + bool keep_quotes = *, + bool mixed_types_as_string = *, + bool prune_columns = *, + json_recovery_mode_t recovery_mode = *, +) + + cpdef void write_json( SinkInfo sink_info, TableWithMetadata tbl, diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx index 7530eba3803..354cb4981de 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx @@ -1,16 +1,130 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.limits cimport numeric_limits +from libcpp.map cimport map from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector -from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata +from cudf._lib.pylibcudf.io.types cimport ( + SinkInfo, + SourceInfo, + TableWithMetadata, +) from cudf._lib.pylibcudf.libcudf.io.json cimport ( + json_reader_options, + json_recovery_mode_t, json_writer_options, + read_json as cpp_read_json, + schema_element, write_json as cpp_write_json, ) -from cudf._lib.pylibcudf.libcudf.io.types cimport table_metadata -from cudf._lib.pylibcudf.types cimport size_type +from cudf._lib.pylibcudf.libcudf.io.types cimport ( + compression_type, + table_metadata, + table_with_metadata, +) +from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type +from cudf._lib.pylibcudf.types cimport DataType + + +cdef map[string, schema_element] _generate_schema_map(list dtypes): + cdef map[string, schema_element] schema_map + cdef schema_element s_elem + cdef string c_name + + for name, dtype, child_dtypes in dtypes: + if not (isinstance(name, str) and + isinstance(dtype, DataType) and + isinstance(child_dtypes, list)): + + raise ValueError("Must pass a list of a tuple containing " + "(column_name, column_dtype, list of child_dtypes)") + + c_name = name.encode() + + s_elem.type = (dtype).c_obj + s_elem.child_types = _generate_schema_map(child_dtypes) + + schema_map[c_name] = s_elem + return schema_map + + +cpdef TableWithMetadata read_json( + SourceInfo source_info, + list dtypes = None, + compression_type compression = compression_type.AUTO, + bool lines = False, + size_type byte_range_offset = 0, + size_type byte_range_size = 0, + bool keep_quotes = False, + bool mixed_types_as_string = False, + bool prune_columns = False, + json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL, +): + """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`. + + Parameters + ---------- + source_info : SourceInfo + The SourceInfo object to read the JSON file from. + dtypes : list, default None + Set data types for the columns in the JSON file. + + Each element of the list has the format + (column_name, column_dtype, list of child dtypes), where + the list of child dtypes is an empty list if the child is not + a nested type (list or struct dtype), and is of format + (column_child_name, column_child_type, list of grandchild dtypes). + compression_type: CompressionType, default CompressionType.AUTO + The compression format of the JSON source. + byte_range_offset : size_type, default 0 + Number of bytes to skip from source start. + byte_range_size : size_type, default 0 + Number of bytes to read. By default, will read all bytes. + keep_quotes : bool, default False + Whether the reader should keep quotes of string values. + prune_columns : bool, default False + Whether to only read columns specified in dtypes. + recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL + Whether to raise an error or set corresponding values to null + when encountering an invalid JSON line. + + Returns + ------- + TableWithMetadata + The Table and its corresponding metadata (column names) that were read in. + """ + cdef vector[data_type] types_vec + cdef json_reader_options opts = move( + json_reader_options.builder(source_info.c_obj) + .compression(compression) + .lines(lines) + .byte_range_offset(byte_range_offset) + .byte_range_size(byte_range_size) + .recovery_mode(recovery_mode) + .build() + ) + + if dtypes is not None: + if isinstance(dtypes[0], tuple): + opts.set_dtypes(move(_generate_schema_map(dtypes))) + else: + for dtype in dtypes: + types_vec.push_back((dtype).c_obj) + opts.set_dtypes(types_vec) + + opts.enable_keep_quotes(keep_quotes) + opts.enable_mixed_types_as_string(mixed_types_as_string) + opts.enable_prune_columns(prune_columns) + + # Read JSON + cdef table_with_metadata c_result + + with nogil: + c_result = move(cpp_read_json(opts)) + + return TableWithMetadata.from_libcudf(c_result) cpdef void write_json( diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd index 88daf54f33b..ab223c16a72 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd @@ -28,6 +28,11 @@ cdef class TableWithMetadata: cdef vector[column_name_info] _make_column_info(self, list column_names) + cdef list _make_columns_list(self, dict child_dict) + + @staticmethod + cdef dict _parse_col_names(vector[column_name_info] infos) + @staticmethod cdef TableWithMetadata from_libcudf(table_with_metadata& tbl) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx index f94e20970a4..df0b729b711 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -22,6 +22,11 @@ import errno import io import os +from cudf._lib.pylibcudf.libcudf.io.json import \ + json_recovery_mode_t as JSONRecoveryMode # no-cython-lint +from cudf._lib.pylibcudf.libcudf.io.types import \ + compression_type as CompressionType # no-cython-lint + cdef class TableWithMetadata: """A container holding a table and its associated metadata @@ -69,16 +74,44 @@ cdef class TableWithMetadata: """ return self.tbl.columns() - @property - def column_names(self): + cdef list _make_columns_list(self, dict child_dict): + cdef list names = [] + for child in child_dict: + grandchildren = self._make_columns_list(child_dict[child]) + names.append((child, grandchildren)) + return names + + def column_names(self, include_children=False): """ Return a list containing the column names of the table """ cdef list names = [] + cdef str name + cdef dict child_names = self.child_names for col_info in self.metadata.schema_info: - # TODO: Handle nesting (columns with child columns) - assert col_info.children.size() == 0, "Child column names are not handled!" - names.append(col_info.name.decode()) + name = col_info.name.decode() + if include_children: + children = self._make_columns_list(child_names[name]) + names.append((name, children)) + else: + names.append(name) + return names + + @property + def child_names(self): + """ + Return a dictionary mapping the names of columns with children + to the names of their child columns + """ + return TableWithMetadata._parse_col_names(self.metadata.schema_info) + + @staticmethod + cdef dict _parse_col_names(vector[column_name_info] infos): + cdef dict child_names = dict() + cdef dict names = dict() + for col_info in infos: + child_names = TableWithMetadata._parse_col_names(col_info.children) + names[col_info.name.decode()] = child_names return names @staticmethod @@ -137,6 +170,15 @@ cdef class SourceInfo: cdef vector[host_buffer] c_host_buffers cdef const unsigned char[::1] c_buffer cdef bint empty_buffer = False + cdef list new_sources = [] + + if isinstance(sources[0], io.StringIO): + for buffer in sources: + if not isinstance(buffer, io.StringIO): + raise ValueError("All sources must be of the same type!") + new_sources.append(buffer.read().encode()) + sources = new_sources + if isinstance(sources[0], bytes): empty_buffer = True for buffer in sources: @@ -156,7 +198,10 @@ cdef class SourceInfo: c_buffer.shape[0])) else: raise ValueError("Sources must be a list of str/paths, " - "bytes, io.BytesIO, or a Datasource") + "bytes, io.BytesIO, io.StringIO, or a Datasource") + + if empty_buffer is True: + c_host_buffers.push_back(host_buffer(NULL, 0)) self.c_obj = source_info(c_host_buffers) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt index 6c66d01ca57..699e85ce567 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt @@ -22,4 +22,5 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp ) +add_subdirectory(io) add_subdirectory(strings) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt new file mode 100644 index 00000000000..6831063ecb9 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt @@ -0,0 +1,26 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources json.pyx types.pyx) + +set(linked_libraries cudf::cudf) + +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_io_ +) + +set(targets_using_arrow_headers cpp_io_json cpp_io_types) +link_to_pyarrow_headers("${targets_using_arrow_headers}") diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd index 2e50cccd132..86621ae184f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libc.stdint cimport uint8_t +from libc.stdint cimport int32_t, uint8_t from libcpp cimport bool from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr @@ -19,9 +19,9 @@ cdef extern from "cudf/io/json.hpp" \ data_type type map[string, schema_element] child_types - cdef enum json_recovery_mode_t: - FAIL "cudf::io::json_recovery_mode_t::FAIL" - RECOVER_WITH_NULL "cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL" + cpdef enum class json_recovery_mode_t(int32_t): + FAIL + RECOVER_WITH_NULL cdef cppclass json_reader_options: json_reader_options() except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd index fd21e7b334b..8917a6ac899 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd @@ -10,7 +10,9 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil: cdef cppclass lists_column_view(column_view): lists_column_view() except + + lists_column_view(const lists_column_view& lists_column) except + lists_column_view(const column_view& lists_column) except + + lists_column_view& operator=(const lists_column_view&) except + column_view parent() except + column_view offsets() except + column_view child() except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd new file mode 100644 index 00000000000..0382a5d42c3 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport ( + lists_column_view, +) + + +cdef extern from "cudf/lists/reverse.hpp" namespace "cudf::lists" nogil: + cdef unique_ptr[column] reverse( + const lists_column_view& lists_column, + ) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd new file mode 100644 index 00000000000..0cc58af735b --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.types cimport data_type + + +cdef extern from "cudf/utilities/traits.hpp" namespace "cudf" nogil: + cdef bool is_relationally_comparable(data_type) + cdef bool is_equality_comparable(data_type) + cdef bool is_numeric(data_type) + cdef bool is_index_type(data_type) + cdef bool is_unsigned(data_type) + cdef bool is_integral(data_type) + cdef bool is_integral_not_bool(data_type) + cdef bool is_floating_point(data_type) + cdef bool is_boolean(data_type) + cdef bool is_timestamp(data_type) + cdef bool is_fixed_point(data_type) + cdef bool is_duration(data_type) + cdef bool is_chrono(data_type) + cdef bool is_dictionary(data_type) + cdef bool is_fixed_width(data_type) + cdef bool is_compound(data_type) + cdef bool is_nested(data_type) + cdef bool is_bit_castable(data_type, data_type) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd index 2ccf0139e90..c9d0a84e8ac 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd @@ -23,3 +23,5 @@ cpdef Column contains(Column, ColumnOrScalar) cpdef Column contains_nulls(Column) cpdef Column index_of(Column, ColumnOrScalar, bool) + +cpdef Column reverse(Column) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index a94d940accd..651f1346f88 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -9,6 +9,7 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.lists cimport ( contains as cpp_contains, explode as cpp_explode, + reverse as cpp_reverse, ) from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( concatenate_list_elements as cpp_concatenate_list_elements, @@ -206,3 +207,28 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o find_option, )) return Column.from_libcudf(move(c_result)) + + +cpdef Column reverse(Column input): + """Reverse the element order within each list of the input column. + + For details, see :cpp:func:`reverse`. + + Parameters + ---------- + input : Column + The input column. + + Returns + ------- + Column + A new Column with reversed lists. + """ + cdef unique_ptr[column] c_result + cdef ListColumnView list_view = input.list_view() + + with nogil: + c_result = move(cpp_reverse.reverse( + list_view.view(), + )) + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/traits.pxd new file mode 100644 index 00000000000..668fa775202 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/traits.pxd @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool + +from .types cimport DataType + + +cpdef bool is_relationally_comparable(DataType typ) +cpdef bool is_equality_comparable(DataType typ) +cpdef bool is_numeric(DataType typ) +cpdef bool is_index_type(DataType typ) +cpdef bool is_unsigned(DataType typ) +cpdef bool is_integral(DataType typ) +cpdef bool is_integral_not_bool(DataType typ) +cpdef bool is_floating_point(DataType typ) +cpdef bool is_boolean(DataType typ) +cpdef bool is_timestamp(DataType typ) +cpdef bool is_fixed_point(DataType typ) +cpdef bool is_duration(DataType typ) +cpdef bool is_chrono(DataType typ) +cpdef bool is_dictionary(DataType typ) +cpdef bool is_fixed_width(DataType typ) +cpdef bool is_compound(DataType typ) +cpdef bool is_nested(DataType typ) +cpdef bool is_bit_castable(DataType source, DataType target) diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pyx b/python/cudf/cudf/_lib/pylibcudf/traits.pyx new file mode 100644 index 00000000000..d2370f8d641 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/traits.pyx @@ -0,0 +1,151 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool + +from cudf._lib.pylibcudf.libcudf.utilities cimport traits + +from .types cimport DataType + + +cpdef bool is_relationally_comparable(DataType typ): + """Checks if the given data type supports relational comparisons. + + For details, see :cpp:func:`is_relationally_comparable`. + """ + return traits.is_relationally_comparable(typ.c_obj) + + +cpdef bool is_equality_comparable(DataType typ): + """Checks if the given data type supports equality comparisons. + + For details, see :cpp:func:`is_equality_comparable`. + """ + return traits.is_equality_comparable(typ.c_obj) + + +cpdef bool is_numeric(DataType typ): + """Checks if the given data type is numeric. + + For details, see :cpp:func:`is_numeric`. + """ + return traits.is_numeric(typ.c_obj) + + +cpdef bool is_index_type(DataType typ): + """Checks if the given data type is an index type. + + For details, see :cpp:func:`is_index_type`. + """ + return traits.is_index_type(typ.c_obj) + + +cpdef bool is_unsigned(DataType typ): + """Checks if the given data type is an unsigned type. + + For details, see :cpp:func:`is_unsigned`. + """ + return traits.is_unsigned(typ.c_obj) + + +cpdef bool is_integral(DataType typ): + """Checks if the given data type is an integral type. + + For details, see :cpp:func:`is_integral`. + """ + return traits.is_integral(typ.c_obj) + + +cpdef bool is_integral_not_bool(DataType typ): + """Checks if the given data type is an integral type excluding booleans. + + For details, see :cpp:func:`is_integral_not_bool`. + """ + return traits.is_integral_not_bool(typ.c_obj) + + +cpdef bool is_floating_point(DataType typ): + """Checks if the given data type is a floating point type. + + For details, see :cpp:func:`is_floating_point`. + """ + return traits.is_floating_point(typ.c_obj) + + +cpdef bool is_boolean(DataType typ): + """Checks if the given data type is a boolean type. + + For details, see :cpp:func:`is_boolean`. + """ + return traits.is_boolean(typ.c_obj) + + +cpdef bool is_timestamp(DataType typ): + """Checks if the given data type is a timestamp type. + + For details, see :cpp:func:`is_timestamp`. + """ + return traits.is_timestamp(typ.c_obj) + + +cpdef bool is_fixed_point(DataType typ): + """Checks if the given data type is a fixed point type. + + For details, see :cpp:func:`is_fixed_point`. + """ + return traits.is_fixed_point(typ.c_obj) + + +cpdef bool is_duration(DataType typ): + """Checks if the given data type is a duration type. + + For details, see :cpp:func:`is_duration`. + """ + return traits.is_duration(typ.c_obj) + + +cpdef bool is_chrono(DataType typ): + """Checks if the given data type is a chrono type. + + For details, see :cpp:func:`is_chrono`. + """ + return traits.is_chrono(typ.c_obj) + + +cpdef bool is_dictionary(DataType typ): + """Checks if the given data type is a dictionary type. + + For details, see :cpp:func:`is_dictionary`. + """ + return traits.is_dictionary(typ.c_obj) + + +cpdef bool is_fixed_width(DataType typ): + """Checks if the given data type is a fixed width type. + + For details, see :cpp:func:`is_fixed_width`. + """ + return traits.is_fixed_width(typ.c_obj) + + +cpdef bool is_compound(DataType typ): + """Checks if the given data type is a compound type. + + For details, see :cpp:func:`is_compound`. + """ + return traits.is_compound(typ.c_obj) + + +cpdef bool is_nested(DataType typ): + """Checks if the given data type is a nested type. + + For details, see :cpp:func:`is_nested`. + """ + return traits.is_nested(typ.c_obj) + + +cpdef bool is_bit_castable(DataType source, DataType target): + """Checks if the source type is bit-castable to the target type. + + For details, see :cpp:func:`is_bit_castable`. + """ + return traits.is_bit_castable(source.c_obj, target.c_obj) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index de6b9f690b6..f136cd997a7 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -322,7 +322,7 @@ cdef data_from_pylibcudf_io(tbl_with_meta): """ return _data_from_columns( columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns], - column_names=tbl_with_meta.column_names, + column_names=tbl_with_meta.column_names(include_children=False), index_names=None ) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index c548db67344..1992d471947 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -73,10 +73,15 @@ def memory_usage(self): child0_size = ( current_base_child.size + 1 - current_offset ) * current_base_child.base_children[0].dtype.itemsize - current_offset = current_base_child.base_children[ - 0 - ].element_indexing(current_offset) n += child0_size + current_offset_col = current_base_child.base_children[0] + if not len(current_offset_col): + # See https://github.com/rapidsai/cudf/issues/16164 why + # offset column can be uninitialized + break + current_offset = current_offset_col.element_indexing( + current_offset + ) current_base_child = current_base_child.base_children[1] n += ( diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index f8bfe340ae5..46603ff32b8 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -8,13 +8,14 @@ import pytest from cudf._lib import pylibcudf as plc +from cudf._lib.pylibcudf.io.types import CompressionType def metadata_from_arrow_type( pa_type: pa.Array, name: str = "", ) -> plc.interop.ColumnMetadata | None: - metadata = plc.interop.ColumnMetadata(name) # None + metadata = plc.interop.ColumnMetadata(name) if pa.types.is_list(pa_type): child_meta = [plc.interop.ColumnMetadata("offsets")] for i in range(pa_type.num_fields): @@ -39,9 +40,25 @@ def metadata_from_arrow_type( def assert_column_eq( - lhs: pa.Array | plc.Column, rhs: pa.Array | plc.Column + lhs: pa.Array | plc.Column, + rhs: pa.Array | plc.Column, + check_field_nullability=True, ) -> None: - """Verify that a pylibcudf array and PyArrow array are equal.""" + """Verify that a pylibcudf array and PyArrow array are equal. + + Parameters + ---------- + lhs: Union[pa.Array, plc.Column] + The array with the expected values + rhs: Union[pa.Array, plc.Column] + The array to check + check_field_nullability: + For list/struct dtypes, whether to check if the nullable attributes + on child fields are equal. + + Useful for checking roundtripping of lossy formats like JSON that may not + preserve this information. + """ # Nested types require children metadata to be passed to the conversion function. if isinstance(lhs, (pa.Array, pa.ChunkedArray)) and isinstance( rhs, plc.Column @@ -65,6 +82,33 @@ def assert_column_eq( if isinstance(rhs, pa.ChunkedArray): rhs = rhs.combine_chunks() + def _make_fields_nullable(typ): + new_fields = [] + for i in range(typ.num_fields): + child_field = typ.field(i) + if not child_field.nullable: + child_type = child_field.type + if isinstance(child_field.type, (pa.StructType, pa.ListType)): + child_type = _make_fields_nullable(child_type) + new_fields.append( + pa.field(child_field.name, child_type, nullable=True) + ) + else: + new_fields.append(child_field) + + if isinstance(typ, pa.StructType): + return pa.struct(new_fields) + elif isinstance(typ, pa.ListType): + return pa.list_(new_fields[0]) + return typ + + if not check_field_nullability: + rhs_type = _make_fields_nullable(rhs.type) + rhs = rhs.cast(rhs_type) + + lhs_type = _make_fields_nullable(lhs.type) + lhs = rhs.cast(lhs_type) + assert lhs.equals(rhs) @@ -78,20 +122,24 @@ def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None: def assert_table_and_meta_eq( - plc_table_w_meta: plc.io.types.TableWithMetadata, pa_table: pa.Table + pa_table: pa.Table, + plc_table_w_meta: plc.io.types.TableWithMetadata, + check_field_nullability=True, ) -> None: """Verify that the pylibcudf TableWithMetadata and PyArrow table are equal""" plc_table = plc_table_w_meta.tbl plc_shape = (plc_table.num_rows(), plc_table.num_columns()) - assert plc_shape == pa_table.shape + assert ( + plc_shape == pa_table.shape + ), f"{plc_shape} is not equal to {pa_table.shape}" for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns): - assert_column_eq(plc_col, pa_col) + assert_column_eq(pa_col, plc_col, check_field_nullability) # Check column name equality - assert plc_table_w_meta.column_names == pa_table.column_names + assert plc_table_w_meta.column_names() == pa_table.column_names def cudf_raises(expected_exception: BaseException, *args, **kwargs): @@ -102,49 +150,10 @@ def cudf_raises(expected_exception: BaseException, *args, **kwargs): return pytest.raises(expected_exception, *args, **kwargs) -# TODO: Consider moving these type utilities into pylibcudf.types itself. -def is_signed_integer(plc_dtype: plc.DataType): - return ( - plc.TypeId.INT8.value <= plc_dtype.id().value <= plc.TypeId.INT64.value - ) - - -def is_integer(plc_dtype: plc.DataType): - return plc_dtype.id() in ( - plc.TypeId.INT8, - plc.TypeId.INT16, - plc.TypeId.INT32, - plc.TypeId.INT64, - plc.TypeId.UINT8, - plc.TypeId.UINT16, - plc.TypeId.UINT32, - plc.TypeId.UINT64, - ) - - -def is_floating(plc_dtype: plc.DataType): - return plc_dtype.id() in ( - plc.TypeId.FLOAT32, - plc.TypeId.FLOAT64, - ) - - -def is_boolean(plc_dtype: plc.DataType): - return plc_dtype.id() == plc.TypeId.BOOL8 - - def is_string(plc_dtype: plc.DataType): return plc_dtype.id() == plc.TypeId.STRING -def is_fixed_width(plc_dtype: plc.DataType): - return ( - is_integer(plc_dtype) - or is_floating(plc_dtype) - or is_boolean(plc_dtype) - ) - - def nesting_level(typ) -> tuple[int, int]: """Return list and struct nesting of a pyarrow type.""" if isinstance(typ, pa.ListType): @@ -221,4 +230,26 @@ def sink_to_str(sink): + DEFAULT_PA_STRUCT_TESTING_TYPES ) +# Map pylibcudf compression types to pandas ones +# Not all compression types map cleanly, read the comments to learn more! +# If a compression type is unsupported, it maps to False. + +COMPRESSION_TYPE_TO_PANDAS = { + CompressionType.NONE: None, + # Users of this dict will have to special case + # AUTO + CompressionType.AUTO: None, + CompressionType.GZIP: "gzip", + CompressionType.BZIP2: "bz2", + CompressionType.ZIP: "zip", + CompressionType.XZ: "xz", + CompressionType.ZSTD: "zstd", + # Unsupported + CompressionType.ZLIB: False, + CompressionType.LZ4: False, + CompressionType.LZO: False, + # These only work for parquet + CompressionType.SNAPPY: "snappy", + CompressionType.BROTLI: "brotli", +} ALL_PA_TYPES = DEFAULT_PA_TYPES diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py index e4760ea7ac8..39832eb4bba 100644 --- a/python/cudf/cudf/pylibcudf_tests/conftest.py +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -121,6 +121,11 @@ def source_or_sink(request, tmp_path): return fp_or_buf() +@pytest.fixture(params=[opt for opt in plc.io.types.CompressionType]) +def compression_type(request): + return request.param + + @pytest.fixture( scope="session", params=[opt for opt in plc.types.Interpolation] ) diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_avro.py b/python/cudf/cudf/pylibcudf_tests/io/test_avro.py index d6cd86768cd..061d6792ce3 100644 --- a/python/cudf/cudf/pylibcudf_tests/io/test_avro.py +++ b/python/cudf/cudf/pylibcudf_tests/io/test_avro.py @@ -120,4 +120,4 @@ def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable): if columns != []: expected = expected.select(columns) - assert_table_and_meta_eq(res, expected) + assert_table_and_meta_eq(expected, res) diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_json.py b/python/cudf/cudf/pylibcudf_tests/io/test_json.py index d6b8bfa6976..c13eaf40625 100644 --- a/python/cudf/cudf/pylibcudf_tests/io/test_json.py +++ b/python/cudf/cudf/pylibcudf_tests/io/test_json.py @@ -1,11 +1,49 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import io +import pandas as pd import pyarrow as pa import pytest -from utils import sink_to_str +from utils import ( + COMPRESSION_TYPE_TO_PANDAS, + assert_table_and_meta_eq, + sink_to_str, +) import cudf._lib.pylibcudf as plc +from cudf._lib.pylibcudf.io.types import CompressionType + + +def make_json_source(path_or_buf, pa_table, **kwargs): + """ + Uses pandas to write a pyarrow Table to a JSON file. + + The caller is responsible for making sure that no arguments + unsupported by pandas are passed in. + """ + df = pa_table.to_pandas() + if "compression" in kwargs: + kwargs["compression"] = COMPRESSION_TYPE_TO_PANDAS[ + kwargs["compression"] + ] + df.to_json(path_or_buf, orient="records", **kwargs) + if isinstance(path_or_buf, io.IOBase): + path_or_buf.seek(0) + return path_or_buf + + +def write_json_bytes(source, json_str): + """ + Write a JSON string to the source + """ + if not isinstance(source, io.IOBase): + with open(source, "w") as source_f: + source_f.write(json_str) + else: + if isinstance(source, io.BytesIO): + json_str = json_str.encode("utf-8") + source.write(json_str) + source.seek(0) @pytest.mark.parametrize("rows_per_chunk", [8, 100]) @@ -114,3 +152,238 @@ def test_write_json_bool_opts(true_value, false_value): pd_result = pd_result.replace("false", false_value) assert str_result == pd_result + + +@pytest.mark.parametrize("lines", [True, False]) +def test_read_json_basic( + table_data, source_or_sink, lines, compression_type, request +): + if compression_type in { + # Not supported by libcudf + CompressionType.SNAPPY, + CompressionType.XZ, + CompressionType.ZSTD, + # Not supported by pandas + # TODO: find a way to test these + CompressionType.BROTLI, + CompressionType.LZ4, + CompressionType.LZO, + CompressionType.ZLIB, + }: + pytest.skip("unsupported compression type by pandas/libcudf") + + # can't compress non-binary data with pandas + if isinstance(source_or_sink, io.StringIO): + compression_type = CompressionType.NONE + + _, pa_table = table_data + + source = make_json_source( + source_or_sink, pa_table, lines=lines, compression=compression_type + ) + + request.applymarker( + pytest.mark.xfail( + condition=( + len(pa_table) > 0 + and compression_type + not in {CompressionType.NONE, CompressionType.AUTO} + ), + # note: wasn't able to narrow down the specific types that were failing + # seems to be a little non-deterministic, but always fails with + # cudaErrorInvalidValue invalid argument + reason="libcudf json reader crashes on compressed non empty table_data", + ) + ) + + if isinstance(source, io.IOBase): + source.seek(0) + + res = plc.io.json.read_json( + plc.io.SourceInfo([source]), + compression=compression_type, + lines=lines, + ) + + # Adjustments to correct for the fact orient=records is lossy + # and doesn't + # 1) preserve colnames when zero rows in table + # 2) preserve struct nullability + # 3) differentiate int64/uint64 + if len(pa_table) == 0: + pa_table = pa.table([]) + + new_fields = [] + for i in range(len(pa_table.schema)): + curr_field = pa_table.schema.field(i) + if curr_field.type == pa.uint64(): + try: + curr_field = curr_field.with_type(pa.int64()) + except OverflowError: + # There will be no confusion, values are too large + # for int64 anyways + pass + new_fields.append(curr_field) + + pa_table = pa_table.cast(pa.schema(new_fields)) + + # Convert non-nullable struct fields to nullable fields + # since nullable=False cannot roundtrip through orient='records' + # JSON format + assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) + + +def test_read_json_dtypes(table_data, source_or_sink): + # Simple test for dtypes where we read in + # all numeric data as floats + _, pa_table = table_data + source = make_json_source( + source_or_sink, + pa_table, + lines=True, + ) + + dtypes = [] + new_fields = [] + for i in range(len(pa_table.schema)): + field = pa_table.schema.field(i) + child_types = [] + + def get_child_types(typ): + typ_child_types = [] + for i in range(typ.num_fields): + curr_field = typ.field(i) + typ_child_types.append( + ( + curr_field.name, + curr_field.type, + get_child_types(curr_field.type), + ) + ) + return typ_child_types + + plc_type = plc.interop.from_arrow(field.type) + if pa.types.is_integer(field.type) or pa.types.is_unsigned_integer( + field.type + ): + plc_type = plc.interop.from_arrow(pa.float64()) + field = field.with_type(pa.float64()) + + dtypes.append((field.name, plc_type, child_types)) + + new_fields.append(field) + + new_schema = pa.schema(new_fields) + + res = plc.io.json.read_json( + plc.io.SourceInfo([source]), dtypes=dtypes, lines=True + ) + new_table = pa_table.cast(new_schema) + + # orient=records is lossy + # and doesn't preserve column names when there's zero rows in the table + if len(new_table) == 0: + new_table = pa.table([]) + + assert_table_and_meta_eq(new_table, res, check_field_nullability=False) + + +@pytest.mark.parametrize("chunk_size", [10, 15, 20]) +def test_read_json_lines_byte_range(source_or_sink, chunk_size): + source = source_or_sink + if isinstance(source_or_sink, io.StringIO): + pytest.skip("byte_range doesn't work on StringIO") + + json_str = "[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]\n" + write_json_bytes(source, json_str) + + tbls_w_meta = [] + for chunk_start in range(0, len(json_str.encode("utf-8")), chunk_size): + tbls_w_meta.append( + plc.io.json.read_json( + plc.io.SourceInfo([source]), + lines=True, + byte_range_offset=chunk_start, + byte_range_size=chunk_start + chunk_size, + ) + ) + + if isinstance(source, io.IOBase): + source.seek(0) + exp = pd.read_json(source, orient="records", lines=True) + + # TODO: can do this operation using pylibcudf + tbls = [] + for tbl_w_meta in tbls_w_meta: + if tbl_w_meta.tbl.num_rows() > 0: + tbls.append(plc.interop.to_arrow(tbl_w_meta.tbl)) + full_tbl = pa.concat_tables(tbls) + + full_tbl_plc = plc.io.TableWithMetadata( + plc.interop.from_arrow(full_tbl), + tbls_w_meta[0].column_names(include_children=True), + ) + assert_table_and_meta_eq(pa.Table.from_pandas(exp), full_tbl_plc) + + +@pytest.mark.parametrize("keep_quotes", [True, False]) +def test_read_json_lines_keep_quotes(keep_quotes, source_or_sink): + source = source_or_sink + + json_bytes = '["a", "b", "c"]\n' + write_json_bytes(source, json_bytes) + + tbl_w_meta = plc.io.json.read_json( + plc.io.SourceInfo([source]), lines=True, keep_quotes=keep_quotes + ) + + template = "{0}" + if keep_quotes: + template = '"{0}"' + + exp = pa.Table.from_arrays( + [ + [template.format("a")], + [template.format("b")], + [template.format("c")], + ], + names=["0", "1", "2"], + ) + + assert_table_and_meta_eq(exp, tbl_w_meta) + + +@pytest.mark.parametrize( + "recovery_mode", [opt for opt in plc.io.types.JSONRecoveryMode] +) +def test_read_json_lines_recovery_mode(recovery_mode, source_or_sink): + source = source_or_sink + + json_bytes = '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n' + write_json_bytes(source, json_bytes) + + if recovery_mode == plc.io.types.JSONRecoveryMode.FAIL: + with pytest.raises(RuntimeError): + plc.io.json.read_json( + plc.io.SourceInfo([source]), + lines=True, + recovery_mode=recovery_mode, + ) + else: + # Recover case (bad values replaced with nulls) + tbl_w_meta = plc.io.json.read_json( + plc.io.SourceInfo([source]), + lines=True, + recovery_mode=recovery_mode, + ) + exp = pa.Table.from_arrays( + [[1, 2, None, 3], [10, 11, None, 12]], names=["a", "b"] + ) + assert_table_and_meta_eq(exp, tbl_w_meta) + + +# TODO: Add tests for these! +# Tests were not added in the initial PR porting the JSON reader to pylibcudf +# to save time (and since there are no existing tests for these in Python cuDF) +# mixed_types_as_string = mixed_types_as_string, +# prune_columns = prune_columns, diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py index 0a6df198d46..f27fe4e942e 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_copying.py +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -9,9 +9,6 @@ assert_column_eq, assert_table_eq, cudf_raises, - is_fixed_width, - is_floating, - is_integer, is_nested_list, is_nested_struct, is_string, @@ -359,9 +356,9 @@ def test_scatter_table_type_mismatch(source_table, index_column, target_table): _, plc_index_column = index_column _, plc_target_table = target_table with cudf_raises(TypeError): - if is_integer( + if plc.traits.is_integral_not_bool( dtype := plc_target_table.columns()[0].type() - ) or is_floating(dtype): + ) or plc.traits.is_floating_point(dtype): pa_array = pa.array([True] * plc_source_table.num_rows()) else: pa_array = pa.array([1] * plc_source_table.num_rows()) @@ -428,9 +425,9 @@ def test_scatter_scalars_type_mismatch(index_column, target_table): _, plc_index_column = index_column _, plc_target_table = target_table with cudf_raises(TypeError): - if is_integer( + if plc.traits.is_integral_not_bool( dtype := plc_target_table.columns()[0].type() - ) or is_floating(dtype): + ) or plc.traits.is_floating_point(dtype): plc_source_scalar = [plc.interop.from_arrow(pa.scalar(True))] else: plc_source_scalar = [plc.interop.from_arrow(pa.scalar(1))] @@ -458,7 +455,7 @@ def test_empty_like_table(source_table): @pytest.mark.parametrize("size", [None, 10]) def test_allocate_like(input_column, size): _, plc_input_column = input_column - if is_fixed_width(plc_input_column.type()): + if plc.traits.is_fixed_width(plc_input_column.type()): result = plc.copying.allocate_like( plc_input_column, plc.copying.MaskAllocationPolicy.RETAIN, @@ -484,7 +481,7 @@ def test_copy_range_in_place( pa_target_column, _ = target_column - if not is_fixed_width(mutable_target_column.type()): + if not plc.traits.is_fixed_width(mutable_target_column.type()): with pytest.raises(TypeError): plc.copying.copy_range_in_place( plc_input_column, @@ -516,7 +513,7 @@ def test_copy_range_in_place_out_of_bounds( ): _, plc_input_column = input_column - if is_fixed_width(mutable_target_column.type()): + if plc.traits.is_fixed_width(mutable_target_column.type()): with cudf_raises(IndexError): plc.copying.copy_range_in_place( plc_input_column, @@ -528,7 +525,9 @@ def test_copy_range_in_place_out_of_bounds( def test_copy_range_in_place_different_types(mutable_target_column): - if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype): + if plc.traits.is_integral_not_bool( + dtype := mutable_target_column.type() + ) or plc.traits.is_floating_point(dtype): plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) @@ -548,7 +547,7 @@ def test_copy_range_in_place_null_mismatch( ): pa_input_column, _ = input_column - if is_fixed_width(mutable_target_column.type()): + if plc.traits.is_fixed_width(mutable_target_column.type()): pa_input_column = pc.if_else( _pyarrow_index_to_mask([0], len(pa_input_column)), pa_input_column, @@ -568,7 +567,9 @@ def test_copy_range_in_place_null_mismatch( def test_copy_range(input_column, target_column): pa_input_column, plc_input_column = input_column pa_target_column, plc_target_column = target_column - if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype): + if plc.traits.is_fixed_width( + dtype := plc_target_column.type() + ) or is_string(dtype): result = plc.copying.copy_range( plc_input_column, plc_target_column, @@ -610,7 +611,9 @@ def test_copy_range_out_of_bounds(input_column, target_column): def test_copy_range_different_types(target_column): _, plc_target_column = target_column - if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + if plc.traits.is_integral_not_bool( + dtype := plc_target_column.type() + ) or plc.traits.is_floating_point(dtype): plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) @@ -629,7 +632,9 @@ def test_shift(target_column, source_scalar): pa_source_scalar, plc_source_scalar = source_scalar pa_target_column, plc_target_column = target_column shift = 2 - if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype): + if plc.traits.is_fixed_width( + dtype := plc_target_column.type() + ) or is_string(dtype): result = plc.copying.shift(plc_target_column, shift, plc_source_scalar) expected = pa.concat_arrays( [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]] @@ -642,7 +647,9 @@ def test_shift(target_column, source_scalar): def test_shift_type_mismatch(target_column): _, plc_target_column = target_column - if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + if plc.traits.is_integral_not_bool( + dtype := plc_target_column.type() + ) or plc.traits.is_floating_point(dtype): fill_value = plc.interop.from_arrow(pa.scalar("a")) else: fill_value = plc.interop.from_arrow(pa.scalar(1)) @@ -747,7 +754,9 @@ def test_copy_if_else_column_column(target_column, mask, source_scalar): def test_copy_if_else_wrong_type(target_column, mask): _, plc_target_column = target_column _, plc_mask = mask - if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + if plc.traits.is_integral_not_bool( + dtype := plc_target_column.type() + ) or plc.traits.is_floating_point(dtype): plc_input_column = plc.interop.from_arrow( pa.array(["a"] * plc_target_column.size()) ) @@ -951,9 +960,9 @@ def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table): def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask): _, plc_target_table = target_table _, plc_mask = mask - if is_integer( + if plc.traits.is_integral_not_bool( dtype := plc_target_table.columns()[0].type() - ) or is_floating(dtype): + ) or plc.traits.is_floating_point(dtype): input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index c781126e388..58a1dcf8d56 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -134,3 +134,15 @@ def test_index_of_list_column(test_data, column): expect = pa.array(column[1], type=pa.int32()) assert_column_eq(expect, res) + + +def test_reverse(test_data): + list_column = test_data[0][0] + arr = pa.array(list_column) + plc_column = plc.interop.from_arrow(arr) + + res = plc.lists.reverse(plc_column) + + expect = pa.array([lst[::-1] for lst in list_column]) + + assert_column_eq(expect, res) diff --git a/python/cudf/cudf/pylibcudf_tests/test_traits.py b/python/cudf/cudf/pylibcudf_tests/test_traits.py new file mode 100644 index 00000000000..6c22cb02f21 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_traits.py @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib import pylibcudf as plc + + +def test_is_relationally_comparable(): + assert plc.traits.is_relationally_comparable(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_relationally_comparable( + plc.DataType(plc.TypeId.LIST) + ) + + +def test_is_equality_comparable(): + assert plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.LIST)) + + +def test_is_numeric(): + assert plc.traits.is_numeric(plc.DataType(plc.TypeId.FLOAT64)) + assert not plc.traits.is_numeric(plc.DataType(plc.TypeId.LIST)) + + +def test_is_index_type(): + assert plc.traits.is_index_type(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_index_type(plc.DataType(plc.TypeId.BOOL8)) + + +def test_is_unsigned(): + assert plc.traits.is_unsigned(plc.DataType(plc.TypeId.UINT8)) + assert not plc.traits.is_unsigned(plc.DataType(plc.TypeId.INT8)) + + +def test_is_integral(): + assert plc.traits.is_integral(plc.DataType(plc.TypeId.BOOL8)) + assert not plc.traits.is_integral(plc.DataType(plc.TypeId.DECIMAL32)) + + +def test_is_integral_not_bool(): + assert plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.BOOL8)) + + +def test_is_floating_point(): + assert plc.traits.is_floating_point(plc.DataType(plc.TypeId.FLOAT64)) + assert not plc.traits.is_floating_point(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_boolean(): + assert plc.traits.is_boolean(plc.DataType(plc.TypeId.BOOL8)) + assert not plc.traits.is_boolean(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_timestamp(): + assert plc.traits.is_timestamp( + plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS) + ) + assert not plc.traits.is_timestamp( + plc.DataType(plc.TypeId.DURATION_MICROSECONDS) + ) + + +def test_is_fixed_point(): + assert plc.traits.is_fixed_point(plc.DataType(plc.TypeId.DECIMAL128)) + assert not plc.traits.is_fixed_point(plc.DataType(plc.TypeId.FLOAT32)) + + +def test_is_duration(): + assert plc.traits.is_duration( + plc.DataType(plc.TypeId.DURATION_MICROSECONDS) + ) + assert not plc.traits.is_duration( + plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS) + ) + + +def test_is_chrono(): + assert plc.traits.is_chrono(plc.DataType(plc.TypeId.DURATION_MICROSECONDS)) + assert plc.traits.is_chrono( + plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS) + ) + assert not plc.traits.is_chrono(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_dictionary(): + assert plc.traits.is_dictionary(plc.DataType(plc.TypeId.DICTIONARY32)) + assert not plc.traits.is_dictionary(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_fixed_width(): + assert plc.traits.is_fixed_width(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_fixed_width(plc.DataType(plc.TypeId.STRING)) + + +def test_is_compound(): + assert plc.traits.is_compound(plc.DataType(plc.TypeId.STRUCT)) + assert not plc.traits.is_compound(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_nested(): + assert plc.traits.is_nested(plc.DataType(plc.TypeId.STRUCT)) + assert not plc.traits.is_nested(plc.DataType(plc.TypeId.STRING)) + + +def test_is_bit_castable(): + assert plc.traits.is_bit_castable( + plc.DataType(plc.TypeId.INT8), plc.DataType(plc.TypeId.UINT8) + ) + assert not plc.traits.is_bit_castable( + plc.DataType(plc.TypeId.UINT8), plc.DataType(plc.TypeId.UINT16) + ) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 297040b6d95..9222f6d23db 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1077,8 +1077,13 @@ def test_json_dtypes_nested_data(): ) pdf = pd.read_json( - StringIO(expected_json_str), orient="records", lines=True + StringIO(expected_json_str), + orient="records", + lines=True, ) + + assert_eq(df, pdf) + pdf.columns = pdf.columns.astype("str") pa_table_pdf = pa.Table.from_pandas( pdf, schema=df.to_arrow().schema, safe=False diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index f76143cb381..ec9d7995b05 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -12,6 +12,7 @@ from cudf import NA from cudf._lib.copying import get_element from cudf.api.types import is_scalar +from cudf.core.column.column import column_empty from cudf.testing import assert_eq from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES @@ -926,3 +927,29 @@ def test_list_iterate_error(): def test_list_struct_list_memory_usage(): df = cudf.DataFrame({"a": [[{"b": [1]}]]}) assert df.memory_usage().sum() == 16 + + +def test_empty_nested_list_uninitialized_offsets_memory_usage(): + col = column_empty(0, cudf.ListDtype(cudf.ListDtype("int64"))) + nested_col = col.children[1] + empty_inner = type(nested_col)( + size=nested_col.size, + dtype=nested_col.dtype, + mask=nested_col.mask, + offset=nested_col.offset, + null_count=nested_col.null_count, + children=( + column_empty(0, nested_col.children[0].dtype), + nested_col.children[1], + ), + ) + col_empty_offset = type(col)( + size=col.size, + dtype=col.dtype, + mask=col.mask, + offset=col.offset, + null_count=col.null_count, + children=(column_empty(0, col.children[0].dtype), empty_inner), + ) + ser = cudf.Series._from_data({None: col_empty_offset}) + assert ser.memory_usage() == 8 diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 20b731624df..dcb33b1fc1a 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -121,7 +121,7 @@ skip = [ build-backend = "scikit_build_core.build" dependencies-file = "../../dependencies.yaml" requires = [ - "cmake>=3.26.4", + "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", "ninja", "numpy==1.23.*", diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 11e18cd4f32..badfdf06d15 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -101,7 +101,7 @@ regex = "(?P.*)" build-backend = "scikit_build_core.build" dependencies-file = "../../dependencies.yaml" requires = [ - "cmake>=3.26.4", + "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", "ninja", "numpy==1.23.*", diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 979087d5273..764cdd3b3ca 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -34,7 +34,12 @@ def _callback( return ir.evaluate(cache={}).to_polars() -def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None: +def execute_with_cudf( + nt: NodeTraverser, + *, + raise_on_fail: bool = False, + exception: type[Exception] | tuple[type[Exception], ...] = Exception, +) -> None: """ A post optimization callback that attempts to execute the plan with cudf. @@ -47,11 +52,15 @@ def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None Should conversion raise an exception rather than continuing without setting a callback. + exception + Optional exception, or tuple of exceptions, to catch during + translation. Defaults to ``Exception``. + The NodeTraverser is mutated if the libcudf executor can handle the plan. """ try: with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): nt.set_udf(partial(_callback, translate_ir(nt))) - except NotImplementedError: + except exception: if raise_on_fail: raise diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index ec8d00c3123..d86656578d7 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -5,6 +5,7 @@ from __future__ import annotations +import itertools from functools import cached_property from typing import TYPE_CHECKING, cast @@ -160,7 +161,10 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self: ----- If column names overlap, newer names replace older ones. """ - return type(self)([*self.columns, *columns]) + columns = list( + {c.name: c for c in itertools.chain(self.columns, columns)}.values() + ) + return type(self)(columns) def discard_columns(self, names: Set[str]) -> Self: """Drop columns by name.""" diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index fe859c8d958..93cb9db7cbd 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -44,6 +44,7 @@ "Col", "BooleanFunction", "StringFunction", + "TemporalFunction", "Sort", "SortBy", "Gather", @@ -703,6 +704,7 @@ def _validate_input(self): pl_expr.StringFunction.EndsWith, pl_expr.StringFunction.StartsWith, pl_expr.StringFunction.Contains, + pl_expr.StringFunction.Slice, ): raise NotImplementedError(f"String function {self.name}") if self.name == pl_expr.StringFunction.Contains: @@ -716,6 +718,11 @@ def _validate_input(self): raise NotImplementedError( "Regex contains only supports a scalar pattern" ) + elif self.name == pl_expr.StringFunction.Slice: + if not all(isinstance(child, Literal) for child in self.children[1:]): + raise NotImplementedError( + "Slice only supports literal start and stop values" + ) def do_evaluate( self, @@ -744,6 +751,36 @@ def do_evaluate( flags=plc.strings.regex_flags.RegexFlags.DEFAULT, ) return Column(plc.strings.contains.contains_re(column.obj, prog)) + elif self.name == pl_expr.StringFunction.Slice: + child, expr_offset, expr_length = self.children + assert isinstance(expr_offset, Literal) + assert isinstance(expr_length, Literal) + + column = child.evaluate(df, context=context, mapping=mapping) + # libcudf slices via [start,stop). + # polars slices with offset + length where start == offset + # stop = start + length. Negative values for start look backward + # from the last element of the string. If the end index would be + # below zero, an empty string is returned. + # Do this maths on the host + start = expr_offset.value.as_py() + length = expr_length.value.as_py() + + if length == 0: + stop = start + else: + # No length indicates a scan to the end + # The libcudf equivalent is a null stop + stop = start + length if length else None + if length and start < 0 and length >= -start: + stop = None + return Column( + plc.strings.slice.slice_strings( + column.obj, + plc.interop.from_arrow(pa.scalar(start, type=pa.int32())), + plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())), + ) + ) columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -779,6 +816,129 @@ def do_evaluate( ) # pragma: no cover; handled by init raising +class TemporalFunction(Expr): + __slots__ = ("name", "options", "children") + _non_child = ("dtype", "name", "options") + children: tuple[Expr, ...] + + def __init__( + self, + dtype: plc.DataType, + name: pl_expr.TemporalFunction, + options: tuple[Any, ...], + *children: Expr, + ) -> None: + super().__init__(dtype) + self.options = options + self.name = name + self.children = children + if self.name != pl_expr.TemporalFunction.Year: + raise NotImplementedError(f"String function {self.name}") + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + columns = [ + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ] + if self.name == pl_expr.TemporalFunction.Year: + (column,) = columns + return Column(plc.datetime.extract_year(column.obj)) + raise NotImplementedError( + f"TemporalFunction {self.name}" + ) # pragma: no cover; init trips first + + +class UnaryFunction(Expr): + __slots__ = ("name", "options", "children") + _non_child = ("dtype", "name", "options") + children: tuple[Expr, ...] + + def __init__( + self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr + ) -> None: + super().__init__(dtype) + self.name = name + self.options = options + self.children = children + if self.name not in ("round", "unique"): + raise NotImplementedError(f"Unary function {name=}") + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + if self.name == "round": + (decimal_places,) = self.options + (values,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + return Column( + plc.round.round( + values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP + ) + ).sorted_like(values) + elif self.name == "unique": + (maintain_order,) = self.options + (values,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + # Only one column, so keep_any is the same as keep_first + # for stable distinct + keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY + if values.is_sorted: + maintain_order = True + result = plc.stream_compaction.unique( + plc.Table([values.obj]), + [0], + keep, + plc.types.NullEquality.EQUAL, + ) + else: + distinct = ( + plc.stream_compaction.stable_distinct + if maintain_order + else plc.stream_compaction.distinct + ) + result = distinct( + plc.Table([values.obj]), + [0], + keep, + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + (column,) = result.columns() + if maintain_order: + return Column(column).sorted_like(values) + return Column(column) + raise NotImplementedError( + f"Unimplemented unary function {self.name=}" + ) # pragma: no cover; init trips first + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + if depth == 1: + # inside aggregation, need to pre-evaluate, groupby + # construction has checked that we don't have nested aggs, + # so stop the recursion and return ourselves for pre-eval + return AggInfo([(self, plc.aggregation.collect_list(), self)]) + else: + (child,) = self.children + return child.collect_agg(depth=depth) + + class Sort(Expr): __slots__ = ("options", "children") _non_child = ("dtype", "options") @@ -1182,7 +1342,8 @@ def __init__( self.children = (left, right) if ( op in (plc.binaryop.BinaryOperator.ADD, plc.binaryop.BinaryOperator.SUB) - and ({left.dtype.id(), right.dtype.id()}.issubset(dtypes.TIMELIKE_TYPES)) + and plc.traits.is_chrono(left.dtype) + and plc.traits.is_chrono(right.dtype) and not dtypes.have_compatible_resolution(left.dtype.id(), right.dtype.id()) ): raise NotImplementedError("Casting rules for timelike types") diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 9b3096becd4..6b552642e88 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -96,6 +96,8 @@ def broadcast( ``target_length`` is provided and not all columns are length-1 (i.e. ``n != 1``), then ``target_length`` must be equal to ``n``. """ + if len(columns) == 0: + return [] lengths: set[int] = {column.obj.size() for column in columns} if lengths == {1}: if target_length is None: @@ -431,7 +433,7 @@ def check_agg(agg: expr.Expr) -> int: NotImplementedError For unsupported expression nodes. """ - if isinstance(agg, (expr.BinOp, expr.Cast)): + if isinstance(agg, (expr.BinOp, expr.Cast, expr.UnaryFunction)): return max(GroupBy.check_agg(child) for child in agg.children) elif isinstance(agg, expr.Agg): return 1 + max(GroupBy.check_agg(child) for child in agg.children) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index a2fdb3c3d79..5a1e682abe7 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -361,8 +361,23 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex options, *(translate_expr(visitor, n=n) for n in node.input), ) - else: - raise NotImplementedError(f"No handler for Expr function node with {name=}") + elif isinstance(name, pl_expr.TemporalFunction): + return expr.TemporalFunction( + dtype, + name, + options, + *(translate_expr(visitor, n=n) for n in node.input), + ) + elif isinstance(name, str): + return expr.UnaryFunction( + dtype, + name, + options, + *(translate_expr(visitor, n=n) for n in node.input), + ) + raise NotImplementedError( + f"No handler for Expr function node with {name=}" + ) # pragma: no cover; polars raises on the rust side for now @_translate_expr.register @@ -432,8 +447,11 @@ def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.E # Push casts into literals so we can handle Cast(Literal(Null)) if isinstance(inner, expr.Literal): return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype))) - else: - return expr.Cast(dtype, inner) + elif isinstance(inner, expr.Cast): + # Translation of Len/Count-agg put in a cast, remove double + # casts if we have one. + (inner,) = inner.children + return expr.Cast(dtype, inner) @_translate_expr.register @@ -443,12 +461,15 @@ def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr @_translate_expr.register def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: - return expr.Agg( + value = expr.Agg( dtype, node.name, node.options, *(translate_expr(visitor, n=n) for n in node.arguments), ) + if value.name == "count" and value.dtype.id() != plc.TypeId.INT32: + return expr.Cast(value.dtype, value) + return value @_translate_expr.register @@ -475,7 +496,10 @@ def _( @_translate_expr.register def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: - return expr.Len(dtype) + value = expr.Len(dtype) + if dtype.id() != plc.TypeId.INT32: + return expr.Cast(dtype, value) + return value # pragma: no cover; never reached since polars len has uint32 dtype def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr: diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 507acb5d33a..918cd024fa2 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -17,19 +17,6 @@ __all__ = ["from_polars", "downcast_arrow_lists", "have_compatible_resolution"] -TIMELIKE_TYPES: frozenset[plc.TypeId] = frozenset( - [ - plc.TypeId.TIMESTAMP_MILLISECONDS, - plc.TypeId.TIMESTAMP_MICROSECONDS, - plc.TypeId.TIMESTAMP_NANOSECONDS, - plc.TypeId.TIMESTAMP_DAYS, - plc.TypeId.DURATION_MILLISECONDS, - plc.TypeId.DURATION_MICROSECONDS, - plc.TypeId.DURATION_NANOSECONDS, - ] -) - - def have_compatible_resolution(lid: plc.TypeId, rid: plc.TypeId): """ Do two datetime typeids have matching resolution for a binop. diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index bf4673fcc50..0b559f7a8e9 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -182,5 +182,3 @@ docstring-code-format = true [tool.rapids-build-backend] build-backend = "setuptools.build_meta" dependencies-file = "../../dependencies.yaml" -# Pure python -disable-cuda = true diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py index 6ba2a1dce1e..218101bf87c 100644 --- a/python/cudf_polars/tests/expressions/test_datetime_basic.py +++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py @@ -2,6 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import datetime +from operator import methodcaller + import pytest import polars as pl @@ -32,3 +35,28 @@ def test_datetime_dataframe_scan(dtype): query = ldf.select(pl.col("b"), pl.col("a")) assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "field", + [ + methodcaller("year"), + pytest.param( + methodcaller("day"), + marks=pytest.mark.xfail(reason="day extraction not implemented"), + ), + ], +) +def test_datetime_extract(field): + ldf = pl.LazyFrame( + {"dates": [datetime.date(2024, 1, 1), datetime.date(2024, 10, 11)]} + ) + q = ldf.select(field(pl.col("dates").dt)) + + with pytest.raises(AssertionError): + # polars produces int32, libcudf produces int16 for the year extraction + # libcudf can lose data here. + # https://github.com/rapidsai/cudf/issues/16196 + assert_gpu_result_equal(q) + + assert_gpu_result_equal(q, check_dtypes=False) diff --git a/python/cudf_polars/tests/expressions/test_round.py b/python/cudf_polars/tests/expressions/test_round.py new file mode 100644 index 00000000000..3af3a0ce6d1 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_round.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import math + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture(params=[pl.Float32, pl.Float64]) +def dtype(request): + return request.param + + +@pytest.fixture +def df(dtype, with_nulls): + a = [-math.e, 10, 22.5, 1.5, 2.5, -1.5, math.pi, 8] + if with_nulls: + a[2] = None + a[-1] = None + return pl.LazyFrame({"a": a}, schema={"a": dtype}) + + +@pytest.mark.parametrize("decimals", [0, 2, 4]) +def test_round(df, decimals): + q = df.select(pl.col("a").round(decimals=decimals)) + + assert_gpu_result_equal(q, check_exact=False) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 9729e765948..8cf65dd51ac 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -37,6 +37,30 @@ def ldf(with_nulls): return pl.LazyFrame({"a": a, "b": range(len(a))}) +slice_cases = [ + (1, 3), + (0, 3), + (0, 0), + (-3, 1), + (-100, 5), + (1, 1), + (100, 100), + (-3, 4), + (-3, 3), +] + + +@pytest.fixture(params=slice_cases) +def slice_column_data(ldf, request): + start, length = request.param + if length: + return ldf.with_columns( + pl.lit(start).alias("start"), pl.lit(length).alias("length") + ) + else: + return ldf.with_columns(pl.lit(start).alias("start")) + + def test_supported_stringfunction_expression(ldf): query = ldf.select( pl.col("a").str.starts_with("Z"), @@ -104,3 +128,25 @@ def test_contains_invalid(ldf): query.collect() with pytest.raises(pl.exceptions.ComputeError): query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)) + + +@pytest.mark.parametrize("offset", [1, -1, 0, 100, -100]) +def test_slice_scalars_offset(ldf, offset): + query = ldf.select(pl.col("a").str.slice(offset)) + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize("offset,length", slice_cases) +def test_slice_scalars_length_and_offset(ldf, offset, length): + query = ldf.select(pl.col("a").str.slice(offset, length)) + assert_gpu_result_equal(query) + + +def test_slice_column(slice_column_data): + if "length" in slice_column_data.collect_schema(): + query = slice_column_data.select( + pl.col("a").str.slice(pl.col("start"), pl.col("length")) + ) + else: + query = slice_column_data.select(pl.col("a").str.slice(pl.col("start"))) + assert_ir_translation_raises(query, NotImplementedError) diff --git a/python/cudf_polars/tests/expressions/test_unique.py b/python/cudf_polars/tests/expressions/test_unique.py new file mode 100644 index 00000000000..9b009a422c2 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_unique.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"]) +@pytest.mark.parametrize("pre_sorted", [False, True], ids=["unsorted", "sorted"]) +def test_unique(maintain_order, pre_sorted): + ldf = pl.DataFrame( + { + "b": [1.5, 2.5, None, 1.5, 3, float("nan"), 3], + } + ).lazy() + if pre_sorted: + ldf = ldf.sort("b") + + query = ldf.select(pl.col("b").unique(maintain_order=maintain_order)) + assert_gpu_result_equal(query, check_row_order=maintain_order) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index aefad59eb91..b84e2c16b43 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -47,6 +47,8 @@ def keys(request): [pl.col("float").max() - pl.col("int").min()], [pl.col("float").mean(), pl.col("int").std()], [(pl.col("float") - pl.lit(2)).max()], + [pl.col("float").sum().round(decimals=1)], + [pl.col("float").round(decimals=1).sum()], ], ids=lambda aggs: "-".join(map(str, aggs)), ) @@ -83,10 +85,7 @@ def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs): def test_groupby_len(df, keys): q = df.group_by(*keys).agg(pl.len()) - # TODO: polars returns UInt32, libcudf returns Int32 - with pytest.raises(AssertionError): - assert_gpu_result_equal(q, check_row_order=False) - assert_gpu_result_equal(q, check_dtypes=False, check_row_order=False) + assert_gpu_result_equal(q, check_row_order=False) @pytest.mark.parametrize( diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py index b021d832910..865b95a7d91 100644 --- a/python/cudf_polars/tests/test_union.py +++ b/python/cudf_polars/tests/test_union.py @@ -46,3 +46,12 @@ def test_concat_vertical(): q = pl.concat([ldf, ldf2], how="vertical") assert_gpu_result_equal(q) + + +def test_concat_diagonal_empty(): + df1 = pl.LazyFrame() + df2 = pl.LazyFrame({"a": [1, 2]}) + + q = pl.concat([df1, df2], how="diagonal_relaxed") + + assert_gpu_result_equal(q, collect_kwargs={"no_optimization": True})