diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 1275aad757c..ad3f5940b94 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -63,7 +63,8 @@ jobs: branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} - run_script: "ci/clang_tidy.sh" + run_script: "ci/cpp_linters.sh" + file_to_upload: iwyu_results.txt conda-python-cudf-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f5234f58efe..6d070a8a14c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -90,8 +90,8 @@ repos: entry: | # Check for usage of default_rng without seeding default_rng\(\)| - # Check for usage of np.random.seed - np.random.seed\( + # Check for usage of np.random.seed (NPY002 only disallows this being called) + np.random.seed language: pygrep types: [python] - id: cmake-format diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh index ae4eb0d5c66..32dd5a7fa62 100755 --- a/ci/build_wheel_cudf.sh +++ b/ci/build_wheel_cudf.sh @@ -27,4 +27,6 @@ python -m auditwheel repair \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* +./ci/validate_wheel.sh ${package_dir} final_dist + RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh index 79853cdbdb2..38048125247 100755 --- a/ci/build_wheel_cudf_polars.sh +++ b/ci/build_wheel_cudf_polars.sh @@ -6,6 +6,7 @@ set -euo pipefail package_dir="python/cudf_polars" ./ci/build_wheel.sh cudf-polars ${package_dir} +./ci/validate_wheel.sh ${package_dir} dist RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh index 00c64afa2ef..b0ae2f23abc 100755 --- a/ci/build_wheel_dask_cudf.sh +++ b/ci/build_wheel_dask_cudf.sh @@ -6,6 +6,7 @@ set -euo pipefail package_dir="python/dask_cudf" ./ci/build_wheel.sh dask-cudf ${package_dir} +./ci/validate_wheel.sh ${package_dir} dist RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index aabd3814a24..af49942c8cd 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -37,4 +37,6 @@ python -m auditwheel repair \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* +./ci/validate_wheel.sh ${package_dir} final_dist + RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${package_dir}/final_dist" diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh index c4a89f20f5f..5a8f3397714 100755 --- a/ci/build_wheel_pylibcudf.sh +++ b/ci/build_wheel_pylibcudf.sh @@ -25,4 +25,6 @@ python -m auditwheel repair \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* +./ci/validate_wheel.sh ${package_dir} final_dist + RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist diff --git a/ci/clang_tidy.sh b/ci/clang_tidy.sh deleted file mode 100755 index 4d5d3fc3136..00000000000 --- a/ci/clang_tidy.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024, NVIDIA CORPORATION. - -set -euo pipefail - -rapids-logger "Create clang-tidy conda environment" -. /opt/conda/etc/profile.d/conda.sh - -ENV_YAML_DIR="$(mktemp -d)" - -rapids-dependency-file-generator \ - --output conda \ - --file-key clang_tidy \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" - -rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n clang_tidy - -# Temporarily allow unbound variables for conda activation. -set +u -conda activate clang_tidy -set -u - -RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)" - -source rapids-configure-sccache - -# Run the build via CMake, which will run clang-tidy when CUDF_CLANG_TIDY is enabled. -cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_CLANG_TIDY=ON -GNinja -cmake --build cpp/build diff --git a/ci/cpp_linters.sh b/ci/cpp_linters.sh new file mode 100755 index 00000000000..a7c7255456f --- /dev/null +++ b/ci/cpp_linters.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +rapids-logger "Create checks conda environment" +. /opt/conda/etc/profile.d/conda.sh + +ENV_YAML_DIR="$(mktemp -d)" + +rapids-dependency-file-generator \ + --output conda \ + --file-key clang_tidy \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" + +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n clang_tidy + +# Temporarily allow unbound variables for conda activation. +set +u +conda activate clang_tidy +set -u + +RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)" + +source rapids-configure-sccache + +# TODO: For testing purposes, clone and build IWYU. We can switch to a release +# once a clang 19-compatible version is available, which should be soon +# (https://github.com/include-what-you-use/include-what-you-use/issues/1641). +git clone --depth 1 https://github.com/include-what-you-use/include-what-you-use.git +pushd include-what-you-use +# IWYU's CMake build uses some Python scripts that assume that the cwd is +# importable, so support that legacy behavior. +export PYTHONPATH=${PWD}:${PYTHONPATH:-} +cmake -S . -B build -GNinja --install-prefix=${CONDA_PREFIX} +cmake --build build +cmake --install build +popd + +# Run the build via CMake, which will run clang-tidy when CUDF_STATIC_LINTERS is enabled. +cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_STATIC_LINTERS=ON -GNinja +cmake --build cpp/build 2>&1 | python cpp/scripts/parse_iwyu_output.py + +# Remove invalid components of the path for local usage. The path below is +# valid in the CI due to where the project is cloned, but presumably the fixes +# will be applied locally from inside a clone of cudf. +sed -i 's/\/__w\/cudf\/cudf\///' iwyu_results.txt diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh index f5bcdc62604..fefe26984cb 100755 --- a/ci/test_cudf_polars_polars_tests.sh +++ b/ci/test_cudf_polars_polars_tests.sh @@ -3,22 +3,6 @@ set -eou pipefail -# We will only fail these tests if the PR touches code in pylibcudf -# or cudf_polars itself. -# Note, the three dots mean we are doing diff between the merge-base -# of upstream and HEAD. So this is asking, "does _this branch_ touch -# files in cudf_polars/pylibcudf", rather than "are there changes -# between upstream and this branch which touch cudf_polars/pylibcudf" -# TODO: is the target branch exposed anywhere in an environment variable? -if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ]; -then - HAS_CHANGES=1 - rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure" -else - HAS_CHANGES=0 - rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure" -fi - rapids-logger "Download wheels" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" @@ -63,9 +47,4 @@ if [ ${EXITCODE} != 0 ]; then else rapids-logger "Running polars test suite PASSED" fi - -if [ ${HAS_CHANGES} == 1 ]; then - exit ${EXITCODE} -else - exit 0 -fi +exit ${EXITCODE} diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh index 2884757e46b..6c827406f78 100755 --- a/ci/test_wheel_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -3,22 +3,6 @@ set -eou pipefail -# We will only fail these tests if the PR touches code in pylibcudf -# or cudf_polars itself. -# Note, the three dots mean we are doing diff between the merge-base -# of upstream and HEAD. So this is asking, "does _this branch_ touch -# files in cudf_polars/pylibcudf", rather than "are there changes -# between upstream and this branch which touch cudf_polars/pylibcudf" -# TODO: is the target branch exposed anywhere in an environment variable? -if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/pylibcudf/)" ]; -then - HAS_CHANGES=1 - rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure" -else - HAS_CHANGES=0 - rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure" -fi - rapids-logger "Download wheels" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" @@ -65,9 +49,4 @@ if [ ${EXITCODE} != 0 ]; then else rapids-logger "Testing PASSED" fi - -if [ ${HAS_CHANGES} == 1 ]; then - exit ${EXITCODE} -else - exit 0 -fi +exit ${EXITCODE} diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh new file mode 100755 index 00000000000..5910a5c59fe --- /dev/null +++ b/ci/validate_wheel.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +package_dir=$1 +wheel_dir_relative_path=$2 + +cd "${package_dir}" + +rapids-logger "validate packages with 'pydistcheck'" + +pydistcheck \ + --inspect \ + "$(echo ${wheel_dir_relative_path}/*.whl)" + +rapids-logger "validate packages with 'twine'" + +twine check \ + --strict \ + "$(echo ${wheel_dir_relative_path}/*.whl)" diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 8a64ebf40c5..e91443ddba8 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -55,7 +55,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.0.13 +- numba-cuda>=0.0.13,<0.0.18 - numpy>=1.23,<3.0a0 - numpydoc - nvcc_linux-64=11.8 @@ -66,10 +66,10 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.13 +- polars>=1.11,<1.14 - pre-commit - ptxcompiler -- pyarrow>=14.0.0,<18.0.0a0 +- pyarrow>=14.0.0,<19.0.0a0 - pydata-sphinx-theme!=0.14.2 - pytest-benchmark - pytest-cases>=3.8.2 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 5f779c3170f..2dccb595e59 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -54,7 +54,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.0.13 +- numba-cuda>=0.0.13,<0.0.18 - numpy>=1.23,<3.0a0 - numpydoc - nvcomp==4.1.0.6 @@ -64,9 +64,9 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.13 +- polars>=1.11,<1.14 - pre-commit -- pyarrow>=14.0.0,<18.0.0a0 +- pyarrow>=14.0.0,<19.0.0a0 - pydata-sphinx-theme!=0.14.2 - pynvjitlink>=0.0.0a0 - pytest-benchmark diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml index edf92b930d9..7a477291e7a 100644 --- a/conda/recipes/cudf-polars/meta.yaml +++ b/conda/recipes/cudf-polars/meta.yaml @@ -43,7 +43,7 @@ requirements: run: - python - pylibcudf ={{ version }} - - polars >=1.11,<1.12 + - polars >=1.11,<1.14 - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 2aafcae072d..04904e95630 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -80,7 +80,7 @@ requirements: - typing_extensions >=4.0.0 - pandas >=2.0,<2.2.4dev0 - cupy >=12.0.0 - - numba-cuda >=0.0.13 + - numba-cuda >=0.0.13,<0.0.18 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<18.0.0a0 - libcudf ={{ version }} diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy index 12120a5c6d1..0e5699876fc 100644 --- a/cpp/.clang-tidy +++ b/cpp/.clang-tidy @@ -39,8 +39,8 @@ Checks: -clang-analyzer-optin.core.EnumCastOutOfRange, -clang-analyzer-optin.cplusplus.UninitializedObject' -WarningsAsErrors: '*' -HeaderFilterRegex: '.*cudf/cpp/(src|include|tests).*' +WarningsAsErrors: '' +HeaderFilterRegex: '.*cudf/cpp/(src|include).*' ExcludeHeaderFilterRegex: '.*(Message_generated.h|Schema_generated.h|brotli_dict.hpp|unbz2.hpp|cxxopts.hpp).*' FormatStyle: none CheckOptions: diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index bfa4bf80724..e237b0b2856 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -88,7 +88,13 @@ option( ${DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL} ) mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL) -option(CUDF_CLANG_TIDY "Enable clang-tidy checking" OFF) +option(CUDF_STATIC_LINTERS "Enable static linters during compilation" OFF) + +option( + CUDF_KVIKIO_REMOTE_IO + "Enable remote IO (e.g. AWS S3) support through KvikIO. If disabled, cudf-python will still be able to do remote IO through fsspec." + ON +) message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}") message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}") @@ -109,6 +115,9 @@ message( "CUDF: Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler): ${CUDA_ENABLE_LINEINFO}" ) message(VERBOSE "CUDF: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}") +message(VERBOSE + "CUDF: Build with remote IO (e.g. AWS S3) support through KvikIO: ${CUDF_KVIKIO_REMOTE_IO}" +) # Set a default build type if none was specified rapids_cmake_build_type("Release") @@ -146,8 +155,10 @@ if(NOT CUDF_GENERATED_INCLUDE_DIR) endif() # ################################################################################################## -# * clang-tidy configuration ---------------------------------------------------------------------- -if(CUDF_CLANG_TIDY) +# * linter configuration --------------------------------------------------------------------------- +if(CUDF_STATIC_LINTERS) + # For simplicity, for now we assume that all linters can be installed into an environment where + # any linter is being run. We could relax this requirement if desired. find_program( CLANG_TIDY_EXE NAMES "clang-tidy" @@ -174,24 +185,48 @@ if(CUDF_CLANG_TIDY) "clang-tidy version ${expected_clang_tidy_version} is required, but found ${LLVM_VERSION}" ) endif() + + find_program(IWYU_EXE NAMES include-what-you-use iwyu REQUIRED) endif() # Turn on the clang-tidy property for a target excluding the files specified in SKIPPED_FILES. -function(enable_clang_tidy target) - set(_tidy_options) +function(enable_static_checkers target) + set(_tidy_options IWYU CLANG_TIDY) set(_tidy_one_value) set(_tidy_multi_value SKIPPED_FILES) cmake_parse_arguments( - _TIDY "${_tidy_options}" "${_tidy_one_value}" "${_tidy_multi_value}" ${ARGN} + _LINT "${_tidy_options}" "${_tidy_one_value}" "${_tidy_multi_value}" ${ARGN} ) - if(CUDF_CLANG_TIDY) - # clang will complain about unused link libraries on the compile line unless we specify - # -Qunused-arguments. - set_target_properties( - ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments" - ) - foreach(file IN LISTS _TIDY_SKIPPED_FILES) + if(CUDF_STATIC_LINTERS) + if(_LINT_CLANG_TIDY) + # clang will complain about unused link libraries on the compile line unless we specify + # -Qunused-arguments. + set_target_properties( + ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments" + ) + endif() + if(_LINT_IWYU) + # A few extra warnings pop up when building with IWYU. I'm not sure why, but they are not + # relevant since they don't show up in any other build so it's better to suppress them until + # we can figure out the cause. Setting this as part of CXX_INCLUDE_WHAT_YOU_USE does not + # appear to be sufficient, we must also ensure that it is set to the underlying target's CXX + # compile flags. To do this completely cleanly we should modify the flags on the target rather + # than the global CUDF_CXX_FLAGS, but this solution is good enough for now since we never run + # the linters on real builds. + foreach(_flag -Wno-missing-braces -Wno-absolute-value -Wunneeded-internal-declaration) + list(FIND CUDF_CXX_FLAGS "${flag}" _flag_index) + if(_flag_index EQUAL -1) + list(APPEND CUDF_CXX_FLAGS ${flag}) + endif() + endforeach() + set(CUDF_CXX_FLAGS + "${CUDF_CXX_FLAGS}" + PARENT_SCOPE + ) + set_target_properties(${target} PROPERTIES CXX_INCLUDE_WHAT_YOU_USE "${IWYU_EXE}") + endif() + foreach(file IN LISTS _LINT_SKIPPED_FILES) set_source_files_properties(${file} PROPERTIES SKIP_LINTING ON) endforeach() endif() @@ -368,11 +403,14 @@ add_library( src/filling/repeat.cu src/filling/sequence.cu src/groupby/groupby.cu + src/groupby/hash/compute_aggregations.cu + src/groupby/hash/compute_aggregations_null.cu + src/groupby/hash/compute_global_memory_aggs.cu + src/groupby/hash/compute_global_memory_aggs_null.cu src/groupby/hash/compute_groupby.cu src/groupby/hash/compute_mapping_indices.cu src/groupby/hash/compute_mapping_indices_null.cu src/groupby/hash/compute_shared_memory_aggs.cu - src/groupby/hash/compute_single_pass_aggs.cu src/groupby/hash/create_sparse_results_table.cu src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu @@ -667,6 +705,7 @@ add_library( src/strings/replace/replace_slice.cu src/strings/reverse.cu src/strings/scan/scan_inclusive.cu + src/strings/search/contains_multiple.cu src/strings/search/findall.cu src/strings/search/find.cu src/strings/search/find_multiple.cu @@ -771,11 +810,15 @@ set_target_properties( INTERFACE_POSITION_INDEPENDENT_CODE ON ) +# Note: This must come before the target_compile_options below so that the function can modify the +# flags if necessary. +enable_static_checkers( + cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp CLANG_TIDY IWYU +) target_compile_options( cudf PRIVATE "$<$:${CUDF_CXX_FLAGS}>" "$<$:${CUDF_CUDA_FLAGS}>" ) -enable_clang_tidy(cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp) if(CUDF_BUILD_STACKTRACE_DEBUG) # Remove any optimization level to avoid nvcc warning "incompatible redefinition for option @@ -857,6 +900,9 @@ target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL # Define spdlog level target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}") +# Enable remote IO through KvikIO +target_compile_definitions(cudf PRIVATE $<$:CUDF_KVIKIO_REMOTE_IO>) + # Compile stringified JIT sources first add_dependencies(cudf jitify_preprocess_run) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index bdc360c082b..419b78db9b0 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -348,23 +348,15 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary ConfigureBench(TEXT_BENCH text/subword.cpp) ConfigureNVBench( - TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp - text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp + TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp + text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp ) # ################################################################################################## # * strings benchmark ------------------------------------------------------------------- ConfigureBench( - STRINGS_BENCH - string/convert_datetime.cpp - string/convert_durations.cpp - string/copy.cu - string/factory.cu - string/filter.cpp - string/repeat_strings.cpp - string/replace.cpp - string/translate.cpp - string/url_decode.cu + STRINGS_BENCH string/factory.cu string/filter.cpp string/repeat_strings.cpp string/replace.cpp + string/translate.cpp string/url_decode.cu ) ConfigureNVBench( @@ -373,14 +365,17 @@ ConfigureNVBench( string/char_types.cpp string/combine.cpp string/contains.cpp + string/convert_datetime.cpp + string/convert_durations.cpp string/convert_fixed_point.cpp string/convert_numerics.cpp + string/copy.cpp string/copy_if_else.cpp string/copy_range.cpp string/count.cpp string/extract.cpp string/find.cpp - string/gather.cpp + string/find_multiple.cpp string/join_strings.cpp string/lengths.cpp string/like.cpp diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index bdce8a31176..8bce718c7d8 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -23,11 +23,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -540,7 +542,7 @@ struct string_generator { // range 32-127 is ASCII; 127-136 will be multi-byte UTF-8 { } - __device__ void operator()(thrust::tuple str_begin_end) + __device__ void operator()(thrust::tuple str_begin_end) { auto begin = thrust::get<0>(str_begin_end); auto end = thrust::get<1>(str_begin_end); @@ -569,6 +571,9 @@ std::unique_ptr create_random_utf8_string_column(data_profile cons distribution_params{1. - profile.get_null_probability().value_or(0)}); auto lengths = len_dist(engine, num_rows + 1); auto null_mask = valid_dist(engine, num_rows + 1); + auto stream = cudf::get_default_stream(); + auto mr = cudf::get_current_device_resource_ref(); + thrust::transform_if( thrust::device, lengths.begin(), @@ -580,28 +585,26 @@ std::unique_ptr create_random_utf8_string_column(data_profile cons auto valid_lengths = thrust::make_transform_iterator( thrust::make_zip_iterator(thrust::make_tuple(lengths.begin(), null_mask.begin())), valid_or_zero{}); - rmm::device_uvector offsets(num_rows + 1, cudf::get_default_stream()); - thrust::exclusive_scan( - thrust::device, valid_lengths, valid_lengths + lengths.size(), offsets.begin()); - // offsets are ready. - auto chars_length = *thrust::device_pointer_cast(offsets.end() - 1); + + // offsets are created as INT32 or INT64 as appropriate + auto [offsets, chars_length] = cudf::strings::detail::make_offsets_child_column( + valid_lengths, valid_lengths + num_rows, stream, mr); + // use the offsetalator to normalize the offset values for use by the string_generator + auto offsets_itr = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()); rmm::device_uvector chars(chars_length, cudf::get_default_stream()); thrust::for_each_n(thrust::device, - thrust::make_zip_iterator(offsets.begin(), offsets.begin() + 1), + thrust::make_zip_iterator(offsets_itr, offsets_itr + 1), num_rows, string_generator{chars.data(), engine}); + auto [result_bitmask, null_count] = - cudf::detail::valid_if(null_mask.begin(), - null_mask.end() - 1, - thrust::identity{}, - cudf::get_default_stream(), - cudf::get_current_device_resource_ref()); + profile.get_null_probability().has_value() + ? cudf::detail::valid_if( + null_mask.begin(), null_mask.end() - 1, thrust::identity{}, stream, mr) + : std::pair{rmm::device_buffer{}, 0}; + return cudf::make_strings_column( - num_rows, - std::make_unique(std::move(offsets), rmm::device_buffer{}, 0), - chars.release(), - null_count, - profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{}); + num_rows, std::move(offsets), chars.release(), null_count, std::move(result_bitmask)); } /** diff --git a/cpp/benchmarks/io/nvbench_helpers.hpp b/cpp/benchmarks/io/nvbench_helpers.hpp index 1e3ab2b7b4f..cc548ccd3de 100644 --- a/cpp/benchmarks/io/nvbench_helpers.hpp +++ b/cpp/benchmarks/io/nvbench_helpers.hpp @@ -28,6 +28,7 @@ enum class data_type : int32_t { INTEGRAL = static_cast(type_group_id::INTEGRAL), INTEGRAL_SIGNED = static_cast(type_group_id::INTEGRAL_SIGNED), FLOAT = static_cast(type_group_id::FLOATING_POINT), + BOOL8 = static_cast(cudf::type_id::BOOL8), DECIMAL = static_cast(type_group_id::FIXED_POINT), TIMESTAMP = static_cast(type_group_id::TIMESTAMP), DURATION = static_cast(type_group_id::DURATION), @@ -44,6 +45,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( case data_type::INTEGRAL: return "INTEGRAL"; case data_type::INTEGRAL_SIGNED: return "INTEGRAL_SIGNED"; case data_type::FLOAT: return "FLOAT"; + case data_type::BOOL8: return "BOOL8"; case data_type::DECIMAL: return "DECIMAL"; case data_type::TIMESTAMP: return "TIMESTAMP"; case data_type::DURATION: return "DURATION"; diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index ce115fd7723..b14f9cbb67e 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -114,6 +114,7 @@ void BM_parquet_read_io_compression(nvbench::state& state) { auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL), static_cast(data_type::FLOAT), + static_cast(data_type::BOOL8), static_cast(data_type::DECIMAL), static_cast(data_type::TIMESTAMP), static_cast(data_type::DURATION), @@ -298,6 +299,7 @@ void BM_parquet_read_wide_tables_mixed(nvbench::state& state) using d_type_list = nvbench::enum_type_list(data_type::INTEGRAL), static_cast(data_type::FLOAT), + static_cast(data_type::BOOL8), static_cast(data_type::DECIMAL), static_cast(data_type::TIMESTAMP), static_cast(data_type::DURATION), diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index 256e50f0e64..84e4b8b93c0 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -89,6 +89,7 @@ void BM_parq_write_io_compression( { auto const data_types = get_type_or_group({static_cast(data_type::INTEGRAL), static_cast(data_type::FLOAT), + static_cast(data_type::BOOL8), static_cast(data_type::DECIMAL), static_cast(data_type::TIMESTAMP), static_cast(data_type::DURATION), @@ -143,6 +144,7 @@ void BM_parq_write_varying_options( auto const data_types = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), static_cast(data_type::FLOAT), + static_cast(data_type::BOOL8), static_cast(data_type::DECIMAL), static_cast(data_type::TIMESTAMP), static_cast(data_type::DURATION), @@ -181,6 +183,7 @@ void BM_parq_write_varying_options( using d_type_list = nvbench::enum_type_list(joined_table->column("n_name")); - auto o_year = cudf::datetime::extract_year(joined_table->column("o_orderdate")); + auto o_year = cudf::datetime::extract_datetime_component( + joined_table->column("o_orderdate"), cudf::datetime::datetime_component::YEAR); auto amount = calculate_amount(joined_table->column("l_discount"), joined_table->column("l_extendedprice"), joined_table->column("ps_supplycost"), diff --git a/cpp/benchmarks/string/convert_datetime.cpp b/cpp/benchmarks/string/convert_datetime.cpp index 5deca3664b7..288aa6029d3 100644 --- a/cpp/benchmarks/string/convert_datetime.cpp +++ b/cpp/benchmarks/string/convert_datetime.cpp @@ -16,62 +16,59 @@ #include #include -#include -#include #include #include +#include #include -class StringDateTime : public cudf::benchmark {}; +#include -enum class direction { to, from }; +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_D, "cudf::timestamp_D", "cudf::timestamp_D"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_s, "cudf::timestamp_s", "cudf::timestamp_s"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_us, "cudf::timestamp_us", "cudf::timestamp_us"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ns, "cudf::timestamp_ns", "cudf::timestamp_ns"); -template -void BM_convert_datetime(benchmark::State& state, direction dir) +using Types = nvbench::type_list; + +template +void bench_convert_datetime(nvbench::state& state, nvbench::type_list) { - auto const n_rows = static_cast(state.range(0)); - auto const data_type = cudf::data_type(cudf::type_to_id()); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const from_ts = state.get_string("dir") == "from"; - auto const column = create_random_column(data_type.id(), row_count{n_rows}); - cudf::column_view input(column->view()); + auto const data_type = cudf::data_type(cudf::type_to_id()); + auto const ts_col = create_random_column(data_type.id(), row_count{num_rows}); - auto source = dir == direction::to ? cudf::strings::from_timestamps(input, "%Y-%m-%d %H:%M:%S") - : make_empty_column(cudf::data_type{cudf::type_id::STRING}); - cudf::strings_column_view source_string(source->view()); + auto format = std::string{"%Y-%m-%d %H:%M:%S"}; + auto s_col = cudf::strings::from_timestamps(ts_col->view(), format); + auto sv = cudf::strings_column_view(s_col->view()); - for (auto _ : state) { - cuda_event_timer raii(state, true); - if (dir == direction::to) - cudf::strings::to_timestamps(source_string, data_type, "%Y-%m-%d %H:%M:%S"); - else - cudf::strings::from_timestamps(input, "%Y-%m-%d %H:%M:%S"); - } + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); - auto const bytes = dir == direction::to ? source_string.chars_size(cudf::get_default_stream()) - : n_rows * sizeof(TypeParam); - state.SetBytesProcessed(state.iterations() * bytes); + if (from_ts) { + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(sv.chars_size(stream)); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::from_timestamps(ts_col->view(), format); + }); + } else { + state.add_global_memory_reads(sv.chars_size(stream)); + state.add_global_memory_writes(num_rows); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::to_timestamps(sv, data_type, format); + }); + } } -#define STR_BENCHMARK_DEFINE(name, type, dir) \ - BENCHMARK_DEFINE_F(StringDateTime, name)(::benchmark::State & state) \ - { \ - BM_convert_datetime(state, dir); \ - } \ - BENCHMARK_REGISTER_F(StringDateTime, name) \ - ->RangeMultiplier(1 << 5) \ - ->Range(1 << 10, 1 << 25) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -STR_BENCHMARK_DEFINE(from_days, cudf::timestamp_D, direction::from); -STR_BENCHMARK_DEFINE(from_seconds, cudf::timestamp_s, direction::from); -STR_BENCHMARK_DEFINE(from_mseconds, cudf::timestamp_ms, direction::from); -STR_BENCHMARK_DEFINE(from_useconds, cudf::timestamp_us, direction::from); -STR_BENCHMARK_DEFINE(from_nseconds, cudf::timestamp_ns, direction::from); - -STR_BENCHMARK_DEFINE(to_days, cudf::timestamp_D, direction::to); -STR_BENCHMARK_DEFINE(to_seconds, cudf::timestamp_s, direction::to); -STR_BENCHMARK_DEFINE(to_mseconds, cudf::timestamp_ms, direction::to); -STR_BENCHMARK_DEFINE(to_useconds, cudf::timestamp_us, direction::to); -STR_BENCHMARK_DEFINE(to_nseconds, cudf::timestamp_ns, direction::to); +NVBENCH_BENCH_TYPES(bench_convert_datetime, NVBENCH_TYPE_AXES(Types)) + .set_name("datetime") + .set_type_axes_names({"DataType"}) + .add_string_axis("dir", {"to", "from"}) + .add_int64_axis("num_rows", {1 << 16, 1 << 18, 1 << 20, 1 << 22}); diff --git a/cpp/benchmarks/string/convert_durations.cpp b/cpp/benchmarks/string/convert_durations.cpp index f12d292c2e7..9d2377f2d82 100644 --- a/cpp/benchmarks/string/convert_durations.cpp +++ b/cpp/benchmarks/string/convert_durations.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,92 +14,60 @@ * limitations under the License. */ +#include #include -#include - -#include #include #include -#include +#include #include -#include -#include - -class DurationsToString : public cudf::benchmark {}; -template -void BM_convert_from_durations(benchmark::State& state) -{ - cudf::size_type const source_size = state.range(0); - - // Every element is valid - auto data = cudf::detail::make_counting_transform_iterator( - 0, [source_size](auto i) { return TypeParam{i - source_size / 2}; }); +#include - cudf::test::fixed_width_column_wrapper source_durations(data, data + source_size); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_D, "cudf::duration_D", "cudf::duration_D"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_s, "cudf::duration_s", "cudf::duration_s"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_ms, "cudf::duration_ms", "cudf::duration_ms"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_us, "cudf::duration_us", "cudf::duration_us"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_ns, "cudf::duration_ns", "cudf::duration_ns"); - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf::strings::from_durations(source_durations, "%D days %H:%M:%S"); - } - - state.SetBytesProcessed(state.iterations() * source_size * sizeof(TypeParam)); -} +using Types = nvbench::type_list; -class StringToDurations : public cudf::benchmark {}; -template -void BM_convert_to_durations(benchmark::State& state) +template +void bench_convert_duration(nvbench::state& state, nvbench::type_list) { - cudf::size_type const source_size = state.range(0); - - // Every element is valid - auto data = cudf::detail::make_counting_transform_iterator( - 0, [source_size](auto i) { return TypeParam{i - source_size / 2}; }); - - cudf::test::fixed_width_column_wrapper source_durations(data, data + source_size); - auto results = cudf::strings::from_durations(source_durations, "%D days %H:%M:%S"); - cudf::strings_column_view source_string(*results); - auto output_type = cudf::data_type(cudf::type_to_id()); - - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf::strings::to_durations(source_string, output_type, "%D days %H:%M:%S"); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const data_type = cudf::data_type(cudf::type_to_id()); + auto const from_dur = state.get_string("dir") == "from"; + + auto const ts_col = create_random_column(data_type.id(), row_count{num_rows}); + cudf::column_view input(ts_col->view()); + + auto format = std::string{"%D days %H:%M:%S"}; + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + if (from_dur) { + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(format.size() * num_rows); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::from_durations(input, format); }); + } else { + auto source = cudf::strings::from_durations(input, format); + auto view = cudf::strings_column_view(source->view()); + state.add_global_memory_reads(view.chars_size(stream)); + state.add_global_memory_writes(num_rows); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::to_durations(view, data_type, format); + }); } - - state.SetBytesProcessed(state.iterations() * source_size * sizeof(TypeParam)); } -#define DSBM_BENCHMARK_DEFINE(name, type) \ - BENCHMARK_DEFINE_F(DurationsToString, name)(::benchmark::State & state) \ - { \ - BM_convert_from_durations(state); \ - } \ - BENCHMARK_REGISTER_F(DurationsToString, name) \ - ->RangeMultiplier(1 << 5) \ - ->Range(1 << 10, 1 << 25) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -#define SDBM_BENCHMARK_DEFINE(name, type) \ - BENCHMARK_DEFINE_F(StringToDurations, name)(::benchmark::State & state) \ - { \ - BM_convert_to_durations(state); \ - } \ - BENCHMARK_REGISTER_F(StringToDurations, name) \ - ->RangeMultiplier(1 << 5) \ - ->Range(1 << 10, 1 << 25) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -DSBM_BENCHMARK_DEFINE(from_durations_D, cudf::duration_D); -DSBM_BENCHMARK_DEFINE(from_durations_s, cudf::duration_s); -DSBM_BENCHMARK_DEFINE(from_durations_ms, cudf::duration_ms); -DSBM_BENCHMARK_DEFINE(from_durations_us, cudf::duration_us); -DSBM_BENCHMARK_DEFINE(from_durations_ns, cudf::duration_ns); - -SDBM_BENCHMARK_DEFINE(to_durations_D, cudf::duration_D); -SDBM_BENCHMARK_DEFINE(to_durations_s, cudf::duration_s); -SDBM_BENCHMARK_DEFINE(to_durations_ms, cudf::duration_ms); -SDBM_BENCHMARK_DEFINE(to_durations_us, cudf::duration_us); -SDBM_BENCHMARK_DEFINE(to_durations_ns, cudf::duration_ns); +NVBENCH_BENCH_TYPES(bench_convert_duration, NVBENCH_TYPE_AXES(Types)) + .set_name("duration") + .set_type_axes_names({"DataType"}) + .add_string_axis("dir", {"to", "from"}) + .add_int64_axis("num_rows", {1 << 10, 1 << 15, 1 << 20, 1 << 25}); diff --git a/cpp/benchmarks/string/copy.cpp b/cpp/benchmarks/string/copy.cpp new file mode 100644 index 00000000000..2baccd4fad1 --- /dev/null +++ b/cpp/benchmarks/string/copy.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +static void bench_copy(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); + auto const api = state.get_string("api"); + + data_profile const table_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const source = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + + data_profile const map_profile = data_profile_builder().no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + auto const map_table = + create_random_table({cudf::type_to_id()}, row_count{num_rows}, map_profile); + auto const map_view = map_table->view().column(0); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + if (api == "gather") { + auto result = + cudf::gather(source->view(), map_view, cudf::out_of_bounds_policy::NULLIFY, stream); + auto chars_size = cudf::strings_column_view(result->view().column(0)).chars_size(stream); + state.add_global_memory_reads(chars_size + + (map_view.size() * sizeof(cudf::size_type))); + state.add_global_memory_writes(chars_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::gather(source->view(), map_view, cudf::out_of_bounds_policy::NULLIFY, stream); + }); + } else if (api == "scatter") { + auto const target = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + auto result = cudf::scatter(source->view(), map_view, target->view(), stream); + auto chars_size = cudf::strings_column_view(result->view().column(0)).chars_size(stream); + state.add_global_memory_reads(chars_size + + (map_view.size() * sizeof(cudf::size_type))); + state.add_global_memory_writes(chars_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::scatter(source->view(), map_view, target->view(), stream); + }); + } +} + +NVBENCH_BENCH(bench_copy) + .set_name("copy") + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) + .add_string_axis("api", {"gather", "scatter"}); diff --git a/cpp/benchmarks/string/copy.cu b/cpp/benchmarks/string/copy.cu deleted file mode 100644 index 6b2f6c3a0a7..00000000000 --- a/cpp/benchmarks/string/copy.cu +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "string_bench_args.hpp" - -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include - -class StringCopy : public cudf::benchmark {}; - -enum copy_type { gather, scatter }; - -static void BM_copy(benchmark::State& state, copy_type ct) -{ - cudf::size_type const n_rows{static_cast(state.range(0))}; - cudf::size_type const max_str_length{static_cast(state.range(1))}; - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - - auto const source = - create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile); - auto const target = - create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile); - - // scatter indices - auto index_map_col = make_numeric_column( - cudf::data_type{cudf::type_id::INT32}, n_rows, cudf::mask_state::UNALLOCATED); - auto index_map = index_map_col->mutable_view(); - thrust::shuffle_copy(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(n_rows), - index_map.begin(), - thrust::default_random_engine()); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - switch (ct) { - case gather: cudf::gather(source->view(), index_map); break; - case scatter: cudf::scatter(source->view(), index_map, target->view()); break; - } - } - - state.SetBytesProcessed( - state.iterations() * - cudf::strings_column_view(source->view().column(0)).chars_size(cudf::get_default_stream())); -} - -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 4; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); - - // Benchmark for very small strings - b->Args({67108864, 2}); -} - -#define COPY_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(StringCopy, name) \ - (::benchmark::State & st) { BM_copy(st, copy_type::name); } \ - BENCHMARK_REGISTER_F(StringCopy, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -COPY_BENCHMARK_DEFINE(gather) -COPY_BENCHMARK_DEFINE(scatter) diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index 996bdcf0332..3ea3ff13a2f 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -20,9 +20,7 @@ #include #include -#include #include -#include #include #include @@ -44,15 +42,13 @@ static void bench_find_string(nvbench::state& state) auto const col = create_string_column(n_rows, row_width, hit_rate); auto const input = cudf::strings_column_view(col->view()); - std::vector h_targets({"5W", "5W43", "0987 5W43"}); - cudf::string_scalar target(h_targets[2]); - cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end()); + cudf::string_scalar target("0987 5W43"); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); auto const chars_size = input.chars_size(stream); state.add_element_count(chars_size, "chars_size"); state.add_global_memory_reads(chars_size); - if (api.substr(0, 4) == "find") { + if (api == "find") { state.add_global_memory_writes(input.size()); } else { state.add_global_memory_writes(input.size()); @@ -61,10 +57,6 @@ static void bench_find_string(nvbench::state& state) if (api == "find") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::strings::find(input, target); }); - } else if (api == "find_multi") { - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - cudf::strings::find_multiple(input, cudf::strings_column_view(targets)); - }); } else if (api == "contains") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::strings::contains(input, target); }); @@ -79,7 +71,7 @@ static void bench_find_string(nvbench::state& state) NVBENCH_BENCH(bench_find_string) .set_name("find_string") - .add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"}) + .add_string_axis("api", {"find", "contains", "starts_with", "ends_with"}) .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216}) .add_int64_axis("hit_rate", {20, 80}); // percentage diff --git a/cpp/benchmarks/string/find_multiple.cpp b/cpp/benchmarks/string/find_multiple.cpp new file mode 100644 index 00000000000..0e780fdb302 --- /dev/null +++ b/cpp/benchmarks/string/find_multiple.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include +#include +#include +#include + +#include + +static void bench_find_string(nvbench::state& state) +{ + auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const hit_rate = static_cast(state.get_int64("hit_rate")); + auto const target_count = static_cast(state.get_int64("targets")); + auto const api = state.get_string("api"); + + auto const stream = cudf::get_default_stream(); + auto const col = create_string_column(n_rows, row_width, hit_rate); + auto const input = cudf::strings_column_view(col->view()); + + // Note that these all match the first row of the raw_data in create_string_column. + // This is so the hit_rate can properly accounted for. + std::vector const target_data( + {" abc", "W43", "0987 5W43", "123 abc", "23 abc", "3 abc", "7 5W43", "87 5W43", "987 5W43"}); + auto h_targets = std::vector{}; + for (cudf::size_type i = 0; i < target_count; i++) { + h_targets.emplace_back(target_data[i % target_data.size()]); + } + cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end()); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto const chars_size = input.chars_size(stream); + state.add_global_memory_reads(chars_size); + if (api == "find") { + state.add_global_memory_writes(input.size()); + } else { + state.add_global_memory_writes(input.size()); + } + + if (api == "find") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::find_multiple(input, cudf::strings_column_view(targets)); + }); + } else if (api == "contains") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::contains_multiple(input, cudf::strings_column_view(targets)); + }); + } +} + +NVBENCH_BENCH(bench_find_string) + .set_name("find_multiple") + .add_string_axis("api", {"find", "contains"}) + .add_int64_axis("targets", {10, 20, 40}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) + .add_int64_axis("hit_rate", {20, 80}); // percentage diff --git a/cpp/benchmarks/string/gather.cpp b/cpp/benchmarks/string/gather.cpp deleted file mode 100644 index 5b1c679be7d..00000000000 --- a/cpp/benchmarks/string/gather.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include - -#include - -static void bench_gather(nvbench::state& state) -{ - auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); - auto const input_table = - create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); - - data_profile const map_profile = data_profile_builder().no_validity().distribution( - cudf::type_id::INT32, distribution_id::UNIFORM, 0, num_rows); - auto const map_table = - create_random_table({cudf::type_id::INT32}, row_count{num_rows}, map_profile); - - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - auto chars_size = - cudf::strings_column_view(input_table->view().column(0)).chars_size(cudf::get_default_stream()); - state.add_global_memory_reads(chars_size); // all bytes are read; - state.add_global_memory_writes(chars_size); - - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = cudf::gather( - input_table->view(), map_table->view().column(0), cudf::out_of_bounds_policy::NULLIFY); - }); -} - -NVBENCH_BENCH(bench_gather) - .set_name("gather") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 31ce60d8f9a..a80d0dcbdb8 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -20,8 +20,6 @@ #include -#include - #include static void bench_minhash(nvbench::state& state) @@ -29,26 +27,25 @@ static void bench_minhash(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); auto const hash_width = static_cast(state.get_int64("hash_width")); - auto const seed_count = static_cast(state.get_int64("seed_count")); + auto const parameters = static_cast(state.get_int64("parameters")); auto const base64 = state.get_int64("hash_type") == 64; - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const strings_profile = data_profile_builder().distribution( cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); auto const strings_table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input(strings_table->view().column(0)); - data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution( - cudf::type_to_id(), distribution_id::NORMAL, 0, row_width); - auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; - auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); - auto seeds = seeds_table->get_column(0); - seeds.set_null_mask(rmm::device_buffer{}, 0); + data_profile const param_profile = data_profile_builder().no_validity().distribution( + cudf::type_to_id(), + distribution_id::NORMAL, + 0u, + std::numeric_limits::max()); + auto const param_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; + auto const param_table = + create_random_table({param_type, param_type}, row_count{parameters}, param_profile); + auto const parameters_a = param_table->view().column(0); + auto const parameters_b = param_table->view().column(1); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); @@ -57,15 +54,16 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width) - : nvtext::minhash(input, seeds.view(), hash_width); + auto result = base64 + ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width) + : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width); }); } NVBENCH_BENCH(bench_minhash) .set_name("minhash") - .add_int64_axis("num_rows", {1024, 8192, 16364, 131072}) - .add_int64_axis("row_width", {128, 512, 2048}) - .add_int64_axis("hash_width", {5, 10}) - .add_int64_axis("seed_count", {2, 26}) + .add_int64_axis("num_rows", {15000, 30000, 60000}) + .add_int64_axis("row_width", {6000, 28000, 50000}) + .add_int64_axis("hash_width", {12, 24}) + .add_int64_axis("parameters", {26, 260}) .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake index c949f48505e..73f875b46c2 100644 --- a/cpp/cmake/thirdparty/get_kvikio.cmake +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -22,7 +22,7 @@ function(find_and_configure_kvikio VERSION) GIT_REPOSITORY https://github.com/rapidsai/kvikio.git GIT_TAG branch-${VERSION} GIT_SHALLOW TRUE SOURCE_SUBDIR cpp - OPTIONS "KvikIO_BUILD_EXAMPLES OFF" + OPTIONS "KvikIO_BUILD_EXAMPLES OFF" "KvikIO_REMOTE_SUPPORT ${CUDF_KVIKIO_REMOTE_IO}" ) include("${rapids-cmake-dir}/export/find_package_root.cmake") diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 940d03cdb41..2e2ac43d6fe 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -57,11 +57,13 @@ void write_json(data_sink* sink, /** * @brief Normalize single quotes to double quotes using FST * - * @param indata Input device buffer - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation + * @param indata Input device buffer + * @param delimiter Line-separating delimiter character in JSONL inputs + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation */ void normalize_single_quotes(datasource::owning_buffer& indata, + char delimiter, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index b662b660557..7cd4697f592 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -18,6 +18,7 @@ #include "types.hpp" +#include #include #include #include @@ -53,6 +54,11 @@ struct schema_element { * @brief Allows specifying this column's child columns target type */ std::map child_types; + + /** + * @brief Allows specifying the order of the columns + */ + std::optional> column_order; }; /** @@ -87,13 +93,18 @@ enum class json_recovery_mode_t { * | `chunksize` | use `byte_range_xxx` for chunking instead | */ class json_reader_options { + public: + using dtype_variant = + std::variant, + std::map, + std::map, + schema_element>; ///< Variant type holding dtypes information for the columns + + private: source_info _source; // Data types of the column; empty to infer dtypes - std::variant, - std::map, - std::map> - _dtypes; + dtype_variant _dtypes; // Specify the compression format of the source or infer from file extension compression_type _compression = compression_type::AUTO; @@ -178,13 +189,7 @@ class json_reader_options { * * @returns Data types of the columns */ - [[nodiscard]] std::variant, - std::map, - std::map> const& - get_dtypes() const - { - return _dtypes; - } + [[nodiscard]] dtype_variant const& get_dtypes() const { return _dtypes; } /** * @brief Returns compression format of the source. @@ -228,7 +233,11 @@ class json_reader_options { */ [[nodiscard]] size_t get_byte_range_padding() const { - auto const num_columns = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes); + auto const num_columns = + std::visit(cudf::detail::visitor_overload{ + [](auto const& dtypes) { return dtypes.size(); }, + [](schema_element const& dtypes) { return dtypes.child_types.size(); }}, + _dtypes); auto const max_row_bytes = 16 * 1024; // 16KB auto const column_bytes = 64; @@ -390,6 +399,14 @@ class json_reader_options { */ void set_dtypes(std::map types) { _dtypes = std::move(types); } + /** + * @brief Set data types for a potentially nested column hierarchy. + * + * @param types schema element with column names and column order to support arbitrary nesting of + * data types + */ + void set_dtypes(schema_element types); + /** * @brief Set the compression type. * @@ -624,6 +641,18 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set data types for columns to be read. + * + * @param types Struct schema_element with Column name -> schema_element with map and order + * @return this for chaining + */ + json_reader_options_builder& dtypes(schema_element types) + { + options.set_dtypes(std::move(types)); + return *this; + } + /** * @brief Set the compression type. * diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp index f6bae170f03..f0039734519 100644 --- a/cpp/include/cudf/quantiles.hpp +++ b/cpp/include/cudf/quantiles.hpp @@ -48,6 +48,7 @@ namespace CUDF_EXPORT cudf { * ignored. * @param[in] exact If true, returns doubles. * If false, returns same type as input. + * @param[in] stream CUDA stream used for device memory operations and kernel launches * @param[in] mr Device memory resource used to allocate the returned column's device memory * @returns Column of specified quantiles, with nulls for indeterminable values @@ -59,6 +60,7 @@ std::unique_ptr quantile( interpolation interp = interpolation::LINEAR, column_view const& ordered_indices = {}, bool exact = true, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -85,6 +87,7 @@ std::unique_ptr quantile( * @param is_input_sorted Indicates if the input has been pre-sorted * @param column_order The desired sort order for each column * @param null_precedence The desired order of null compared to other elements + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * * @returns Table of specified quantiles, with nulls for indeterminable values @@ -98,6 +101,7 @@ std::unique_ptr quantiles( cudf::sorted is_input_sorted = sorted::NO, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -114,6 +118,7 @@ std::unique_ptr
quantiles( * * @param input tdigest input data. One tdigest per row * @param percentiles Desired percentiles in range [0, 1] + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device * memory * @@ -125,6 +130,7 @@ std::unique_ptr
quantiles( std::unique_ptr percentile_approx( tdigest::tdigest_column_view const& input, column_view const& percentiles, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp index 1fe446db8da..e090766dd07 100644 --- a/cpp/include/cudf/strings/find_multiple.hpp +++ b/cpp/include/cudf/strings/find_multiple.hpp @@ -28,8 +28,42 @@ namespace strings { */ /** - * @brief Returns a lists column with character position values where each - * of the target strings are found in each string. + * @brief Searches for the given target strings within each string in the provided column + * + * Each column in the result table corresponds to the result for the target string at the same + * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1st for 1st, + * etc. + * + * If the target is not found for a string, false is returned for that entry in the output column. + * If the target is an empty string, true is returned for all non-null entries in the output column. + * + * Any null input strings return corresponding null entries in the output columns. + * + * @code{.pseudo} + * input = ["a", "b", "c"] + * targets = ["a", "c"] + * output is a table with two boolean columns: + * column 0: [true, false, false] + * column 1: [false, false, true] + * @endcode + * + * @throw std::invalid_argument if `targets` is empty or contains nulls + * + * @param input Strings instance for this operation + * @param targets UTF-8 encoded strings to search for in each string in `input` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Table of BOOL8 columns + */ +std::unique_ptr
contains_multiple( + strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Searches for the given target strings within each string in the provided column + * and returns the position the targets were found * * The size of the output column is `input.size()`. * Each row of the output column is of size `targets.size()`. @@ -45,7 +79,7 @@ namespace strings { * [-1,-1, 1 ]} // for "def": "a" and "b" not found, "e" at pos 1 * @endcode * - * @throw cudf::logic_error if `targets` is empty or contains nulls + * @throw std::invalid_argument if `targets` is empty or contains nulls * * @param input Strings instance for this operation * @param targets Strings to search for in each string diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 42124461cdf..b2c1a23f57e 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -94,6 +94,53 @@ namespace CUDF_EXPORT nvtext { rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each string + * + * This function uses MurmurHash3_x86_32 for the hash algorithm. + * + * The input strings are first hashed using the given `seed` over substrings + * of `width` characters. These hash values are then combined with the `a` + * and `b` parameter values using the following formula: + * ``` + * max_hash = max of uint32 + * mp = (1 << 61) - 1 + * hv[i] = hash value of a substring at i + * pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash + * ``` + * + * This calculation is performed on each substring and the minimum value is computed + * as follows: + * ``` + * mh[j,i] = min(pv[i]) for all substrings in row j + * and where i=[0,a.size()) + * ``` + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the width < 2 + * @throw std::invalid_argument if parameter_a is empty + * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()` + * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit + * + * @param input Strings column to compute minhash + * @param seed Seed value used for the hash algorithm + * @param parameter_a Values used for the permuted calculation + * @param parameter_b Values used for the permuted calculation + * @param width The character width of substrings to hash for each row + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr minhash_permuted( + cudf::strings_column_view const& input, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Returns the minhash value for each string * @@ -159,6 +206,53 @@ namespace CUDF_EXPORT nvtext { rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each string + * + * This function uses MurmurHash3_x64_128 for the hash algorithm. + * + * The input strings are first hashed using the given `seed` over substrings + * of `width` characters. These hash values are then combined with the `a` + * and `b` parameter values using the following formula: + * ``` + * max_hash = max of uint64 + * mp = (1 << 61) - 1 + * hv[i] = hash value of a substring at i + * pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash + * ``` + * + * This calculation is performed on each substring and the minimum value is computed + * as follows: + * ``` + * mh[j,i] = min(pv[i]) for all substrings in row j + * and where i=[0,a.size()) + * ``` + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the width < 2 + * @throw std::invalid_argument if parameter_a is empty + * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()` + * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit + * + * @param input Strings column to compute minhash + * @param seed Seed value used for the hash algorithm + * @param parameter_a Values used for the permuted calculation + * @param parameter_b Values used for the permuted calculation + * @param width The character width of substrings to hash for each row + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr minhash64_permuted( + cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Returns the minhash values for each row of strings per seed * diff --git a/cpp/scripts/parse_iwyu_output.py b/cpp/scripts/parse_iwyu_output.py new file mode 100644 index 00000000000..822a980a1a8 --- /dev/null +++ b/cpp/scripts/parse_iwyu_output.py @@ -0,0 +1,170 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Helper script to modify IWYU output to only include removals. + +Lines that are not from include-what-you-use are removed from the output. +""" + +import argparse +import re +from enum import Enum + + +class Mode(Enum): + NORMAL = 0 + ADD = 1 + REMOVE = 2 + FULL_INCLUDE_LIST = 3 + + +def extract_include_file(include_line): + """Extract the core file path from an #include directive.""" + match = re.search(r'#include\s+[<"]([^">]+)[">]', include_line) + if match: + return match.group(1) + return None + + +def parse_output(input_stream): + include_modifications = {} + current_file = None + mode = Mode.NORMAL + + for line in input_stream: + if match := re.match(r"(\/\S+) should add these lines:", line): + current_file = match.group(1) + include_modifications.setdefault( + current_file, + { + "add_includes": [], + "remove_includes": [], + "full_include_list": [], + }, + ) + mode = Mode.ADD + elif match := re.match(r"(\/\S+) should remove these lines:", line): + mode = Mode.REMOVE + elif match := re.match(r"The full include-list for (\/\S+):", line): + mode = Mode.FULL_INCLUDE_LIST + elif line.strip() == "---": + current_file = None + mode = Mode.NORMAL + else: + if current_file: + if mode == Mode.ADD: + include_modifications[current_file]["add_includes"].append( + line.strip() + ) + elif mode == Mode.REMOVE: + include_modifications[current_file][ + "remove_includes" + ].append(line.strip()) + elif mode == Mode.FULL_INCLUDE_LIST: + include_modifications[current_file][ + "full_include_list" + ].append(line.strip()) + else: + if ( + line.strip() + and "include-what-you-use reported diagnostics" not in line + and "In file included from" not in line + and "has correct #includes/fwd-decls" not in line + ): + print(line, end="") + + return include_modifications + + +def post_process_includes(include_modifications): + """Deduplicate and remove redundant entries from add and remove includes.""" + for mods in include_modifications.values(): + # Deduplicate add_includes and remove_includes + mods["add_includes"] = list(set(mods["add_includes"])) + mods["remove_includes"] = list(set(mods["remove_includes"])) + + # Extract file paths from add_includes and remove_includes + add_files = { + extract_include_file(line) for line in mods["add_includes"] + } + remove_files = { + extract_include_file(line) for line in mods["remove_includes"] + } + + # Remove entries that exist in both add_includes and remove_includes + common_files = add_files & remove_files + mods["add_includes"] = [ + line + for line in mods["add_includes"] + if extract_include_file(line) not in common_files + ] + mods["remove_includes"] = [ + line + for line in mods["remove_includes"] + if extract_include_file(line) not in common_files + ] + + # Remove entries that exist in add_includes from full_include_list + mods["full_include_list"] = [ + include + for include in mods["full_include_list"] + if extract_include_file(include) not in add_files + ] + + +def write_output(include_modifications, output_stream): + for filename, mods in include_modifications.items(): + if mods["remove_includes"]: + # IWYU requires all sections to exist, so we write out this header even + # though we never write out any actual additions. + output_stream.write(f"{filename} should add these lines:\n\n") + + output_stream.write(f"{filename} should remove these lines:\n") + for line in mods["remove_includes"]: + output_stream.write(line + "\n") + output_stream.write("\n") + + output_stream.write(f"The full include-list for {filename}:\n") + for line in mods["full_include_list"]: + output_stream.write(line + "\n") + output_stream.write("---\n") + + +def main(): + parser = argparse.ArgumentParser( + description="Process include modifications from a build output log." + ) + parser.add_argument( + "input", + nargs="?", + type=argparse.FileType("r"), + default="-", + help="Input file to read (default: stdin)", + ) + parser.add_argument( + "--output", + type=argparse.FileType("w"), + default="iwyu_results.txt", + help="Output file to write (default: iwyu_output.txt)", + ) + args = parser.parse_args() + + include_modifications = parse_output(args.input) + post_process_includes(include_modifications) + write_output(include_modifications, args.output) + + +if __name__ == "__main__": + main() diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index cc0682b68b9..6eb82618e2a 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu new file mode 100644 index 00000000000..cac6c2224f0 --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_aggregations.cuh" +#include "compute_aggregations.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_aggregations( + int64_t num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + global_set_t& global_set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh new file mode 100644 index 00000000000..e8b29a0e7a8 --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_aggregations.hpp" +#include "compute_global_memory_aggs.hpp" +#include "compute_mapping_indices.hpp" +#include "compute_shared_memory_aggs.hpp" +#include "create_sparse_results_table.hpp" +#include "flatten_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Computes all aggregations from `requests` that require a single pass + * over the data and stores the results in `sparse_results` + */ +template +rmm::device_uvector compute_aggregations( + int64_t num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + SetType& global_set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream) +{ + // flatten the aggs to a table that can be operated on by aggregate_row + auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); + auto const d_agg_kinds = cudf::detail::make_device_uvector_async( + agg_kinds, stream, rmm::mr::get_current_device_resource()); + + auto const grid_size = + max_occupancy_grid_size>(num_rows); + auto const available_shmem_size = get_available_shared_memory_size(grid_size); + auto const has_sufficient_shmem = + available_shmem_size > (compute_shmem_offsets_size(flattened_values.num_columns()) * 2); + auto const has_dictionary_request = std::any_of( + requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) { + return cudf::is_dictionary(request.values.type()); + }); + auto const is_shared_memory_compatible = !has_dictionary_request and has_sufficient_shmem; + + // Performs naive global memory aggregations when the workload is not compatible with shared + // memory, such as when aggregating dictionary columns or when there is insufficient dynamic + // shared memory for shared memory aggregations. + if (!is_shared_memory_compatible) { + return compute_global_memory_aggs(num_rows, + skip_rows_with_nulls, + row_bitmask, + flattened_values, + d_agg_kinds.data(), + agg_kinds, + global_set, + aggs, + sparse_results, + stream); + } + + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(num_rows, stream); + // 'local_mapping_index' maps from the global row index of the input table to its block-wise rank + rmm::device_uvector local_mapping_index(num_rows, stream); + // 'global_mapping_index' maps from the block-wise rank to the row index of global aggregate table + rmm::device_uvector global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS, + stream); + rmm::device_uvector block_cardinality(grid_size, stream); + + // Flag indicating whether a global memory aggregation fallback is required or not + rmm::device_scalar needs_global_memory_fallback(stream); + + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + + compute_mapping_indices(grid_size, + num_rows, + global_set_ref, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + needs_global_memory_fallback.data(), + stream); + + cuda::std::atomic_flag h_needs_fallback; + // Cannot use `device_scalar::value` as it requires a copy constructor, which + // `atomic_flag` doesn't have. + CUDF_CUDA_TRY(cudaMemcpyAsync(&h_needs_fallback, + needs_global_memory_fallback.data(), + sizeof(cuda::std::atomic_flag), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + auto const needs_fallback = h_needs_fallback.test(); + + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds.data(), + agg_kinds, + needs_fallback, + global_set, + populated_keys, + stream); + // prepare to launch kernel to do the actual aggregation + auto d_values = table_device_view::create(flattened_values, stream); + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + + compute_shared_memory_aggs(grid_size, + available_shmem_size, + num_rows, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + stream); + + // The shared memory groupby is designed so that each thread block can handle up to 128 unique + // keys. When a block reaches this cardinality limit, shared memory becomes insufficient to store + // the temporary aggregation results. In these situations, we must fall back to a global memory + // aggregator to process the remaining aggregation requests. + if (needs_fallback) { + auto const stride = GROUPBY_BLOCK_SIZE * grid_size; + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator{0}, + num_rows, + global_memory_fallback_fn{global_set_ref, + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + block_cardinality.data(), + stride, + row_bitmask, + skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + } + + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggs.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); + } + + return populated_keys; +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp similarity index 70% rename from cpp/src/groupby/hash/compute_single_pass_aggs.hpp rename to cpp/src/groupby/hash/compute_aggregations.hpp index a7434bdf61a..829c3c808b0 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/compute_aggregations.hpp @@ -21,6 +21,7 @@ #include #include +#include namespace cudf::groupby::detail::hash { /** @@ -28,11 +29,12 @@ namespace cudf::groupby::detail::hash { * over the data and stores the results in `sparse_results` */ template -void compute_single_pass_aggs(int64_t num_keys, - bool skip_rows_with_nulls, - bitmask_type const* row_bitmask, - SetType set, - cudf::host_span requests, - cudf::detail::result_cache* sparse_results, - rmm::cuda_stream_view stream); +rmm::device_uvector compute_aggregations( + int64_t num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + SetType& global_set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations_null.cu b/cpp/src/groupby/hash/compute_aggregations_null.cu new file mode 100644 index 00000000000..1d7184227ea --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations_null.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_aggregations.cuh" +#include "compute_aggregations.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_aggregations( + int64_t num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + nullable_global_set_t& global_set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu new file mode 100644 index 00000000000..6025686953e --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_global_memory_aggs.cuh" +#include "compute_global_memory_aggs.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + global_set_t& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh new file mode 100644 index 00000000000..00db149c6d9 --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_global_memory_aggs.hpp" +#include "create_sparse_results_table.hpp" +#include "flatten_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + SetType& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream) +{ + auto constexpr uses_global_memory_aggs = true; + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(num_rows, stream); + + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds, + agg_kinds, + uses_global_memory_aggs, + global_set, + populated_keys, + stream); + + // prepare to launch kernel to do the actual aggregation + auto d_values = table_device_view::create(flattened_values, stream); + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::counting_iterator{0}, + num_rows, + hash::compute_single_pass_aggs_fn{ + global_set_ref, *d_values, *d_sparse_table, d_agg_kinds, row_bitmask, skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggregations.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggregations[i], std::move(sparse_result_cols[i])); + } + + return populated_keys; +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp new file mode 100644 index 00000000000..0777b9ffd93 --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + SetType& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu new file mode 100644 index 00000000000..209e2b7f20a --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_global_memory_aggs.cuh" +#include "compute_global_memory_aggs.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + nullable_global_set_t& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index 59457bea694..e1dbf2a3d9e 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -14,8 +14,8 @@ * limitations under the License. */ +#include "compute_aggregations.hpp" #include "compute_groupby.hpp" -#include "compute_single_pass_aggs.hpp" #include "helpers.cuh" #include "sparse_to_dense_results.hpp" @@ -29,7 +29,6 @@ #include #include -#include #include #include @@ -38,18 +37,6 @@ #include namespace cudf::groupby::detail::hash { -template -rmm::device_uvector extract_populated_keys(SetType const& key_set, - size_type num_keys, - rmm::cuda_stream_view stream) -{ - rmm::device_uvector populated_keys(num_keys, stream); - auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); - - populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); - return populated_keys; -} - template std::unique_ptr
compute_groupby(table_view const& keys, host_span requests, @@ -67,8 +54,8 @@ std::unique_ptr
compute_groupby(table_view const& keys, // column is indexed by the hash set cudf::detail::result_cache sparse_results(requests.size()); - auto const set = cuco::static_set{ - num_keys, + auto set = cuco::static_set{ + cuco::extent{num_keys}, cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% load factor cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, d_row_equal, @@ -84,17 +71,13 @@ std::unique_ptr
compute_groupby(table_view const& keys, : rmm::device_buffer{}; // Compute all single pass aggs first - compute_single_pass_aggs(num_keys, - skip_rows_with_nulls, - static_cast(row_bitmask.data()), - set.ref(cuco::insert_and_find), - requests, - &sparse_results, - stream); - - // Extract the populated indices from the hash set and create a gather map. - // Gathering using this map from sparse results will give dense results. - auto gather_map = extract_populated_keys(set, keys.num_rows(), stream); + auto gather_map = compute_aggregations(num_keys, + skip_rows_with_nulls, + static_cast(row_bitmask.data()), + set, + requests, + &sparse_results, + stream); // Compact all results from sparse_results and insert into cache sparse_to_dense_results(requests, @@ -114,12 +97,6 @@ std::unique_ptr
compute_groupby(table_view const& keys, mr); } -template rmm::device_uvector extract_populated_keys( - global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream); - -template rmm::device_uvector extract_populated_keys( - nullable_global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream); - template std::unique_ptr
compute_groupby( table_view const& keys, host_span requests, diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp index 7bb3a60ff07..77243dc0a4f 100644 --- a/cpp/src/groupby/hash/compute_groupby.hpp +++ b/cpp/src/groupby/hash/compute_groupby.hpp @@ -22,28 +22,11 @@ #include #include -#include #include #include namespace cudf::groupby::detail::hash { -/** - * @brief Computes and returns a device vector containing all populated keys in - * `key_set`. - * - * @tparam SetType Type of key hash set - * - * @param key_set Key hash set - * @param num_keys Number of input keys - * @param stream CUDA stream used for device memory operations and kernel launches - * @return An array of unique keys contained in `key_set` - */ -template -rmm::device_uvector extract_populated_keys(SetType const& key_set, - size_type num_keys, - rmm::cuda_stream_view stream); - /** * @brief Computes groupby using hash table. * diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu index 12c02a1865e..f0361ccced2 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -47,9 +47,8 @@ struct size_of_functor { /// Shared memory data alignment CUDF_HOST_DEVICE cudf::size_type constexpr ALIGNMENT = 8; -// Prepares shared memory data required by each output column, exits if -// no enough memory space to perform the shared memory aggregation for the -// current output column +// Allocates shared memory required for output columns. Exits if there is insufficient memory to +// perform shared memory aggregation for the current output column. __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, cudf::size_type& col_end, cudf::mutable_table_device_view output_values, @@ -74,9 +73,7 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, ALIGNMENT); auto const next_col_total_size = next_col_size + valid_col_size; - if (bytes_allocated + next_col_total_size > total_agg_size) { - CUDF_UNREACHABLE("Not enough memory for shared memory aggregations"); - } + if (bytes_allocated + next_col_total_size > total_agg_size) { break; } shmem_agg_res_offsets[col_end] = bytes_allocated; shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size; @@ -275,7 +272,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, } } // namespace -std::size_t available_shared_memory_size(cudf::size_type grid_size) +std::size_t get_available_shared_memory_size(cudf::size_type grid_size) { auto const active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); @@ -302,11 +299,11 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, { // For each aggregation, need one offset determining where the aggregation is // performed, another indicating the validity of the aggregation - auto const shmem_offsets_size = output_values.num_columns() * sizeof(cudf::size_type); + auto const offsets_size = compute_shmem_offsets_size(output_values.num_columns()); // The rest of shmem is utilized for the actual arrays in shmem - CUDF_EXPECTS(available_shmem_size > shmem_offsets_size * 2, + CUDF_EXPECTS(available_shmem_size > offsets_size * 2, "No enough space for shared memory aggregations"); - auto const shmem_agg_size = available_shmem_size - shmem_offsets_size * 2; + auto const shmem_agg_size = available_shmem_size - offsets_size * 2; single_pass_shmem_aggs_kernel<<>>( num_input_rows, row_bitmask, @@ -318,6 +315,6 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, output_values, d_agg_kinds, shmem_agg_size, - shmem_offsets_size); + offsets_size); } } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp index 653821fd53b..346956cdab0 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp @@ -22,8 +22,12 @@ #include namespace cudf::groupby::detail::hash { +std::size_t get_available_shared_memory_size(cudf::size_type grid_size); -std::size_t available_shared_memory_size(cudf::size_type grid_size); +std::size_t constexpr compute_shmem_offsets_size(cudf::size_type num_cols) +{ + return sizeof(cudf::size_type) * num_cols; +} void compute_shared_memory_aggs(cudf::size_type grid_size, std::size_t available_shmem_size, @@ -37,5 +41,4 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, cudf::mutable_table_device_view output_values, cudf::aggregation::Kind const* d_agg_kinds, rmm::cuda_stream_view stream); - } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu deleted file mode 100644 index e292543e6e9..00000000000 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "compute_single_pass_aggs.hpp" -#include "create_sparse_results_table.hpp" -#include "flatten_single_pass_aggs.hpp" -#include "helpers.cuh" -#include "single_pass_functors.cuh" -#include "var_hash_functor.cuh" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -namespace cudf::groupby::detail::hash { -/** - * @brief Computes all aggregations from `requests` that require a single pass - * over the data and stores the results in `sparse_results` - */ -template -void compute_single_pass_aggs(int64_t num_keys, - bool skip_rows_with_nulls, - bitmask_type const* row_bitmask, - SetType set, - host_span requests, - cudf::detail::result_cache* sparse_results, - rmm::cuda_stream_view stream) -{ - // flatten the aggs to a table that can be operated on by aggregate_row - auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); - - // make table that will hold sparse results - table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream); - // prepare to launch kernel to do the actual aggregation - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto d_values = table_device_view::create(flattened_values, stream); - auto const d_aggs = cudf::detail::make_device_uvector_async( - agg_kinds, stream, cudf::get_current_device_resource_ref()); - - thrust::for_each_n( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - num_keys, - hash::compute_single_pass_aggs_fn{ - set, *d_values, *d_sparse_table, d_aggs.data(), row_bitmask, skip_rows_with_nulls}); - // Add results back to sparse_results cache - auto sparse_result_cols = sparse_table.release(); - for (size_t i = 0; i < aggs.size(); i++) { - // Note that the cache will make a copy of this temporary aggregation - sparse_results->add_result( - flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); - } -} - -template void compute_single_pass_aggs>( - int64_t num_keys, - bool skip_rows_with_nulls, - bitmask_type const* row_bitmask, - hash_set_ref_t set, - host_span requests, - cudf::detail::result_cache* sparse_results, - rmm::cuda_stream_view stream); - -template void compute_single_pass_aggs>( - int64_t num_keys, - bool skip_rows_with_nulls, - bitmask_type const* row_bitmask, - nullable_hash_set_ref_t set, - host_span requests, - cudf::detail::result_cache* sparse_results, - rmm::cuda_stream_view stream); -} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu index 22fa4fc584c..bc32e306b3f 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.cu +++ b/cpp/src/groupby/hash/create_sparse_results_table.cu @@ -15,53 +15,110 @@ */ #include "create_sparse_results_table.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" +#include #include #include -#include -#include -#include #include #include #include +#include + +#include #include #include #include namespace cudf::groupby::detail::hash { +template +void extract_populated_keys(SetType const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) +{ + auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); + + populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); +} + // make table that will hold sparse results -cudf::table create_sparse_results_table(table_view const& flattened_values, - std::vector aggs, +template +cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector agg_kinds, + bool direct_aggregations, + GlobalSetType const& global_set, + rmm::device_uvector& populated_keys, rmm::cuda_stream_view stream) { // TODO single allocation - room for performance improvement - std::vector> sparse_columns; - sparse_columns.reserve(flattened_values.num_columns()); - std::transform( - flattened_values.begin(), - flattened_values.end(), - aggs.begin(), - std::back_inserter(sparse_columns), - [stream](auto const& col, auto const& agg) { - bool nullable = - (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL) - ? false - : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD); - auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED; + std::vector> sparse_columns; + std::transform(flattened_values.begin(), + flattened_values.end(), + agg_kinds.begin(), + std::back_inserter(sparse_columns), + [stream](auto const& col, auto const& agg) { + auto const nullable = + (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) + ? false + : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or + agg == cudf::aggregation::STD); + auto const mask_flag = + (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; + auto const col_type = cudf::is_dictionary(col.type()) + ? cudf::dictionary_column_view(col).keys().type() + : col.type(); + return make_fixed_width_column( + cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); + }); + cudf::table sparse_table(std::move(sparse_columns)); + // If no direct aggregations, initialize the sparse table + // only for the keys inserted in global hash set + if (!direct_aggregations) { + auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream); + extract_populated_keys(global_set, populated_keys, stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + populated_keys.size(), + initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds}); + } + // Else initialize the whole table + else { + cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view(); + cudf::detail::initialize_with_identity(sparse_table_view, agg_kinds, stream); + } + return sparse_table; +} - auto col_type = cudf::is_dictionary(col.type()) - ? cudf::dictionary_column_view(col).keys().type() - : col.type(); +template void extract_populated_keys( + global_set_t const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); - return make_fixed_width_column( - cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); - }); +template void extract_populated_keys( + nullable_global_set_t const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); - table sparse_table(std::move(sparse_columns)); - mutable_table_view table_view = sparse_table.mutable_view(); - cudf::detail::initialize_with_identity(table_view, aggs, stream); - return sparse_table; -} +template cudf::table create_sparse_results_table( + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector agg_kinds, + bool direct_aggregations, + global_set_t const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +template cudf::table create_sparse_results_table( + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector agg_kinds, + bool direct_aggregations, + nullable_global_set_t const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp index c1d4e0d3f20..8155ce852e0 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.hpp +++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp @@ -15,18 +15,41 @@ */ #pragma once +#include #include #include #include #include #include +#include #include namespace cudf::groupby::detail::hash { +/** + * @brief Computes and returns a device vector containing all populated keys in + * `key_set`. + * + * @tparam SetType Type of the key hash set + * + * @param key_set Key hash set + * @param populated_keys Array of unique keys + * @param stream CUDA stream used for device memory operations and kernel launches + * @return An array of unique keys contained in `key_set` + */ +template +void extract_populated_keys(SetType const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + // make table that will hold sparse results -cudf::table create_sparse_results_table(table_view const& flattened_values, - std::vector aggs_kinds, +template +cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector agg_kinds, + bool direct_aggregations, + GlobalSetType const& global_set, + rmm::device_uvector& populated_keys, rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh index 00836567b4f..f950e03e0fb 100644 --- a/cpp/src/groupby/hash/helpers.cuh +++ b/cpp/src/groupby/hash/helpers.cuh @@ -23,8 +23,6 @@ #include namespace cudf::groupby::detail::hash { -// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested -// types and `cg_size = 1`for flat data to improve performance /// Number of threads to handle each input element CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1; diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 28a5b578e00..048c9252773 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -15,12 +15,14 @@ */ #pragma once -#include +#include "helpers.cuh" + #include -#include -#include +#include +#include +#include -#include +#include namespace cudf::groupby::detail::hash { // TODO: TO BE REMOVED issue tracked via #17171 @@ -104,6 +106,114 @@ struct initialize_shmem { } }; +template +struct initialize_target_element_gmem { + __device__ void operator()(cudf::mutable_column_device_view, cudf::size_type) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct initialize_target_element_gmem< + Target, + k, + std::enable_if_t() && cudf::is_fixed_width() && + !cudf::is_fixed_point()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index) const noexcept + { + using DeviceType = cudf::device_storage_type_t; + target.element(target_index) = get_identity(); + } +}; + +template +struct initialize_target_element_gmem< + Target, + k, + std::enable_if_t() && cudf::is_fixed_point()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index) const noexcept + { + using DeviceType = cudf::device_storage_type_t; + target.element(target_index) = get_identity(); + } +}; + +struct initialize_gmem { + template + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index) const noexcept + { + initialize_target_element_gmem{}(target, target_index); + } +}; + +struct initialize_sparse_table { + cudf::size_type const* row_indices; + cudf::mutable_table_device_view sparse_table; + cudf::aggregation::Kind const* __restrict__ aggs; + initialize_sparse_table(cudf::size_type const* row_indices, + cudf::mutable_table_device_view sparse_table, + cudf::aggregation::Kind const* aggs) + : row_indices(row_indices), sparse_table(sparse_table), aggs(aggs) + { + } + __device__ void operator()(cudf::size_type i) + { + auto key_idx = row_indices[i]; + for (auto col_idx = 0; col_idx < sparse_table.num_columns(); col_idx++) { + cudf::detail::dispatch_type_and_aggregation(sparse_table.column(col_idx).type(), + aggs[col_idx], + initialize_gmem{}, + sparse_table.column(col_idx), + key_idx); + } + } +}; + +template +struct global_memory_fallback_fn { + SetType set; + cudf::table_device_view input_values; + cudf::mutable_table_device_view output_values; + cudf::aggregation::Kind const* __restrict__ aggs; + cudf::size_type* block_cardinality; + cudf::size_type stride; + bitmask_type const* __restrict__ row_bitmask; + bool skip_rows_with_nulls; + + global_memory_fallback_fn(SetType set, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* aggs, + cudf::size_type* block_cardinality, + cudf::size_type stride, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls) + : set(set), + input_values(input_values), + output_values(output_values), + aggs(aggs), + block_cardinality(block_cardinality), + stride(stride), + row_bitmask(row_bitmask), + skip_rows_with_nulls(skip_rows_with_nulls) + { + } + + __device__ void operator()(cudf::size_type i) + { + auto const block_id = (i % stride) / GROUPBY_BLOCK_SIZE; + if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD and + (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i))) { + auto const result = set.insert_and_find(i); + cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); + } + } +}; + /** * @brief Computes single-pass aggregations and store results into a sparse `output_values` table, * and populate `set` with indices of unique keys diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index 570a00cbfc2..7fafa885c66 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -269,7 +269,8 @@ std::map unified_schema(cudf::io::json_reader_optio }); return dnew; }, - [](std::map const& user_dtypes) { return user_dtypes; }}, + [](std::map const& user_dtypes) { return user_dtypes; }, + [](schema_element const& user_dtypes) { return user_dtypes.child_types; }}, options.get_dtypes()); } @@ -492,7 +493,7 @@ std::pair, hashmap_of_device_columns> build_tree auto expected_types = cudf::detail::make_host_vector(num_columns, stream); std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES); - auto lookup_names = [&column_names](auto child_ids, auto name) { + auto lookup_names = [&column_names](auto const& child_ids, auto const& name) { for (auto const& child_id : child_ids) { if (column_names[child_id] == name) return child_id; } @@ -569,7 +570,7 @@ std::pair, hashmap_of_device_columns> build_tree for (size_t i = 0; i < adj[root_list_col_id].size() && i < user_dtypes.size(); i++) { NodeIndexT const first_child_id = adj[root_list_col_id][i]; - auto name = column_names[first_child_id]; + auto const& name = column_names[first_child_id]; auto value_id = std::stol(name); if (value_id >= 0 and value_id < static_cast(user_dtypes.size())) mark_is_pruned(first_child_id, schema_element{user_dtypes[value_id]}); @@ -580,7 +581,7 @@ std::pair, hashmap_of_device_columns> build_tree std::map const& user_dtypes) -> void { for (size_t i = 0; i < adj[root_list_col_id].size(); i++) { auto const first_child_id = adj[root_list_col_id][i]; - auto name = column_names[first_child_id]; + auto const& name = column_names[first_child_id]; if (user_dtypes.count(name)) mark_is_pruned(first_child_id, schema_element{user_dtypes.at(name)}); } @@ -589,10 +590,19 @@ std::pair, hashmap_of_device_columns> build_tree std::map const& user_dtypes) -> void { for (size_t i = 0; i < adj[root_list_col_id].size(); i++) { auto const first_child_id = adj[root_list_col_id][i]; - auto name = column_names[first_child_id]; + auto const& name = column_names[first_child_id]; if (user_dtypes.count(name)) mark_is_pruned(first_child_id, user_dtypes.at(name)); } + }, + [&root_list_col_id, &adj, &mark_is_pruned, &column_names]( + schema_element const& user_dtypes) -> void { + for (size_t i = 0; i < adj[root_list_col_id].size(); i++) { + auto const first_child_id = adj[root_list_col_id][i]; + auto const& name = column_names[first_child_id]; + if (user_dtypes.child_types.count(name) != 0) + mark_is_pruned(first_child_id, user_dtypes.child_types.at(name)); + } }}, options.get_dtypes()); } else { @@ -626,7 +636,9 @@ std::pair, hashmap_of_device_columns> build_tree [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema]( std::map const& user_dtypes) -> void { mark_is_pruned(root_struct_col_id, u_schema); - }}, + }, + [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema](schema_element const& user_dtypes) + -> void { mark_is_pruned(root_struct_col_id, u_schema); }}, options.get_dtypes()); } // Useful for array of arrays @@ -714,7 +726,7 @@ std::pair, hashmap_of_device_columns> build_tree if (expected_category == NC_STRUCT) { // find field column ids, and its children and create columns. for (auto const& field_id : child_ids) { - auto name = column_names[field_id]; + auto const& name = column_names[field_id]; if (is_pruned[field_id]) continue; auto inserted = ref.get().child_columns.try_emplace(name, device_json_column(stream, mr)).second; @@ -745,7 +757,7 @@ std::pair, hashmap_of_device_columns> build_tree std::map> array_values; for (auto const& child_id : child_ids) { if (is_pruned[child_id]) continue; - auto name = column_names[child_id]; + auto const& name = column_names[child_id]; array_values[std::stoi(name)].push_back(child_id); } // diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 7e4d975e431..30a154fdda2 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -399,9 +399,9 @@ std::pair, std::vector> device_json_co // - String columns will be returned as nullable, iff there's at least one null entry if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); } - // For string columns return ["offsets", "char"] schema + // For string columns return ["offsets"] schema if (target_type.id() == type_id::STRING) { - return {std::move(col), std::vector{{"offsets"}, {"chars"}}}; + return {std::move(col), std::vector{{"offsets"}}}; } // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema return {std::move(col), std::vector{}}; @@ -410,12 +410,37 @@ std::pair, std::vector> device_json_co std::vector> child_columns; std::vector column_names{}; size_type num_rows{json_col.num_rows}; + + bool const has_column_order = + prune_columns and not schema.value_or(schema_element{}) + .column_order.value_or(std::vector{}) + .empty(); + + auto const& col_order = + has_column_order ? schema.value().column_order.value() : json_col.column_order; + // Create children columns - for (auto const& col_name : json_col.column_order) { - auto const& col = json_col.child_columns.find(col_name); - column_names.emplace_back(col->first); - auto& child_col = col->second; + for (auto const& col_name : col_order) { auto child_schema_element = get_child_schema(col_name); + auto const found_it = json_col.child_columns.find(col_name); + + if (prune_columns and found_it == std::end(json_col.child_columns)) { + CUDF_EXPECTS(child_schema_element.has_value(), + "Column name not found in input schema map, but present in column order and " + "prune_columns is enabled"); + column_names.emplace_back(make_column_name_info( + child_schema_element.value_or(schema_element{data_type{type_id::EMPTY}}), col_name)); + auto all_null_column = make_all_nulls_column( + child_schema_element.value_or(schema_element{data_type{type_id::EMPTY}}), + num_rows, + stream, + mr); + child_columns.emplace_back(std::move(all_null_column)); + continue; + } + column_names.emplace_back(found_it->first); + + auto& child_col = found_it->second; if (!prune_columns or child_schema_element.has_value()) { auto [child_column, names] = device_json_column_to_cudf_column( child_col, d_input, options, prune_columns, child_schema_element, stream, mr); @@ -576,11 +601,21 @@ table_with_metadata device_parse_nested_json(device_span d_input, std::vector out_column_names; auto parse_opt = parsing_options(options, stream); - // Iterate over the struct's child columns and convert to cudf column - size_type column_index = 0; - for (auto const& col_name : root_struct_col.column_order) { - auto& json_col = root_struct_col.child_columns.find(col_name)->second; + schema_element const* prune_schema = std::get_if(&options.get_dtypes()); + bool const has_column_order = options.is_enabled_prune_columns() and prune_schema != nullptr and + prune_schema->column_order.has_value() and + not prune_schema->column_order->empty(); + auto const& col_order = + has_column_order ? prune_schema->column_order.value() : root_struct_col.column_order; + if (has_column_order) { + CUDF_EXPECTS(prune_schema->child_types.size() == col_order.size(), + "Input schema column order size mismatch with input schema child types"); + } + auto root_col_size = root_struct_col.num_rows; + // Iterate over the struct's child columns/column_order and convert to cudf column + size_type column_index = 0; + for (auto const& col_name : col_order) { std::optional child_schema_element = std::visit( cudf::detail::visitor_overload{ [column_index](std::vector const& user_dtypes) -> std::optional { @@ -590,17 +625,23 @@ table_with_metadata device_parse_nested_json(device_span d_input, }, [col_name]( std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? std::optional{{user_dtypes.find(col_name)->second}} - : std::optional{}; + if (auto it = user_dtypes.find(col_name); it != std::end(user_dtypes)) + return std::optional{{it->second}}; + return std::nullopt; }, [col_name](std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? user_dtypes.find(col_name)->second - : std::optional{}; + if (auto it = user_dtypes.find(col_name); it != std::end(user_dtypes)) return it->second; + return std::nullopt; + }, + [col_name](schema_element const& user_dtypes) -> std::optional { + if (auto it = user_dtypes.child_types.find(col_name); + it != std::end(user_dtypes.child_types)) + return it->second; + return std::nullopt; }}, options.get_dtypes()); + #ifdef NJP_DEBUG_PRINT auto debug_schema_print = [](auto ret) { std::cout << ", type id: " @@ -608,20 +649,39 @@ table_with_metadata device_parse_nested_json(device_span d_input, << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children" << "\n"; }; - std::visit( - cudf::detail::visitor_overload{[column_index](std::vector const&) { - std::cout << "Column by index: #" << column_index; - }, - [col_name](std::map const&) { - std::cout << "Column by flat name: '" << col_name; - }, - [col_name](std::map const&) { - std::cout << "Column by nested name: #" << col_name; - }}, - options.get_dtypes()); + std::visit(cudf::detail::visitor_overload{ + [column_index](std::vector const&) { + std::cout << "Column by index: #" << column_index; + }, + [col_name](std::map const&) { + std::cout << "Column by flat name: '" << col_name; + }, + [col_name](std::map const&) { + std::cout << "Column by nested name: #" << col_name; + }, + [col_name](schema_element const&) { + std::cout << "Column by nested schema with column order: #" << col_name; + }}, + options.get_dtypes()); debug_schema_print(child_schema_element); #endif + auto const found_it = root_struct_col.child_columns.find(col_name); + if (options.is_enabled_prune_columns() and + found_it == std::end(root_struct_col.child_columns)) { + CUDF_EXPECTS(child_schema_element.has_value(), + "Column name not found in input schema map, but present in column order and " + "prune_columns is enabled"); + // inserts all null column + out_column_names.emplace_back(make_column_name_info(child_schema_element.value(), col_name)); + auto all_null_column = + make_all_nulls_column(child_schema_element.value(), root_col_size, stream, mr); + out_columns.emplace_back(std::move(all_null_column)); + column_index++; + continue; + } + auto& json_col = found_it->second; + if (!options.is_enabled_prune_columns() or child_schema_element.has_value()) { // Get this JSON column's cudf column and schema info, (modifies json_col) auto [cudf_col, col_name_info] = diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 34a87918e57..1b61be20202 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -58,7 +58,7 @@ enum class dfa_symbol_group_id : uint32_t { DOUBLE_QUOTE_CHAR, ///< Quote character SG: " SINGLE_QUOTE_CHAR, ///< Quote character SG: ' ESCAPE_CHAR, ///< Escape character SG: '\' - NEWLINE_CHAR, ///< Newline character SG: '\n' + DELIM_CHAR, ///< Delimiter character SG OTHER_SYMBOLS, ///< SG implicitly matching all other characters NUM_SYMBOL_GROUPS ///< Total number of symbol groups }; @@ -72,13 +72,17 @@ constexpr auto TT_SEC = dfa_states::TT_SEC; constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); -// The i-th string representing all the characters of a symbol group -std::array, NUM_SYMBOL_GROUPS - 1> const qna_sgs{ - {{'\"'}, {'\''}, {'\\'}, {'\n'}}}; +auto get_sgid_lut(SymbolT delim) +{ + // The i-th string representing all the characters of a symbol group + std::array, NUM_SYMBOL_GROUPS - 1> symbol_groups{ + {{'\"'}, {'\''}, {'\\'}, {delim}}}; + return symbol_groups; +} // Transition table std::array, TT_NUM_STATES> const qna_state_tt{{ - /* IN_STATE " ' \ \n OTHER */ + /* IN_STATE " ' \ OTHER */ /* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}}, /* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}}, /* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}}, @@ -199,28 +203,26 @@ struct TransduceToNormalizedQuotes { namespace normalize_whitespace { +// We do not need a symbol group for the delimiter character since whitespace normalization +// now occurs after tokenization. enum class dfa_symbol_group_id : uint32_t { DOUBLE_QUOTE_CHAR, ///< Quote character SG: " ESCAPE_CHAR, ///< Escape character SG: '\\' - NEWLINE_CHAR, ///< Newline character SG: '\n' WHITESPACE_SYMBOLS, ///< Whitespace characters SG: '\t' or ' ' OTHER_SYMBOLS, ///< SG implicitly matching all other characters NUM_SYMBOL_GROUPS ///< Total number of symbol groups }; // Alias for readability of symbol group ids constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); -// The i-th string representing all the characters of a symbol group -std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{ - {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}}; + +std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{{{'"'}, {'\\'}, {' ', '\t'}}}; /** * -------- FST states --------- * ----------------------------- * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double - * | quotes as well as any other character not enclosed by a string. Also handles - * | newline character present within a string - * TT_DQS | Double-quoted string state handling all characters within double quotes except - * | newline character + * | quotes as well as any other character not enclosed by a string. + * TT_DQS | Double-quoted string state handling all characters within double quotes * TT_DEC | State handling escaped characters inside double-quoted string. Note that this * | state is necessary to process escaped double-quote characters. Without this * | state, whitespaces following escaped double quotes inside strings may be removed. @@ -235,10 +237,10 @@ constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); // Transition table std::array, TT_NUM_STATES> const wna_state_tt{ - {/* IN_STATE " \ \n OTHER */ - /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}}, - /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}}, - /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}}; + {/* IN_STATE " \ OTHER */ + /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS}}, + /* TT_DQS */ {{TT_OOS, TT_DEC, TT_DQS, TT_DQS}}, + /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}}; // The DFA's starting state constexpr StateT start_state = static_cast(TT_OOS); @@ -302,18 +304,19 @@ struct TransduceToNormalizedWS { namespace detail { void normalize_single_quotes(datasource::owning_buffer& indata, + char delimiter, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); static constexpr std::int32_t min_out = 0; static constexpr std::int32_t max_out = 2; - auto parser = - fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs), - fst::detail::make_transition_table(normalize_quotes::qna_state_tt), - fst::detail::make_translation_functor( - normalize_quotes::TransduceToNormalizedQuotes{}), - stream); + auto parser = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(normalize_quotes::get_sgid_lut(delimiter)), + fst::detail::make_transition_table(normalize_quotes::qna_state_tt), + fst::detail::make_translation_functor( + normalize_quotes::TransduceToNormalizedQuotes{}), + stream); rmm::device_buffer outbuf(indata.size() * 2, stream, mr); cudf::detail::device_scalar outbuf_size(stream, mr); diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 7b3b04dea16..4989fff4b30 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -429,6 +429,29 @@ table_with_metadata device_parse_nested_json(device_span input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +/** + * @brief Create all null column of a given nested schema + * + * @param schema The schema of the column to create + * @param num_rows The number of rows in the column + * @param stream The CUDA stream to which kernels are dispatched + * @param mr resource with which to allocate + * @return The all null column + */ +std::unique_ptr make_all_nulls_column(schema_element const& schema, + size_type num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +/** + * @brief Create metadata for a column of a given schema + * + * @param schema The schema of the column + * @param col_name The name of the column + * @return column metadata for a given schema + */ +column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name); + /** * @brief Get the path data type of a column by path if present in input schema * diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 60e78f4763d..f1c2826c62a 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -2198,9 +2198,9 @@ std::pair, std::vector> json_column_to // - String columns will be returned as nullable, iff there's at least one null entry if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); } - // For string columns return ["offsets", "char"] schema + // For string columns return ["offsets"] schema if (target_type.id() == type_id::STRING) { - return {std::move(col), std::vector{{"offsets"}, {"chars"}}}; + return {std::move(col), std::vector{{"offsets"}}}; } // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema else { diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp index 4caa5cd9e24..401a6e992de 100644 --- a/cpp/src/io/json/parser_features.cpp +++ b/cpp/src/io/json/parser_features.cpp @@ -16,14 +16,201 @@ #include "nested_json.hpp" +#include +#include +#include #include +#include +#include +#include +#include +#include #include #include #include +namespace cudf::io { +namespace { +bool validate_column_order(schema_element const& types) +{ + // For struct types, check if column_order size matches child_types size and all elements in + // column_order are in child_types, in child_types, call this function recursively. + // For list types, check if child_types size is 1 and call this function recursively. + if (types.type.id() == type_id::STRUCT) { + if (types.column_order.has_value()) { + if (types.column_order.value().size() != types.child_types.size()) { return false; } + for (auto const& column_name : types.column_order.value()) { + auto it = types.child_types.find(column_name); + if (it == types.child_types.end()) { return false; } + if (it->second.type.id() == type_id::STRUCT or it->second.type.id() == type_id::LIST) { + if (!validate_column_order(it->second)) { return false; } + } + } + } + } else if (types.type.id() == type_id::LIST) { + if (types.child_types.size() != 1) { return false; } + auto it = types.child_types.begin(); + if (it->second.type.id() == type_id::STRUCT or it->second.type.id() == type_id::LIST) { + if (!validate_column_order(it->second)) { return false; } + } + } + return true; +} +} // namespace + +void json_reader_options::set_dtypes(schema_element types) +{ + CUDF_EXPECTS( + validate_column_order(types), "Column order does not match child types", std::invalid_argument); + _dtypes = std::move(types); +} +} // namespace cudf::io + namespace cudf::io::json::detail { +/// Created an empty column of the specified schema +struct empty_column_functor { + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; + + template ())> + std::unique_ptr operator()(schema_element const& schema) const + { + return make_empty_column(schema.type); + } + + template )> + std::unique_ptr operator()(schema_element const& schema) const + { + CUDF_EXPECTS(schema.child_types.size() == 1, "List column should have only one child"); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = cudf::type_dispatcher( + schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name)); + auto offsets = make_empty_column(data_type(type_to_id())); + return make_lists_column(0, std::move(offsets), std::move(child), 0, {}, stream, mr); + } + + template )> + std::unique_ptr operator()(schema_element const& schema) const + { + std::vector> child_columns; + for (auto const& child_name : schema.column_order.value_or(std::vector{})) { + child_columns.push_back(cudf::type_dispatcher( + schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name))); + } + return make_structs_column(0, std::move(child_columns), 0, {}, stream, mr); + } +}; + +/// Created all null column of the specified schema +struct allnull_column_functor { + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; + + private: + auto make_zeroed_offsets(size_type size) const + { + auto offsets_buff = + cudf::detail::make_zeroed_device_uvector_async(size + 1, stream, mr); + return std::make_unique(std::move(offsets_buff), rmm::device_buffer{}, 0); + } + + public: + template ())> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + return make_fixed_width_column(schema.type, size, mask_state::ALL_NULL, stream, mr); + } + + template ())> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + CUDF_EXPECTS(schema.child_types.size() == 1, "Dictionary column should have only one child"); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = cudf::type_dispatcher(schema.child_types.at(child_name).type, + empty_column_functor{stream, mr}, + schema.child_types.at(child_name)); + return make_fixed_width_column(schema.type, size, mask_state::ALL_NULL, stream, mr); + auto indices = make_zeroed_offsets(size - 1); + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_dictionary_column( + std::move(child), std::move(indices), std::move(null_mask), size, stream, mr); + } + + template )> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + auto offsets = make_zeroed_offsets(size); + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_strings_column( + size, std::move(offsets), rmm::device_buffer{}, size, std::move(null_mask)); + } + template )> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + CUDF_EXPECTS(schema.child_types.size() == 1, "List column should have only one child"); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = cudf::type_dispatcher(schema.child_types.at(child_name).type, + empty_column_functor{stream, mr}, + schema.child_types.at(child_name)); + auto offsets = make_zeroed_offsets(size); + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_lists_column( + size, std::move(offsets), std::move(child), size, std::move(null_mask), stream, mr); + } + + template )> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + std::vector> child_columns; + for (auto const& child_name : schema.column_order.value_or(std::vector{})) { + child_columns.push_back(cudf::type_dispatcher( + schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name), size)); + } + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_structs_column( + size, std::move(child_columns), size, std::move(null_mask), stream, mr); + } +}; + +std::unique_ptr make_all_nulls_column(schema_element const& schema, + size_type num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + return cudf::type_dispatcher(schema.type, allnull_column_functor{stream, mr}, schema, num_rows); +} + +column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name) +{ + column_name_info info; + info.name = col_name; + switch (schema.type.id()) { + case type_id::STRUCT: + for (auto const& child_name : schema.column_order.value_or(std::vector{})) { + info.children.push_back( + make_column_name_info(schema.child_types.at(child_name), child_name)); + } + break; + case type_id::LIST: + info.children.emplace_back("offsets"); + for (auto const& [child_name, child_schema] : schema.child_types) { + info.children.push_back(make_column_name_info(child_schema, child_name)); + } + break; + case type_id::DICTIONARY32: + info.children.emplace_back("indices"); + for (auto const& [child_name, child_schema] : schema.child_types) { + info.children.push_back(make_column_name_info(child_schema, child_name)); + } + break; + case type_id::STRING: info.children.emplace_back("offsets"); break; + default: break; + } + return info; +} + std::optional child_schema_element(std::string const& col_name, cudf::io::json_reader_options const& options) { @@ -46,6 +233,11 @@ std::optional child_schema_element(std::string const& col_name, return (user_dtypes.find(col_name) != std::end(user_dtypes)) ? user_dtypes.find(col_name)->second : std::optional{}; + }, + [col_name](schema_element const& user_dtypes) -> std::optional { + return (user_dtypes.child_types.find(col_name) != std::end(user_dtypes.child_types)) + ? user_dtypes.child_types.find(col_name)->second + : std::optional{}; }}, options.get_dtypes()); } diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 2bc15ea19cb..279f5e71351 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -248,7 +248,8 @@ table_with_metadata read_batch(host_span> sources, // If input JSON buffer has single quotes and option to normalize single quotes is enabled, // invoke pre-processing FST if (reader_opts.is_enabled_normalize_single_quotes()) { - normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref()); + normalize_single_quotes( + bufview, reader_opts.get_delimiter(), stream, cudf::get_current_device_resource_ref()); } auto buffer = diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 45380e6ea20..aaf5ebfbe7d 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -147,6 +147,8 @@ __device__ void gpuDecodeFixedWidthValues( } break; } + } else if (dtype == BOOLEAN) { + gpuOutputBoolean(sb, src_pos, static_cast(dst)); } else if (dtype == INT96) { gpuOutputInt96Timestamp(s, sb, src_pos, static_cast(dst)); } else if (dtype_len == 8) { @@ -841,6 +843,33 @@ __device__ inline bool maybe_has_nulls(page_state_s* s) return run_val != s->col.max_level[lvl]; } +template +inline __device__ void bool_plain_decode(page_state_s* s, state_buf* sb, int t, int to_decode) +{ + int pos = s->dict_pos; + int const target_pos = pos + to_decode; + + while (pos < target_pos) { + int const batch_len = min(target_pos - pos, decode_block_size_t); + + if (t < batch_len) { + int const bit_pos = pos + t; + int const byte_offset = bit_pos >> 3; + int const bit_in_byte_index = bit_pos & 7; + + uint8_t const* const read_from = s->data_start + byte_offset; + bool const read_bit = (*read_from) & (1 << bit_in_byte_index); + + int const write_to_index = rolling_index(bit_pos); + sb->dict_idx[write_to_index] = read_bit; + } + + pos += batch_len; + } + + if (t == 0) { s->dict_pos = pos; } +} + template __device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t) { @@ -872,14 +901,7 @@ __device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t) * @param num_rows Maximum number of rows to read * @param error_code Error code to set if an error is encountered */ -template - typename DecodeValuesFunc> +template CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) gpuDecodePageDataGeneric(PageInfo* pages, device_span chunks, @@ -887,12 +909,33 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) size_t num_rows, kernel_error::pointer error_code) { + constexpr bool has_dict_t = (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT_NESTED) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT_LIST); + constexpr bool has_bools_t = (kernel_mask_t == decode_kernel_mask::BOOLEAN) || + (kernel_mask_t == decode_kernel_mask::BOOLEAN_NESTED) || + (kernel_mask_t == decode_kernel_mask::BOOLEAN_LIST); + constexpr bool has_nesting_t = + (kernel_mask_t == decode_kernel_mask::BOOLEAN_NESTED) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT_NESTED) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED) || + (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED); + constexpr bool has_lists_t = + (kernel_mask_t == decode_kernel_mask::BOOLEAN_LIST) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT_LIST) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST) || + (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST); + constexpr bool split_decode_t = + (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT) || + (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED) || + (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST); + constexpr int rolling_buf_size = decode_block_size_t * 2; constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size(); __shared__ __align__(16) page_state_s state_g; using state_buf_t = page_state_buffers_s; __shared__ __align__(16) state_buf_t state_buffers; @@ -920,32 +963,31 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) // if we have no work to do (eg, in a skip_rows/num_rows case) in this page. if (s->num_rows == 0) { return; } - DecodeValuesFunc decode_values; + using value_decoder_type = std::conditional_t< + split_decode_t, + decode_fixed_width_split_values_func, + decode_fixed_width_values_func>; + value_decoder_type decode_values; bool const should_process_nulls = is_nullable(s) && maybe_has_nulls(s); // shared buffer. all shared memory is suballocated out of here - constexpr int shared_rep_size = - has_lists_t - ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}) - : 0; - constexpr int shared_dict_size = - has_dict_t - ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}) - : 0; - constexpr int shared_def_size = - cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}); - constexpr int shared_buf_size = shared_rep_size + shared_dict_size + shared_def_size; + constexpr int rle_run_buffer_bytes = + cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}); + constexpr int shared_buf_size = + rle_run_buffer_bytes * (static_cast(has_dict_t) + static_cast(has_bools_t) + + static_cast(has_lists_t) + 1); __shared__ __align__(16) uint8_t shared_buf[shared_buf_size]; // setup all shared memory buffers - int shared_offset = 0; - rle_run* rep_runs = reinterpret_cast*>(shared_buf + shared_offset); - if constexpr (has_lists_t) { shared_offset += shared_rep_size; } - - rle_run* dict_runs = reinterpret_cast*>(shared_buf + shared_offset); - if constexpr (has_dict_t) { shared_offset += shared_dict_size; } - rle_run* def_runs = reinterpret_cast*>(shared_buf + shared_offset); + int shared_offset = 0; + auto rep_runs = reinterpret_cast(shared_buf + shared_offset); + if constexpr (has_lists_t) { shared_offset += rle_run_buffer_bytes; } + auto dict_runs = reinterpret_cast(shared_buf + shared_offset); + if constexpr (has_dict_t) { shared_offset += rle_run_buffer_bytes; } + auto bool_runs = reinterpret_cast(shared_buf + shared_offset); + if constexpr (has_bools_t) { shared_offset += rle_run_buffer_bytes; } + auto def_runs = reinterpret_cast(shared_buf + shared_offset); // initialize the stream decoders (requires values computed in setupLocalPageInfo) rle_stream def_decoder{def_runs}; @@ -974,6 +1016,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values); } + // Use dictionary stream memory for bools + rle_stream bool_stream{bool_runs}; + bool bools_are_rle_stream = (s->dict_run == 0); + if constexpr (has_bools_t) { + if (bools_are_rle_stream) { + bool_stream.init(1, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values); + } + } + __syncthreads(); + // We use two counters in the loop below: processed_count and valid_count. // - processed_count: number of values out of num_input_values that we have decoded so far. // the definition stream returns the number of total rows it has processed in each call @@ -1041,13 +1093,20 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) } __syncthreads(); - // if we have dictionary data + // if we have dictionary or bool data + // We want to limit the number of dictionary/bool items we decode, that correspond to + // the rows we have processed in this iteration that are valid. + // We know the number of valid rows to process with: next_valid_count - valid_count. if constexpr (has_dict_t) { - // We want to limit the number of dictionary items we decode, that correspond to - // the rows we have processed in this iteration that are valid. - // We know the number of valid rows to process with: next_valid_count - valid_count. dict_stream.decode_next(t, next_valid_count - valid_count); __syncthreads(); + } else if constexpr (has_bools_t) { + if (bools_are_rle_stream) { + bool_stream.decode_next(t, next_valid_count - valid_count); + } else { + bool_plain_decode(s, sb, t, next_valid_count - valid_count); + } + __syncthreads(); } // decode the values themselves @@ -1061,250 +1120,82 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) } // anonymous namespace -void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - bool is_list, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream) -{ - constexpr int decode_block_size = 128; - - dim3 dim_block(decode_block_size, 1); - dim3 dim_grid(pages.size(), 1); // 1 threadblock per page - - if (level_type_size == 1) { - if (is_list) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else if (has_nesting) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } - } else { - if (is_list) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else if (has_nesting) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } - } -} +template +using kernel_tag_t = std::integral_constant; -void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - bool is_list, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream) -{ - constexpr int decode_block_size = 128; - - dim3 dim_block(decode_block_size, 1); // decode_block_size = 128 threads per block - dim3 dim_grid(pages.size(), 1); // 1 thread block per page => # blocks - - if (level_type_size == 1) { - if (is_list) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else if (has_nesting) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } - } else { - if (is_list) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else if (has_nesting) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } - } -} +template +using int_tag_t = std::integral_constant; -void __host__ -DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - bool is_list, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream) +void __host__ DecodePageData(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, + size_t num_rows, + size_t min_row, + int level_type_size, + decode_kernel_mask kernel_mask, + kernel_error::pointer error_code, + rmm::cuda_stream_view stream) { - constexpr int decode_block_size = 128; - - dim3 dim_block(decode_block_size, 1); // decode_block_size = 128 threads per block - dim3 dim_grid(pages.size(), 1); // 1 thread block per page => # blocks - - if (level_type_size == 1) { - if (is_list) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else if (has_nesting) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } - } else { - if (is_list) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else if (has_nesting) { - gpuDecodePageDataGeneric + // No template parameters on lambdas until C++20, so use type tags instead + auto launch_kernel = [&](auto block_size_tag, auto kernel_mask_tag) { + constexpr int decode_block_size = decltype(block_size_tag)::value; + constexpr decode_kernel_mask mask = decltype(kernel_mask_tag)::value; + + dim3 dim_block(decode_block_size, 1); + dim3 dim_grid(pages.size(), 1); // 1 threadblock per page + + if (level_type_size == 1) { + gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); } else { - gpuDecodePageDataGeneric + gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); } + }; + + switch (kernel_mask) { + case decode_kernel_mask::FIXED_WIDTH_NO_DICT: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED: + launch_kernel(int_tag_t<128>{}, + kernel_tag_t{}); + break; + case decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::FIXED_WIDTH_DICT: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::FIXED_WIDTH_DICT_NESTED: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::FIXED_WIDTH_DICT_LIST: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT: + launch_kernel(int_tag_t<128>{}, + kernel_tag_t{}); + break; + case decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED: + launch_kernel(int_tag_t<128>{}, + kernel_tag_t{}); + break; + case decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST: + launch_kernel(int_tag_t<128>{}, + kernel_tag_t{}); + break; + case decode_kernel_mask::BOOLEAN: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::BOOLEAN_NESTED: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::BOOLEAN_LIST: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + default: CUDF_EXPECTS(false, "Kernel type not handled by this function"); break; } } diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index 62f1ee88036..5b9831668e6 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -343,8 +343,8 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) bool has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0; // the level stream decoders - __shared__ rle_run def_runs[rle_run_buffer_size]; - __shared__ rle_run rep_runs[rle_run_buffer_size]; + __shared__ rle_run def_runs[rle_run_buffer_size]; + __shared__ rle_run rep_runs[rle_run_buffer_size]; rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}}; diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 52d53cb8225..a8a8c441a84 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -181,9 +181,13 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page, } else if (is_string_col(chunk)) { // check for string before byte_stream_split so FLBA will go to the right kernel return decode_kernel_mask::STRING; + } else if (is_boolean(chunk)) { + return is_list(chunk) ? decode_kernel_mask::BOOLEAN_LIST + : is_nested(chunk) ? decode_kernel_mask::BOOLEAN_NESTED + : decode_kernel_mask::BOOLEAN; } - if (!is_byte_array(chunk) && !is_boolean(chunk)) { + if (!is_byte_array(chunk)) { if (page.encoding == Encoding::PLAIN) { return is_list(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index ca74a1c2ba0..5ece3a54892 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -618,8 +618,8 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBo constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size(); // the level stream decoders - __shared__ rle_run def_runs[rle_run_buffer_size]; - __shared__ rle_run rep_runs[rle_run_buffer_size]; + __shared__ rle_run def_runs[rle_run_buffer_size]; + __shared__ rle_run rep_runs[rle_run_buffer_size]; rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}}; diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index dba24b553e6..3b4d0e6dc80 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -224,6 +224,9 @@ enum class decode_kernel_mask { FIXED_WIDTH_NO_DICT_LIST = (1 << 13), // Run decode kernel for fixed width non-dictionary pages BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST = (1 << 14), // Run decode kernel for BYTE_STREAM_SPLIT encoded data for fixed width lists + BOOLEAN = (1 << 15), // Run decode kernel for boolean data + BOOLEAN_NESTED = (1 << 16), // Run decode kernel for nested boolean data + BOOLEAN_LIST = (1 << 17), // Run decode kernel for list boolean data }; // mask representing all the ways in which a string can be encoded @@ -539,7 +542,7 @@ enum class encode_kernel_mask { DELTA_BINARY = (1 << 2), // Run DELTA_BINARY_PACKED encoding kernel DELTA_LENGTH_BA = (1 << 3), // Run DELTA_LENGTH_BYTE_ARRAY encoding kernel DELTA_BYTE_ARRAY = (1 << 4), // Run DELTA_BYtE_ARRAY encoding kernel - BYTE_STREAM_SPLIT = (1 << 5), // Run plain encoding kernel, but split streams + BYTE_STREAM_SPLIT = (1 << 5) // Run plain encoding kernel, but split streams }; /** @@ -911,72 +914,18 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span pages, * @param[in] num_rows Total number of rows to read * @param[in] min_row Minimum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding - * @param[in] has_nesting Whether or not the data contains nested (but not list) data. - * @param[in] is_list Whether or not the data contains list data. + * @param[in] kernel_mask Mask indicating the type of decoding kernel to launch. * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ -void DecodePageDataFixed(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - std::size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - bool is_list, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream); - -/** - * @brief Launches kernel for reading dictionary fixed width column data stored in the pages - * - * The page data will be written to the output pointed to in the page's - * associated column chunk. - * - * @param[in,out] pages All pages to be decoded - * @param[in] chunks All chunks to be decoded - * @param[in] num_rows Total number of rows to read - * @param[in] min_row Minimum number of rows to read - * @param[in] level_type_size Size in bytes of the type for level decoding - * @param[in] has_nesting Whether or not the data contains nested (but not list) data. - * @param[in] is_list Whether or not the data contains list data. - * @param[out] error_code Error code for kernel failures - * @param[in] stream CUDA stream to use - */ -void DecodePageDataFixedDict(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - std::size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - bool is_list, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream); - -/** - * @brief Launches kernel for reading fixed width column data stored in the pages - * - * The page data will be written to the output pointed to in the page's - * associated column chunk. - * - * @param[in,out] pages All pages to be decoded - * @param[in] chunks All chunks to be decoded - * @param[in] num_rows Total number of rows to read - * @param[in] min_row Minimum number of rows to read - * @param[in] level_type_size Size in bytes of the type for level decoding - * @param[in] has_nesting Whether or not the data contains nested (but not list) data. - * @param[in] is_list Whether or not the data contains list data. - * @param[out] error_code Error code for kernel failures - * @param[in] stream CUDA stream to use - */ -void DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - std::size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - bool is_list, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream); +void DecodePageData(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, + size_t num_rows, + size_t min_row, + int level_type_size, + decode_kernel_mask kernel_mask, + kernel_error::pointer error_code, + rmm::cuda_stream_view stream); /** * @brief Launches kernel for initializing encoder row group fragments diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 689386b8957..d74ae83b635 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -97,22 +97,37 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num _stream); } + // Compute column string sizes (using page string offsets) for this subpass col_string_sizes = calculate_page_string_offsets(); - // check for overflow + // ensure cumulative column string sizes have been initialized + if (pass.cumulative_col_string_sizes.empty()) { + pass.cumulative_col_string_sizes.resize(_input_columns.size(), 0); + } + + // Add to the cumulative column string sizes of this pass + std::transform(pass.cumulative_col_string_sizes.begin(), + pass.cumulative_col_string_sizes.end(), + col_string_sizes.begin(), + pass.cumulative_col_string_sizes.begin(), + std::plus<>{}); + + // Check for overflow in cumulative column string sizes of this pass so that the page string + // offsets of overflowing (large) string columns are treated as 64-bit. auto const threshold = static_cast(strings::detail::get_offset64_threshold()); - auto const has_large_strings = std::any_of(col_string_sizes.cbegin(), - col_string_sizes.cend(), + auto const has_large_strings = std::any_of(pass.cumulative_col_string_sizes.cbegin(), + pass.cumulative_col_string_sizes.cend(), [=](std::size_t sz) { return sz > threshold; }); if (has_large_strings and not strings::detail::is_large_strings_enabled()) { CUDF_FAIL("String column exceeds the column size limit", std::overflow_error); } - // mark any chunks that are large string columns + // Mark any chunks for which the cumulative column string size has exceeded the + // large strings threshold if (has_large_strings) { for (auto& chunk : pass.chunks) { auto const idx = chunk.src_col_index; - if (col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; } + if (pass.cumulative_col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; } } } } @@ -195,7 +210,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // only do string buffer for leaf if (idx == max_depth - 1 and out_buf.string_size() == 0 and col_string_sizes[pass.chunks[c].src_col_index] > 0) { - out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index], _stream); + out_buf.create_string_data( + col_string_sizes[pass.chunks[c].src_col_index], + pass.cumulative_col_string_sizes[pass.chunks[c].src_col_index] > + static_cast(strings::detail::get_offset64_threshold()), + _stream); } if (has_strings) { str_data[idx] = out_buf.string_data(); } out_buf.user_data |= @@ -219,8 +238,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num int const nkernels = std::bitset<32>(kernel_mask).count(); auto streams = cudf::detail::fork_streams(_stream, nkernels); - // launch string decoder int s_idx = 0; + + auto decode_data = [&](decode_kernel_mask decoder_mask) { + DecodePageData(subpass.pages, + pass.chunks, + num_rows, + skip_rows, + level_type_size, + decoder_mask, + error_code.data(), + streams[s_idx++]); + }; + + // launch string decoder if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) { DecodeStringPageData(subpass.pages, pass.chunks, @@ -266,41 +297,17 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // launch byte stream split decoder if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT) != 0) { - DecodeSplitPageFixedWidthData(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - false, - false, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT); } // launch byte stream split decoder, for nested columns if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED) != 0) { - DecodeSplitPageFixedWidthData(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - true, - false, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED); } // launch byte stream split decoder, for list columns if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST) != 0) { - DecodeSplitPageFixedWidthData(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - true, - true, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST); } // launch byte stream split decoder @@ -316,80 +323,47 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // launch fixed width type decoder if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT) != 0) { - DecodePageDataFixed(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - false, - false, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::FIXED_WIDTH_NO_DICT); } // launch fixed width type decoder for lists if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST) != 0) { - DecodePageDataFixed(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - true, - true, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST); } // launch fixed width type decoder, for nested columns if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED) != 0) { - DecodePageDataFixed(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - true, - false, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED); + } + + // launch boolean type decoder + if (BitAnd(kernel_mask, decode_kernel_mask::BOOLEAN) != 0) { + decode_data(decode_kernel_mask::BOOLEAN); + } + + // launch boolean type decoder, for nested columns + if (BitAnd(kernel_mask, decode_kernel_mask::BOOLEAN_NESTED) != 0) { + decode_data(decode_kernel_mask::BOOLEAN_NESTED); + } + + // launch boolean type decoder, for nested columns + if (BitAnd(kernel_mask, decode_kernel_mask::BOOLEAN_LIST) != 0) { + decode_data(decode_kernel_mask::BOOLEAN_LIST); } // launch fixed width type decoder with dictionaries if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT) != 0) { - DecodePageDataFixedDict(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - false, - false, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::FIXED_WIDTH_DICT); } // launch fixed width type decoder with dictionaries for lists if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_LIST) != 0) { - DecodePageDataFixedDict(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - true, - true, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::FIXED_WIDTH_DICT_LIST); } // launch fixed width type decoder with dictionaries, for nested columns if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_NESTED) != 0) { - DecodePageDataFixedDict(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - true, - false, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::FIXED_WIDTH_DICT_NESTED); } // launch the catch-all page decoder diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp index a0c2dbd3e44..ca46f198bb8 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.hpp +++ b/cpp/src/io/parquet/reader_impl_chunking.hpp @@ -130,6 +130,9 @@ struct pass_intermediate_data { rmm::device_buffer decomp_dict_data{0, cudf::get_default_stream()}; rmm::device_uvector str_dict_index{0, cudf::get_default_stream()}; + // cumulative strings column sizes. + std::vector cumulative_col_string_sizes{}; + int level_type_size{0}; // skip_rows / num_rows for this pass. diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 69e783a89d0..3c49de0c997 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -152,7 +152,6 @@ __device__ inline void decode(level_t* const output, } // a single rle run. may be broken up into multiple rle_batches -template struct rle_run { int size; // total size of the run int output_pos; // absolute position of this run w.r.t output @@ -183,14 +182,14 @@ struct rle_stream { level_t* output; - rle_run* runs; + rle_run* runs; int output_pos; int fill_index; int decode_index; - __device__ rle_stream(rle_run* _runs) : runs(_runs) {} + __device__ rle_stream(rle_run* _runs) : runs(_runs) {} __device__ inline bool is_last_decode_warp(int warp_id) { @@ -217,7 +216,7 @@ struct rle_stream { decode_index = -1; // signals the first iteration. Nothing to decode. } - __device__ inline int get_rle_run_info(rle_run& run) + __device__ inline int get_rle_run_info(rle_run& run) { run.start = cur; run.level_run = get_vlq32(run.start, end); @@ -384,7 +383,7 @@ struct rle_stream { // started basically we're setting up the rle_stream vars necessary to start fill_run_batch for // the first time while (cur < end) { - rle_run run; + rle_run run; int run_bytes = get_rle_run_info(run); if ((output_pos + run.size) > target_count) { diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index 6d954753af8..41ed55cd090 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -63,9 +63,11 @@ void cudf::io::detail::inline_column_buffer::allocate_strings_data(bool memset_d } void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes, + bool is_large_strings_col, rmm::cuda_stream_view stream) { - _string_data = rmm::device_buffer(num_bytes, stream, _mr); + _is_large_strings_col = is_large_strings_col; + _string_data = rmm::device_buffer(num_bytes, stream, _mr); } namespace { diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp index 31c8b781e77..da19539f509 100644 --- a/cpp/src/io/utilities/column_buffer.hpp +++ b/cpp/src/io/utilities/column_buffer.hpp @@ -246,13 +246,17 @@ class inline_column_buffer : public column_buffer_base { [[nodiscard]] size_t data_size_impl() const { return _data.size(); } std::unique_ptr make_string_column_impl(rmm::cuda_stream_view stream); - void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream); + void create_string_data(size_t num_bytes, + bool is_large_strings_col, + rmm::cuda_stream_view stream); void* string_data() { return _string_data.data(); } [[nodiscard]] void const* string_data() const { return _string_data.data(); } [[nodiscard]] size_t string_size() const { return _string_data.size(); } + [[nodiscard]] bool is_large_strings_column() const { return _is_large_strings_col; } private: rmm::device_buffer _string_data{}; + bool _is_large_strings_col{}; }; using column_buffer = gather_column_buffer; diff --git a/cpp/src/io/utilities/column_buffer_strings.cu b/cpp/src/io/utilities/column_buffer_strings.cu index 4bc303a34a5..66d0a644c12 100644 --- a/cpp/src/io/utilities/column_buffer_strings.cu +++ b/cpp/src/io/utilities/column_buffer_strings.cu @@ -27,8 +27,7 @@ std::unique_ptr cudf::io::detail::inline_column_buffer::make_string_colu { // if the size of _string_data is over the threshold for 64bit size_type, _data will contain // sizes rather than offsets. need special handling for that case. - auto const threshold = static_cast(strings::detail::get_offset64_threshold()); - if (_string_data.size() > threshold) { + if (is_large_strings_column()) { if (not strings::detail::is_large_strings_enabled()) { CUDF_FAIL("String column exceeds the column size limit", std::overflow_error); } diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 15de5d85614..68377ad6d5f 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -72,8 +72,12 @@ class file_sink : public data_sink { [[nodiscard]] bool is_device_write_preferred(size_t size) const override { - if (size < _gds_write_preferred_threshold) { return false; } - return supports_device_write(); + if (!supports_device_write()) { return false; } + + // Always prefer device writes if kvikio is enabled + if (!_kvikio_file.closed()) { return true; } + + return size >= _gds_write_preferred_threshold; } std::future device_write_async(void const* gpu_data, diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 15a4a270ce0..0870e4a84a7 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -33,8 +33,13 @@ #include #include +#include #include +#ifdef CUDF_KVIKIO_REMOTE_IO +#include +#endif + namespace cudf { namespace io { namespace { @@ -90,8 +95,12 @@ class file_source : public datasource { [[nodiscard]] bool is_device_read_preferred(size_t size) const override { - if (size < _gds_read_preferred_threshold) { return false; } - return supports_device_read(); + if (!supports_device_read()) { return false; } + + // Always prefer device reads if kvikio is enabled + if (!_kvikio_file.closed()) { return true; } + + return size >= _gds_read_preferred_threshold; } std::future device_read_async(size_t offset, @@ -389,6 +398,96 @@ class user_datasource_wrapper : public datasource { datasource* const source; ///< A non-owning pointer to the user-implemented datasource }; +#ifdef CUDF_KVIKIO_REMOTE_IO +/** + * @brief Remote file source backed by KvikIO, which handles S3 filepaths seamlessly. + */ +class remote_file_source : public datasource { + static std::unique_ptr create_s3_endpoint(char const* filepath) + { + auto [bucket_name, bucket_object] = kvikio::S3Endpoint::parse_s3_url(filepath); + return std::make_unique(bucket_name, bucket_object); + } + + public: + explicit remote_file_source(char const* filepath) : _kvikio_file{create_s3_endpoint(filepath)} {} + + ~remote_file_source() override = default; + + [[nodiscard]] bool supports_device_read() const override { return true; } + + [[nodiscard]] bool is_device_read_preferred(size_t size) const override { return true; } + + [[nodiscard]] size_t size() const override { return _kvikio_file.nbytes(); } + + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override + { + CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); + + auto const read_size = std::min(size, this->size() - offset); + return _kvikio_file.pread(dst, read_size, offset); + } + + size_t device_read(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override + { + return device_read_async(offset, size, dst, stream).get(); + } + + std::unique_ptr device_read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) override + { + rmm::device_buffer out_data(size, stream); + size_t read = device_read(offset, size, reinterpret_cast(out_data.data()), stream); + out_data.resize(read, stream); + return datasource::buffer::create(std::move(out_data)); + } + + size_t host_read(size_t offset, size_t size, uint8_t* dst) override + { + auto const read_size = std::min(size, this->size() - offset); + return _kvikio_file.pread(dst, read_size, offset).get(); + } + + std::unique_ptr host_read(size_t offset, size_t size) override + { + auto const count = std::min(size, this->size() - offset); + std::vector h_data(count); + this->host_read(offset, count, h_data.data()); + return datasource::buffer::create(std::move(h_data)); + } + + /** + * @brief Is `url` referring to a remote file supported by KvikIO? + * + * For now, only S3 urls (urls starting with "s3://") are supported. + */ + static bool is_supported_remote_url(std::string const& url) + { + // Regular expression to match "s3://" + static std::regex pattern{R"(^s3://)", std::regex_constants::icase}; + return std::regex_search(url, pattern); + } + + private: + kvikio::RemoteHandle _kvikio_file; +}; +#else +/** + * @brief When KvikIO remote IO is disabled, `is_supported_remote_url()` return false always. + */ +class remote_file_source : public file_source { + public: + explicit remote_file_source(char const* filepath) : file_source(filepath) {} + static constexpr bool is_supported_remote_url(std::string const&) { return false; } +}; +#endif } // namespace std::unique_ptr datasource::create(std::string const& filepath, @@ -403,8 +502,9 @@ std::unique_ptr datasource::create(std::string const& filepath, CUDF_FAIL("Invalid LIBCUDF_MMAP_ENABLED value: " + policy); }(); - - if (use_memory_mapping) { + if (remote_file_source::is_supported_remote_url(filepath)) { + return std::make_unique(filepath.c_str()); + } else if (use_memory_mapping) { return std::make_unique(filepath.c_str(), offset, max_size_estimate); } else { // `file_source` reads the file directly, without memory mapping diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu index 80fd72a3088..21f6fe87a62 100644 --- a/cpp/src/quantiles/quantile.cu +++ b/cpp/src/quantiles/quantile.cu @@ -195,10 +195,11 @@ std::unique_ptr quantile(column_view const& input, interpolation interp, column_view const& ordered_indices, bool exact, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::quantile(input, q, interp, ordered_indices, exact, cudf::get_default_stream(), mr); + return detail::quantile(input, q, interp, ordered_indices, exact, stream, mr); } } // namespace cudf diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu index 69421f3bfc4..a94fb9362b9 100644 --- a/cpp/src/quantiles/quantiles.cu +++ b/cpp/src/quantiles/quantiles.cu @@ -103,17 +103,12 @@ std::unique_ptr
quantiles(table_view const& input, cudf::sorted is_input_sorted, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::quantiles(input, - q, - interp, - is_input_sorted, - column_order, - null_precedence, - cudf::get_default_stream(), - mr); + return detail::quantiles( + input, q, interp, is_input_sorted, column_order, null_precedence, stream, mr); } } // namespace cudf diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index 43c3b0a291b..fb5aebb4b39 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -410,10 +410,11 @@ std::unique_ptr percentile_approx(tdigest_column_view const& input, std::unique_ptr percentile_approx(tdigest_column_view const& input, column_view const& percentiles, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return tdigest::percentile_approx(input, percentiles, cudf::get_default_stream(), mr); + return tdigest::percentile_approx(input, percentiles, stream, mr); } } // namespace cudf diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu new file mode 100644 index 00000000000..1183e3e4038 --- /dev/null +++ b/cpp/src/strings/search/contains_multiple.cu @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { +namespace { + +/** + * @brief Threshold to decide on using string or warp parallel functions. + * + * If the average byte length of a string in a column exceeds this value then + * a warp-parallel function is used. + */ +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64; + +/** + * @brief Kernel for finding multiple targets in each row of input strings + * + * The d_first_bytes is sorted and unique so the d_indices and d_offsets + * are used to map the corresponding character to its d_targets entry. + * + * Example + * d_targets = ["foo", "hello", "world", "hi"] + * - sorted first-chars: ['f','h','h','w'] + * d_indices = [0, 3, 1, 2] + * d_first_bytes = ['f', 'h', 'w'] (unique) + * d_offsets = [0, 1, 3] + * unique_count = 3 + * + * If 'h' is found, lower_bound produces pos=1 in d_first_bytes. + * This corresponds to d_offset[1]==1 which has two values: + * - (d_offsets[2] - d_offsets[1]) = (3 - 1) = 2. + * Set map_idx = d_offsets[1] = 1 and the two targets to check are sequential + * in the d_indices array: + * - tgt1_idx = d_indices[map_idx] = 3 --> d_targets[3] == 'hi' + * - tgt2_idx = d_indices[map_idx+1] = 1 --> d_targets[1] == 'hello' + * The logic now only needs to check for either of these 2 targets. + * + * This kernel works in either thread-per-string or warp-per-string depending + * on the template parameter. If tile_size==1, then this kernel executes as + * a row-per-string. If tile_size=32, the it executes as a warp-per-string. + * No other options are supported for now. + * + * @tparam tile_size Number of threads per string + * @param d_strings Input strings + * @param d_targets Target strings to search within input strings + * @param d_first_bytes Sorted, unique list of first bytes of the target strings + * @param d_indices Indices to map sorted d_first_bytes to d_targets + * @param d_offsets Offsets to map d_indices to d_targets + * @param unique_count Number of unique values in d_first_bytes (and d_offsets) + * @param working_memory Global memory to use if shared-memory is too small + * @param d_results Bool results for each target within each string row + */ +template +CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings, + column_device_view const d_targets, + u_char const* d_first_bytes, + size_type const* d_indices, + size_type const* d_offsets, + size_type unique_count, + bool* working_memory, + cudf::device_span d_results) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = idx / tile_size; + if (str_idx >= d_strings.size()) { return; } + if (d_strings.is_null(str_idx)) { return; } + + // get the string for this tile + auto const d_str = d_strings.element(str_idx); + + namespace cg = cooperative_groups; + auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const lane_idx = tile.thread_rank(); + auto const num_targets = d_targets.size(); + + // size of shared_bools = num_targets * block_size + // each thread uses num_targets bools + extern __shared__ bool shared_bools[]; + // bools for the current string + auto bools = working_memory == nullptr + ? (shared_bools + (tile.meta_group_rank() * tile_size * num_targets)) + : (working_memory + (str_idx * tile_size * num_targets)); + + // initialize result: set true if target is empty, false otherwise + for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) { + auto const d_target = d_targets.element(target_idx); + if constexpr (tile_size == 1) { + d_results[target_idx][str_idx] = d_target.empty(); + } else { + auto const begin = bools + (target_idx * tile_size); + thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, d_target.empty()); + } + } + tile.sync(); + + auto const last_ptr = d_first_bytes + unique_count; + for (size_type str_byte_idx = lane_idx; str_byte_idx < d_str.size_bytes(); + str_byte_idx += tile_size) { + // search for byte in first_bytes array + auto const sptr = d_str.data() + str_byte_idx; + auto const chr = static_cast(*sptr); + auto const byte_ptr = thrust::lower_bound(thrust::seq, d_first_bytes, last_ptr, chr); + // if not found, continue to next byte + if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; } + // compute index of matched byte + auto const offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); + auto map_idx = d_offsets[offset_idx]; + auto const last_idx = (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : num_targets; + // check for targets that begin with chr + while (map_idx < last_idx) { + auto const target_idx = d_indices[map_idx++]; + auto const bool_idx = (target_idx * tile_size) + lane_idx; + auto const found = tile_size == 1 ? d_results[target_idx][str_idx] : bools[bool_idx]; + if (!found) { // not found before + auto const d_target = d_targets.element(target_idx); + if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) { + // first char already checked, so just check the [1, end) chars match + auto const tp = d_target.data(); + if (thrust::equal(thrust::seq, tp + 1, tp + d_target.size_bytes(), sptr + 1)) { + if constexpr (tile_size == 1) { + d_results[target_idx][str_idx] = true; + } else { + bools[bool_idx] = true; + } + } + } + } + } + } + + if constexpr (tile_size > 1) { + tile.sync(); + // reduce the bools for each target to store in the result + for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) { + auto const begin = bools + (target_idx * tile_size); + d_results[target_idx][str_idx] = + thrust::any_of(thrust::seq, begin, begin + tile_size, thrust::identity{}); + // cooperative_group any() implementation was almost 3x slower than this parallel reduce + } + } +} +} // namespace + +std::unique_ptr
contains_multiple(strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS( + not targets.is_empty(), "Must specify at least one target string.", std::invalid_argument); + CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null", std::invalid_argument); + + auto const d_strings = column_device_view::create(input.parent(), stream); + auto const d_targets = column_device_view::create(targets.parent(), stream); + + // copy the first byte of each target and sort them + auto first_bytes = rmm::device_uvector(targets.size(), stream); + auto indices = rmm::device_uvector(targets.size(), stream); + { + auto tgt_itr = thrust::make_transform_iterator( + d_targets->begin(), + cuda::proclaim_return_type([] __device__(auto const& d_tgt) -> u_char { + return d_tgt.empty() ? u_char{0} : static_cast(d_tgt.data()[0]); + })); + auto count_itr = thrust::make_counting_iterator(0); + auto keys_out = first_bytes.begin(); + auto vals_out = indices.begin(); + auto num_items = targets.size(); + auto cmp_op = thrust::less(); + auto sv = stream.value(); + + std::size_t tmp_bytes = 0; + cub::DeviceMergeSort::SortPairsCopy( + nullptr, tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_items, cmp_op, sv); + auto tmp_stg = rmm::device_buffer(tmp_bytes, stream); + cub::DeviceMergeSort::SortPairsCopy( + tmp_stg.data(), tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_items, cmp_op, sv); + } + + // remove duplicates to help speed up lower_bound + auto offsets = rmm::device_uvector(targets.size(), stream); + thrust::sequence(rmm::exec_policy_nosync(stream), offsets.begin(), offsets.end()); + auto const end = thrust::unique_by_key( + rmm::exec_policy_nosync(stream), first_bytes.begin(), first_bytes.end(), offsets.begin()); + auto const unique_count = + static_cast(thrust::distance(first_bytes.begin(), end.first)); + + // create output columns + auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return make_numeric_column(data_type{type_id::BOOL8}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); + }); + auto results = std::vector>(results_iter, results_iter + targets.size()); + auto d_results = [&] { + auto host_results_pointer_iter = + thrust::make_transform_iterator(results.begin(), [](auto const& results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = + std::vector(host_results_pointer_iter, host_results_pointer_iter + results.size()); + return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); + }(); + + constexpr cudf::thread_index_type block_size = 256; + // calculated (benchmarked) for efficient use of shared-memory + constexpr size_type targets_threshold = 32; + + auto d_first_bytes = first_bytes.data(); + auto d_indices = indices.data(); + auto d_offsets = offsets.data(); + + bool const row_parallel = ((input.null_count() == input.size()) || + ((input.chars_size(stream) / (input.size() - input.null_count())) <= + AVG_CHAR_BYTES_THRESHOLD)); + + if (row_parallel) { + // Smaller strings perform better with a row per string + cudf::detail::grid_1d grid{static_cast(input.size()), block_size}; + multi_contains_kernel<1> + <<>>(*d_strings, + *d_targets, + d_first_bytes, + d_indices, + d_offsets, + unique_count, + nullptr, + d_results); + } else { + constexpr cudf::thread_index_type tile_size = cudf::detail::warp_size; + + auto const shared_mem_size = + (targets.size() <= targets_threshold) ? (block_size * targets.size()) : 0; + auto const work_mem_size = + (targets.size() <= targets_threshold) ? 0 : tile_size * targets.size() * input.size(); + auto working_memory = rmm::device_uvector(work_mem_size, stream); + + cudf::detail::grid_1d grid{static_cast(input.size()) * tile_size, + block_size}; + multi_contains_kernel + <<>>( + *d_strings, + *d_targets, + d_first_bytes, + d_indices, + d_offsets, + unique_count, + working_memory.data(), + d_results); + } + + return std::make_unique
(std::move(results)); +} + +} // namespace detail + +std::unique_ptr
contains_multiple(strings_column_view const& strings, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::contains_multiple(strings, targets, stream, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu index ec7015878dd..67226b259d4 100644 --- a/cpp/src/strings/search/find_multiple.cu +++ b/cpp/src/strings/search/find_multiple.cu @@ -42,8 +42,9 @@ std::unique_ptr find_multiple(strings_column_view const& input, { auto const strings_count = input.size(); auto const targets_count = targets.size(); - CUDF_EXPECTS(targets_count > 0, "Must include at least one search target"); - CUDF_EXPECTS(!targets.has_nulls(), "Search targets cannot contain null strings"); + CUDF_EXPECTS(targets_count > 0, "Must include at least one search target", std::invalid_argument); + CUDF_EXPECTS( + !targets.has_nulls(), "Search targets cannot contain null strings", std::invalid_argument); auto strings_column = column_device_view::create(input.parent(), stream); auto d_strings = *strings_column; diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index a03a34f5fa7..aee83ab35ed 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -37,9 +38,13 @@ #include #include +#include #include +#include #include #include +#include +#include #include @@ -162,6 +167,339 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, return hashes; } +constexpr cudf::thread_index_type block_size = 256; +// for potentially tuning minhash_seed_kernel independently from block_size +constexpr cudf::thread_index_type tile_size = block_size; + +// Number of a/b parameter values to process per thread. +// The intermediate values are stored in shared-memory and therefore limits this count. +// This value was found to be an efficient size for both uint32 and uint64 +// hash types based on benchmarks. +constexpr cuda::std::size_t params_per_thread = 16; + +// Separate kernels are used to process strings above and below this value (in bytes). +constexpr cudf::size_type wide_string_threshold = 1 << 18; // 256K +// The number of blocks per string for the above-threshold kernel processing. +constexpr cudf::size_type blocks_per_string = 64; +// The above values were determined using the redpajama and books_sample datasets + +/** + * @brief Hashing kernel launched as a thread per tile-size (block or warp) + * + * This kernel computes the hashes for each string using the seed and the specified + * hash function. The width is used to compute rolling substrings to hash over. + * The hashes are stored in d_hashes to be used in the minhash_permuted_kernel. + * + * This kernel also counts the number of strings above the wide_string_threshold + * and proactively initializes the output values for those strings. + * + * @tparam HashFunction The hash function to use for this kernel + * @tparam hash_value_type Derived from HashFunction result_type + * + * @param d_strings The input strings to hash + * @param seed The seed used for the hash function + * @param width Width in characters used for determining substrings to hash + * @param d_hashes The resulting hash values are stored here + * @param threshold_count Stores the number of strings above wide_string_threshold + * @param param_count Number of parameters (used for the proactive initialize) + * @param d_results Final results vector (used for the proactive initialize) + */ +template +CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, + hash_value_type seed, + cudf::size_type width, + hash_value_type* d_hashes, + cudf::size_type* threshold_count, + cudf::size_type param_count, + hash_value_type* d_results) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = tid / tile_size; + if (str_idx >= d_strings.size()) { return; } + if (d_strings.is_null(str_idx)) { return; } + + // retrieve this string's offset to locate the output position in d_hashes + auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); + auto const offsets_itr = + cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); + auto const offset = offsets_itr[str_idx]; + auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); + if (size_bytes == 0) { return; } + + auto const d_str = cudf::string_view(d_strings.head() + offset, size_bytes); + auto const lane_idx = tid % tile_size; + + // hashes for this string/thread are stored here + auto seed_hashes = d_hashes + offset - offsets_itr[0] + lane_idx; + + auto const begin = d_str.data() + lane_idx; + auto const end = d_str.data() + d_str.size_bytes(); + auto const hasher = HashFunction(seed); + + for (auto itr = begin; itr < end; itr += tile_size, seed_hashes += tile_size) { + if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { + *seed_hashes = 0; + continue; + } + auto const check_str = // used for counting 'width' characters + cudf::string_view(itr, static_cast(thrust::distance(itr, end))); + auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width); + if ((itr != d_str.data()) && (left > 0)) { + // true itr+width is past the end of the string + *seed_hashes = 0; + continue; + } + + auto const hash_str = cudf::string_view(itr, bytes); + hash_value_type hv; + if constexpr (std::is_same_v) { + hv = hasher(hash_str); + } else { + hv = thrust::get<0>(hasher(hash_str)); + } + // disallowing hash to zero case + *seed_hashes = cuda::std::max(hv, hash_value_type{1}); + } + + // logic appended here so an extra kernel is not required + if (size_bytes >= wide_string_threshold) { + if (lane_idx == 0) { + // count the number of wide strings + cuda::atomic_ref ref{*threshold_count}; + ref.fetch_add(1, cuda::std::memory_order_relaxed); + } + // initialize the output -- only needed for wider strings + auto d_output = d_results + (str_idx * param_count); + for (auto i = lane_idx; i < param_count; i += tile_size) { + d_output[i] = std::numeric_limits::max(); + } + } +} + +/** + * @brief Permutation calculation kernel + * + * This kernel uses the hashes from the minhash_seed_kernel and the parameter_a and + * parameter_b values to compute the final output results. + * The output is the number of input rows (N) by the number of parameter values (M). + * Each output[i] is the calculated result for parameter_a/b[0:M]. + * + * This kernel is launched with either blocks per strings of 1 for strings + * below the wide_strings_threshold or blocks per string = blocks_per_strings + * for strings above wide_strings_threshold. + * + * @tparam hash_value_type Derived from HashFunction result_type + * @tparam blocks_per_string Number of blocks used to process each string + * + * @param d_strings The input strings to hash + * @param indices The indices of the strings in d_strings to process + * @param parameter_a 1st set of parameters for the calculation result + * @param parameter_b 2nd set of parameters for the calculation result + * @param width Used for calculating the number of available hashes in each string + * @param d_hashes The hash values computed in minhash_seed_kernel + * @param d_results Final results vector of calculate values + */ +template +CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, + cudf::device_span indices, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + hash_value_type const* d_hashes, + hash_value_type* d_results) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const idx = (tid / blocks_per_string) / block_size; + if (idx >= indices.size()) { return; } + auto const str_idx = indices[idx]; + if (d_strings.is_null(str_idx)) { return; } + + auto const block = cooperative_groups::this_thread_block(); + int const section_idx = block.group_index().x % blocks_per_string; + + auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); + auto const offsets_itr = + cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); + auto const offset = offsets_itr[str_idx]; + auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); + + // number of items to process in this block; + // last block also includes any remainder values from the size_bytes/blocks_per_string truncation + // example: + // each section_size for string with size 588090 and blocks_per_string=64 is 9188 + // except the last section which is 9188 + (588090 % 64) = 9246 + auto const section_size = + (size_bytes / blocks_per_string) + + (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string); + auto const section_offset = section_idx * (size_bytes / blocks_per_string); + + // hash values for this block/section + auto const seed_hashes = d_hashes + offset - offsets_itr[0] + section_offset; + // width used here as a max value since a string's char-count <= byte-count + auto const hashes_size = + section_idx < (blocks_per_string - 1) + ? section_size + : cuda::std::max(static_cast(size_bytes > 0), section_size - width + 1); + + auto const init = size_bytes == 0 ? 0 : std::numeric_limits::max(); + auto const lane_idx = block.thread_rank(); + auto const d_output = d_results + (str_idx * parameter_a.size()); + + auto const begin = seed_hashes + lane_idx; + auto const end = seed_hashes + hashes_size; + + // constants used in the permutation calculations + constexpr uint64_t mersenne_prime = (1UL << 61) - 1; + constexpr hash_value_type hash_max = std::numeric_limits::max(); + + // found to be an efficient shared memory size for both hash types + __shared__ hash_value_type block_values[block_size * params_per_thread]; + + for (std::size_t i = 0; i < parameter_a.size(); i += params_per_thread) { + // initialize this block's chunk of shared memory + // each thread handles params_per_thread of values + auto const chunk_values = block_values + (lane_idx * params_per_thread); + thrust::uninitialized_fill(thrust::seq, chunk_values, chunk_values + params_per_thread, init); + block.sync(); + + auto const param_count = + cuda::std::min(static_cast(params_per_thread), parameter_a.size() - i); + + // each lane accumulates min hashes in its shared memory + for (auto itr = begin; itr < end; itr += block_size) { + auto const hv = *itr; + // 0 is used as a skip sentinel for UTF-8 and trailing bytes + if (hv == 0) { continue; } + + for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) { + // permutation formula used by datatrove + hash_value_type const v = + ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max; + auto const block_idx = ((param_idx % params_per_thread) * block_size) + lane_idx; + block_values[block_idx] = cuda::std::min(v, block_values[block_idx]); + } + } + block.sync(); + + // reduce each parameter values vector to a single min value; + // assumes that the block_size > params_per_thread; + // each thread reduces a block_size of parameter values (thread per parameter) + if (lane_idx < param_count) { + auto const values = block_values + (lane_idx * block_size); + // cooperative groups does not have a min function and cub::BlockReduce was slower + auto const minv = + thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{}); + if constexpr (blocks_per_string > 1) { + // accumulates mins for each block into d_output + cuda::atomic_ref ref{d_output[lane_idx + i]}; + ref.fetch_min(minv, cuda::std::memory_order_relaxed); + } else { + d_output[lane_idx + i] = minv; + } + } + block.sync(); + } +} + +template +std::unique_ptr minhash_fn(cudf::strings_column_view const& input, + hash_value_type seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(width >= 2, + "Parameter width should be an integer value of 2 or greater", + std::invalid_argument); + CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument); + CUDF_EXPECTS(parameter_a.size() == parameter_b.size(), + "Parameters A and B should have the same number of elements", + std::invalid_argument); + CUDF_EXPECTS( + (static_cast(input.size()) * parameter_a.size()) < + static_cast(std::numeric_limits::max()), + "The number of parameters times the number of input rows exceeds the column size limit", + std::overflow_error); + + auto const output_type = cudf::data_type{cudf::type_to_id()}; + if (input.is_empty()) { return cudf::make_empty_column(output_type); } + + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + + auto results = + cudf::make_numeric_column(output_type, + input.size() * static_cast(parameter_a.size()), + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_results = results->mutable_view().data(); + + cudf::detail::grid_1d grid{static_cast(input.size()) * block_size, + block_size}; + auto const hashes_size = input.chars_size(stream); + auto d_hashes = rmm::device_uvector(hashes_size, stream); + auto d_threshold_count = cudf::detail::device_scalar(0, stream); + + minhash_seed_kernel + <<>>(*d_strings, + seed, + width, + d_hashes.data(), + d_threshold_count.data(), + parameter_a.size(), + d_results); + auto const threshold_count = d_threshold_count.value(stream); + + auto indices = rmm::device_uvector(input.size(), stream); + thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end()); + cudf::size_type threshold_index = threshold_count < input.size() ? input.size() : 0; + + // if we counted a split of above/below threshold then + // compute partitions based on the size of each string + if ((threshold_count > 0) && (threshold_count < input.size())) { + auto sizes = rmm::device_uvector(input.size(), stream); + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + sizes.data(), + cuda::proclaim_return_type( + [d_strings = *d_strings] __device__(auto idx) -> cudf::size_type { + if (d_strings.is_null(idx)) { return 0; } + return d_strings.element(idx).size_bytes(); + })); + thrust::sort_by_key( + rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin()); + auto const lb = thrust::lower_bound( + rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_string_threshold); + threshold_index = static_cast(thrust::distance(sizes.begin(), lb)); + } + + // handle the strings below the threshold width + if (threshold_index > 0) { + auto d_indices = cudf::device_span(indices.data(), threshold_index); + cudf::detail::grid_1d grid{static_cast(d_indices.size()) * block_size, + block_size}; + minhash_permuted_kernel + <<>>( + *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); + } + + // handle the strings above the threshold width + if (threshold_index < input.size()) { + auto const count = static_cast(input.size() - threshold_index); + auto d_indices = + cudf::device_span(indices.data() + threshold_index, count); + cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size}; + minhash_permuted_kernel + <<>>( + *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); + } + + return results; +} + /** * @brief Compute the minhash of each list row of strings for each seed * @@ -309,6 +647,20 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } +std::unique_ptr minhash(cudf::strings_column_view const& input, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + auto hashes = + detail::minhash_fn(input, seed, parameter_a, parameter_b, width, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); +} + std::unique_ptr minhash64(cudf::strings_column_view const& input, cudf::numeric_scalar const& seed, cudf::size_type width, @@ -333,6 +685,20 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } +std::unique_ptr minhash64(cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + auto hashes = + detail::minhash_fn(input, seed, parameter_a, parameter_b, width, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); +} + std::unique_ptr word_minhash(cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream, @@ -374,6 +740,18 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return detail::minhash(input, seeds, width, stream, mr); } +std::unique_ptr minhash_permuted(cudf::strings_column_view const& input, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr); +} + std::unique_ptr minhash64(cudf::strings_column_view const& input, cudf::numeric_scalar seed, cudf::size_type width, @@ -394,6 +772,18 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return detail::minhash64(input, seeds, width, stream, mr); } +std::unique_ptr minhash64_permuted(cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr); +} + std::unique_ptr word_minhash(cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream, diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e9ba58ba224..cbca0ceef77 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -83,7 +83,6 @@ function(ConfigureTest CMAKE_TEST_NAME) "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$" ) endif() - enable_clang_tidy(${CMAKE_TEST_NAME}) endfunction() # ################################################################################################## @@ -611,6 +610,7 @@ ConfigureTest( text/bpe_tests.cpp text/edit_distance_tests.cpp text/jaccard_tests.cpp + text/minhash_tests.cpp text/ngrams_tests.cpp text/ngrams_tokenize_tests.cpp text/normalize_tests.cpp @@ -712,6 +712,7 @@ ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_PARTITIONING_TEST streams/partitioning_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing) +ConfigureTest(STREAM_QUANTILE_TEST streams/quantile_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_RESHAPE_TEST streams/reshape_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp index c8c2d18903f..0fbd7da7f4d 100644 --- a/cpp/tests/io/json/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json/json_quote_normalization_test.cpp @@ -34,7 +34,9 @@ // Base test fixture for tests struct JsonNormalizationTest : public cudf::test::BaseFixture {}; -void run_test(std::string const& host_input, std::string const& expected_host_output) +void run_test(std::string const& host_input, + std::string const& expected_host_output, + char delimiter = '\n') { // RMM memory resource std::shared_ptr rsc = @@ -46,7 +48,7 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou // Preprocessing FST cudf::io::datasource::owning_buffer device_data(std::move(device_input)); - cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get()); + cudf::io::json::detail::normalize_single_quotes(device_data, delimiter, stream_view, rsc.get()); std::string preprocessed_host_output(device_data.size(), 0); CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(), @@ -172,6 +174,13 @@ TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBraces run_test(input, output); } +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_NonNewlineDelimiter) +{ + std::string input{"{\"a\": \"1\n2\"}z{\'a\': 12}"}; + std::string output{"{\"a\": \"1\n2\"}z{\"a\": 12}"}; + run_test(input, output, 'z'); +} + TEST_F(JsonNormalizationTest, ReadJsonOption) { // RMM memory resource @@ -179,22 +188,24 @@ TEST_F(JsonNormalizationTest, ReadJsonOption) std::make_shared(); // Test input - std::string const host_input = R"({"A":'TEST"'})"; + std::string const host_input = R"({"a": "1\n2"}h{'a': 12})"; cudf::io::json_reader_options input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{host_input.data(), host_input.size()}) .lines(true) + .delimiter('h') .normalize_single_quotes(true); cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get()); // Expected table - std::string const expected_input = R"({"A":"TEST\""})"; + std::string const expected_input = R"({"a": "1\n2"}h{"a": 12})"; cudf::io::json_reader_options expected_input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{expected_input.data(), expected_input.size()}) - .lines(true); + .lines(true) + .delimiter('h'); cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get()); diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index b58ca56e066..26937c9298a 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -239,7 +239,7 @@ struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest(), scale}}) + .dtypes(std::vector{data_type{type_to_id(), scale}}) .lines(true); auto const result = cudf::io::read_json(in_opts); @@ -324,7 +324,7 @@ TEST_P(JsonReaderParamTest, FloatingPoint) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({dtype()}) + .dtypes(std::vector{dtype()}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -348,7 +348,8 @@ TEST_P(JsonReaderParamTest, JsonLinesStrings) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) - .dtypes({{"2", dtype()}, {"0", dtype()}, {"1", dtype()}}) + .dtypes(std::map{ + {"2", dtype()}, {"0", dtype()}, {"1", dtype()}}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -466,7 +467,7 @@ TEST_P(JsonReaderParamTest, Booleans) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({dtype()}) + .dtypes(std::vector{dtype()}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -508,7 +509,7 @@ TEST_P(JsonReaderParamTest, Dates) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) + .dtypes(std::vector{data_type{type_id::TIMESTAMP_MILLISECONDS}}) .lines(true) .dayfirst(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -564,7 +565,7 @@ TEST_P(JsonReaderParamTest, Durations) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({data_type{type_id::DURATION_NANOSECONDS}}) + .dtypes(std::vector{data_type{type_id::DURATION_NANOSECONDS}}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -1022,7 +1023,7 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({dtype()}) + .dtypes(std::vector{dtype()}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -1461,7 +1462,7 @@ TEST_F(JsonReaderTest, ErrorStrings) cudf::io::json_reader_options const in_opts = cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) - .dtypes({data_type{cudf::type_id::STRING}}) + .dtypes(std::vector{data_type{cudf::type_id::STRING}}) .lines(true); auto const result = cudf::io::read_json(in_opts); @@ -1849,7 +1850,7 @@ TYPED_TEST(JsonFixedPointReaderTest, EmptyValues) cudf::io::json_reader_options const in_opts = cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) - .dtypes({data_type{type_to_id(), 0}}) + .dtypes(std::vector{data_type{type_to_id(), 0}}) .lines(true); auto const result = cudf::io::read_json(in_opts); @@ -2827,7 +2828,7 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren) EXPECT_EQ(result.metadata.schema_info[0].name, "Root"); ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key"); - ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 1); EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); // types EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); @@ -2865,7 +2866,7 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren) EXPECT_EQ(result.metadata.schema_info[0].name, "Root"); ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key"); - ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 1); EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); // types EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); @@ -2884,9 +2885,9 @@ TEST_F(JsonReaderTest, MixedTypesWithSchema) std::map data_types; std::map child_types; child_types.insert( - std::pair{"element", cudf::io::schema_element{cudf::data_type{cudf::type_id::STRING, 0}, {}}}); - data_types.insert(std::pair{ - "data", cudf::io::schema_element{cudf::data_type{cudf::type_id::LIST, 0}, child_types}}); + std::pair{"element", cudf::io::schema_element{cudf::data_type{cudf::type_id::STRING}, {}}}); + data_types.insert( + std::pair{"data", cudf::io::schema_element{cudf::data_type{cudf::type_id::LIST}, child_types}}); cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) @@ -2991,4 +2992,264 @@ TEST_F(JsonReaderTest, LastRecordInvalid) CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), cudf::table_view{{expected}}); } +// Test case for dtype pruning with column order +TEST_F(JsonReaderTest, JsonNestedDtypeFilterWithOrder) +{ + std::string json_stringl = R"( + {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true} + {"a": 1, "b": {"0": "abc" }, "c": false} + {"a": 1, "b": {}} + {"a": 1, "c": null} + )"; + std::string json_string = R"([ + {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true}, + {"a": 1, "b": {"0": "abc" }, "c": false}, + {"a": 1, "b": {}}, + {"a": 1, "c": null} + ])"; + for (auto& [json_string, lines] : {std::pair{json_stringl, true}, {json_string, false}}) { + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.data(), json_string.size()}) + .prune_columns(true) + .lines(lines); + + // include all columns + //// schema with partial ordering + { + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"b", + {data_type{cudf::type_id::STRUCT}, + {{"0", {data_type{cudf::type_id::STRING}}}, + {"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}}, + {{"0", "1"}}}}, + {"a", {dtype()}}, + {"c", {dtype()}}, + }, + {{"b", "a", "c"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "a", "b" and "c" + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "b"); + EXPECT_EQ(result.metadata.schema_info[1].name, "a"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + // "b" children checks + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[0].children[1].name, "1"); + ASSERT_EQ(result.metadata.schema_info[0].children[1].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[1].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[0].children[1].children[1].name, "element"); + // types + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::BOOL8); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(0).child(1).type().id(), cudf::type_id::LIST); + EXPECT_EQ(result.tbl->get_column(0).child(1).child(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(0).child(1).child(1).type().id(), cudf::type_id::FLOAT32); + } + //// schema with pruned columns and different order. + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"c", {dtype()}}, + {"b", + { + data_type{cudf::type_id::STRUCT}, + }}, + {"a", {dtype()}}, + }, + {{"c", "b", "a"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // "c", "b" and "a" order + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "c"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "a"); + // pruned + EXPECT_EQ(result.metadata.schema_info[1].children.size(), 0); + } + //// schema with pruned columns and different sub-order. + { + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"c", {dtype()}}, + {"b", + {data_type{cudf::type_id::STRUCT}, + // {}, + {{"0", {data_type{cudf::type_id::STRING}}}, + {"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}}, + {{"1", "0"}}}}, + {"a", {dtype()}}, + }}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Order of occurance in json + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + // Sub-order of "b" + EXPECT_EQ(result.metadata.schema_info[1].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "1"); + EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "0"); + } + //// schema with 1 dtype, but 2 column order + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + }, + {{"a", "b"}}}; + EXPECT_THROW(in_options.set_dtypes(dtype_schema), std::invalid_argument); + // Input schema column order size mismatch with input schema child types + } + //// repetition, Error + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + }, + {{"a", "a"}}}; + EXPECT_THROW(in_options.set_dtypes(dtype_schema), std::invalid_argument); + // Input schema column order size mismatch with input schema child types + } + //// different column name in order, Error + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + }, + {{"b"}}}; + EXPECT_THROW(in_options.set_dtypes(dtype_schema), std::invalid_argument); + // Column name not found in input schema map, but present in column order and + // prune_columns is enabled + } + // include only one column (nested) + { + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"b", + {data_type{cudf::type_id::STRUCT}, + {{"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}}, + {{"1"}}}}, + }}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "b":"1":[float] + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "b"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "1"); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[1].name, "element"); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::LIST); + EXPECT_EQ(result.tbl->get_column(0).child(0).child(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(0).child(0).child(1).type().id(), cudf::type_id::FLOAT32); + } + // multiple - all present + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + {"c", {dtype()}}, + }, + {{"a", "c"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "a", and "c" + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "c"); + } + // multiple - not all present + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + {"d", {dtype()}}, + }, + {{"a", "d"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "a" + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "d"); + auto all_null_bools = + cudf::test::fixed_width_column_wrapper{{true, true, true, true}, {0, 0, 0, 0}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), all_null_bools); + } + // test struct, list of string, list of struct. + // multiple - not all present nested + { + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"b", + {data_type{cudf::type_id::STRUCT}, + { + {"2", {data_type{cudf::type_id::STRING}}}, + }, + {{"2"}}}}, + {"d", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}, + {"e", + {data_type{cudf::type_id::LIST}, + {{"element", + { + data_type{cudf::type_id::STRUCT}, + { + {"3", {data_type{cudf::type_id::STRING}}}, + }, //{{"3"}} missing column_order, but output should not have it. + }}}}}, + }, + {{"b", "d", "e"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "b" (empty struct) and "c" + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "b"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); + ASSERT_EQ(result.metadata.schema_info[0].children[0].name, "2"); + EXPECT_EQ(result.metadata.schema_info[1].name, "d"); + auto all_null_strings = cudf::test::strings_column_wrapper{{"", "", "", ""}, {0, 0, 0, 0}}; + EXPECT_EQ(result.tbl->get_column(0).num_children(), 1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0).child(0), all_null_strings); + auto const all_null_list = cudf::test::lists_column_wrapper{ + {{0, 0}, {1, 1}, {2, 2}, {3, 3}}, cudf::test::iterators::all_nulls()}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), all_null_list); + EXPECT_EQ(result.metadata.schema_info[2].name, "e"); + ASSERT_EQ(result.metadata.schema_info[2].children.size(), 2); + ASSERT_EQ(result.metadata.schema_info[2].children[1].children.size(), 0); + // ASSERT_EQ(result.metadata.schema_info[2].children[1].children[0].name, "3"); + auto empty_string_col = cudf::test::strings_column_wrapper{}; + cudf::test::structs_column_wrapper expected_structs{{}, cudf::test::iterators::all_nulls()}; + // make all null column of list of struct of string + auto wrapped = make_lists_column( + 4, + cudf::test::fixed_width_column_wrapper{0, 0, 0, 0, 0}.release(), + expected_structs.release(), + 4, + cudf::create_null_mask(4, cudf::mask_state::ALL_NULL)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), *wrapped); + } + } +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp index f47782a2d02..39cd783de00 100644 --- a/cpp/tests/large_strings/parquet_tests.cpp +++ b/cpp/tests/large_strings/parquet_tests.cpp @@ -18,6 +18,7 @@ #include +#include #include #include #include @@ -69,3 +70,76 @@ TEST_F(ParquetStringsTest, ReadLargeStrings) // go back to normal threshold unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD"); } + +// Disabled as the test is too brittle and depends on empirically set `pass_read_limit`, +// encoding type, and the currently used `ZSTD` scratch space size. +TEST_F(ParquetStringsTest, DISABLED_ChunkedReadLargeStrings) +{ + // Construct a table with one large strings column > 2GB + auto const wide = this->wide_column(); + auto input = cudf::concatenate(std::vector(120000, wide)); ///< 230MB + + int constexpr multiplier = 12; + std::vector input_cols(multiplier, input->view()); + auto col0 = cudf::concatenate(input_cols); ///< 2.70GB + + // Expected table + auto const expected = cudf::table_view{{col0->view()}}; + auto expected_metadata = cudf::io::table_input_metadata{expected}; + + // Needed to get exactly 2 Parquet subpasses: first with large-strings and the second with + // regualar ones. This may change in the future and lead to false failures. + expected_metadata.column_metadata[0].set_encoding( + cudf::io::column_encoding::DELTA_LENGTH_BYTE_ARRAY); + + // Host buffer to write Parquet + std::vector buffer; + + // Writer options + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buffer}, expected) + .metadata(expected_metadata); + + // Needed to get exactly 2 Parquet subpasses: first with large-strings and the second with + // regualar ones. This may change in the future and lead to false failures. + out_opts.set_compression(cudf::io::compression_type::ZSTD); + + // Write to Parquet + cudf::io::write_parquet(out_opts); + + // Empirically set pass_read_limit of 8GB so we read almost entire table (>2GB strings) in the + // first subpass and only a small amount in the second subpass. This may change in the future + // and lead to false failures. + size_t constexpr pass_read_limit = size_t{8} * 1024 * 1024 * 1024; + + // Reader options + cudf::io::parquet_reader_options default_in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(buffer.data(), buffer.size())); + + // Chunked parquet reader + auto reader = cudf::io::chunked_parquet_reader(0, pass_read_limit, default_in_opts); + + // Read chunked + auto tables = std::vector>{}; + while (reader.has_next()) { + tables.emplace_back(reader.read_chunk().tbl); + } + auto table_views = std::vector{}; + std::transform(tables.begin(), tables.end(), std::back_inserter(table_views), [](auto& tbl) { + return tbl->view(); + }); + auto result = cudf::concatenate(table_views); + auto const result_view = result->view(); + + // Verify offsets + for (auto const& cv : result_view) { + auto const offsets = cudf::strings_column_view(cv).offsets(); + EXPECT_EQ(offsets.type(), cudf::data_type{cudf::type_id::INT64}); + } + + // Verify tables to be equal + CUDF_TEST_EXPECT_TABLES_EQUAL(result_view, expected); + + // Verify that we read exactly two table chunks + EXPECT_EQ(tables.size(), 2); +} diff --git a/cpp/tests/streams/quantile_test.cpp b/cpp/tests/streams/quantile_test.cpp new file mode 100644 index 00000000000..4f4f16a9e70 --- /dev/null +++ b/cpp/tests/streams/quantile_test.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include + +struct QuantileTest : public cudf::test::BaseFixture {}; + +TEST_F(QuantileTest, TestMultiColumnUnsorted) +{ + auto input_a = cudf::test::strings_column_wrapper( + {"C", "B", "A", "A", "D", "B", "D", "B", "D", "C", "C", "C", + "D", "B", "D", "B", "C", "C", "A", "D", "B", "A", "A", "A"}, + {true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}); + + cudf::test::fixed_width_column_wrapper input_b( + {4, 3, 5, 0, 1, 0, 4, 1, 5, 3, 0, 5, 2, 4, 3, 2, 1, 2, 3, 0, 5, 1, 4, 2}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + + auto input = cudf::table_view({input_a, input_b}); + + auto actual = cudf::quantiles(input, + {0.0f, 0.5f, 0.7f, 0.25f, 1.0f}, + cudf::interpolation::NEAREST, + cudf::sorted::NO, + {cudf::order::ASCENDING, cudf::order::DESCENDING}, + {}, + cudf::test::get_default_stream()); +} + +TEST_F(QuantileTest, TestEmpty) +{ + auto input = cudf::test::fixed_width_column_wrapper({}); + cudf::quantile( + input, {0.5, 0.25}, cudf::interpolation::LINEAR, {}, true, cudf::test::get_default_stream()); +} + +TEST_F(QuantileTest, EmptyInput) +{ + auto empty_ = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref()); + cudf::test::fixed_width_column_wrapper percentiles{0.0, 0.25, 0.3}; + + std::vector input; + input.push_back(*empty_); + input.push_back(*empty_); + input.push_back(*empty_); + auto empty = cudf::concatenate(input, cudf::test::get_default_stream()); + + cudf::tdigest::tdigest_column_view tdv(*empty); + auto result = cudf::percentile_approx(tdv, percentiles, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/strings/find_multiple_tests.cpp b/cpp/tests/strings/find_multiple_tests.cpp index 41a5940c880..3c8483b153d 100644 --- a/cpp/tests/strings/find_multiple_tests.cpp +++ b/cpp/tests/strings/find_multiple_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -75,8 +76,158 @@ TEST_F(StringsFindMultipleTest, ErrorTest) auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view(); auto empty_view = cudf::strings_column_view(zero_size_strings_column); // targets must have at least one string - EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), cudf::logic_error); + EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), std::invalid_argument); + EXPECT_THROW(cudf::strings::contains_multiple(strings_view, empty_view), std::invalid_argument); // targets cannot have nulls - EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), cudf::logic_error); + EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), std::invalid_argument); + EXPECT_THROW(cudf::strings::contains_multiple(strings_view, strings_view), std::invalid_argument); +} + +TEST_F(StringsFindMultipleTest, MultiContains) +{ + constexpr int num_rows = 1024 + 1; + // replicate the following 9 rows: + std::vector s = { + "Héllo, there world and goodbye", + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving", + "the following code snippet demonstrates how to use search for values in an ordered range", + "it returns the last position where value could be inserted without violating the ordering", + "algorithms execution is parallelized as determined by an execution policy. t", + "he this is a continuation of previous row to make sure string boundaries are honored", + "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ !@#$%^&*()~", + "", + ""}; + + // replicate strings + auto string_itr = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; }); + + // nulls: 8, 8 + 1 * 9, 8 + 2 * 9 ...... + auto string_v = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return (i + 1) % s.size() != 0; }); + + auto const strings = + cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v); + auto strings_view = cudf::strings_column_view(strings); + std::vector match_targets({" the ", "a", "", "é"}); + cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(), + match_targets.end()); + auto results = + cudf::strings::contains_multiple(strings_view, cudf::strings_column_view(multi_targets_column)); + + std::vector ret_0 = {0, 1, 0, 1, 0, 0, 0, 0, 0}; + std::vector ret_1 = {1, 1, 1, 1, 1, 1, 1, 0, 0}; + std::vector ret_2 = {1, 1, 1, 1, 1, 1, 1, 1, 0}; + std::vector ret_3 = {1, 0, 0, 0, 0, 0, 0, 0, 0}; + + auto make_bool_col_fn = [&string_v, &num_rows](std::vector bools) { + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return bools[i % bools.size()]; }); + return cudf::test::fixed_width_column_wrapper(iter, iter + num_rows, string_v); + }; + + auto expected_0 = make_bool_col_fn(ret_0); + auto expected_1 = make_bool_col_fn(ret_1); + auto expected_2 = make_bool_col_fn(ret_2); + auto expected_3 = make_bool_col_fn(ret_3); + + auto expected = cudf::table_view({expected_0, expected_1, expected_2, expected_3}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected); +} + +TEST_F(StringsFindMultipleTest, MultiContainsMoreTargets) +{ + auto const strings = cudf::test::strings_column_wrapper{ + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "quick brown fox jumped", + "the following code snippet demonstrates how to use search for values in an ordered rangethe " + "following code snippet", + "thé it returns the last position where value could be inserted without violating ordering thé " + "it returns the last position"}; + auto strings_view = cudf::strings_column_view(strings); + std::vector targets({"lazy brown", "non-exist", ""}); + + std::vector> expects; + expects.push_back(cudf::test::fixed_width_column_wrapper({1, 0, 0})); + expects.push_back(cudf::test::fixed_width_column_wrapper({0, 0, 0})); + expects.push_back(cudf::test::fixed_width_column_wrapper({1, 1, 1})); + + std::vector match_targets; + int max_num_targets = 50; + + for (int num_targets = 1; num_targets < max_num_targets; num_targets++) { + match_targets.clear(); + for (int i = 0; i < num_targets; i++) { + match_targets.push_back(targets[i % targets.size()]); + } + + cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(), + match_targets.end()); + auto results = cudf::strings::contains_multiple( + strings_view, cudf::strings_column_view(multi_targets_column)); + EXPECT_EQ(results->num_columns(), num_targets); + for (int i = 0; i < num_targets; i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(i), expects[i % expects.size()]); + } + } +} + +TEST_F(StringsFindMultipleTest, MultiContainsLongStrings) +{ + constexpr int num_rows = 1024 + 1; + // replicate the following 7 rows: + std::vector s = { + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "quick brown fox jumped", + "the following code snippet demonstrates how to use search for values in an ordered rangethe " + "following code snippet", + "thé it returns the last position where value could be inserted without violating ordering thé " + "it returns the last position", + "algorithms execution is parallelized as determined by an execution policy. t algorithms " + "execution is parallelized as ", + "he this is a continuation of previous row to make sure string boundaries are honored he this " + "is a continuation of previous row", + "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ " + "!@#$%^&*()~abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKL", + ""}; + + // replicate strings + auto string_itr = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; }); + + // nulls: 6, 6 + 1 * 7, 6 + 2 * 7 ...... + auto string_v = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return (i + 1) % s.size() != 0; }); + + auto const strings = + cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v); + + auto sv = cudf::strings_column_view(strings); + auto targets = cudf::test::strings_column_wrapper({" the ", "search", "", "string", "ox", "é "}); + auto results = cudf::strings::contains_multiple(sv, cudf::strings_column_view(targets)); + + std::vector ret_0 = {1, 0, 1, 0, 0, 0, 0}; + std::vector ret_1 = {0, 1, 0, 0, 0, 0, 0}; + std::vector ret_2 = {1, 1, 1, 1, 1, 1, 0}; + std::vector ret_3 = {0, 0, 0, 0, 1, 0, 0}; + std::vector ret_4 = {1, 0, 0, 0, 0, 0, 0}; + std::vector ret_5 = {0, 0, 1, 0, 0, 0, 0}; + + auto make_bool_col_fn = [&string_v, &num_rows](std::vector bools) { + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return bools[i % bools.size()]; }); + return cudf::test::fixed_width_column_wrapper(iter, iter + num_rows, string_v); + }; + + auto expected_0 = make_bool_col_fn(ret_0); + auto expected_1 = make_bool_col_fn(ret_1); + auto expected_2 = make_bool_col_fn(ret_2); + auto expected_3 = make_bool_col_fn(ret_3); + auto expected_4 = make_bool_col_fn(ret_4); + auto expected_5 = make_bool_col_fn(ret_5); + + auto expected = + cudf::table_view({expected_0, expected_1, expected_2, expected_3, expected_4, expected_5}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected); } diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp index 2da95ba5c27..a3066c40650 100644 --- a/cpp/tests/strings/find_tests.cpp +++ b/cpp/tests/strings/find_tests.cpp @@ -17,16 +17,14 @@ #include #include #include +#include -#include #include #include #include #include #include -#include - #include struct StringsFindTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index ef35a4472cf..042ac44621e 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -28,155 +28,169 @@ struct MinHashTest : public cudf::test::BaseFixture {}; -TEST_F(MinHashTest, Basic) +TEST_F(MinHashTest, Permuted) { - auto validity = cudf::test::iterators::null_at(1); auto input = cudf::test::strings_column_wrapper({"doc 1", - "", "this is doc 2", - "", "doc 3", "d", - "The quick brown fox jumpéd over the lazy brown dog."}, - validity); + "The quick brown fox jumpéd over the lazy brown dog.", + "line six", + "line seven", + "line eight", + "line nine", + "line ten"}); auto view = cudf::strings_column_view(input); - auto results = nvtext::minhash(view); + auto first = thrust::counting_iterator(10); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); - auto expected = cudf::test::fixed_width_column_wrapper( - {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, validity); + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{1392101586u, 394869177u, 811528444u}, + LCW32{ 211415830u, 187088503u, 130291444u}, + LCW32{2098117052u, 394869177u, 799753544u}, + LCW32{2264583304u, 2920538364u, 3576493424u}, + LCW32{ 253327882u, 41747273u, 302030804u}, + LCW32{2109809594u, 1017470651u, 326988172u}, + LCW32{1303819864u, 850676747u, 147107852u}, + LCW32{ 736021564u, 720812292u, 1405158760u}, + LCW32{ 902780242u, 134064807u, 1613944636u}, + LCW32{ 547084870u, 1748895564u, 656501844u} + }); + // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto results64 = nvtext::minhash64(view); - auto expected64 = cudf::test::fixed_width_column_wrapper({774489391575805754ul, - 0ul, - 3232308021562742685ul, - 0ul, - 13145552576991307582ul, - 14660046701545912182ul, - 398062025280761388ul}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); -} + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); -TEST_F(MinHashTest, LengthEqualsWidth) -{ - auto input = cudf::test::strings_column_wrapper({"abcdé", "fghjk", "lmnop", "qrstu", "vwxyz"}); - auto view = cudf::strings_column_view(input); - auto results = nvtext::minhash(view, 0, 5); - auto expected = cudf::test::fixed_width_column_wrapper( - {3825281041u, 2728681928u, 1984332911u, 3965004915u, 192452857u}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + using LCW64 = cudf::test::lists_column_wrapper; + // clang-format off + LCW64 expected64({ + LCW64{ 827364888116975697ul, 1601854279692781452ul, 70500662054893256ul}, + LCW64{ 18312093741021833ul, 133793446674258329ul, 21974512489226198ul}, + LCW64{ 22474244732520567ul, 1638811775655358395ul, 949306297364502264ul}, + LCW64{1332357434996402861ul, 2157346081260151330ul, 676491718310205848ul}, + LCW64{ 65816830624808020ul, 43323600380520789ul, 63511816333816345ul}, + LCW64{ 629657184954525200ul, 49741036507643002ul, 97466271004074331ul}, + LCW64{ 301611977846331113ul, 101188874709594830ul, 97466271004074331ul}, + LCW64{ 121498891461700668ul, 171065800427907402ul, 97466271004074331ul}, + LCW64{ 54617739511834072ul, 231454301607238929ul, 97466271004074331ul}, + LCW64{ 576418665851990314ul, 231454301607238929ul, 97466271004074331ul} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } -TEST_F(MinHashTest, MultiSeed) +TEST_F(MinHashTest, PermutedWide) { - auto input = - cudf::test::strings_column_wrapper({"doc 1", - "this is doc 2", - "doc 3", - "d", - "The quick brown fox jumpéd over the lazy brown dog."}); - - auto view = cudf::strings_column_view(input); + std::string const small(2 << 10, 'x'); // below wide_string_threshold + std::string const wide(2 << 19, 'y'); // above wide_string_threshold + auto input = cudf::test::strings_column_wrapper({small, wide}); + auto view = cudf::strings_column_view(input); - auto seeds = cudf::test::fixed_width_column_wrapper({0, 1, 2}); - auto results = nvtext::minhash(view, cudf::column_view(seeds)); + auto first = thrust::counting_iterator(20); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); - using LCW = cudf::test::lists_column_wrapper; + using LCW32 = cudf::test::lists_column_wrapper; // clang-format off - LCW expected({LCW{1207251914u, 1677652962u, 1061355987u}, - LCW{ 21141582u, 580916568u, 1258052021u}, - LCW{1207251914u, 943567174u, 1109272887u}, - LCW{ 655955059u, 488346356u, 2394664816u}, - LCW{ 86520422u, 236622901u, 102546228u}}); + LCW32 expected({ + LCW32{1731998032u, 315359380u, 3193688024u}, + LCW32{1293098788u, 2860992281u, 133918478u} + }); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto seeds64 = cudf::test::fixed_width_column_wrapper({0, 1, 2}); - auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64)); + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; // clang-format off - LCW64 expected64({LCW64{ 774489391575805754ul, 10435654231793485448ul, 1188598072697676120ul}, - LCW64{ 3232308021562742685ul, 4445611509348165860ul, 1188598072697676120ul}, - LCW64{13145552576991307582ul, 6846192680998069919ul, 1188598072697676120ul}, - LCW64{14660046701545912182ul, 17106501326045553694ul, 17713478494106035784ul}, - LCW64{ 398062025280761388ul, 377720198157450084ul, 984941365662009329ul}}); + LCW64 expected64({ + LCW64{1818322427062143853ul, 641024893347719371ul, 1769570368846988848ul}, + LCW64{1389920339306667795ul, 421787002125838902ul, 1759496674158703968ul} + }); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } -TEST_F(MinHashTest, MultiSeedWithNullInputRow) +TEST_F(MinHashTest, PermutedManyParameters) { - auto validity = cudf::test::iterators::null_at(1); - auto input = cudf::test::strings_column_wrapper({"abcdéfgh", "", "", "stuvwxyz"}, validity); - auto view = cudf::strings_column_view(input); + std::string const small(2 << 10, 'x'); + std::string const wide(2 << 19, 'y'); + auto input = cudf::test::strings_column_wrapper({small, wide}); + auto view = cudf::strings_column_view(input); - auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); - auto results = nvtext::minhash(view, cudf::column_view(seeds)); + auto first = thrust::counting_iterator(20); + // more than params_per_thread + auto params = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); - using LCW = cudf::test::lists_column_wrapper; - LCW expected({LCW{484984072u, 1074168784u}, LCW{}, LCW{0u, 0u}, LCW{571652169u, 173528385u}}, - validity); + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{1731998032u, 315359380u, 3193688024u, 1777049372u, 360410720u, 3238739364u, 1822100712u, 405462060u, + 3283790704u, 1867152052u, 450513400u, 3328842044u, 1912203392u, 495564740u, 3373893384u, 1957254732u, + 540616080u, 3418944724u, 2002306072u, 585667420u, 3463996064u, 2047357412u, 630718760u, 3509047404u, + 2092408752u, 675770100u, 3554098744u, 2137460092u, 720821440u, 3599150084u, 2182511432u}, + LCW32{1293098788u, 2860992281u, 133918478u, 1701811971u, 3269705464u, 542631661u, 2110525154u, 3678418647u, + 951344844u, 2519238337u, 4087131830u, 1360058027u, 2927951520u, 200877717u, 1768771210u, 3336664703u, + 609590900u, 2177484393u, 3745377886u, 1018304083u, 2586197576u, 4154091069u, 1427017266u, 2994910759u, + 267836956u, 1835730449u, 3403623942u, 676550139u, 2244443632u, 3812337125u, 1085263322u} + }); + // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); - auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64)); + // more than params_per_thread + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; - LCW64 expected64({LCW64{2597399324547032480ul, 4461410998582111052ul}, - LCW64{}, - LCW64{0ul, 0ul}, - LCW64{2717781266371273264ul, 6977325820868387259ul}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); -} - -TEST_F(MinHashTest, WordsMinHash) -{ - using LCWS = cudf::test::lists_column_wrapper; - auto validity = cudf::test::iterators::null_at(1); - - LCWS input( - {LCWS({"hello", "abcdéfgh"}), - LCWS{}, - LCWS({"rapids", "moré", "test", "text"}), - LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})}, - validity); - - auto view = cudf::lists_column_view(input); - - auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); - auto results = nvtext::word_minhash(view, cudf::column_view(seeds)); - using LCW32 = cudf::test::lists_column_wrapper; - LCW32 expected({LCW32{2069617641u, 1975382903u}, - LCW32{}, - LCW32{657297235u, 1010955999u}, - LCW32{644643885u, 310002789u}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - - auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); - auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64)); - using LCW64 = cudf::test::lists_column_wrapper; - LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul}, - LCW64{}, - LCW64{5331949571924938590ul, 2088583894581919741ul}, - LCW64{3400468157617183341ul, 2398577492366130055ul}}, - validity); + // clang-format off + LCW64 expected64({ + LCW64{1818322427062143853, 641024893347719371, 1769570368846988848, 592272835132564366, + 1720818310631833835, 543520776917409353, 1672066252416678822, 494768718702254348, + 1623314194201523817, 446016660487099335, 1574562135986368804, 397264602271944322, + 1525810077771213799, 348512544056789317, 1477058019556058786, 299760485841634304, + 1428305961340903773, 251008427626479291, 1379553903125748768, 202256369411324286, + 1330801844910593755, 153504311196169273, 1282049786695438742, 104752252981014268, + 1233297728480283737, 56000194765859255, 1184545670265128724, 7248136550704242, + 1135793612049973719, 2264339087549243188, 1087041553834818706}, + LCW64{1389920339306667795, 421787002125838902, 1759496674158703968, 791363336977875075, + 2129073009010740141, 1160939671829911248, 192806334649082363, 1530516006681947421, + 562382669501118536, 1900092341533983602, 931959004353154709, 2269668676386019775, + 1301535339205190882, 333402002024361997, 1671111674057227055, 702978336876398170, + 2040688008909263228, 1072554671728434343, 104421334547605450, 1442131006580470516, + 473997669399641631, 1811707341432506689, 843574004251677804, 2181283676284542862, + 1213150339103713977, 245017001922885084, 1582726673955750150, 614593336774921257, + 1952303008807786323, 984169671626957438, 16036334446128545} + }); + // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } TEST_F(MinHashTest, EmptyTest) { - auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - auto view = cudf::strings_column_view(input->view()); - auto results = nvtext::minhash(view); + auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + auto view = cudf::strings_column_view(input->view()); + auto params = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); EXPECT_EQ(results->size(), 0); - results = nvtext::minhash64(view); + auto params64 = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + results = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); EXPECT_EQ(results->size(), 0); } @@ -184,20 +198,39 @@ TEST_F(MinHashTest, ErrorsTest) { auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"}); auto view = cudf::strings_column_view(input); - EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument); - EXPECT_THROW(nvtext::minhash64(view, 0, 0), std::invalid_argument); - auto seeds = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument); - auto seeds64 = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::invalid_argument); + auto empty = cudf::test::fixed_width_column_wrapper(); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0), + std::invalid_argument); + auto empty64 = cudf::test::fixed_width_column_wrapper(); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4), + std::invalid_argument); std::vector h_input(50000, ""); input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end()); view = cudf::strings_column_view(input); auto const zeroes = thrust::constant_iterator(0); - seeds = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::overflow_error); - seeds64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::overflow_error); + auto params = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4), + std::overflow_error); + auto params64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW(nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4), + std::overflow_error); + + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(empty), 4), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4), + std::invalid_argument); } diff --git a/dependencies.yaml b/dependencies.yaml index 4c6aefe996f..b5165f82d5f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -442,7 +442,7 @@ dependencies: common: - output_types: [conda] packages: - - pyarrow>=14.0.0,<18.0.0a0 + - pyarrow>=14.0.0,<19.0.0a0 - output_types: [requirements, pyproject] packages: # pyarrow 17.0.0 wheels have a subtle issue around threading that @@ -450,8 +450,8 @@ dependencies: # be highly dependent on the exact build configuration, so we'll just # avoid 17.0.0 for now unless we observe similar issues in future # releases as well. - - pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64' - - pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64' + - pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64' + - pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64' cuda_version: specific: - output_types: conda @@ -587,6 +587,12 @@ dependencies: packages: - clang==19.1.0 - clang-tools==19.1.0 + # TODO: These are build requirements for IWYU and can be replaced + # with IWYU itself once a conda package of IWYU supporting clang 19 + # is available. + - clangdev==19.1.0 + - llvm==19.1.0 + - llvmdev==19.1.0 docs: common: - output_types: [conda] @@ -669,7 +675,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cachetools - - &numba-cuda-dep numba-cuda>=0.0.13 + - &numba-cuda-dep numba-cuda>=0.0.13,<0.0.18 - nvtx>=0.2.1 - packaging - rich @@ -728,7 +734,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.11,<1.13 + - polars>=1.11,<1.14 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 5942cc16850..fbb9ca4b128 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -26,16 +26,18 @@ import tempfile import warnings import xml.etree.ElementTree as ET +from enum import IntEnum +from typing import Any +import cudf from docutils.nodes import Text from packaging.version import Version -from sphinx.addnodes import pending_xref -from sphinx.highlighting import lexers -from sphinx.ext import intersphinx from pygments.lexer import RegexLexer from pygments.token import Text as PText - -import cudf +from sphinx.addnodes import pending_xref +from sphinx.ext import intersphinx +from sphinx.ext.autodoc import ClassDocumenter, bool_option +from sphinx.highlighting import lexers class PseudoLexer(RegexLexer): @@ -342,7 +344,10 @@ def clean_all_xml_files(path): "cudf.Series": ("cudf.core.series.Series", "cudf.Series"), "cudf.Index": ("cudf.core.index.Index", "cudf.Index"), "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"), - "DeviceBuffer": ("rmm.pylibrmm.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"), + "DeviceBuffer": ( + "rmm.pylibrmm.device_buffer.DeviceBuffer", + "rmm.DeviceBuffer", + ), } @@ -373,7 +378,14 @@ def _generate_namespaces(namespaces): _all_namespaces = _generate_namespaces( { # Note that io::datasource is actually a nested class - "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression", "io::text"}, + "cudf": { + "io", + "io::datasource", + "strings", + "ast", + "ast::expression", + "io::text", + }, "numeric": {}, "nvtext": {}, } @@ -554,6 +566,8 @@ def on_missing_reference(app, env, node, contnode): nitpick_ignore = [ + # Erroneously warned in ParquetColumnSchema.name + ("py:class", "unicode"), ("py:class", "SeriesOrIndex"), ("py:class", "Dtype"), # The following are erroneously warned due to @@ -640,9 +654,54 @@ def linkcode_resolve(domain, info) -> str | None: f"branch-{version}/python/cudf/cudf/{fn}{linespec}" ) + # Needed for avoid build warning for PandasCompat extension suppress_warnings = ["myst.domains"] + +class PLCIntEnumDocumenter(ClassDocumenter): + objtype = "enum" + directivetype = "attribute" + priority = 10 + ClassDocumenter.priority + + option_spec = dict(ClassDocumenter.option_spec) + + @classmethod + def can_document_member( + cls, member: Any, membername: str, isattr: bool, parent: Any + ) -> bool: + try: + return issubclass( + member, IntEnum + ) and member.__module__.startswith("pylibcudf") + except TypeError: + return False + + def add_directive_header(self, sig: str) -> None: + self.directivetype = "attribute" + super().add_directive_header(sig) + + def add_content(self, more_content) -> None: + doc_as_attr = self.doc_as_attr + self.doc_as_attr = False + super().add_content(more_content) + self.doc_as_attr = doc_as_attr + source_name = self.get_sourcename() + enum_object: IntEnum = self.object + + if self.object.__name__ != "Kind": + self.add_line(f"See also :cpp:enum:`cudf::{self.object.__name__}`.", source_name) + self.add_line("", source_name) + self.add_line("Enum members", source_name) + self.add_line("", source_name) + + for the_member_name in enum_object.__members__: # type: ignore[attr-defined] + self.add_line( + f"* ``{the_member_name}``", source_name + ) + self.add_line("", source_name) + + def setup(app): app.add_css_file("https://docs.rapids.ai/assets/css/custom.css") app.add_js_file( @@ -650,3 +709,5 @@ def setup(app): ) app.connect("doctree-read", resolve_aliases) app.connect("missing-reference", on_missing_reference) + app.setup_extension("sphinx.ext.autodoc") + app.add_autodocumenter(PLCIntEnumDocumenter) diff --git a/docs/cudf/source/developer_guide/cudf_pandas.md b/docs/cudf/source/developer_guide/cudf_pandas.md index 911a64fa152..b653b786129 100644 --- a/docs/cudf/source/developer_guide/cudf_pandas.md +++ b/docs/cudf/source/developer_guide/cudf_pandas.md @@ -11,7 +11,8 @@ In the rest of this document, to maintain a concrete pair of libraries in mind, For example, future support could include pairs such as CuPy (as the "fast" library) and NumPy (as the "slow" library). ```{note} -We currently do not wrap the entire NumPy library because it exposes a C API. But we do wrap NumPy's `numpy.ndarray` and CuPy's `cupy.ndarray` in a proxy type. +1. We currently do not wrap the entire NumPy library because it exposes a C API. But we do wrap NumPy's `numpy.ndarray` and CuPy's `cupy.ndarray` in a proxy type. +2. There is a `custom_iter` method defined to always utilize slow objects `iter` method, that way we don't move the objects to GPU and trigger an error and again move the object to CPU to execute the iteration successfully. ``` ### Types: diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md index 39840e72e21..1ee828e7c4e 100644 --- a/docs/cudf/source/developer_guide/pylibcudf.md +++ b/docs/cudf/source/developer_guide/pylibcudf.md @@ -15,7 +15,8 @@ To satisfy the goals of pylibcudf, we impose the following set of design princip - All typing in code should be written using Cython syntax, not PEP 484 Python typing syntax. Not only does this ensure compatibility with Cython < 3, but even with Cython 3 PEP 484 support remains incomplete as of this writing. - All cudf code should interact only with pylibcudf, never with libcudf directly. This is not currently the case, but is the direction that the library is moving towards. - Ideally, pylibcudf should depend on no RAPIDS component other than rmm, and should in general have minimal runtime dependencies. - +- Type stubs are provided and generated manually. When adding new + functionality, ensure that the matching type stub is appropriately updated. ## Relationship to libcudf @@ -249,3 +250,73 @@ In the event that libcudf provides multiple overloads for the same function with and set arguments not shared between overloads to `None`. If a user tries to pass in an unsupported argument for a specific overload type, you should raise `ValueError`. Finally, consider making an libcudf issue if you think this inconsistency can be addressed on the libcudf side. + +### Type stubs + +Since static type checkers like `mypy` and `pyright` cannot parse +Cython code, we provide type stubs for the pylibcudf package. These +are currently maintained manually, alongside the matching pylibcudf +files. + +Every `pyx` file should have a matching `pyi` file that provides the +type stubs. Most functions can be exposed straightforwardly. Some +guiding principles: + +- For typed integer arguments in libcudf, use `int` as a type + annotation. +- For functions which are annotated as a `list` in Cython, but the + function body does more detailed checking, try and encode the + detailed information in the type. +- For Cython fused types there are two options: + 1. If the fused type appears only once in the function signature, + use a `Union` type; + 2. If the fused type appears more than once (or as both an input + and output type), use a `TypeVar` with + the variants in the fused type provided as constraints. + + +As an example, `pylibcudf.copying.split` is typed in Cython as: + +```cython +ctypedef fused ColumnOrTable: + Table + Column + +cpdef list split(ColumnOrTable input, list splits): ... +``` + +Here we only have a single use of the fused type, and the `list` +arguments do not specify their values. Here, if we provide a `Column` +as input, we receive a `list[Column]` as output, and if we provide a +`Table` we receive `list[Table]` as output. + +In the type stub, we can encode this with a `TypeVar`, we can also +provide typing for the `splits` argument that indicates that the split +values must be integers: + +```python +ColumnOrTable = TypeVar("ColumnOrTable", Column, Table) + +def split(input: ColumnOrTable, splits: list[int]) -> list[ColumnOrTable]: ... +``` + +Conversely, `pylibcudf.copying.scatter` uses a fused type only once in +its input: + +```cython +ctypedef fused TableOrListOfScalars: + Table + list + +cpdef Table scatter( + TableOrListOfScalars source, Column scatter_map, Table target +) +``` + +In the type stub, we can use a normal union in this case + +```python +def scatter( + source: Table | list[Scalar], scatter_map: Column, target: Table +) -> Table: ... +``` diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst index 53638f071cc..1c1c8040972 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst @@ -19,4 +19,6 @@ I/O Functions csv json parquet + parquet_metadata + text timezone diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst new file mode 100644 index 00000000000..fce964f9714 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst @@ -0,0 +1,6 @@ +================ +Parquet Metadata +================ + +.. automodule:: pylibcudf.io.parquet_metadata + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst new file mode 100644 index 00000000000..327ca043f36 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst @@ -0,0 +1,6 @@ +==== +text +==== + +.. automodule:: pylibcudf.io.text + :members: diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh index 4b5379cf0f1..b85c215d7d1 100755 --- a/java/ci/build-in-docker.sh +++ b/java/ci/build-in-docker.sh @@ -65,7 +65,7 @@ cmake .. -G"${CMAKE_GENERATOR}" \ -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS \ -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL \ -DBUILD_SHARED_LIBS=OFF \ - -DKvikIO_REMOTE_SUPPORT=OFF + -DCUDF_KVIKIO_REMOTE_IO=OFF if [[ -z "${PARALLEL_LEVEL}" ]]; then cmake --build . diff --git a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java index e48b1cf59e4..86b6b98f2ae 100644 --- a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java +++ b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,7 +24,7 @@ * that is backing it. */ public class DeviceMemoryBufferView extends BaseDeviceMemoryBuffer { - DeviceMemoryBufferView(long address, long lengthInBytes) { + public DeviceMemoryBufferView(long address, long lengthInBytes) { // Set the cleaner to null so we don't end up releasing anything super(address, lengthInBytes, (MemoryBufferCleaner) null); } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index dbee53640aa..b01ce31b1f3 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -259,7 +259,6 @@ private static native long readJSON(int[] numChildren, String[] columnNames, boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, - boolean pruneColumns, boolean experimental, byte lineDelimiter) throws CudfException; @@ -275,7 +274,6 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, - boolean pruneColumns, boolean experimental, byte lineDelimiter, long dsHandle) throws CudfException; @@ -1092,224 +1090,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer) { return readJSON(schema, opts, buffer, 0, buffer.length); } - private static class DidViewChange { - ColumnVector changeWasNeeded = null; - boolean noChangeNeeded = false; - - public static DidViewChange yes(ColumnVector cv) { - DidViewChange ret = new DidViewChange(); - ret.changeWasNeeded = cv; - return ret; - } - - public static DidViewChange no() { - DidViewChange ret = new DidViewChange(); - ret.noChangeNeeded = true; - return ret; - } - } - - private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.NestedChildren children, - ColumnView cv) { - // We need to do this recursively to be sure it all matches as expected. - // If we run into problems where the data types don't match, we are not - // going to fix up the data types. We are only going to reorder the columns. - if (schema.getType() == DType.STRUCT) { - if (cv.getType() != DType.STRUCT) { - // The types don't match so just return the input unchanged... - return DidViewChange.no(); - } else { - String[] foundNames; - if (children == null) { - foundNames = new String[0]; - } else { - foundNames = children.getNames(); - } - HashMap indices = new HashMap<>(); - for (int i = 0; i < foundNames.length; i++) { - indices.put(foundNames[i], i); - } - // We might need to rearrange the columns to match what we want. - DType[] types = schema.getChildTypes(); - String[] neededNames = schema.getColumnNames(); - ColumnView[] columns = new ColumnView[neededNames.length]; - try { - boolean somethingChanged = false; - if (columns.length != foundNames.length) { - somethingChanged = true; - } - for (int i = 0; i < columns.length; i++) { - String neededColumnName = neededNames[i]; - Integer index = indices.get(neededColumnName); - Schema childSchema = schema.getChild(i); - if (index != null) { - if (childSchema.isStructOrHasStructDescendant()) { - ColumnView child = cv.getChildColumnView(index); - boolean shouldCloseChild = true; - try { - if (index != i) { - somethingChanged = true; - } - DidViewChange childResult = gatherJSONColumns(schema.getChild(i), - children.getChild(index), child); - if (childResult.noChangeNeeded) { - shouldCloseChild = false; - columns[i] = child; - } else { - somethingChanged = true; - columns[i] = childResult.changeWasNeeded; - } - } finally { - if (shouldCloseChild) { - child.close(); - } - } - } else { - if (index != i) { - somethingChanged = true; - } - columns[i] = cv.getChildColumnView(index); - } - } else { - somethingChanged = true; - if (types[i] == DType.LIST) { - try (Scalar s = Scalar.listFromNull(childSchema.getChild(0).asHostDataType())) { - columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); - } - } else if (types[i] == DType.STRUCT) { - int numStructChildren = childSchema.getNumChildren(); - HostColumnVector.DataType[] structChildren = new HostColumnVector.DataType[numStructChildren]; - for (int structChildIndex = 0; structChildIndex < numStructChildren; structChildIndex++) { - structChildren[structChildIndex] = childSchema.getChild(structChildIndex).asHostDataType(); - } - try (Scalar s = Scalar.structFromNull(structChildren)) { - columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); - } - } else { - try (Scalar s = Scalar.fromNull(types[i])) { - columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); - } - } - } - } - if (somethingChanged) { - try (ColumnView ret = new ColumnView(cv.type, cv.rows, Optional.of(cv.nullCount), - cv.getValid(), null, columns)) { - return DidViewChange.yes(ret.copyToColumnVector()); - } - } else { - return DidViewChange.no(); - } - } finally { - for (ColumnView c: columns) { - if (c != null) { - c.close(); - } - } - } - } - } else if (schema.getType() == DType.LIST && cv.getType() == DType.LIST) { - if (schema.isStructOrHasStructDescendant()) { - String [] childNames = children.getNames(); - if (childNames.length == 2 && - "offsets".equals(childNames[0]) && - "element".equals(childNames[1])) { - try (ColumnView child = cv.getChildColumnView(0)){ - DidViewChange listResult = gatherJSONColumns(schema.getChild(0), - children.getChild(1), child); - if (listResult.noChangeNeeded) { - return DidViewChange.no(); - } else { - try (ColumnView listView = new ColumnView(cv.type, cv.rows, - Optional.of(cv.nullCount), cv.getValid(), cv.getOffsets(), - new ColumnView[]{listResult.changeWasNeeded})) { - return DidViewChange.yes(listView.copyToColumnVector()); - } finally { - listResult.changeWasNeeded.close(); - } - } - } - } - } - // Nothing to change so just return the input, but we need to inc a ref count to really - // make it work, so for now we are going to turn it into a ColumnVector. - return DidViewChange.no(); - } else { - // Nothing to change so just return the input, but we need to inc a ref count to really - // make it work, so for now we are going to turn it into a ColumnVector. - return DidViewChange.no(); - } - } - - private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emptyRowCount) { - String[] neededColumns = schema.getColumnNames(); - if (neededColumns == null || neededColumns.length == 0) { - return twm.releaseTable(); - } else { - String[] foundNames = twm.getColumnNames(); - HashMap indices = new HashMap<>(); - for (int i = 0; i < foundNames.length; i++) { - indices.put(foundNames[i], i); - } - // We might need to rearrange the columns to match what we want. - DType[] types = schema.getChildTypes(); - ColumnVector[] columns = new ColumnVector[neededColumns.length]; - try (Table tbl = twm.releaseTable()) { - int rowCount = tbl == null ? emptyRowCount : (int)tbl.getRowCount(); - if (rowCount < 0) { - throw new IllegalStateException( - "No empty row count provided and the table read has no row count or columns"); - } - for (int i = 0; i < columns.length; i++) { - String neededColumnName = neededColumns[i]; - Integer index = indices.get(neededColumnName); - if (index != null) { - if (schema.getChild(i).isStructOrHasStructDescendant()) { - DidViewChange gathered = gatherJSONColumns(schema.getChild(i), twm.getChild(index), - tbl.getColumn(index)); - if (gathered.noChangeNeeded) { - columns[i] = tbl.getColumn(index).incRefCount(); - } else { - columns[i] = gathered.changeWasNeeded; - } - } else { - columns[i] = tbl.getColumn(index).incRefCount(); - } - } else { - if (types[i] == DType.LIST) { - Schema listSchema = schema.getChild(i); - Schema elementSchema = listSchema.getChild(0); - try (Scalar s = Scalar.listFromNull(elementSchema.asHostDataType())) { - columns[i] = ColumnVector.fromScalar(s, rowCount); - } - } else if (types[i] == DType.STRUCT) { - Schema structSchema = schema.getChild(i); - int numStructChildren = structSchema.getNumChildren(); - DataType[] structChildrenTypes = new DataType[numStructChildren]; - for (int j = 0; j < numStructChildren; j++) { - structChildrenTypes[j] = structSchema.getChild(j).asHostDataType(); - } - try (Scalar s = Scalar.structFromNull(structChildrenTypes)) { - columns[i] = ColumnVector.fromScalar(s, rowCount); - } - } else { - try (Scalar s = Scalar.fromNull(types[i])) { - columns[i] = ColumnVector.fromScalar(s, rowCount); - } - } - } - } - return new Table(columns); - } finally { - for (ColumnVector c: columns) { - if (c != null) { - c.close(); - } - } - } - } - } - /** * Read a JSON file. * @param schema the schema of the file. You may use Schema.INFERRED to infer the schema. @@ -1318,10 +1098,6 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp * @return the file parsed as a table on the GPU. */ public static Table readJSON(Schema schema, JSONOptions opts, File path) { - // only prune the schema if one is provided - boolean cudfPruneSchema = schema.getColumnNames() != null && - schema.getColumnNames().length != 0 && - opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta( readJSON(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), @@ -1336,11 +1112,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), - cudfPruneSchema, opts.experimental(), opts.getLineDelimiter()))) { - return gatherJSONColumns(schema, twm, -1); + return twm.releaseTable(); } } @@ -1361,6 +1136,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon /** * Read JSON formatted data. + * + * @deprecated This method is deprecated since emptyRowCount is not used. Use the method without + * emptyRowCount instead. + * * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. * @param opts various JSON parsing options. * @param buffer raw UTF8 formatted bytes. @@ -1370,6 +1149,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon * @param emptyRowCount the number of rows to return if no columns were read. * @return the data parsed as a table on the GPU. */ + @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, long len, HostMemoryAllocator hostMemoryAllocator, int emptyRowCount) { @@ -1381,14 +1161,14 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon assert offset >= 0 && offset < buffer.length; try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) { newBuf.setBytes(0, buffer, offset, len); - return readJSON(schema, opts, newBuf, 0, len, emptyRowCount); + return readJSON(schema, opts, newBuf, 0, len); } } + @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, long len, int emptyRowCount) { - return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get(), - emptyRowCount); + return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get()); } public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, @@ -1470,6 +1250,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b /** * Read JSON formatted data. + * + * @deprecated This method is deprecated since emptyRowCount is not used. Use the method without + * emptyRowCount instead. + * * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. * @param opts various JSON parsing options. * @param buffer raw UTF8 formatted bytes. @@ -1478,6 +1262,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b * @param emptyRowCount the number of rows to use if no columns were found. * @return the data parsed as a table on the GPU. */ + @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer, long offset, long len, int emptyRowCount) { if (len <= 0) { @@ -1486,10 +1271,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b assert len > 0; assert len <= buffer.length - offset; assert offset >= 0 && offset < buffer.length; - // only prune the schema if one is provided - boolean cudfPruneSchema = schema.getColumnNames() != null && - schema.getColumnNames().length != 0 && - opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta(readJSON( schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null, @@ -1505,10 +1286,9 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), - cudfPruneSchema, opts.experimental(), opts.getLineDelimiter()))) { - return gatherJSONColumns(schema, twm, emptyRowCount); + return twm.releaseTable(); } } @@ -1525,18 +1305,19 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { /** * Read JSON formatted data. + * + * @deprecated This method is deprecated since emptyRowCount is not used. Use the method without + * emptyRowCount instead. + * * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. * @param opts various JSON parsing options. * @param ds the DataSource to read from. * @param emptyRowCount the number of rows to return if no columns were read. * @return the data parsed as a table on the GPU. */ + @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); - // only prune the schema if one is provided - boolean cudfPruneSchema = schema.getColumnNames() != null && - schema.getColumnNames().length != 0 && - opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(), @@ -1550,11 +1331,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), - cudfPruneSchema, opts.experimental(), opts.getLineDelimiter(), dsHandle))) { - return gatherJSONColumns(schema, twm, emptyRowCount); + return twm.releaseTable(); } finally { DataSourceHelper.destroyWrapperDataSource(dsHandle); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 0a667978ca3..1f8b1ea207d 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1037,21 +1037,23 @@ cudf::io::schema_element read_schema_element(int& index, if (d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST) { std::map child_elems; int num_children = children[index]; + std::vector child_names(num_children); // go to the next entry, so recursion can parse it. index++; for (int i = 0; i < num_children; i++) { - auto const name = std::string{names.get(index).get()}; + auto name = std::string{names.get(index).get()}; child_elems.insert( std::pair{name, cudf::jni::read_schema_element(index, children, names, types, scales)}); + child_names[i] = std::move(name); } - return cudf::io::schema_element{d_type, std::move(child_elems)}; + return cudf::io::schema_element{d_type, std::move(child_elems), {std::move(child_names)}}; } else { if (children[index] != 0) { throw std::invalid_argument("found children for a type that should have none"); } // go to the next entry before returning... index++; - return cudf::io::schema_element{d_type, {}}; + return cudf::io::schema_element{d_type, {}, std::nullopt}; } } @@ -1824,7 +1826,6 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, - jboolean prune_columns, jboolean experimental, jbyte line_delimiter, jlong ds_handle) @@ -1853,6 +1854,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, cudf::io::json_recovery_mode_t recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; + cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) @@ -1864,7 +1866,6 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) .keep_quotes(keep_quotes) - .prune_columns(prune_columns) .experimental(experimental); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) @@ -1886,13 +1887,19 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, } std::map data_types; + std::vector name_order; int at = 0; while (at < n_types.size()) { auto const name = std::string{n_col_names.get(at).get()}; data_types.insert(std::pair{ name, cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)}); + name_order.push_back(name); } - opts.dtypes(data_types); + auto const prune_columns = data_types.size() != 0; + cudf::io::schema_element structs{ + cudf::data_type{cudf::type_id::STRUCT}, std::move(data_types), {std::move(name_order)}}; + opts.prune_columns(prune_columns).dtypes(structs); + } else { // should infer the types } @@ -1925,7 +1932,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, - jboolean prune_columns, jboolean experimental, jbyte line_delimiter) { @@ -1968,6 +1974,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, cudf::io::json_recovery_mode_t recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; + cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) @@ -1979,7 +1986,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) .keep_quotes(keep_quotes) - .prune_columns(prune_columns) .experimental(experimental); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) @@ -2001,13 +2007,19 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, } std::map data_types; + std::vector name_order; + name_order.reserve(n_types.size()); int at = 0; while (at < n_types.size()) { - auto const name = std::string{n_col_names.get(at).get()}; + auto name = std::string{n_col_names.get(at).get()}; data_types.insert(std::pair{ name, cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)}); + name_order.emplace_back(std::move(name)); } - opts.dtypes(data_types); + auto const prune_columns = data_types.size() != 0; + cudf::io::schema_element structs{ + cudf::data_type{cudf::type_id::STRUCT}, std::move(data_types), {std::move(name_order)}}; + opts.prune_columns(prune_columns).dtypes(structs); } else { // should infer the types } diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index d844466120f..7e8f29dac93 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -4,46 +4,28 @@ import warnings from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - cimport pylibcudf.libcudf.datetime as libcudf_datetime -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.filling cimport calendrical_month_sequence -from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport size_type -from pylibcudf.datetime import DatetimeComponent +from pylibcudf.datetime import DatetimeComponent, RoundingFrequency from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar +import pylibcudf as plc @acquire_spill_lock() def add_months(Column col, Column months): # months must be int16 dtype - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef column_view months_view = months.view() - - with nogil: - c_result = move( - libcudf_datetime.add_calendrical_months( - col_view, - months_view - ) + return Column.from_pylibcudf( + plc.datetime.add_calendrical_months( + col.to_pylibcudf(mode="read"), + months.to_pylibcudf(mode="read") ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() def extract_datetime_component(Column col, object field): - - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef libcudf_datetime.datetime_component component - component_names = { "year": DatetimeComponent.YEAR, "month": DatetimeComponent.MONTH, @@ -57,33 +39,29 @@ def extract_datetime_component(Column col, object field): "nanosecond": DatetimeComponent.NANOSECOND, } if field == "day_of_year": - with nogil: - c_result = move(libcudf_datetime.day_of_year(col_view)) + result = Column.from_pylibcudf( + plc.datetime.day_of_year( + col.to_pylibcudf(mode="read") + ) + ) elif field in component_names: - component = component_names[field] - with nogil: - c_result = move( - libcudf_datetime.extract_datetime_component( - col_view, - component - ) + result = Column.from_pylibcudf( + plc.datetime.extract_datetime_component( + col.to_pylibcudf(mode="read"), + component_names[field], ) + ) + if field == "weekday": + # Pandas counts Monday-Sunday as 0-6 + # while libcudf counts Monday-Sunday as 1-7 + result = result - result.dtype.type(1) else: raise ValueError(f"Invalid field: '{field}'") - result = Column.from_unique_ptr(move(c_result)) - - if field == "weekday": - # Pandas counts Monday-Sunday as 0-6 - # while libcudf counts Monday-Sunday as 1-7 - result = result - result.dtype.type(1) - return result cdef libcudf_datetime.rounding_frequency _get_rounding_frequency(object freq): - cdef libcudf_datetime.rounding_frequency freq_val - # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timedelta.resolution_string.html old_to_new_freq_map = { "H": "h", @@ -101,96 +79,75 @@ cdef libcudf_datetime.rounding_frequency _get_rounding_frequency(object freq): FutureWarning ) freq = old_to_new_freq_map.get(freq) - if freq == "D": - freq_val = libcudf_datetime.rounding_frequency.DAY - elif freq == "h": - freq_val = libcudf_datetime.rounding_frequency.HOUR - elif freq == "min": - freq_val = libcudf_datetime.rounding_frequency.MINUTE - elif freq == "s": - freq_val = libcudf_datetime.rounding_frequency.SECOND - elif freq == "ms": - freq_val = libcudf_datetime.rounding_frequency.MILLISECOND - elif freq == "us": - freq_val = libcudf_datetime.rounding_frequency.MICROSECOND - elif freq == "ns": - freq_val = libcudf_datetime.rounding_frequency.NANOSECOND + rounding_fequency_map = { + "D": RoundingFrequency.DAY, + "h": RoundingFrequency.HOUR, + "min": RoundingFrequency.MINUTE, + "s": RoundingFrequency.SECOND, + "ms": RoundingFrequency.MILLISECOND, + "us": RoundingFrequency.MICROSECOND, + "ns": RoundingFrequency.NANOSECOND, + } + if freq in rounding_fequency_map: + return rounding_fequency_map[freq] else: raise ValueError(f"Invalid resolution: '{freq}'") - return freq_val @acquire_spill_lock() def ceil_datetime(Column col, object freq): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef libcudf_datetime.rounding_frequency freq_val = \ - _get_rounding_frequency(freq) - - with nogil: - c_result = move(libcudf_datetime.ceil_datetimes(col_view, freq_val)) - - result = Column.from_unique_ptr(move(c_result)) - return result + return Column.from_pylibcudf( + plc.datetime.ceil_datetimes( + col.to_pylibcudf(mode="read"), + _get_rounding_frequency(freq), + ) + ) @acquire_spill_lock() def floor_datetime(Column col, object freq): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef libcudf_datetime.rounding_frequency freq_val = \ - _get_rounding_frequency(freq) - - with nogil: - c_result = move(libcudf_datetime.floor_datetimes(col_view, freq_val)) - - result = Column.from_unique_ptr(move(c_result)) - return result + return Column.from_pylibcudf( + plc.datetime.floor_datetimes( + col.to_pylibcudf(mode="read"), + _get_rounding_frequency(freq), + ) + ) @acquire_spill_lock() def round_datetime(Column col, object freq): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef libcudf_datetime.rounding_frequency freq_val = \ - _get_rounding_frequency(freq) - - with nogil: - c_result = move(libcudf_datetime.round_datetimes(col_view, freq_val)) - - result = Column.from_unique_ptr(move(c_result)) - return result + return Column.from_pylibcudf( + plc.datetime.round_datetimes( + col.to_pylibcudf(mode="read"), + _get_rounding_frequency(freq), + ) + ) @acquire_spill_lock() def is_leap_year(Column col): """Returns a boolean indicator whether the year of the date is a leap year """ - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - c_result = move(libcudf_datetime.is_leap_year(col_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.datetime.is_leap_year( + col.to_pylibcudf(mode="read") + ) + ) @acquire_spill_lock() def date_range(DeviceScalar start, size_type n, offset): - cdef unique_ptr[column] c_result cdef size_type months = ( offset.kwds.get("years", 0) * 12 + offset.kwds.get("months", 0) ) - - cdef const scalar* c_start = start.get_raw_ptr() - with nogil: - c_result = move(calendrical_month_sequence( + return Column.from_pylibcudf( + plc.filling.calendrical_month_sequence( n, - c_start[0], - months - )) - return Column.from_unique_ptr(move(c_result)) + start.c_value, + months, + ) + ) @acquire_spill_lock() @@ -199,34 +156,28 @@ def extract_quarter(Column col): Returns a column which contains the corresponding quarter of the year for every timestamp inside the input column. """ - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - c_result = move(libcudf_datetime.extract_quarter(col_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.datetime.extract_quarter( + col.to_pylibcudf(mode="read") + ) + ) @acquire_spill_lock() def days_in_month(Column col): """Extracts the number of days in the month of the date """ - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - c_result = move(libcudf_datetime.days_in_month(col_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.datetime.days_in_month( + col.to_pylibcudf(mode="read") + ) + ) @acquire_spill_lock() def last_day_of_month(Column col): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - c_result = move(libcudf_datetime.last_day_of_month(col_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.datetime.last_day_of_month( + col.to_pylibcudf(mode="read") + ) + ) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index c199ed96d4f..1ce6dfab15e 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -18,7 +18,6 @@ from cudf._lib.utils cimport columns_from_pylibcudf_table from cudf._lib.scalar import as_device_scalar -from pylibcudf.libcudf.replace cimport replace_policy from pylibcudf.libcudf.scalar.scalar cimport scalar import pylibcudf @@ -244,13 +243,11 @@ cdef class GroupBy: return columns_from_pylibcudf_table(shifts), columns_from_pylibcudf_table(keys) def replace_nulls(self, list values, object method): - # TODO: This is using an enum (replace_policy) that has not been exposed in - # pylibcudf yet. We'll want to fix that import once it is in pylibcudf. _, replaced = self._groupby.replace_nulls( pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in values]), [ - replace_policy.PRECEDING - if method == 'ffill' else replace_policy.FOLLOWING + pylibcudf.replace.ReplacePolicy.PRECEDING + if method == 'ffill' else pylibcudf.replace.ReplacePolicy.FOLLOWING ] * len(values), ) diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd index 76a6e32fde0..96504ebdd66 100644 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ b/python/cudf/cudf/_lib/io/utils.pxd @@ -13,7 +13,6 @@ from pylibcudf.libcudf.io.types cimport ( from cudf._lib.column cimport Column -cdef source_info make_source_info(list src) except* cdef sink_info make_sinks_info( list src, vector[unique_ptr[data_sink]] & data) except* cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except* diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index 564daefbae2..f23980b387a 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -7,76 +7,20 @@ from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector -from pylibcudf.io.datasource cimport Datasource from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.datasource cimport datasource from pylibcudf.libcudf.io.types cimport ( column_name_info, - host_buffer, sink_info, - source_info, ) from cudf._lib.column cimport Column import codecs -import errno import io import os from cudf.core.dtypes import StructDtype - -# Converts the Python source input to libcudf IO source_info -# with the appropriate type and source values -cdef source_info make_source_info(list src) except*: - if not src: - raise ValueError("Need to pass at least one source") - - cdef const unsigned char[::1] c_buffer - cdef vector[host_buffer] c_host_buffers - cdef vector[string] c_files - cdef Datasource csrc - cdef vector[datasource*] c_datasources - empty_buffer = False - if isinstance(src[0], bytes): - empty_buffer = True - for buffer in src: - if (len(buffer) > 0): - c_buffer = buffer - c_host_buffers.push_back(host_buffer(&c_buffer[0], - c_buffer.shape[0])) - empty_buffer = False - elif isinstance(src[0], io.BytesIO): - for bio in src: - c_buffer = bio.getbuffer() # check if empty? - c_host_buffers.push_back(host_buffer(&c_buffer[0], - c_buffer.shape[0])) - # Otherwise src is expected to be a numeric fd, string path, or PathLike. - # TODO (ptaylor): Might need to update this check if accepted input types - # change when UCX and/or cuStreamz support is added. - elif isinstance(src[0], Datasource): - for csrc in src: - c_datasources.push_back(csrc.get_datasource()) - return source_info(c_datasources) - elif isinstance(src[0], (int, float, complex, basestring, os.PathLike)): - # If source is a file, return source_info where type=FILEPATH - if not all(os.path.isfile(file) for file in src): - raise FileNotFoundError(errno.ENOENT, - os.strerror(errno.ENOENT), - src) - - files = [ str(elem).encode() for elem in src] - c_files = files - return source_info(c_files) - else: - raise TypeError("Unrecognized input type: {}".format(type(src[0]))) - - if empty_buffer is True: - c_host_buffers.push_back(host_buffer(NULL, 0)) - - return source_info(c_host_buffers) - # Converts the Python sink input to libcudf IO sink_info. cdef sink_info make_sinks_info( list src, vector[unique_ptr[data_sink]] & sink diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index fb149603960..7dc9cd01a00 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -104,7 +104,7 @@ cpdef read_json(object filepaths_or_buffers, ) df = cudf.DataFrame._from_data( *_data_from_columns( - columns=[Column.from_pylibcudf(plc) for plc in res_cols], + columns=[Column.from_pylibcudf(col) for col in res_cols], column_names=res_col_names, index_names=None ) diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx index 3966cce8981..524bfd3b2e8 100644 --- a/python/cudf/cudf/_lib/labeling.pyx +++ b/python/cudf/cudf/_lib/labeling.pyx @@ -17,8 +17,8 @@ def label_bins(Column input, Column left_edges, cbool left_inclusive, plc_column = plc.labeling.label_bins( input.to_pylibcudf(mode="read"), left_edges.to_pylibcudf(mode="read"), - left_inclusive, + plc.labeling.Inclusive.YES if left_inclusive else plc.labeling.Inclusive.NO, right_edges.to_pylibcudf(mode="read"), - right_inclusive + plc.labeling.Inclusive.YES if right_inclusive else plc.labeling.Inclusive.NO, ) return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 12432ac6d5d..9a2aa4a6130 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -4,7 +4,9 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from pylibcudf.libcudf.types cimport null_order, size_type +from pylibcudf.libcudf.types cimport ( + nan_equality, null_equality, null_order, order, size_type +) from cudf._lib.column cimport Column from cudf._lib.utils cimport columns_from_pylibcudf_table @@ -37,8 +39,8 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal): return Column.from_pylibcudf( plc.lists.distinct( col.to_pylibcudf(mode="read"), - nulls_equal, - nans_all_equal, + null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL, + nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL, ) ) @@ -48,7 +50,7 @@ def sort_lists(Column col, bool ascending, str na_position): return Column.from_pylibcudf( plc.lists.sort_lists( col.to_pylibcudf(mode="read"), - ascending, + order.ASCENDING if ascending else order.DESCENDING, null_order.BEFORE if na_position == "first" else null_order.AFTER, False, ) @@ -91,7 +93,7 @@ def index_of_scalar(Column col, object py_search_key): plc.lists.index_of( col.to_pylibcudf(mode="read"), py_search_key.device_value.c_value, - True, + plc.lists.DuplicateFindOption.FIND_FIRST, ) ) @@ -102,7 +104,7 @@ def index_of_column(Column col, Column search_keys): plc.lists.index_of( col.to_pylibcudf(mode="read"), search_keys.to_pylibcudf(mode="read"), - True, + plc.lists.DuplicateFindOption.FIND_FIRST, ) ) @@ -123,7 +125,9 @@ def concatenate_list_elements(Column input_column, dropna=False): return Column.from_pylibcudf( plc.lists.concatenate_list_elements( input_column.to_pylibcudf(mode="read"), - dropna, + plc.lists.ConcatenateNullPolicy.IGNORE + if dropna + else plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW, ) ) diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 5e39cafa47b..25cfcf99ca6 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -1,5 +1,7 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t, uint64_t + from cudf.core.buffer import acquire_spill_lock from cudf._lib.column cimport Column @@ -17,6 +19,19 @@ def minhash(Column input, Column seeds, int width=4): return Column.from_pylibcudf(result) +@acquire_spill_lock() +def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width): + return Column.from_pylibcudf( + nvtext.minhash.minhash_permuted( + input.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() def minhash64(Column input, Column seeds, int width=4): result = nvtext.minhash.minhash64( @@ -27,6 +42,19 @@ def minhash64(Column input, Column seeds, int width=4): return Column.from_pylibcudf(result) +@acquire_spill_lock() +def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width): + return Column.from_pylibcudf( + nvtext.minhash.minhash64_permuted( + input.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() def word_minhash(Column input, Column seeds): result = nvtext.minhash.word_minhash( diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 1212637d330..d4bd0cd306c 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -27,7 +27,6 @@ from libcpp cimport bool from libcpp.map cimport map from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string -from libcpp.unordered_map cimport unordered_map from libcpp.utility cimport move from libcpp.vector cimport vector @@ -41,12 +40,7 @@ from pylibcudf.libcudf.io.parquet cimport ( parquet_writer_options, write_parquet as parquet_writer, ) -from pylibcudf.libcudf.io.parquet_metadata cimport ( - parquet_metadata, - read_parquet_metadata as parquet_metadata_reader, -) from pylibcudf.libcudf.io.types cimport ( - source_info, sink_info, column_in_metadata, table_input_metadata, @@ -62,7 +56,6 @@ from cudf._lib.column cimport Column from cudf._lib.io.utils cimport ( add_df_col_struct_names, make_sinks_info, - make_source_info, ) from cudf._lib.utils cimport table_view_from_table @@ -373,7 +366,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, nrows=nrows, skip_rows=skip_rows) return df -cpdef read_parquet_metadata(filepaths_or_buffers): +cpdef read_parquet_metadata(list filepaths_or_buffers): """ Cython function to call into libcudf API, see `read_parquet_metadata`. @@ -382,56 +375,40 @@ cpdef read_parquet_metadata(filepaths_or_buffers): cudf.io.parquet.read_parquet cudf.io.parquet.to_parquet """ - cdef source_info source = make_source_info(filepaths_or_buffers) - - args = move(source) - - cdef parquet_metadata c_result - - # Read Parquet metadata - with nogil: - c_result = move(parquet_metadata_reader(args)) - - # access and return results - num_rows = c_result.num_rows() - num_rowgroups = c_result.num_rowgroups() - - # extract row group metadata and sanitize keys - row_group_metadata = [{k.decode(): v for k, v in metadata} - for metadata in c_result.rowgroup_metadata()] + parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata( + plc.io.SourceInfo(filepaths_or_buffers) + ) # read all column names including index column, if any - col_names = [info.name().decode() for info in c_result.schema().root().children()] - - # access the Parquet file_footer to find the index - index_col = None - cdef unordered_map[string, string] file_footer = c_result.metadata() + col_names = [info.name() for info in parquet_metadata.schema().root().children()] - # get index column name(s) - index_col_names = None - json_str = file_footer[b'pandas'].decode('utf-8') - meta = None + index_col_names = set() + json_str = parquet_metadata.metadata()['pandas'] if json_str != "": meta = json.loads(json_str) file_is_range_index, index_col, _ = _parse_metadata(meta) - if not file_is_range_index and index_col is not None \ - and index_col_names is None: - index_col_names = {} + if ( + not file_is_range_index + and index_col is not None + ): + columns = meta['columns'] for idx_col in index_col: - for c in meta['columns']: + for c in columns: if c['field_name'] == idx_col: - index_col_names[idx_col] = c['name'] + index_col_names.add(idx_col) # remove the index column from the list of column names # only if index_col_names is not None - if index_col_names is not None: + if len(index_col_names) >= 0: col_names = [name for name in col_names if name not in index_col_names] - # num_columns = length of list(col_names) - num_columns = len(col_names) - - # return the metadata - return num_rows, num_rowgroups, col_names, num_columns, row_group_metadata + return ( + parquet_metadata.num_rows(), + parquet_metadata.num_rowgroups(), + col_names, + len(col_names), + parquet_metadata.rowgroup_metadata() + ) @acquire_spill_lock() diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx index 7666b7ff8da..509cfe5e9f8 100644 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ b/python/cudf/cudf/_lib/quantiles.pyx @@ -6,14 +6,6 @@ from libcpp cimport bool from libcpp.vector cimport vector from cudf._lib.column cimport Column -from cudf._lib.types cimport ( - underlying_type_t_interpolation, - underlying_type_t_sorted, -) - -from cudf._lib.types import Interpolation - -from pylibcudf.libcudf.types cimport interpolation, sorted from cudf._lib.utils cimport columns_from_pylibcudf_table @@ -28,17 +20,13 @@ def quantile( Column ordered_indices, bool exact, ): - cdef interpolation c_interp = ( - Interpolation[interp.upper()] - ) - return Column.from_pylibcudf( plc.quantiles.quantile( input.to_pylibcudf(mode="read"), q, - c_interp, + plc.types.Interpolation[interp.upper()], ordered_indices.to_pylibcudf(mode="read"), - exact + exact ) ) @@ -51,22 +39,14 @@ def quantile_table( list column_order, list null_precedence, ): - - cdef interpolation c_interp = ( - interp - ) - cdef sorted c_is_input_sorted = ( - is_input_sorted - ) - return columns_from_pylibcudf_table( plc.quantiles.quantiles( plc.Table([ c.to_pylibcudf(mode="read") for c in source_columns ]), q, - c_interp, - c_is_input_sorted, + interp, + is_input_sorted, column_order, null_precedence ) diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx index 185552ede82..eefe37d9880 100644 --- a/python/cudf/cudf/_lib/sort.pyx +++ b/python/cudf/cudf/_lib/sort.pyx @@ -5,21 +5,10 @@ from itertools import repeat from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector from pylibcudf.libcudf.aggregation cimport rank_method -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.search cimport lower_bound, upper_bound -from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport null_order, order as cpp_order - from cudf._lib.column cimport Column -from cudf._lib.utils cimport ( - columns_from_pylibcudf_table, - table_view_from_columns, -) +from cudf._lib.utils cimport columns_from_pylibcudf_table import pylibcudf @@ -311,44 +300,19 @@ def digitize(list source_columns, list bins, bool right=False): right : Indicating whether the intervals include the right or the left bin edge. """ - - cdef table_view bins_view = table_view_from_columns(bins) - cdef table_view source_table_view = table_view_from_columns( - source_columns - ) - cdef vector[cpp_order] column_order = ( - vector[cpp_order]( - bins_view.num_columns(), - cpp_order.ASCENDING - ) - ) - cdef vector[null_order] null_precedence = ( - vector[null_order]( - bins_view.num_columns(), - null_order.BEFORE + return Column.from_pylibcudf( + getattr(pylibcudf.search, "lower_bound" if right else "upper_bound")( + pylibcudf.Table( + [c.to_pylibcudf(mode="read") for c in bins] + ), + pylibcudf.Table( + [c.to_pylibcudf(mode="read") for c in source_columns] + ), + [pylibcudf.types.Order.ASCENDING]*len(bins), + [pylibcudf.types.NullOrder.BEFORE]*len(bins) ) ) - cdef unique_ptr[column] c_result - if right: - with nogil: - c_result = move(lower_bound( - bins_view, - source_table_view, - column_order, - null_precedence) - ) - else: - with nogil: - c_result = move(upper_bound( - bins_view, - source_table_view, - column_order, - null_precedence) - ) - - return Column.from_unique_ptr(move(c_result)) - @acquire_spill_lock() def rank_columns(list source_columns, rank_method method, str na_option, diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index ffa5e603408..4c0ec2d9ac5 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -9,6 +9,8 @@ from cudf._lib.nvtext.minhash import ( minhash, minhash64, + minhash64_permuted, + minhash_permuted, word_minhash, word_minhash64, ) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx index 8b6da2bfa1c..50113347ccb 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx @@ -1,15 +1,8 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_integers cimport ( - is_integer as cpp_is_integer, -) +import pylibcudf as plc from cudf._lib.column cimport Column @@ -20,12 +13,8 @@ def is_integer(Column source_strings): Returns a Column of boolean values with True for `source_strings` that have integers. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_integer( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.convert.convert_integers.is_integer( + source_strings.to_pylibcudf(mode="read") + ) + ) diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx index b2c7232f549..7942d067c2b 100644 --- a/python/cudf/cudf/_lib/text.pyx +++ b/python/cudf/cudf/_lib/text.pyx @@ -1,33 +1,20 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from io import TextIOBase +from libcpp cimport bool -from cython.operator cimport dereference -from libc.stdint cimport uint64_t -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move +from io import TextIOBase -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.io.text cimport ( - byte_range_info, - data_chunk_source, - make_source, - make_source_from_bgzip_file, - make_source_from_file, - multibyte_split, - parse_options, -) +import pylibcudf as plc from cudf._lib.column cimport Column def read_text(object filepaths_or_buffers, - object delimiter=None, - object byte_range=None, - object strip_delimiters=False, - object compression=None, - object compression_offsets=None): + str delimiter, + object byte_range, + bool strip_delimiters, + object compression, + object compression_offsets): """ Cython function to call into libcudf API, see `multibyte_split`. @@ -35,24 +22,11 @@ def read_text(object filepaths_or_buffers, -------- cudf.io.text.read_text """ - cdef string delim = delimiter.encode() - - cdef unique_ptr[data_chunk_source] datasource - cdef unique_ptr[column] c_col - - cdef size_t c_byte_range_offset - cdef size_t c_byte_range_size - cdef uint64_t c_compression_begin_offset - cdef uint64_t c_compression_end_offset - cdef parse_options c_options - if compression is None: if isinstance(filepaths_or_buffers, TextIOBase): - datasource = move(make_source( - filepaths_or_buffers.read().encode())) + datasource = plc.io.text.make_source(filepaths_or_buffers.read()) else: - datasource = move(make_source_from_file( - filepaths_or_buffers.encode())) + datasource = plc.io.text.make_source_from_file(filepaths_or_buffers) elif compression == "bgzip": if isinstance(filepaths_or_buffers, TextIOBase): raise ValueError("bgzip compression requires a file path") @@ -60,30 +34,20 @@ def read_text(object filepaths_or_buffers, if len(compression_offsets) != 2: raise ValueError( "compression offsets need to consist of two elements") - c_compression_begin_offset = compression_offsets[0] - c_compression_end_offset = compression_offsets[1] - datasource = move(make_source_from_bgzip_file( - filepaths_or_buffers.encode(), - c_compression_begin_offset, - c_compression_end_offset)) + datasource = plc.io.text.make_source_from_bgzip_file( + filepaths_or_buffers, + compression_offsets[0], + compression_offsets[1] + ) else: - datasource = move(make_source_from_bgzip_file( - filepaths_or_buffers.encode())) + datasource = plc.io.text.make_source_from_bgzip_file( + filepaths_or_buffers, + ) else: raise ValueError("Only bgzip compression is supported at the moment") - c_options = parse_options() - if byte_range is not None: - c_byte_range_offset = byte_range[0] - c_byte_range_size = byte_range[1] - c_options.byte_range = byte_range_info( - c_byte_range_offset, - c_byte_range_size) - c_options.strip_delimiters = strip_delimiters - with nogil: - c_col = move(multibyte_split( - dereference(datasource), - delim, - c_options)) - - return Column.from_unique_ptr(move(c_col)) + options = plc.io.text.ParseOptions( + byte_range=byte_range, strip_delimiters=strip_delimiters + ) + plc_column = plc.io.text.multibyte_split(datasource, delimiter, options) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd index 4fd3d31841e..c2b760490c1 100644 --- a/python/cudf/cudf/_lib/types.pxd +++ b/python/cudf/cudf/_lib/types.pxd @@ -7,12 +7,7 @@ cimport pylibcudf.libcudf.types as libcudf_types from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view -ctypedef bool underlying_type_t_order -ctypedef bool underlying_type_t_null_order -ctypedef bool underlying_type_t_sorted -ctypedef int32_t underlying_type_t_interpolation ctypedef int32_t underlying_type_t_type_id -ctypedef bool underlying_type_t_null_policy cdef dtype_from_column_view(column_view cv) diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index 861bb063707..f169ea12b10 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -11,12 +11,6 @@ cimport pylibcudf.libcudf.types as libcudf_types from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view -from cudf._lib.types cimport ( - underlying_type_t_interpolation, - underlying_type_t_order, - underlying_type_t_sorted, -) - import pylibcudf import cudf @@ -151,44 +145,6 @@ datetime_unit_map = { size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID] -class Interpolation(IntEnum): - LINEAR = ( - libcudf_types.interpolation.LINEAR - ) - LOWER = ( - libcudf_types.interpolation.LOWER - ) - HIGHER = ( - libcudf_types.interpolation.HIGHER - ) - MIDPOINT = ( - libcudf_types.interpolation.MIDPOINT - ) - NEAREST = ( - libcudf_types.interpolation.NEAREST - ) - - -class Order(IntEnum): - ASCENDING = libcudf_types.order.ASCENDING - DESCENDING = libcudf_types.order.DESCENDING - - -class Sorted(IntEnum): - YES = libcudf_types.sorted.YES - NO = libcudf_types.sorted.NO - - -class NullOrder(IntEnum): - BEFORE = libcudf_types.null_order.BEFORE - AFTER = libcudf_types.null_order.AFTER - - -class NullHandling(IntEnum): - INCLUDE = libcudf_types.null_policy.INCLUDE - EXCLUDE = libcudf_types.null_policy.EXCLUDE - - cdef dtype_from_lists_column_view(column_view cv): # lists_column_view have no default constructor, so we heap # allocate it to get around Cython's limitation of requiring diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 856ce0f75de..3d70b01b7e4 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5350,11 +5350,65 @@ def minhash( libstrings.minhash(self._column, seeds_column, width) ) + def minhash_permuted( + self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int + ) -> SeriesOrIndex: + """ + Compute the minhash of a strings column. + + This uses the MurmurHash3_x86_32 algorithm for the hash function. + + Calculation uses the formula (hv * a + b) % mersenne_prime + where hv is the hash of a substring of width characters, + a and b are provided values and mersenne_prime is 2^61-1. + + Parameters + ---------- + seed : uint32 + The seed used for the hash algorithm. + a : ColumnLike + Values for minhash calculation. + Must be of type uint32. + b : ColumnLike + Values for minhash calculation. + Must be of type uint32. + width : int + The width of the substring to hash. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> s = cudf.Series(['this is my', 'favorite book']) + >>> a = cudf.Series([1, 2, 3], dtype=np.uint32) + >>> b = cudf.Series([4, 5, 6], dtype=np.uint32) + >>> s.str.minhash_permuted(0, a=a, b=b, width=5) + 0 [1305480171, 462824409, 74608232] + 1 [32665388, 65330773, 97996158] + dtype: list + """ + a_column = column.as_column(a) + if a_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(a)}" + ) + b_column = column.as_column(b) + if b_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(b)}" + ) + return self._return_or_inplace( + libstrings.minhash_permuted( + self._column, seed, a_column, b_column, width + ) + ) + def minhash64( self, seeds: ColumnLike | None = None, width: int = 4 ) -> SeriesOrIndex: """ Compute the minhash of a strings column. + This uses the MurmurHash3_x64_128 algorithm for the hash function. This function generates 2 uint64 values but only the first uint64 value is used. @@ -5390,6 +5444,59 @@ def minhash64( libstrings.minhash64(self._column, seeds_column, width) ) + def minhash64_permuted( + self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int + ) -> SeriesOrIndex: + """ + Compute the minhash of a strings column. + This uses the MurmurHash3_x64_128 algorithm for the hash function. + + Calculation uses the formula (hv * a + b) % mersenne_prime + where hv is the hash of a substring of width characters, + a and b are provided values and mersenne_prime is 2^61-1. + + Parameters + ---------- + seed : uint64 + The seed used for the hash algorithm. + a : ColumnLike + Values for minhash calculation. + Must be of type uint64. + b : ColumnLike + Values for minhash calculation. + Must be of type uint64. + width : int + The width of the substring to hash. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> s = cudf.Series(['this is my', 'favorite book', 'to read']) + >>> a = cudf.Series([2, 3], dtype=np.uint64) + >>> b = cudf.Series([5, 6], dtype=np.uint64) + >>> s.str.minhash64_permuted(0, a=a, b=b, width=5) + 0 [172452388517576012, 316595762085180527] + 1 [71427536958126239, 58787297728258215] + 2 [423885828176437114, 1140588505926961370] + dtype: list + """ + a_column = column.as_column(a) + if a_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(a)}" + ) + b_column = column.as_column(b) + if b_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(b)}" + ) + return self._return_or_inplace( + libstrings.minhash64_permuted( + self._column, seed, a_column, b_column, width + ) + ) + def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: """ Compute the minhash of a list column of strings. diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 205edd91d9d..2b4a17f9559 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -16,6 +16,8 @@ import pyarrow as pa from typing_extensions import Self +import pylibcudf as plc + import cudf from cudf import _lib as libcudf from cudf.api.types import is_dtype_equal, is_scalar @@ -789,15 +791,13 @@ def _quantile_table( column_order=(), null_precedence=(), ): - interpolation = libcudf.types.Interpolation[interpolation] + interpolation = plc.types.Interpolation[interpolation] - is_sorted = libcudf.types.Sorted["YES" if is_sorted else "NO"] + is_sorted = plc.types.Sorted["YES" if is_sorted else "NO"] - column_order = [libcudf.types.Order[key] for key in column_order] + column_order = [plc.types.Order[key] for key in column_order] - null_precedence = [ - libcudf.types.NullOrder[key] for key in null_precedence - ] + null_precedence = [plc.types.NullOrder[key] for key in null_precedence] return self._from_columns_like_self( libcudf.quantiles.quantile_table( diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index df7bbe22a61..e206c8bca08 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -351,6 +351,22 @@ def _integer_and_none_validator(val): _make_contains_validator([False, True]), ) +_register_option( + "kvikio_remote_io", + _env_get_bool("CUDF_KVIKIO_REMOTE_IO", False), + textwrap.dedent( + """ + Whether to use KvikIO's remote IO backend or not. + \tWARN: this is experimental and may be removed at any time + \twithout warning or deprecation period. + \tSet KVIKIO_NTHREADS (default is 8) to change the number of + \tconcurrent tcp connections, which is important for good performance. + \tValid values are True or False. Default is False. + """ + ), + _make_contains_validator([False, True]), +) + class option_context(ContextDecorator): """ diff --git a/python/cudf/cudf/pandas/_wrappers/common.py b/python/cudf/cudf/pandas/_wrappers/common.py index 66a51a83896..b801654068e 100644 --- a/python/cudf/cudf/pandas/_wrappers/common.py +++ b/python/cudf/cudf/pandas/_wrappers/common.py @@ -52,4 +52,13 @@ def array_interface(self: _FastSlowProxy): def custom_iter(self: _FastSlowProxy): - return iter(self._fsproxy_slow) + """ + Custom iter method to handle the case where only the slow + object's iter method is used. + """ + # NOTE: Do not remove this method. This is required to avoid + # falling back to GPU for iter method. + return _maybe_wrap_result( + iter(self._fsproxy_slow), + None, # type: ignore + ) diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 73afde407db..9768a6c4a2f 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -33,6 +33,20 @@ def call_operator(fn, args, kwargs): "EXECUTE_SLOW": 0x0571B0, } +# This is a dict of functions that are known to have arguments that +# need to be transformed from fast to slow only. i.e., Some cudf functions +# error on passing a device object but don't error on passing a host object. +# For example: DataFrame.__setitem__(arg, value) errors on passing a +# cudf.Index object but doesn't error on passing a pd.Index object. +# Hence we need to transform the arg from fast to slow only. So, we use +# a dictionary like: +# {"DataFrame.__setitem__": {0}} +# where the keys are the function names and the values are the indices +# (0-based) of the arguments that need to be transformed. + +_SPECIAL_FUNCTIONS_ARGS_MAP = { + "DataFrame.__setitem__": {0}, +} _WRAPPER_ASSIGNMENTS = tuple( attr @@ -875,6 +889,10 @@ def __name__(self, value): pass setattr(self._fsproxy_slow, "__name__", value) + @property + def _customqualname(self): + return self._fsproxy_slow.__qualname__ + def _assert_fast_slow_eq(left, right): if _is_final_type(type(left)) or type(left) in NUMPY_TYPES: @@ -1011,7 +1029,36 @@ def _transform_arg( # use __reduce_ex__ instead... if type(arg) is tuple: # Must come first to avoid infinite recursion - return tuple(_transform_arg(a, attribute_name, seen) for a in arg) + if ( + len(arg) > 0 + and isinstance(arg[0], _MethodProxy) + and arg[0]._customqualname in _SPECIAL_FUNCTIONS_ARGS_MAP + ): + indices_map = _SPECIAL_FUNCTIONS_ARGS_MAP[ + arg[0]._customqualname + ] + method_proxy, original_args, original_kwargs = arg + + original_args = tuple( + _transform_arg(a, "_fsproxy_slow", seen) + if i - 1 in indices_map + else _transform_arg(a, attribute_name, seen) + for i, a in enumerate(original_args) + ) + original_kwargs = _transform_arg( + original_kwargs, attribute_name, seen + ) + return tuple( + ( + _transform_arg(method_proxy, attribute_name, seen), + original_args, + original_kwargs, + ) + ) + else: + return tuple( + _transform_arg(a, attribute_name, seen) for a in arg + ) elif hasattr(arg, "__getnewargs_ex__"): # Partial implementation of to reconstruct with # transformed pieces @@ -1099,7 +1146,9 @@ def _maybe_wrap_result(result: Any, func: Callable, /, *args, **kwargs) -> Any: """ Wraps "result" in a fast-slow proxy if is a "proxiable" object. """ - if _is_final_type(result): + if isinstance(result, (int, str, float, bool, type(None))): + return result + elif _is_final_type(result): typ = get_final_type_map()[type(result)] return typ._fsproxy_wrap(result, func) elif _is_intermediate_type(result): diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index c9ce24d2a5b..96512dacb69 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -193,11 +193,6 @@ def parquet_file(request, tmp_path_factory, pdf): return fname -@pytest.fixture(scope="module") -def rdg_seed(): - return int(os.environ.get("TEST_CUDF_RDG_SEED", "42")) - - def make_pdf(nrows, ncolumns=1, nvalids=0, dtype=np.int64): test_pdf = pd.DataFrame( [list(range(ncolumns * i, ncolumns * (i + 1))) for i in range(nrows)], @@ -405,14 +400,14 @@ def test_parquet_range_index_pandas_metadata(tmpdir, pandas_compat, as_bytes): assert_eq(expect, got) -def test_parquet_read_metadata(tmpdir, pdf): +def test_parquet_read_metadata(tmp_path, pdf): if len(pdf) > 100: pytest.skip("Skipping long setup test") def num_row_groups(rows, group_size): return max(1, (rows + (group_size - 1)) // group_size) - fname = tmpdir.join("metadata.parquet") + fname = tmp_path / "metadata.parquet" row_group_size = 5 pdf.to_parquet(fname, compression="snappy", row_group_size=row_group_size) @@ -431,7 +426,7 @@ def num_row_groups(rows, group_size): assert a == b -def test_parquet_read_filtered(tmpdir, rdg_seed): +def test_parquet_read_filtered(tmpdir): # Generate data fname = tmpdir.join("filtered.parquet") dg.generate( @@ -455,13 +450,13 @@ def test_parquet_read_filtered(tmpdir, rdg_seed): dg.ColumnParameters( 40, 0.2, - lambda: np.random.default_rng(seed=None).integers( + lambda: np.random.default_rng(seed=0).integers( 0, 100, size=40 ), True, ), ], - seed=rdg_seed, + seed=42, ), format={"name": "parquet", "row_group_size": 64}, ) diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index 0958b68084d..afb82f75bcf 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -69,6 +69,7 @@ def s3_base(endpoint_ip, endpoint_port): # with an S3 endpoint on localhost endpoint_uri = f"http://{endpoint_ip}:{endpoint_port}/" + os.environ["AWS_ENDPOINT_URL"] = endpoint_uri server = ThreadedMotoServer(ip_address=endpoint_ip, port=endpoint_port) server.start() @@ -105,6 +106,15 @@ def s3_context(s3_base, bucket, files=None): pass +@pytest.fixture( + params=[True, False], + ids=["kvikio=ON", "kvikio=OFF"], +) +def kvikio_remote_io(request): + with cudf.option_context("kvikio_remote_io", request.param): + yield request.param + + @pytest.fixture def pdf(scope="module"): df = pd.DataFrame() @@ -193,6 +203,7 @@ def test_write_csv(s3_base, s3so, pdf, chunksize): def test_read_parquet( s3_base, s3so, + kvikio_remote_io, pdf, bytes_per_thread, columns, diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 997ca357986..47e541fdcef 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -882,68 +882,48 @@ def test_is_vowel_consonant(): assert_eq(expected, actual) -def test_minhash(): +def test_minhash_permuted(): strings = cudf.Series(["this is my", "favorite book", None, ""]) + params = cudf.Series([1, 2, 3], dtype=np.uint32) expected = cudf.Series( [ - cudf.Series([21141582], dtype=np.uint32), - cudf.Series([962346254], dtype=np.uint32), - None, - cudf.Series([0], dtype=np.uint32), - ] - ) - actual = strings.str.minhash() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - expected = cudf.Series( - [ - cudf.Series([1305480167, 668155704, 34311509], dtype=np.uint32), - cudf.Series([32665384, 3470118, 363147162], dtype=np.uint32), + cudf.Series([1305480168, 462824406, 74608229], dtype=np.uint32), + cudf.Series([32665385, 65330770, 97996155], dtype=np.uint32), None, cudf.Series([0, 0, 0], dtype=np.uint32), ] ) - actual = strings.str.minhash(seeds=seeds, width=5) + actual = strings.str.minhash_permuted(0, a=params, b=params, width=5) assert_eq(expected, actual) - expected = cudf.Series( - [ - cudf.Series([3232308021562742685], dtype=np.uint64), - cudf.Series([23008204270530356], dtype=np.uint64), - None, - cudf.Series([0], dtype=np.uint64), - ] - ) - actual = strings.str.minhash64() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + params = cudf.Series([1, 2, 3], dtype=np.uint64) expected = cudf.Series( [ cudf.Series( - [7082801294247314046, 185949556058924788, 167570629329462454], + [105531920695060180, 172452388517576009, 316595762085180524], dtype=np.uint64, ), cudf.Series( - [382665377781028452, 86243762733551437, 7688750597953083512], + [35713768479063122, 71427536958126236, 58787297728258212], dtype=np.uint64, ), None, cudf.Series([0, 0, 0], dtype=np.uint64), ] ) - actual = strings.str.minhash64(seeds=seeds, width=5) + actual = strings.str.minhash64_permuted(0, a=params, b=params, width=5) assert_eq(expected, actual) # test wrong seed types with pytest.raises(ValueError): - strings.str.minhash(seeds="a") + strings.str.minhash_permuted(1, a="a", b="b", width=7) with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.int32) - strings.str.minhash(seeds=seeds) + params = cudf.Series([0, 1, 2], dtype=np.int32) + strings.str.minhash_permuted(1, a=params, b=params, width=6) with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - strings.str.minhash64(seeds=seeds) + params = cudf.Series([0, 1, 2], dtype=np.uint32) + strings.str.minhash64_permuted(1, a=params, b=params, width=8) def test_word_minhash(): diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index d636f36f282..aecb7ae7c5c 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -16,6 +16,7 @@ import pandas as pd from fsspec.core import expand_paths_if_needed, get_fs_token_paths +import cudf from cudf.api.types import is_list_like from cudf.core._compat import PANDAS_LT_300 from cudf.utils.docutils import docfmt_partial @@ -1624,6 +1625,16 @@ def _maybe_expand_directories(paths, glob_pattern, fs): return expanded_paths +def _use_kvikio_remote_io(fs) -> bool: + """Whether `kvikio_remote_io` is enabled and `fs` refers to a S3 file""" + + try: + from s3fs.core import S3FileSystem + except ImportError: + return False + return cudf.get_option("kvikio_remote_io") and isinstance(fs, S3FileSystem) + + @doc_get_reader_filepath_or_buffer() def get_reader_filepath_or_buffer( path_or_data, @@ -1649,17 +1660,17 @@ def get_reader_filepath_or_buffer( ) ] if not input_sources: - raise ValueError("Empty input source list: {input_sources}.") + raise ValueError(f"Empty input source list: {input_sources}.") filepaths_or_buffers = [] string_paths = [isinstance(source, str) for source in input_sources] if any(string_paths): - # Sources are all strings. Thes strings are typically + # Sources are all strings. The strings are typically # file paths, but they may also be raw text strings. # Don't allow a mix of source types if not all(string_paths): - raise ValueError("Invalid input source list: {input_sources}.") + raise ValueError(f"Invalid input source list: {input_sources}.") # Make sure we define a filesystem (if possible) paths = input_sources @@ -1712,11 +1723,17 @@ def get_reader_filepath_or_buffer( raise FileNotFoundError( f"{input_sources} could not be resolved to any files" ) - filepaths_or_buffers = _prefetch_remote_buffers( - paths, - fs, - **(prefetch_options or {}), - ) + + # If `kvikio_remote_io` is enabled and `fs` refers to a S3 file, + # we create S3 URLs and let them pass-through to libcudf. + if _use_kvikio_remote_io(fs): + filepaths_or_buffers = [f"s3://{fpath}" for fpath in paths] + else: + filepaths_or_buffers = _prefetch_remote_buffers( + paths, + fs, + **(prefetch_options or {}), + ) else: raw_text_input = True diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 3e7d1cf3c4c..d48fbad0ec3 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -12,6 +12,7 @@ import pickle import subprocess import tempfile +import time import types from io import BytesIO, StringIO @@ -1777,3 +1778,43 @@ def test_cudf_pandas_util_version(attrs): assert not hasattr(pd.util, attrs) else: assert hasattr(pd.util, attrs) + + +def test_iteration_over_dataframe_dtypes_produces_proxy_objects(dataframe): + _, xdf = dataframe + xdf["b"] = xpd.IntervalIndex.from_arrays(xdf["a"], xdf["b"]) + xdf["a"] = xpd.Series([1, 1, 1, 2, 3], dtype="category") + dtype_series = xdf.dtypes + assert all(is_proxy_object(x) for x in dtype_series) + assert isinstance(dtype_series.iloc[0], xpd.CategoricalDtype) + assert isinstance(dtype_series.iloc[1], xpd.IntervalDtype) + + +def test_iter_doesnot_raise(monkeypatch): + s = xpd.Series([1, 2, 3]) + with monkeypatch.context() as monkeycontext: + monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") + for _ in s: + pass + + +def test_dataframe_setitem_slowdown(): + # We are explicitly testing the slowdown of the setitem operation + df = xpd.DataFrame( + {"a": [1, 2, 3] * 100000, "b": [1, 2, 3] * 100000} + ).astype("float64") + df = xpd.DataFrame({"a": df["a"].repeat(1000), "b": df["b"].repeat(1000)}) + new_df = df + 1 + start_time = time.time() + df[df.columns] = new_df + end_time = time.time() + delta = int(end_time - start_time) + if delta > 5: + pytest.fail(f"Test took too long to run, runtime: {delta}") + + +def test_dataframe_setitem(): + df = xpd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}).astype("float64") + new_df = df + 1 + df[df.columns] = new_df + tm.assert_equal(df, new_df) diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 1eadceaaccd..280dd52bb22 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -24,14 +24,14 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "libcudf==24.12.*,>=0.0.0a0", - "numba-cuda>=0.0.13", + "numba-cuda>=0.0.13,<0.0.18", "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", "pandas>=2.0,<2.2.4dev0", "ptxcompiler", - "pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'", - "pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'", + "pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'", + "pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'", "pylibcudf==24.12.*,>=0.0.0a0", "rich", "rmm==24.12.*,>=0.0.0a0", @@ -83,6 +83,14 @@ cudf-pandas-tests = [ Homepage = "https://github.com/rapidsai/cudf" Documentation = "https://docs.rapids.ai/api/cudf/stable/" +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index ec0bc0eb22b..b2ea3f06e48 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -47,6 +47,14 @@ rapids = ["rmm", "cudf", "dask_cudf"] [tool.ruff.lint.per-file-ignores] "__init__.py" = ["E402", "F401"] +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py index 66c15f694ee..ba4858c5619 100644 --- a/python/cudf_polars/cudf_polars/__init__.py +++ b/python/cudf_polars/cudf_polars/__init__.py @@ -12,7 +12,7 @@ from cudf_polars._version import __git_commit__, __version__ from cudf_polars.callback import execute_with_cudf -from cudf_polars.dsl.translate import translate_ir +from cudf_polars.dsl.translate import Translator # Check we have a supported polars version from cudf_polars.utils.versions import _ensure_polars_version @@ -22,7 +22,7 @@ __all__: list[str] = [ "execute_with_cudf", - "translate_ir", + "Translator", "__git_commit__", "__version__", ] diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 76816ee0a61..d085f21e0ad 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -18,7 +18,7 @@ import rmm from rmm._cuda import gpu -from cudf_polars.dsl.translate import translate_ir +from cudf_polars.dsl.translate import Translator if TYPE_CHECKING: from collections.abc import Generator @@ -148,12 +148,7 @@ def _callback( return ir.evaluate(cache={}).to_polars() -def execute_with_cudf( - nt: NodeTraverser, - *, - config: GPUEngine, - exception: type[Exception] | tuple[type[Exception], ...] = Exception, -) -> None: +def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None: """ A post optimization callback that attempts to execute the plan with cudf. @@ -165,10 +160,15 @@ def execute_with_cudf( config GPUEngine configuration object - exception - Optional exception, or tuple of exceptions, to catch during - translation. Defaults to ``Exception``. + Raises + ------ + ValueError + If the config contains unsupported keys. + NotImplementedError + If translation of the plan is unsupported. + Notes + ----- The NodeTraverser is mutated if the libcudf executor can handle the plan. """ device = config.device @@ -178,22 +178,27 @@ def execute_with_cudf( raise ValueError( f"Engine configuration contains unsupported settings {unsupported}" ) - try: - with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): - nt.set_udf( - partial( - _callback, - translate_ir(nt), - device=device, - memory_resource=memory_resource, - ) + with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): + translator = Translator(nt) + ir = translator.translate_ir() + ir_translation_errors = translator.errors + if len(ir_translation_errors): + # TODO: Display these errors in user-friendly way. + # tracked in https://github.com/rapidsai/cudf/issues/17051 + unique_errors = sorted(set(ir_translation_errors), key=str) + formatted_errors = "\n".join( + f"- {e.__class__.__name__}: {e}" for e in unique_errors ) - except exception as e: - if bool(int(os.environ.get("POLARS_VERBOSE", 0))): - warnings.warn( - f"Query execution with GPU not supported, reason: {type(e)}: {e}", - PerformanceWarning, - stacklevel=2, + error_message = ( + "Query execution with GPU not possible: unsupported operations." + f"\nThe errors were:\n{formatted_errors}" + ) + exception = NotImplementedError(error_message, unique_errors) + if bool(int(os.environ.get("POLARS_VERBOSE", 0))): + warnings.warn(error_message, PerformanceWarning, stacklevel=2) + if raise_on_fail: + raise exception + else: + nt.set_udf( + partial(_callback, ir, device=device, memory_resource=memory_resource) ) - if raise_on_fail: - raise diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index 08bc9d0ea3f..7560a0f5a64 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -60,7 +60,7 @@ def to_polars(self) -> pl.DataFrame: # To guarantee we produce correct names, we therefore # serialise with names we control and rename with that map. name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)} - table: pa.Table = plc.interop.to_arrow( + table = plc.interop.to_arrow( self.table, [plc.interop.ColumnMetadata(name=name) for name in name_map], ) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index e748ec16f14..326d6b65cbe 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -19,6 +19,8 @@ from cudf_polars.dsl.expressions.base import ( AggInfo, Col, + ColRef, + ErrorExpr, Expr, NamedExpr, ) @@ -35,11 +37,13 @@ __all__ = [ "Expr", + "ErrorExpr", "NamedExpr", "Literal", "LiteralColumn", "Len", "Col", + "ColRef", "BooleanFunction", "StringFunction", "TemporalFunction", diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py index effe8cb2378..23851f91938 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py @@ -20,7 +20,7 @@ from cudf_polars.containers import Column, DataFrame -__all__ = ["Expr", "NamedExpr", "Col", "AggInfo", "ExecutionContext"] +__all__ = ["Expr", "NamedExpr", "Col", "AggInfo", "ExecutionContext", "ColRef"] class AggInfo(NamedTuple): @@ -155,6 +155,17 @@ def collect_agg(self, *, depth: int) -> AggInfo: ) # pragma: no cover; check_agg trips first +class ErrorExpr(Expr): + __slots__ = ("error",) + _non_child = ("dtype", "error") + error: str + + def __init__(self, dtype: plc.DataType, error: str) -> None: + self.dtype = dtype + self.error = error + self.children = () + + class NamedExpr: # NamedExpr does not inherit from Expr since it does not appear # when evaluating expressions themselves, only when constructing @@ -249,3 +260,36 @@ def do_evaluate( def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" return AggInfo([(self, plc.aggregation.collect_list(), self)]) + + +class ColRef(Expr): + __slots__ = ("index", "table_ref") + _non_child = ("dtype", "index", "table_ref") + index: int + table_ref: plc.expressions.TableReference + + def __init__( + self, + dtype: plc.DataType, + index: int, + table_ref: plc.expressions.TableReference, + column: Expr, + ) -> None: + if not isinstance(column, Col): + raise TypeError("Column reference should only apply to columns") + self.dtype = dtype + self.index = index + self.table_ref = table_ref + self.children = (column,) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + raise NotImplementedError( + "Only expect this node as part of an expression translated to libcudf AST." + ) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py index 65fa4bfa62f..cd8e5c6a4eb 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py @@ -27,7 +27,9 @@ class TemporalFunction(Expr): __slots__ = ("name", "options") - _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = { + _COMPONENT_MAP: ClassVar[ + dict[pl_expr.TemporalFunction, plc.datetime.DatetimeComponent] + ] = { pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR, pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH, pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY, diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py index c16313bf83c..7eba0c110ab 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py @@ -58,7 +58,7 @@ def collect_agg(self, *, depth: int) -> AggInfo: class LiteralColumn(Expr): __slots__ = ("value",) _non_child = ("dtype", "value") - value: pa.Array[Any, Any] + value: pa.Array[Any] def __init__(self, dtype: plc.DataType, value: pl.Series) -> None: self.dtype = dtype diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index a242ff9300f..98e8a83b04e 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -29,8 +29,9 @@ import cudf_polars.dsl.expr as expr from cudf_polars.containers import Column, DataFrame from cudf_polars.dsl.nodebase import Node -from cudf_polars.dsl.to_ast import to_parquet_filter +from cudf_polars.dsl.to_ast import to_ast, to_parquet_filter from cudf_polars.utils import dtypes +from cudf_polars.utils.versions import POLARS_VERSION_GT_112 if TYPE_CHECKING: from collections.abc import Callable, Hashable, MutableMapping, Sequence @@ -41,6 +42,7 @@ __all__ = [ "IR", + "ErrorNode", "PythonScan", "Scan", "Cache", @@ -48,6 +50,7 @@ "Select", "GroupBy", "Join", + "ConditionalJoin", "HStack", "Distinct", "Sort", @@ -210,6 +213,23 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ) +class ErrorNode(IR): + """Represents an error translating the IR.""" + + __slots__ = ("error",) + _non_child = ( + "schema", + "error", + ) + error: str + """The error.""" + + def __init__(self, schema: Schema, error: str): + self.schema = schema + self.error = error + self.children = () + + class PythonScan(IR): """Representation of input from a python function.""" @@ -498,7 +518,7 @@ def do_evaluate( # Mask must have been applied. return df elif typ == "ndjson": - json_schema: list[tuple[str, str, list]] = [ + json_schema: list[plc.io.json.NameAndType] = [ (name, typ, []) for name, typ in schema.items() ] plc_tbl_w_meta = plc.io.json.read_json( @@ -522,6 +542,12 @@ def do_evaluate( ) # pragma: no cover; post init trips first if row_index is not None: name, offset = row_index + if POLARS_VERSION_GT_112: + # If we sliced away some data from the start, that + # shifts the row index. + # But prior to 1.13, polars had this wrong, so we match behaviour + # https://github.com/pola-rs/polars/issues/19607 + offset += skip_rows dtype = schema[name] step = plc.interop.from_arrow( pa.scalar(1, type=plc.interop.to_arrow(dtype)) @@ -890,6 +916,66 @@ def do_evaluate( return DataFrame(broadcasted).slice(options.slice) +class ConditionalJoin(IR): + """A conditional inner join of two dataframes on a predicate.""" + + __slots__ = ("predicate", "options", "ast_predicate") + _non_child = ("schema", "predicate", "options") + predicate: expr.Expr + options: tuple + + def __init__( + self, schema: Schema, predicate: expr.Expr, options: tuple, left: IR, right: IR + ) -> None: + self.schema = schema + self.predicate = predicate + self.options = options + self.children = (left, right) + self.ast_predicate = to_ast(predicate) + _, join_nulls, zlice, suffix, coalesce = self.options + # Preconditions from polars + assert not join_nulls + assert not coalesce + if self.ast_predicate is None: + raise NotImplementedError( + f"Conditional join with predicate {predicate}" + ) # pragma: no cover; polars never delivers expressions we can't handle + self._non_child_args = (self.ast_predicate, zlice, suffix) + + @classmethod + def do_evaluate( + cls, + predicate: plc.expressions.Expression, + zlice: tuple[int, int] | None, + suffix: str, + left: DataFrame, + right: DataFrame, + ) -> DataFrame: + """Evaluate and return a dataframe.""" + lg, rg = plc.join.conditional_inner_join(left.table, right.table, predicate) + left = DataFrame.from_table( + plc.copying.gather( + left.table, lg, plc.copying.OutOfBoundsPolicy.DONT_CHECK + ), + left.column_names, + ) + right = DataFrame.from_table( + plc.copying.gather( + right.table, rg, plc.copying.OutOfBoundsPolicy.DONT_CHECK + ), + right.column_names, + ) + right = right.rename_columns( + { + name: f"{name}{suffix}" + for name in right.column_names + if name in left.column_names_set + } + ) + result = left.with_columns(right.columns) + return result.slice(zlice) + + class Join(IR): """A join of two dataframes.""" @@ -1464,7 +1550,7 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR): raise NotImplementedError( "Unpivot cannot cast all input columns to " f"{self.schema[value_name].id()}" - ) + ) # pragma: no cover self.options = ( tuple(indices), tuple(pivotees), diff --git a/python/cudf_polars/cudf_polars/dsl/nodebase.py b/python/cudf_polars/cudf_polars/dsl/nodebase.py index 228d300f467..dd5c40a00be 100644 --- a/python/cudf_polars/cudf_polars/dsl/nodebase.py +++ b/python/cudf_polars/cudf_polars/dsl/nodebase.py @@ -43,9 +43,7 @@ class Node(Generic[T]): def _ctor_arguments(self, children: Sequence[T]) -> Sequence[Any | T]: return (*(getattr(self, attr) for attr in self._non_child), *children) - def reconstruct( - self, children: Sequence[T] - ) -> Self: # pragma: no cover; not yet used + def reconstruct(self, children: Sequence[T]) -> Self: """ Rebuild this node with new children. diff --git a/python/cudf_polars/cudf_polars/dsl/to_ast.py b/python/cudf_polars/cudf_polars/dsl/to_ast.py index 9a0838631cc..acc4b3669af 100644 --- a/python/cudf_polars/cudf_polars/dsl/to_ast.py +++ b/python/cudf_polars/cudf_polars/dsl/to_ast.py @@ -14,12 +14,14 @@ from pylibcudf import expressions as plc_expr from cudf_polars.dsl import expr -from cudf_polars.dsl.traversal import CachingVisitor +from cudf_polars.dsl.traversal import CachingVisitor, reuse_if_unchanged from cudf_polars.typing import GenericTransformer if TYPE_CHECKING: from collections.abc import Mapping + from cudf_polars.typing import ExprTransformer + # Can't merge these op-mapping dictionaries because scoped enum values # are exposed by cython with equality/hash based one their underlying # representation type. So in a dict they are just treated as integers. @@ -128,7 +130,14 @@ def _to_ast(node: expr.Expr, self: Transformer) -> plc_expr.Expression: def _(node: expr.Col, self: Transformer) -> plc_expr.Expression: if self.state["for_parquet"]: return plc_expr.ColumnNameReference(node.name) - return plc_expr.ColumnReference(self.state["name_to_index"][node.name]) + raise TypeError("Should always be wrapped in a ColRef node before translation") + + +@_to_ast.register +def _(node: expr.ColRef, self: Transformer) -> plc_expr.Expression: + if self.state["for_parquet"]: + raise TypeError("Not expecting ColRef node in parquet filter") + return plc_expr.ColumnReference(node.index, node.table_ref) @_to_ast.register @@ -238,9 +247,7 @@ def to_parquet_filter(node: expr.Expr) -> plc_expr.Expression | None: return None -def to_ast( - node: expr.Expr, *, name_to_index: Mapping[str, int] -) -> plc_expr.Expression | None: +def to_ast(node: expr.Expr) -> plc_expr.Expression | None: """ Convert an expression to libcudf AST nodes suitable for compute_column. @@ -248,18 +255,66 @@ def to_ast( ---------- node Expression to convert. - name_to_index - Mapping from column names to their index in the table that - will be used for expression evaluation. + + Notes + ----- + `Col` nodes must always be wrapped in `TableRef` nodes when + converting to an ast expression so that their table reference and + index are provided. Returns ------- - pylibcudf Expressoin if conversion is possible, otherwise None. + pylibcudf Expression if conversion is possible, otherwise None. """ - mapper = CachingVisitor( - _to_ast, state={"for_parquet": False, "name_to_index": name_to_index} - ) + mapper = CachingVisitor(_to_ast, state={"for_parquet": False}) try: return mapper(node) except (KeyError, NotImplementedError): return None + + +def _insert_colrefs(node: expr.Expr, rec: ExprTransformer) -> expr.Expr: + if isinstance(node, expr.Col): + return expr.ColRef( + node.dtype, + rec.state["name_to_index"][node.name], + rec.state["table_ref"], + node, + ) + return reuse_if_unchanged(node, rec) + + +def insert_colrefs( + node: expr.Expr, + *, + table_ref: plc.expressions.TableReference, + name_to_index: Mapping[str, int], +) -> expr.Expr: + """ + Insert column references into an expression before conversion to libcudf AST. + + Parameters + ---------- + node + Expression to insert references into. + table_ref + pylibcudf `TableReference` indicating whether column + references are coming from the left or right table. + name_to_index: + Mapping from column names to column indices in the table + eventually used for evaluation. + + Notes + ----- + All column references are wrapped in the same, singular, table + reference, so this function relies on the expression only + containing column references from a single table. + + Returns + ------- + New expression with column references inserted. + """ + mapper = CachingVisitor( + _insert_colrefs, state={"table_ref": table_ref, "name_to_index": name_to_index} + ) + return mapper(node) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 5181214819e..e8ed009cdf2 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -21,14 +21,127 @@ import pylibcudf as plc from cudf_polars.dsl import expr, ir -from cudf_polars.dsl.traversal import make_recursive, reuse_if_unchanged +from cudf_polars.dsl.to_ast import insert_colrefs from cudf_polars.typing import NodeTraverser from cudf_polars.utils import dtypes, sorting if TYPE_CHECKING: - from cudf_polars.typing import ExprTransformer + from cudf_polars.typing import NodeTraverser -__all__ = ["translate_ir", "translate_named_expr"] +__all__ = ["Translator", "translate_named_expr"] + + +class Translator: + """ + Translates polars-internal IR nodes and expressions to our representation. + + Parameters + ---------- + visitor + Polars NodeTraverser object + """ + + def __init__(self, visitor: NodeTraverser): + self.visitor = visitor + self.errors: list[Exception] = [] + + def translate_ir(self, *, n: int | None = None) -> ir.IR: + """ + Translate a polars-internal IR node to our representation. + + Parameters + ---------- + visitor + Polars NodeTraverser object + n + Optional node to start traversing from, if not provided uses + current polars-internal node. + + Returns + ------- + Translated IR object + + Raises + ------ + NotImplementedError + If the version of Polars IR is unsupported. + + Notes + ----- + Any expression nodes that cannot be translated are replaced by + :class:`expr.ErrorNode` nodes and collected in the the `errors` attribute. + After translation is complete, this list of errors should be inspected + to determine if the query is supported. + """ + ctx: AbstractContextManager[None] = ( + set_node(self.visitor, n) if n is not None else noop_context + ) + # IR is versioned with major.minor, minor is bumped for backwards + # compatible changes (e.g. adding new nodes), major is bumped for + # incompatible changes (e.g. renaming nodes). + if (version := self.visitor.version()) >= (4, 0): + e = NotImplementedError( + f"No support for polars IR {version=}" + ) # pragma: no cover; no such version for now. + self.errors.append(e) # pragma: no cover + raise e # pragma: no cover + + with ctx: + polars_schema = self.visitor.get_schema() + try: + schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()} + except Exception as e: + self.errors.append(NotImplementedError(str(e))) + return ir.ErrorNode({}, str(e)) + try: + node = self.visitor.view_current_node() + except Exception as e: + self.errors.append(e) + return ir.ErrorNode(schema, str(e)) + try: + result = _translate_ir(node, self, schema) + except Exception as e: + self.errors.append(e) + return ir.ErrorNode(schema, str(e)) + if any( + isinstance(dtype, pl.Null) + for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values()) + ): + error = NotImplementedError( + f"No GPU support for {result} with Null column dtype." + ) + self.errors.append(error) + return ir.ErrorNode(schema, str(error)) + + return result + + def translate_expr(self, *, n: int) -> expr.Expr: + """ + Translate a polars-internal expression IR into our representation. + + Parameters + ---------- + n + Node to translate, an integer referencing a polars internal node. + + Returns + ------- + Translated IR object. + + Notes + ----- + Any expression nodes that cannot be translated are replaced by + :class:`expr.ErrorExpr` nodes and collected in the the `errors` attribute. + After translation is complete, this list of errors should be inspected + to determine if the query is supported. + """ + node = self.visitor.view_expression(n) + dtype = dtypes.from_polars(self.visitor.get_dtype(n)) + try: + return _translate_expr(node, self, dtype) + except Exception as e: + self.errors.append(e) + return expr.ErrorExpr(dtype, str(e)) class set_node(AbstractContextManager[None]): @@ -70,7 +183,7 @@ def __exit__(self, *args: Any) -> None: @singledispatch def _translate_ir( - node: Any, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: Any, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: raise NotImplementedError( f"Translation for {type(node).__name__}" @@ -79,19 +192,19 @@ def _translate_ir( @_translate_ir.register def _( - node: pl_ir.PythonScan, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.PythonScan, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: scan_fn, with_columns, source_type, predicate, nrows = node.options options = (scan_fn, with_columns, source_type, nrows) predicate = ( - translate_named_expr(visitor, n=predicate) if predicate is not None else None + translate_named_expr(translator, n=predicate) if predicate is not None else None ) return ir.PythonScan(schema, options, predicate) @_translate_ir.register def _( - node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Scan, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: typ, *options = node.scan_type if typ == "ndjson": @@ -120,7 +233,7 @@ def _( skip_rows, n_rows, row_index, - translate_named_expr(visitor, n=node.predicate) + translate_named_expr(translator, n=node.predicate) if node.predicate is not None else None, ) @@ -128,20 +241,20 @@ def _( @_translate_ir.register def _( - node: pl_ir.Cache, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Cache, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input)) + return ir.Cache(schema, node.id_, translator.translate_ir(n=node.input)) @_translate_ir.register def _( - node: pl_ir.DataFrameScan, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.DataFrameScan, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: return ir.DataFrameScan( schema, node.df, node.projection, - translate_named_expr(visitor, n=node.selection) + translate_named_expr(translator, n=node.selection) if node.selection is not None else None, ) @@ -149,22 +262,22 @@ def _( @_translate_ir.register def _( - node: pl_ir.Select, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Select, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - exprs = [translate_named_expr(visitor, n=e) for e in node.expr] + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + exprs = [translate_named_expr(translator, n=e) for e in node.expr] return ir.Select(schema, exprs, node.should_broadcast, inp) @_translate_ir.register def _( - node: pl_ir.GroupBy, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.GroupBy, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - aggs = [translate_named_expr(visitor, n=e) for e in node.aggs] - keys = [translate_named_expr(visitor, n=e) for e in node.keys] + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + aggs = [translate_named_expr(translator, n=e) for e in node.aggs] + keys = [translate_named_expr(translator, n=e) for e in node.keys] return ir.GroupBy( schema, keys, @@ -177,17 +290,17 @@ def _( @_translate_ir.register def _( - node: pl_ir.Join, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Join, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: # Join key dtypes are dependent on the schema of the left and # right inputs, so these must be translated with the relevant # input active. - with set_node(visitor, node.input_left): - inp_left = translate_ir(visitor, n=None) - left_on = [translate_named_expr(visitor, n=e) for e in node.left_on] - with set_node(visitor, node.input_right): - inp_right = translate_ir(visitor, n=None) - right_on = [translate_named_expr(visitor, n=e) for e in node.right_on] + with set_node(translator.visitor, node.input_left): + inp_left = translator.translate_ir(n=None) + left_on = [translate_named_expr(translator, n=e) for e in node.left_on] + with set_node(translator.visitor, node.input_right): + inp_right = translator.translate_ir(n=None) + right_on = [translate_named_expr(translator, n=e) for e in node.right_on] if (how := node.options[0]) in { "inner", "left", @@ -204,80 +317,65 @@ def _( raise NotImplementedError( f"Unsupported join type {how}" ) # pragma: no cover; asof joins not yet exposed - # No exposure of mixed/conditional joins in pylibcudf yet, so in - # the first instance, implement by doing a cross join followed by - # a filter. - _, join_nulls, zlice, suffix, coalesce = node.options - cross = ir.Join( - schema, - [], - [], - ("cross", join_nulls, None, suffix, coalesce), - inp_left, - inp_right, - ) - dtype = plc.DataType(plc.TypeId.BOOL8) if op2 is None: ops = [op1] else: ops = [op1, op2] - suffix = cross.options[3] - - # Column references in the right table refer to the post-join - # names, so with suffixes. - def _rename(e: expr.Expr, rec: ExprTransformer) -> expr.Expr: - if isinstance(e, expr.Col) and e.name in inp_left.schema: - return type(e)(e.dtype, f"{e.name}{suffix}") - return reuse_if_unchanged(e, rec) - - mapper = make_recursive(_rename) - right_on = [ - expr.NamedExpr( - f"{old.name}{suffix}" if old.name in inp_left.schema else old.name, new - ) - for new, old in zip( - (mapper(e.value) for e in right_on), right_on, strict=True - ) - ] - mask = functools.reduce( + + dtype = plc.DataType(plc.TypeId.BOOL8) + predicate = functools.reduce( functools.partial( expr.BinOp, dtype, plc.binaryop.BinaryOperator.LOGICAL_AND ), ( - expr.BinOp(dtype, expr.BinOp._MAPPING[op], left.value, right.value) + expr.BinOp( + dtype, + expr.BinOp._MAPPING[op], + insert_colrefs( + left.value, + table_ref=plc.expressions.TableReference.LEFT, + name_to_index={ + name: i for i, name in enumerate(inp_left.schema) + }, + ), + insert_colrefs( + right.value, + table_ref=plc.expressions.TableReference.RIGHT, + name_to_index={ + name: i for i, name in enumerate(inp_right.schema) + }, + ), + ) for op, left, right in zip(ops, left_on, right_on, strict=True) ), ) - filtered = ir.Filter(schema, expr.NamedExpr("mask", mask), cross) - if zlice is not None: - offset, length = zlice - return ir.Slice(schema, offset, length, filtered) - return filtered + + return ir.ConditionalJoin(schema, predicate, node.options, inp_left, inp_right) @_translate_ir.register def _( - node: pl_ir.HStack, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.HStack, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - exprs = [translate_named_expr(visitor, n=e) for e in node.exprs] + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + exprs = [translate_named_expr(translator, n=e) for e in node.exprs] return ir.HStack(schema, exprs, node.should_broadcast, inp) @_translate_ir.register def _( - node: pl_ir.Reduce, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Reduce, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: # pragma: no cover; polars doesn't emit this node yet - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - exprs = [translate_named_expr(visitor, n=e) for e in node.expr] + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + exprs = [translate_named_expr(translator, n=e) for e in node.expr] return ir.Reduce(schema, exprs, inp) @_translate_ir.register def _( - node: pl_ir.Distinct, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Distinct, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: (keep, subset, maintain_order, zlice) = node.options keep = ir.Distinct._KEEP_MAP[keep] @@ -288,17 +386,17 @@ def _( subset, zlice, maintain_order, - translate_ir(visitor, n=node.input), + translator.translate_ir(n=node.input), ) @_translate_ir.register def _( - node: pl_ir.Sort, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Sort, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - by = [translate_named_expr(visitor, n=e) for e in node.by_column] + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + by = [translate_named_expr(translator, n=e) for e in node.by_column] stable, nulls_last, descending = node.sort_options order, null_order = sorting.sort_order( descending, nulls_last=nulls_last, num_keys=len(by) @@ -308,33 +406,35 @@ def _( @_translate_ir.register def _( - node: pl_ir.Slice, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Slice, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.Slice(schema, node.offset, node.len, translate_ir(visitor, n=node.input)) + return ir.Slice( + schema, node.offset, node.len, translator.translate_ir(n=node.input) + ) @_translate_ir.register def _( - node: pl_ir.Filter, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Filter, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - mask = translate_named_expr(visitor, n=node.predicate) + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + mask = translate_named_expr(translator, n=node.predicate) return ir.Filter(schema, mask, inp) @_translate_ir.register def _( node: pl_ir.SimpleProjection, - visitor: NodeTraverser, + translator: Translator, schema: dict[str, plc.DataType], ) -> ir.IR: - return ir.Projection(schema, translate_ir(visitor, n=node.input)) + return ir.Projection(schema, translator.translate_ir(n=node.input)) @_translate_ir.register def _( - node: pl_ir.MapFunction, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.MapFunction, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: name, *options = node.function return ir.MapFunction( @@ -342,83 +442,36 @@ def _( name, options, # TODO: merge_sorted breaks this pattern - translate_ir(visitor, n=node.input), + translator.translate_ir(n=node.input), ) @_translate_ir.register def _( - node: pl_ir.Union, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Union, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: return ir.Union( - schema, node.options, *(translate_ir(visitor, n=n) for n in node.inputs) + schema, node.options, *(translator.translate_ir(n=n) for n in node.inputs) ) @_translate_ir.register def _( - node: pl_ir.HConcat, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.HConcat, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.HConcat(schema, *(translate_ir(visitor, n=n) for n in node.inputs)) - - -def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR: - """ - Translate a polars-internal IR node to our representation. - - Parameters - ---------- - visitor - Polars NodeTraverser object - n - Optional node to start traversing from, if not provided uses - current polars-internal node. - - Returns - ------- - Translated IR object - - Raises - ------ - NotImplementedError - If we can't translate the nodes due to unsupported functionality. - """ - ctx: AbstractContextManager[None] = ( - set_node(visitor, n) if n is not None else noop_context - ) - # IR is versioned with major.minor, minor is bumped for backwards - # compatible changes (e.g. adding new nodes), major is bumped for - # incompatible changes (e.g. renaming nodes). - if (version := visitor.version()) >= (4, 0): - raise NotImplementedError( - f"No support for polars IR {version=}" - ) # pragma: no cover; no such version for now. - - with ctx: - polars_schema = visitor.get_schema() - node = visitor.view_current_node() - schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()} - result = _translate_ir(node, visitor, schema) - if any( - isinstance(dtype, pl.Null) - for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values()) - ): - raise NotImplementedError( - f"No GPU support for {result} with Null column dtype." - ) - return result + return ir.HConcat(schema, *(translator.translate_ir(n=n) for n in node.inputs)) def translate_named_expr( - visitor: NodeTraverser, *, n: pl_expr.PyExprIR + translator: Translator, *, n: pl_expr.PyExprIR ) -> expr.NamedExpr: """ Translate a polars-internal named expression IR object into our representation. Parameters ---------- - visitor - Polars NodeTraverser object + translator + Translator object n Node to translate, a named expression node. @@ -438,12 +491,12 @@ def translate_named_expr( NotImplementedError If any translation fails due to unsupported functionality. """ - return expr.NamedExpr(n.output_name, translate_expr(visitor, n=n.node)) + return expr.NamedExpr(n.output_name, translator.translate_expr(n=n.node)) @singledispatch def _translate_expr( - node: Any, visitor: NodeTraverser, dtype: plc.DataType + node: Any, translator: Translator, dtype: plc.DataType ) -> expr.Expr: raise NotImplementedError( f"Translation for {type(node).__name__}" @@ -451,7 +504,7 @@ def _translate_expr( @_translate_expr.register -def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> expr.Expr: name, *options = node.function_data options = tuple(options) if isinstance(name, pl_expr.StringFunction): @@ -460,7 +513,7 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex pl_expr.StringFunction.StripCharsStart, pl_expr.StringFunction.StripCharsEnd, }: - column, chars = (translate_expr(visitor, n=n) for n in node.input) + column, chars = (translator.translate_expr(n=n) for n in node.input) if isinstance(chars, expr.Literal): if chars.value == pa.scalar(""): # No-op in polars, but libcudf uses empty string @@ -477,11 +530,11 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex dtype, name, options, - *(translate_expr(visitor, n=n) for n in node.input), + *(translator.translate_expr(n=n) for n in node.input), ) elif isinstance(name, pl_expr.BooleanFunction): if name == pl_expr.BooleanFunction.IsBetween: - column, lo, hi = (translate_expr(visitor, n=n) for n in node.input) + column, lo, hi = (translator.translate_expr(n=n) for n in node.input) (closed,) = options lop, rop = expr.BooleanFunction._BETWEEN_OPS[closed] return expr.BinOp( @@ -494,7 +547,7 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex dtype, name, options, - *(translate_expr(visitor, n=n) for n in node.input), + *(translator.translate_expr(n=n) for n in node.input), ) elif isinstance(name, pl_expr.TemporalFunction): # functions for which evaluation of the expression may not return @@ -514,14 +567,14 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex dtype, name, options, - *(translate_expr(visitor, n=n) for n in node.input), + *(translator.translate_expr(n=n) for n in node.input), ) if name in needs_cast: return expr.Cast(dtype, result_expr) return result_expr elif isinstance(name, str): - children = (translate_expr(visitor, n=n) for n in node.input) + children = (translator.translate_expr(n=n) for n in node.input) if name == "log": (base,) = options (child,) = children @@ -540,26 +593,26 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex @_translate_expr.register -def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Window, translator: Translator, dtype: plc.DataType) -> expr.Expr: # TODO: raise in groupby? if isinstance(node.options, pl_expr.RollingGroupOptions): # pl.col("a").rolling(...) return expr.RollingWindow( - dtype, node.options, translate_expr(visitor, n=node.function) + dtype, node.options, translator.translate_expr(n=node.function) ) elif isinstance(node.options, pl_expr.WindowMapping): # pl.col("a").over(...) return expr.GroupedRollingWindow( dtype, node.options, - translate_expr(visitor, n=node.function), - *(translate_expr(visitor, n=n) for n in node.partition_by), + translator.translate_expr(n=node.function), + *(translator.translate_expr(n=n) for n in node.partition_by), ) assert_never(node.options) @_translate_expr.register -def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Literal, translator: Translator, dtype: plc.DataType) -> expr.Expr: if isinstance(node.value, plrs.PySeries): return expr.LiteralColumn(dtype, pl.Series._from_pyseries(node.value)) value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype)) @@ -567,42 +620,42 @@ def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> exp @_translate_expr.register -def _(node: pl_expr.Sort, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Sort, translator: Translator, dtype: plc.DataType) -> expr.Expr: # TODO: raise in groupby - return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr)) + return expr.Sort(dtype, node.options, translator.translate_expr(n=node.expr)) @_translate_expr.register -def _(node: pl_expr.SortBy, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.SortBy, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.SortBy( dtype, node.sort_options, - translate_expr(visitor, n=node.expr), - *(translate_expr(visitor, n=n) for n in node.by), + translator.translate_expr(n=node.expr), + *(translator.translate_expr(n=n) for n in node.by), ) @_translate_expr.register -def _(node: pl_expr.Gather, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Gather, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.Gather( dtype, - translate_expr(visitor, n=node.expr), - translate_expr(visitor, n=node.idx), + translator.translate_expr(n=node.expr), + translator.translate_expr(n=node.idx), ) @_translate_expr.register -def _(node: pl_expr.Filter, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Filter, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.Filter( dtype, - translate_expr(visitor, n=node.input), - translate_expr(visitor, n=node.by), + translator.translate_expr(n=node.input), + translator.translate_expr(n=node.by), ) @_translate_expr.register -def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: - inner = translate_expr(visitor, n=node.expr) +def _(node: pl_expr.Cast, translator: Translator, dtype: plc.DataType) -> expr.Expr: + inner = translator.translate_expr(n=node.expr) # Push casts into literals so we can handle Cast(Literal(Null)) if isinstance(inner, expr.Literal): return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype))) @@ -614,17 +667,17 @@ def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.E @_translate_expr.register -def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Column, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.Col(dtype, node.name) @_translate_expr.register -def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Agg, translator: Translator, dtype: plc.DataType) -> expr.Expr: value = expr.Agg( dtype, node.name, node.options, - *(translate_expr(visitor, n=n) for n in node.arguments), + *(translator.translate_expr(n=n) for n in node.arguments), ) if value.name == "count" and value.dtype.id() != plc.TypeId.INT32: return expr.Cast(value.dtype, value) @@ -632,55 +685,30 @@ def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Ex @_translate_expr.register -def _(node: pl_expr.Ternary, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Ternary, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.Ternary( dtype, - translate_expr(visitor, n=node.predicate), - translate_expr(visitor, n=node.truthy), - translate_expr(visitor, n=node.falsy), + translator.translate_expr(n=node.predicate), + translator.translate_expr(n=node.truthy), + translator.translate_expr(n=node.falsy), ) @_translate_expr.register def _( - node: pl_expr.BinaryExpr, visitor: NodeTraverser, dtype: plc.DataType + node: pl_expr.BinaryExpr, translator: Translator, dtype: plc.DataType ) -> expr.Expr: return expr.BinOp( dtype, expr.BinOp._MAPPING[node.op], - translate_expr(visitor, n=node.left), - translate_expr(visitor, n=node.right), + translator.translate_expr(n=node.left), + translator.translate_expr(n=node.right), ) @_translate_expr.register -def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Len, translator: Translator, dtype: plc.DataType) -> expr.Expr: value = expr.Len(dtype) if dtype.id() != plc.TypeId.INT32: return expr.Cast(dtype, value) return value # pragma: no cover; never reached since polars len has uint32 dtype - - -def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr: - """ - Translate a polars-internal expression IR into our representation. - - Parameters - ---------- - visitor - Polars NodeTraverser object - n - Node to translate, an integer referencing a polars internal node. - - Returns - ------- - Translated IR object. - - Raises - ------ - NotImplementedError - If any translation fails due to unsupported functionality. - """ - node = visitor.view_expression(n) - dtype = dtypes.from_polars(visitor.get_dtype(n)) - return _translate_expr(node, visitor, dtype) diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index 7b45c1eaa06..2207545aa60 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -10,7 +10,7 @@ from polars import GPUEngine from polars.testing.asserts import assert_frame_equal -from cudf_polars.dsl.translate import translate_ir +from cudf_polars.dsl.translate import Translator if TYPE_CHECKING: import polars as pl @@ -117,12 +117,14 @@ def assert_ir_translation_raises(q: pl.LazyFrame, *exceptions: type[Exception]) AssertionError If the specified exceptions were not raised. """ - try: - _ = translate_ir(q._ldf.visit()) - except exceptions: + translator = Translator(q._ldf.visit()) + translator.translate_ir() + if errors := translator.errors: + for err in errors: + assert any( + isinstance(err, err_type) for err_type in exceptions + ), f"Translation DID NOT RAISE {exceptions}" return - except Exception as e: - raise AssertionError(f"Translation DID NOT RAISE {exceptions}") from e else: raise AssertionError(f"Translation DID NOT RAISE {exceptions}") diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 2f95cd38c57..080a1af6e19 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -40,7 +40,7 @@ def pytest_configure(config: pytest.Config) -> None: ) config.addinivalue_line( "filterwarnings", - "ignore:.*Query execution with GPU not supported", + "ignore:.*Query execution with GPU not possible", ) diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index a90c283ee54..e7ac72df609 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -71,11 +71,16 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: ------- True if casting is supported, False otherwise """ + has_empty = from_.id() == plc.TypeId.EMPTY or to.id() == plc.TypeId.EMPTY return ( ( - plc.traits.is_fixed_width(to) - and plc.traits.is_fixed_width(from_) - and plc.unary.is_supported_cast(from_, to) + from_ == to + or not has_empty + and ( + plc.traits.is_fixed_width(to) + and plc.traits.is_fixed_width(from_) + and plc.unary.is_supported_cast(from_, to) + ) ) or (from_.id() == plc.TypeId.STRING and is_numeric_not_bool(to)) or (to.id() == plc.TypeId.STRING and is_numeric_not_bool(from_)) diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py index a119cab3b74..b08cede8f7f 100644 --- a/python/cudf_polars/cudf_polars/utils/versions.py +++ b/python/cudf_polars/cudf_polars/utils/versions.py @@ -14,6 +14,8 @@ POLARS_VERSION_LT_111 = POLARS_VERSION < parse("1.11") POLARS_VERSION_LT_112 = POLARS_VERSION < parse("1.12") +POLARS_VERSION_GT_112 = POLARS_VERSION > parse("1.12") +POLARS_VERSION_LT_113 = POLARS_VERSION < parse("1.13") def _ensure_polars_version(): diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index 17a94c633f8..2f2361223d2 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -458,12 +458,12 @@ translate it to our intermediate representation (IR), and then execute and convert back to polars: ```python -from cudf_polars.dsl.translate import translate_ir +from cudf_polars.dsl.translate import Translator q = ... # Convert to our IR -ir = translate_ir(q._ldf.visit()) +ir = Translator(q._ldf.visit()).translate_ir() # DataFrame living on the device result = ir.evaluate(cache={}) diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 2e75dff5c9e..785e87391e7 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.11,<1.13", + "polars>=1.11,<1.14", "pylibcudf==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -49,6 +49,14 @@ license-files = ["LICENSE"] [tool.setuptools.dynamic] version = {file = "cudf_polars/VERSION"} +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" diff --git a/python/cudf_polars/tests/dsl/test_to_ast.py b/python/cudf_polars/tests/dsl/test_to_ast.py index 57d794d4890..f6c24da0180 100644 --- a/python/cudf_polars/tests/dsl/test_to_ast.py +++ b/python/cudf_polars/tests/dsl/test_to_ast.py @@ -3,6 +3,7 @@ from __future__ import annotations +import pyarrow as pa import pytest import polars as pl @@ -10,10 +11,11 @@ import pylibcudf as plc +import cudf_polars.dsl.expr as expr_nodes import cudf_polars.dsl.ir as ir_nodes -from cudf_polars import translate_ir +from cudf_polars import Translator from cudf_polars.containers.dataframe import DataFrame, NamedColumn -from cudf_polars.dsl.to_ast import to_ast +from cudf_polars.dsl.to_ast import insert_colrefs, to_ast, to_parquet_filter @pytest.fixture(scope="module") @@ -58,14 +60,21 @@ def df(): ) def test_compute_column(expr, df): q = df.select(expr) - ir = translate_ir(q._ldf.visit()) + ir = Translator(q._ldf.visit()).translate_ir() assert isinstance(ir, ir_nodes.Select) table = ir.children[0].evaluate(cache={}) name_to_index = {c.name: i for i, c in enumerate(table.columns)} def compute_column(e): - ast = to_ast(e.value, name_to_index=name_to_index) + e_with_colrefs = insert_colrefs( + e.value, + table_ref=plc.expressions.TableReference.LEFT, + name_to_index=name_to_index, + ) + with pytest.raises(NotImplementedError): + e_with_colrefs.evaluate(table) + ast = to_ast(e_with_colrefs) if ast is not None: return NamedColumn( plc.transform.compute_column(table.table, ast), name=e.name @@ -77,3 +86,28 @@ def compute_column(e): expect = q.collect() assert_frame_equal(expect, got) + + +def test_invalid_colref_construction_raises(): + literal = expr_nodes.Literal( + plc.DataType(plc.TypeId.INT8), pa.scalar(1, type=pa.int8()) + ) + with pytest.raises(TypeError): + expr_nodes.ColRef( + literal.dtype, 0, plc.expressions.TableReference.LEFT, literal + ) + + +def test_to_ast_without_colref_raises(): + col = expr_nodes.Col(plc.DataType(plc.TypeId.INT8), "a") + + with pytest.raises(TypeError): + to_ast(col) + + +def test_to_parquet_filter_with_colref_raises(): + col = expr_nodes.Col(plc.DataType(plc.TypeId.INT8), "a") + colref = expr_nodes.ColRef(col.dtype, 0, plc.expressions.TableReference.LEFT, col) + + with pytest.raises(TypeError): + to_parquet_filter(colref) diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py index 15c644d7978..8958c2a0f84 100644 --- a/python/cudf_polars/tests/dsl/test_traversal.py +++ b/python/cudf_polars/tests/dsl/test_traversal.py @@ -10,7 +10,7 @@ import pylibcudf as plc -from cudf_polars import translate_ir +from cudf_polars import Translator from cudf_polars.dsl import expr, ir from cudf_polars.dsl.traversal import ( CachingVisitor, @@ -109,7 +109,7 @@ def test_rewrite_ir_node(): df = pl.LazyFrame({"a": [1, 2, 1], "b": [1, 3, 4]}) q = df.group_by("a").agg(pl.col("b").sum()).sort("b") - orig = translate_ir(q._ldf.visit()) + orig = Translator(q._ldf.visit()).translate_ir() new_df = pl.DataFrame({"a": [1, 1, 2], "b": [-1, -2, -4]}) @@ -150,7 +150,7 @@ def replace_scan(node, rec): mapper = CachingVisitor(replace_scan) - orig = translate_ir(q._ldf.visit()) + orig = Translator(q._ldf.visit()).translate_ir() new = mapper(orig) result = new.evaluate(cache={}).to_polars() @@ -174,7 +174,7 @@ def test_rewrite_names_and_ops(): .collect() ) - qir = translate_ir(q._ldf.visit()) + qir = Translator(q._ldf.visit()).translate_ir() @singledispatch def _transform(e: expr.Expr, fn: ExprTransformer) -> expr.Expr: diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py index 62df8ce1498..6170281ad54 100644 --- a/python/cudf_polars/tests/expressions/test_sort.py +++ b/python/cudf_polars/tests/expressions/test_sort.py @@ -10,7 +10,7 @@ import pylibcudf as plc -from cudf_polars import translate_ir +from cudf_polars import Translator from cudf_polars.testing.asserts import assert_gpu_result_equal @@ -68,7 +68,7 @@ def test_setsorted(descending, nulls_last, with_nulls): assert_gpu_result_equal(q) - df = translate_ir(q._ldf.visit()).evaluate(cache={}) + df = Translator(q._ldf.visit()).translate_ir().evaluate(cache={}) a = df.column_map["a"] diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py index 9900f598e5f..25b71716eed 100644 --- a/python/cudf_polars/tests/test_config.py +++ b/python/cudf_polars/tests/test_config.py @@ -30,7 +30,7 @@ def raise_unimplemented(self, *args): pytest.raises(pl.exceptions.ComputeError), pytest.warns( pl.exceptions.PerformanceWarning, - match="Query execution with GPU not supported", + match="Query execution with GPU not possible", ), ): # And ensure that collecting issues the correct warning. diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 8ca7a7b9264..2fcbbf21f1c 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -13,7 +13,7 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) -from cudf_polars.utils.versions import POLARS_VERSION_LT_112 +from cudf_polars.utils.versions import POLARS_VERSION_LT_112, POLARS_VERSION_LT_113 @pytest.fixture(params=[False, True], ids=["nulls_not_equal", "nulls_equal"]) @@ -110,7 +110,11 @@ def test_cross_join(left, right, zlice): @pytest.mark.parametrize( - "left_on,right_on", [(pl.col("a"), pl.lit(2)), (pl.lit(2), pl.col("a"))] + "left_on,right_on", + [ + (pl.col("a"), pl.lit(2, dtype=pl.Int64)), + (pl.lit(2, dtype=pl.Int64), pl.col("a")), + ], ) def test_join_literal_key_unsupported(left, right, left_on, right_on): q = left.join(right, left_on=left_on, right_on=right_on, how="inner") @@ -125,7 +129,13 @@ def test_join_literal_key_unsupported(left, right, left_on, right_on): [pl.col("a_right") <= pl.col("a") * 2], [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")], [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")], - [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2], + pytest.param( + [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2], + marks=pytest.mark.xfail( + POLARS_VERSION_LT_113, + reason="https://github.com/pola-rs/polars/issues/19597", + ), + ), ], ) def test_join_where(left, right, conditions, zlice): diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py index e895f27f637..63aa1c573a9 100644 --- a/python/cudf_polars/tests/test_mapfunction.py +++ b/python/cudf_polars/tests/test_mapfunction.py @@ -93,16 +93,3 @@ def test_unpivot_defaults(): ) q = df.unpivot(index="d") assert_gpu_result_equal(q) - - -def test_unpivot_unsupported_cast_raises(): - df = pl.LazyFrame( - { - "a": ["x", "y", "z"], - "b": pl.Series([1, 3, 5], dtype=pl.Int16), - } - ) - - q = df.unpivot(["a", "b"]) - - assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index d3baf3bf4d2..dd67a019c77 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -65,6 +65,14 @@ include = [ ] exclude = ["*tests*"] +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.ruff] extend = "../../pyproject.toml" diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 48cea7266af..a7a116875ea 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -5,6 +5,8 @@ from dask_expr.io.io import FusedParquetIO from dask_expr.io.parquet import FragmentWrapper, ReadParquetPyarrowFS +from dask._task_spec import Task + import cudf from dask_cudf import _deprecated_api @@ -19,7 +21,7 @@ def _load_multiple_files( frag_filters, columns, schema, - *to_pandas_args, + **to_pandas_kwargs, ): import pyarrow as pa @@ -46,7 +48,7 @@ def _load_multiple_files( ) return CudfReadParquetPyarrowFS._table_to_pandas( get(dsk, name), - *to_pandas_args, + **to_pandas_kwargs, ) @@ -89,7 +91,7 @@ def _table_to_pandas(table, index_name): df = df.set_index(index_name) return df - def _filtered_task(self, index: int): + def _filtered_task(self, name, index: int): columns = self.columns.copy() index_name = self.index.name if self.index is not None: @@ -99,16 +101,20 @@ def _filtered_task(self, index: int): if columns is None: columns = list(schema.names) columns.append(index_name) - return ( + return Task( + name, self._table_to_pandas, - ( + Task( + None, self._fragment_to_table, - FragmentWrapper(self.fragments[index], filesystem=self.fs), - self.filters, - columns, - schema, + fragment_wrapper=FragmentWrapper( + self.fragments[index], filesystem=self.fs + ), + filters=self.filters, + columns=columns, + schema=schema, ), - index_name, + index_name=index_name, ) def _tune_up(self, parent): diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index 4351b672151..f11a5252080 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -1,7 +1,5 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -import numpy as np -import pandas as pd import pytest import dask @@ -10,20 +8,7 @@ import cudf import dask_cudf - - -def _make_random_frame(nelem, npartitions=2): - rng = np.random.default_rng(seed=0) - df = pd.DataFrame( - { - "x": rng.integers(0, 5, size=nelem), - "y": rng.normal(loc=1.0, scale=1.0, size=nelem), - } - ) - gdf = cudf.DataFrame.from_pandas(df) - dgf = dask_cudf.from_cudf(gdf, npartitions=npartitions) - return df, dgf - +from dask_cudf.tests.utils import _make_random_frame _reducers = ["sum", "count", "mean", "var", "std", "min", "max"] diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py index a9f61f75762..b44b3f939e7 100644 --- a/python/dask_cudf/dask_cudf/tests/utils.py +++ b/python/dask_cudf/dask_cudf/tests/utils.py @@ -19,7 +19,7 @@ def _make_random_frame(nelem, npartitions=2, include_na=False): - rng = np.random.default_rng(seed=None) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( {"x": rng.random(size=nelem), "y": rng.random(size=nelem)} ) diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index c7e4cbc45ea..07d9143db36 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -46,7 +46,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint" [project.optional-dependencies] test = [ "dask-cuda==24.12.*,>=0.0.0a0", - "numba-cuda>=0.0.13", + "numba-cuda>=0.0.13,<0.0.18", "pytest-cov", "pytest-xdist", "pytest<8", @@ -81,6 +81,14 @@ section-order = ["future", "standard-library", "third-party", "dask", "rapids", dask = ["dask", "distributed", "dask_cuda"] rapids = ["rmm", "cudf"] +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" diff --git a/python/libcudf/cmake/Modules/WheelHelpers.cmake b/python/libcudf/cmake/Modules/WheelHelpers.cmake deleted file mode 100644 index 278d6751c15..00000000000 --- a/python/libcudf/cmake/Modules/WheelHelpers.cmake +++ /dev/null @@ -1,59 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= -include_guard(GLOBAL) - -# Making libraries available inside wheels by installing the associated targets. -function(install_aliased_imported_targets) - list(APPEND CMAKE_MESSAGE_CONTEXT "install_aliased_imported_targets") - - set(options "") - set(one_value "DESTINATION") - set(multi_value "TARGETS") - cmake_parse_arguments(_ "${options}" "${one_value}" "${multi_value}" ${ARGN}) - - message(VERBOSE "Installing targets '${__TARGETS}' into lib_dir '${__DESTINATION}'") - - foreach(target IN LISTS __TARGETS) - - if(NOT TARGET ${target}) - message(VERBOSE "No target named ${target}") - continue() - endif() - - get_target_property(alias_target ${target} ALIASED_TARGET) - if(alias_target) - set(target ${alias_target}) - endif() - - get_target_property(is_imported ${target} IMPORTED) - if(NOT is_imported) - # If the target isn't imported, install it into the wheel - install(TARGETS ${target} DESTINATION ${__DESTINATION}) - message(VERBOSE "install(TARGETS ${target} DESTINATION ${__DESTINATION})") - else() - # If the target is imported, make sure it's global - get_target_property(type ${target} TYPE) - if(${type} STREQUAL "UNKNOWN_LIBRARY") - install(FILES $ DESTINATION ${__DESTINATION}) - message(VERBOSE "install(FILES $ DESTINATION ${__DESTINATION})") - else() - install(IMPORTED_RUNTIME_ARTIFACTS ${target} DESTINATION ${__DESTINATION}) - message( - VERBOSE - "install(IMPORTED_RUNTIME_ARTIFACTS $ DESTINATION ${__DESTINATION})" - ) - endif() - endif() - endforeach() -endfunction() diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml index 62726bb0df4..8c650eb2144 100644 --- a/python/libcudf/pyproject.toml +++ b/python/libcudf/pyproject.toml @@ -48,6 +48,14 @@ Homepage = "https://github.com/rapidsai/cudf" [project.entry-points."cmake.prefix"] libcudf = "libcudf" +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 600 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '525M' + [tool.scikit-build] build-dir = "build/{wheel_tag}" cmake.build-type = "Release" diff --git a/python/pylibcudf/pylibcudf/aggregation.pyi b/python/pylibcudf/pylibcudf/aggregation.pyi new file mode 100644 index 00000000000..a59e2a9dc93 --- /dev/null +++ b/python/pylibcudf/pylibcudf/aggregation.pyi @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.types import ( + DataType, + Interpolation, + NanEquality, + NullEquality, + NullOrder, + NullPolicy, + Order, +) + +class Kind(IntEnum): + SUM = ... + PRODUCT = ... + MIN = ... + MAX = ... + COUNT_VALID = ... + COUNT_ALL = ... + ANY = ... + ALL = ... + SUM_OF_SQUARES = ... + MEAN = ... + VARIANCE = ... + STD = ... + MEDIAN = ... + QUANTILE = ... + ARGMAX = ... + ARGMIN = ... + NUNIQUE = ... + NTH_ELEMENT = ... + RANK = ... + COLLECT_LIST = ... + COLLECT_SET = ... + PTX = ... + CUDA = ... + CORRELATION = ... + COVARIANCE = ... + +class CorrelationType(IntEnum): + PEARSON = ... + KENDALL = ... + SPEARMAN = ... + +class EWMHistory(IntEnum): + INFINITE = ... + FINITE = ... + +class RankMethod(IntEnum): + FIRST = ... + AVERAGE = ... + MIN = ... + MAX = ... + DENSE = ... + +class RankPercentage(IntEnum): + NONE = ... + ZERO_NORMALIZED = ... + ONE_NORMALIZED = ... + +class UdfType(IntEnum): + CUDA = ... + PTX = ... + +class Aggregation: + def __init__(self): ... + def kind(self) -> Kind: ... + +def sum() -> Aggregation: ... +def product() -> Aggregation: ... +def min() -> Aggregation: ... +def max() -> Aggregation: ... +def count(null_handling: NullPolicy = NullPolicy.INCLUDE) -> Aggregation: ... +def any() -> Aggregation: ... +def all() -> Aggregation: ... +def sum_of_squares() -> Aggregation: ... +def mean() -> Aggregation: ... +def variance(ddof: int = 1) -> Aggregation: ... +def std(ddof: int = 1) -> Aggregation: ... +def median() -> Aggregation: ... +def quantile( + quantiles: list[float], interp: Interpolation = Interpolation.LINEAR +) -> Aggregation: ... +def argmax() -> Aggregation: ... +def argmin() -> Aggregation: ... +def ewma(center_of_mass: float, history: EWMHistory) -> Aggregation: ... +def nunique(null_handling: NullPolicy = NullPolicy.EXCLUDE) -> Aggregation: ... +def nth_element( + n: int, null_handling: NullPolicy = NullPolicy.INCLUDE +) -> Aggregation: ... +def collect_list( + null_handling: NullPolicy = NullPolicy.INCLUDE, +) -> Aggregation: ... +def collect_set( + null_handling: NullPolicy = NullPolicy.INCLUDE, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Aggregation: ... +def udf(operation: str, output_type: DataType) -> Aggregation: ... +def correlation(type: CorrelationType, min_periods: int) -> Aggregation: ... +def covariance(min_periods: int, ddof: int) -> Aggregation: ... +def rank( + method: RankMethod, + column_order: Order = Order.ASCENDING, + null_handling: NullPolicy = NullPolicy.EXCLUDE, + null_precedence: NullOrder = NullOrder.AFTER, + percentage: RankPercentage = RankPercentage.NONE, +) -> Aggregation: ... diff --git a/python/pylibcudf/pylibcudf/aggregation.pyx b/python/pylibcudf/pylibcudf/aggregation.pyx index e510b738f70..662f76d5c8e 100644 --- a/python/pylibcudf/pylibcudf/aggregation.pyx +++ b/python/pylibcudf/pylibcudf/aggregation.pyx @@ -64,6 +64,40 @@ from pylibcudf.libcudf.aggregation import udf_type as UdfType # no-cython-lint from .types cimport DataType +__all__ = [ + "Aggregation", + "CorrelationType", + "EWMHistory", + "Kind", + "RankMethod", + "RankPercentage", + "UdfType", + "all", + "any", + "argmax", + "argmin", + "collect_list", + "collect_set", + "correlation", + "count", + "covariance", + "ewma", + "max", + "mean", + "median", + "min", + "nth_element", + "nunique", + "product", + "quantile", + "rank", + "std", + "sum", + "sum_of_squares", + "udf", + "variance", +] + cdef class Aggregation: """A type of aggregation to perform. diff --git a/python/pylibcudf/pylibcudf/binaryop.pyi b/python/pylibcudf/pylibcudf/binaryop.pyi new file mode 100644 index 00000000000..f745e6c6854 --- /dev/null +++ b/python/pylibcudf/pylibcudf/binaryop.pyi @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.types import DataType + +class BinaryOperator(IntEnum): + ADD = ... + SUB = ... + MUL = ... + DIV = ... + TRUE_DIV = ... + FLOOR_DIV = ... + MOD = ... + PMOD = ... + PYMOD = ... + POW = ... + INT_POW = ... + LOG_BASE = ... + ATAN2 = ... + SHIFT_LEFT = ... + SHIFT_RIGHT = ... + SHIFT_RIGHT_UNSIGNED = ... + BITWISE_AND = ... + BITWISE_OR = ... + BITWISE_XOR = ... + LOGICAL_AND = ... + LOGICAL_OR = ... + EQUAL = ... + NOT_EQUAL = ... + LESS = ... + GREATER = ... + LESS_EQUAL = ... + GREATER_EQUAL = ... + NULL_EQUALS = ... + NULL_MAX = ... + NULL_MIN = ... + NULL_NOT_EQUALS = ... + GENERIC_BINARY = ... + NULL_LOGICAL_AND = ... + NULL_LOGICAL_OR = ... + INVALID_BINARY = ... + +def binary_operation( + lhs: Column | Scalar, + rhs: Column | Scalar, + op: BinaryOperator, + output_type: DataType, +) -> Column: ... +def is_supported_operation( + out: DataType, lhs: DataType, rhs: DataType, op: BinaryOperator +) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx index eef73bf4e9d..b7b4ecc6e83 100644 --- a/python/pylibcudf/pylibcudf/binaryop.pyx +++ b/python/pylibcudf/pylibcudf/binaryop.pyx @@ -16,6 +16,7 @@ from .column cimport Column from .scalar cimport Scalar from .types cimport DataType +__all__ = ["BinaryOperator", "binary_operation", "is_supported_operation"] cpdef Column binary_operation( LeftBinaryOperand lhs, diff --git a/python/pylibcudf/pylibcudf/column.pyi b/python/pylibcudf/pylibcudf/column.pyi new file mode 100644 index 00000000000..c9f70de3dbf --- /dev/null +++ b/python/pylibcudf/pylibcudf/column.pyi @@ -0,0 +1,48 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Sequence +from typing import Any + +from pylibcudf.gpumemoryview import gpumemoryview +from pylibcudf.scalar import Scalar +from pylibcudf.types import DataType + +class Column: + def __init__( + self, + data_type: DataType, + size: int, + data: gpumemoryview | None, + mask: gpumemoryview | None, + null_count: int, + offset: int, + children: list[Column], + ) -> None: ... + def type(self) -> DataType: ... + def child(self, index: int) -> Column: ... + def size(self) -> int: ... + def null_count(self) -> int: ... + def offset(self) -> int: ... + def data(self) -> gpumemoryview | None: ... + def null_mask(self) -> gpumemoryview | None: ... + def children(self) -> list[Column]: ... + def copy(self) -> Column: ... + def with_mask( + self, mask: gpumemoryview | None, null_count: int + ) -> Column: ... + def list_view(self) -> ListColumnView: ... + @staticmethod + def from_scalar(scalar: Scalar, size: int) -> Column: ... + @staticmethod + def all_null_like(like: Column, size: int) -> Column: ... + @staticmethod + def from_cuda_array_interface_obj(obj: Any) -> Column: ... + +class ListColumnView: + def __init__(self, column: Column): ... + def child(self) -> Column: ... + def offsets(self) -> Column: ... + +def is_c_contiguous( + shape: Sequence[int], strides: Sequence[int], itemsize: int +) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx index 4e5698566d0..9bb5574608e 100644 --- a/python/pylibcudf/pylibcudf/column.pyx +++ b/python/pylibcudf/pylibcudf/column.pyx @@ -17,6 +17,7 @@ from .utils cimport int_to_bitmask_ptr, int_to_void_ptr import functools +__all__ = ["Column", "ListColumnView", "is_c_contiguous"] cdef class Column: """A container of nullable device data as a column of elements. @@ -61,6 +62,8 @@ cdef class Column: self._children = children self._num_children = len(children) + __hash__ = None + cdef column_view view(self) nogil: """Generate a libcudf column_view to pass to libcudf algorithms. @@ -384,6 +387,8 @@ cdef class ListColumnView: raise TypeError("Column is not a list type") self._column = col + __hash__ = None + cpdef child(self): """The data column of the underlying list column.""" return self._column.child(1) diff --git a/python/pylibcudf/pylibcudf/column_factories.pyi b/python/pylibcudf/pylibcudf/column_factories.pyi new file mode 100644 index 00000000000..c87fe423acb --- /dev/null +++ b/python/pylibcudf/pylibcudf/column_factories.pyi @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.column import Column +from pylibcudf.types import DataType, MaskState, TypeId + +def make_empty_column(type_or_id: DataType | TypeId) -> Column: ... +def make_numeric_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... +def make_fixed_point_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... +def make_timestamp_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... +def make_duration_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... +def make_fixed_width_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx index ac942a620b5..c4969a7f502 100644 --- a/python/pylibcudf/pylibcudf/column_factories.pyx +++ b/python/pylibcudf/pylibcudf/column_factories.pyx @@ -17,6 +17,15 @@ from .types cimport DataType, type_id from .types import MaskState, TypeId +__all__ = [ + "make_duration_column", + "make_empty_column", + "make_fixed_point_column", + "make_fixed_width_column", + "make_numeric_column", + "make_timestamp_column", +] + cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id): """Creates an empty column of the specified type. diff --git a/python/pylibcudf/pylibcudf/concatenate.pyi b/python/pylibcudf/pylibcudf/concatenate.pyi new file mode 100644 index 00000000000..79076f509e0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/concatenate.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.table import Table + +def concatenate[ColumnOrTable: (Column, Table)]( + objects: list[ColumnOrTable], +) -> ColumnOrTable: ... diff --git a/python/pylibcudf/pylibcudf/concatenate.pyx b/python/pylibcudf/pylibcudf/concatenate.pyx index 10c860d97bb..42c5f34cf3e 100644 --- a/python/pylibcudf/pylibcudf/concatenate.pyx +++ b/python/pylibcudf/pylibcudf/concatenate.pyx @@ -12,6 +12,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view from .column cimport Column from .table cimport Table +__all__ = ["concatenate"] cpdef concatenate(list objects): """Concatenate columns or tables. diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyi b/python/pylibcudf/pylibcudf/contiguous_split.pyi new file mode 100644 index 00000000000..dd6328fbf23 --- /dev/null +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.gpumemoryview import gpumemoryview +from pylibcudf.table import Table + +class PackedColumns: + def __init__(self): ... + def release(self) -> tuple[memoryview, gpumemoryview]: ... + +def pack(input: Table) -> PackedColumns: ... +def unpack(input: PackedColumns) -> Table: ... +def unpack_from_memoryviews( + metadata: memoryview, gpu_data: gpumemoryview +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx index ed926a3fcc0..94873e079c9 100644 --- a/python/pylibcudf/pylibcudf/contiguous_split.pyx +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -20,6 +20,13 @@ from .table cimport Table from .utils cimport int_to_void_ptr +__all__ = [ + "PackedColumns", + "pack", + "unpack", + "unpack_from_memoryviews", +] + cdef class HostBuffer: """Owning host buffer that implements the buffer protocol""" cdef unique_ptr[vector[uint8_t]] c_obj @@ -38,6 +45,8 @@ cdef class HostBuffer: out.strides[0] = 1 return out + __hash__ = None + def __getbuffer__(self, Py_buffer *buffer, int flags): buffer.buf = dereference(self.c_obj).data() buffer.format = NULL # byte @@ -69,6 +78,8 @@ cdef class PackedColumns: "Use one of the factories." ) + __hash__ = None + @staticmethod cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data): """Create a Python PackedColumns from a libcudf packed_columns.""" diff --git a/python/pylibcudf/pylibcudf/copying.pyi b/python/pylibcudf/pylibcudf/copying.pyi new file mode 100644 index 00000000000..6cf4ed48724 --- /dev/null +++ b/python/pylibcudf/pylibcudf/copying.pyi @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum +from typing import TypeVar + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table + +class MaskAllocationPolicy(IntEnum): + NEVER = ... + RETAIN = ... + ALWAYS = ... + +class OutOfBoundsPolicy(IntEnum): + NULLIFY = ... + DONT_CHECK = ... + +ColumnOrTable = TypeVar("ColumnOrTable", Column, Table) + +def gather( + source_table: Table, gather_map: Column, bounds_policy: OutOfBoundsPolicy +) -> Table: ... +def scatter( + source: Table | list[Scalar], scatter_map: Column, target_table: Table +) -> Table: ... +def empty_like(input: ColumnOrTable) -> ColumnOrTable: ... +def allocate_like( + input_column: Column, policy: MaskAllocationPolicy, size: int | None = None +) -> Column: ... +def copy_range_in_place( + input_column: Column, + target_column: Column, + input_begin: int, + input_end: int, + target_begin: int, +) -> Column: ... +def copy_range( + input_column: Column, + target_column: Column, + input_begin: int, + input_end: int, + target_begin: int, +) -> Column: ... +def shift(input: Column, offset: int, fill_value: Scalar) -> Column: ... +def slice(input: ColumnOrTable, indices: list[int]) -> list[ColumnOrTable]: ... +def split(input: ColumnOrTable, splits: list[int]) -> list[ColumnOrTable]: ... +def copy_if_else( + lhs: Column | Scalar, rhs: Column | Scalar, boolean_mask: Column +) -> Column: ... +def boolean_mask_scatter( + input: Table | list[Scalar], target: Table, boolean_mask: Column +) -> Table: ... +def get_element(input_column: Column, index: int) -> Scalar: ... diff --git a/python/pylibcudf/pylibcudf/copying.pyx b/python/pylibcudf/pylibcudf/copying.pyx index 4938f1a3dda..fb8b6f9890e 100644 --- a/python/pylibcudf/pylibcudf/copying.pyx +++ b/python/pylibcudf/pylibcudf/copying.pyx @@ -36,6 +36,23 @@ from .table cimport Table from .utils cimport _as_vector +__all__ = [ + "MaskAllocationPolicy", + "OutOfBoundsPolicy", + "allocate_like", + "boolean_mask_scatter", + "copy_if_else", + "copy_range", + "copy_range_in_place", + "empty_like", + "gather", + "get_element", + "scatter", + "shift", + "slice", + "split", +] + cpdef Table gather( Table source_table, Column gather_map, diff --git a/python/pylibcudf/pylibcudf/datetime.pyi b/python/pylibcudf/pylibcudf/datetime.pyi new file mode 100644 index 00000000000..6a3ae7953d9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/datetime.pyi @@ -0,0 +1,45 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class DatetimeComponent(IntEnum): + YEAR = ... + MONTH = ... + DAY = ... + WEEKDAY = ... + HOUR = ... + MINUTE = ... + SECOND = ... + MILLISECOND = ... + MICROSECOND = ... + NANOSECOND = ... + +class RoundingFrequency(IntEnum): + DAY = ... + HOUR = ... + MINUTE = ... + SECOND = ... + MILLISECOND = ... + MICROSECOND = ... + NANOSECOND = ... + +def extract_millisecond_fraction(input: Column) -> Column: ... +def extract_microsecond_fraction(input: Column) -> Column: ... +def extract_nanosecond_fraction(input: Column) -> Column: ... +def extract_datetime_component( + input: Column, component: DatetimeComponent +) -> Column: ... +def ceil_datetimes(input: Column, freq: RoundingFrequency) -> Column: ... +def floor_datetimes(input: Column, freq: RoundingFrequency) -> Column: ... +def round_datetimes(input: Column, freq: RoundingFrequency) -> Column: ... +def add_calendrical_months( + input: Column, months: Column | Scalar +) -> Column: ... +def day_of_year(input: Column) -> Column: ... +def is_leap_year(input: Column) -> Column: ... +def last_day_of_month(input: Column) -> Column: ... +def extract_quarter(input: Column) -> Column: ... +def days_in_month(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx index 9e5e709d81d..b100e3e22d0 100644 --- a/python/pylibcudf/pylibcudf/datetime.pyx +++ b/python/pylibcudf/pylibcudf/datetime.pyx @@ -29,6 +29,24 @@ from cython.operator cimport dereference from .column cimport Column +__all__ = [ + "DatetimeComponent", + "RoundingFrequency", + "add_calendrical_months", + "ceil_datetimes", + "day_of_year", + "days_in_month", + "extract_datetime_component", + "extract_microsecond_fraction", + "extract_millisecond_fraction", + "extract_nanosecond_fraction", + "extract_quarter", + "floor_datetimes", + "is_leap_year", + "last_day_of_month", + "round_datetimes", +] + cpdef Column extract_millisecond_fraction( Column input ): diff --git a/python/pylibcudf/pylibcudf/experimental.pyi b/python/pylibcudf/pylibcudf/experimental.pyi new file mode 100644 index 00000000000..bbfb86b0ff6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/experimental.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +def enable_prefetching(key: str) -> None: ... +def disable_prefetching(key: str) -> None: ... +def prefetch_debugging(enable: bool) -> None: ... diff --git a/python/pylibcudf/pylibcudf/experimental.pyx b/python/pylibcudf/pylibcudf/experimental.pyx index b25a53e13b2..d94d6d087ac 100644 --- a/python/pylibcudf/pylibcudf/experimental.pyx +++ b/python/pylibcudf/pylibcudf/experimental.pyx @@ -5,6 +5,8 @@ from libcpp.string cimport string from pylibcudf.libcudf cimport experimental as cpp_experimental +__all__ = ["disable_prefetching", "enable_prefetching", "prefetch_debugging"] + cpdef enable_prefetching(str key): """Turn on prefetch instructions for the given key. diff --git a/python/pylibcudf/pylibcudf/expressions.pyi b/python/pylibcudf/pylibcudf/expressions.pyi new file mode 100644 index 00000000000..12b473d8605 --- /dev/null +++ b/python/pylibcudf/pylibcudf/expressions.pyi @@ -0,0 +1,79 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from enum import IntEnum + +from pylibcudf.scalar import Scalar + +class TableReference(IntEnum): + LEFT = ... + RIGHT = ... + +class ASTOperator(IntEnum): + ADD = ... + SUB = ... + MUL = ... + DIV = ... + TRUE_DIV = ... + FLOOR_DIV = ... + MOD = ... + PYMOD = ... + POW = ... + EQUAL = ... + NULL_EQUAL = ... + NOT_EQUAL = ... + LESS = ... + GREATER = ... + LESS_EQUAL = ... + GREATER_EQUAL = ... + BITWISE_AND = ... + BITWISE_OR = ... + BITWISE_XOR = ... + NULL_LOGICAL_AND = ... + LOGICAL_AND = ... + NULL_LOGICAL_OR = ... + LOGICAL_OR = ... + IDENTITY = ... + IS_NULL = ... + SIN = ... + COS = ... + TAN = ... + ARCSIN = ... + ARCCOS = ... + ARCTAN = ... + SINH = ... + COSH = ... + TANH = ... + ARCSINH = ... + ARCCOSH = ... + ARCTANH = ... + EXP = ... + LOG = ... + SQRT = ... + CBRT = ... + CEIL = ... + FLOOR = ... + ABS = ... + RINT = ... + BIT_INVERT = ... + NOT = ... + +class Expression: + def __init__(self): ... + +class Literal(Expression): + def __init__(self, value: Scalar): ... + +class ColumnReference(Expression): + def __init__( + self, index: int, table_source: TableReference = TableReference.LEFT + ): ... + +class ColumnNameReference(Expression): + def __init__(self, name: str): ... + +class Operation(Expression): + def __init__( + self, + op: ASTOperator, + left: Expression, + right: Expression | None = None, + ): ... diff --git a/python/pylibcudf/pylibcudf/expressions.pyx b/python/pylibcudf/pylibcudf/expressions.pyx index 1535f68366b..0f12cfe313c 100644 --- a/python/pylibcudf/pylibcudf/expressions.pyx +++ b/python/pylibcudf/pylibcudf/expressions.pyx @@ -49,6 +49,16 @@ from .types cimport DataType # Aliases for simplicity ctypedef unique_ptr[libcudf_exp.expression] expression_ptr +__all__ = [ + "ASTOperator", + "ColumnNameReference", + "ColumnReference", + "Expression", + "Literal", + "Operation", + "TableReference", +] + # Define this class just to have a docstring for it cdef class Expression: """ @@ -58,7 +68,7 @@ cdef class Expression: For details, see :cpp:class:`cudf::ast::expression`. """ - pass + __hash__ = None cdef class Literal(Expression): """ diff --git a/python/pylibcudf/pylibcudf/filling.pxd b/python/pylibcudf/pylibcudf/filling.pxd index b9345f8cd42..56aef086e1b 100644 --- a/python/pylibcudf/pylibcudf/filling.pxd +++ b/python/pylibcudf/pylibcudf/filling.pxd @@ -33,3 +33,9 @@ cpdef Table repeat( Table input_table, ColumnOrSize count ) + +cpdef Column calendrical_month_sequence( + size_type n, + Scalar init, + size_type months, +) diff --git a/python/pylibcudf/pylibcudf/filling.pyi b/python/pylibcudf/pylibcudf/filling.pyi new file mode 100644 index 00000000000..0b5e29bdc32 --- /dev/null +++ b/python/pylibcudf/pylibcudf/filling.pyi @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table + +def fill( + destination: Column, begin: int, end: int, value: Scalar +) -> Column: ... +def fill_in_place( + destination: Column, begin: int, end: int, value: Scalar +) -> None: ... +def sequence(size: int, init: Scalar, step: Scalar) -> Column: ... +def repeat(input_table: Table, count: Column | int) -> Table: ... +def calendrical_month_sequence( + n: int, init: Scalar, months: int +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx index a47004a1e42..ea5b45ff7c2 100644 --- a/python/pylibcudf/pylibcudf/filling.pyx +++ b/python/pylibcudf/pylibcudf/filling.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.filling cimport ( fill_in_place as cpp_fill_in_place, repeat as cpp_repeat, sequence as cpp_sequence, + calendrical_month_sequence as cpp_calendrical_month_sequence ) from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type @@ -18,6 +19,14 @@ from .scalar cimport Scalar from .table cimport Table +__all__ = [ + "fill", + "fill_in_place", + "repeat", + "sequence", + "calendrical_month_sequence", +] + cpdef Column fill( Column destination, size_type begin, @@ -164,3 +173,39 @@ cpdef Table repeat( count ) return Table.from_libcudf(move(result)) + + +cpdef Column calendrical_month_sequence( + size_type n, + Scalar init, + size_type months, +): + + """Fill destination column from begin to end with value. + + For details, see :cpp:func:`calendrical_month_sequence`. + + Parameters + ---------- + n : size_type + Number of timestamps to generate + init : Scalar + The initial timestamp + months : size_type + Months to increment + + Returns + ------- + pylibcudf.Column + Timestamps column with sequences of months + """ + + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_calendrical_month_sequence( + n, + dereference(init.c_obj), + months + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/gpumemoryview.pyi b/python/pylibcudf/pylibcudf/gpumemoryview.pyi new file mode 100644 index 00000000000..50f1f39a515 --- /dev/null +++ b/python/pylibcudf/pylibcudf/gpumemoryview.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Mapping +from typing import Any + +class gpumemoryview: + def __init__(self, data: Any): ... + @property + def __cuda_array_interface__(self) -> Mapping[str, Any]: ... diff --git a/python/pylibcudf/pylibcudf/gpumemoryview.pyx b/python/pylibcudf/pylibcudf/gpumemoryview.pyx index 0904022a944..41316eddb60 100644 --- a/python/pylibcudf/pylibcudf/gpumemoryview.pyx +++ b/python/pylibcudf/pylibcudf/gpumemoryview.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +__all__ = ["gpumemoryview"] cdef class gpumemoryview: """Minimal representation of a memory buffer. @@ -25,3 +26,5 @@ cdef class gpumemoryview: @property def __cuda_array_interface__(self): return self.obj.__cuda_array_interface__ + + __hash__ = None diff --git a/python/pylibcudf/pylibcudf/groupby.pyi b/python/pylibcudf/pylibcudf/groupby.pyi new file mode 100644 index 00000000000..883ad6e34cf --- /dev/null +++ b/python/pylibcudf/pylibcudf/groupby.pyi @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.aggregation import Aggregation +from pylibcudf.column import Column +from pylibcudf.replace import ReplacePolicy +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table +from pylibcudf.types import NullOrder, NullPolicy, Order, Sorted + +class GroupByRequest: + def __init__( + self, values: Column, aggregations: list[Aggregation] + ) -> None: ... + +class GroupBy: + def __init__( + self, + keys: Table, + null_handling: NullPolicy = NullPolicy.EXCLUDE, + keys_are_sorted: Sorted = Sorted.NO, + column_order: list[Order] | None = None, + null_precedence: list[NullOrder] | None = None, + ) -> None: ... + def aggregate( + self, requests: list[GroupByRequest] + ) -> tuple[Table, list[Table]]: ... + def scan( + self, requests: list[GroupByRequest] + ) -> tuple[Table, list[Table]]: ... + def shift( + self, values: Table, offset: list[int], fill_values: list[Scalar] + ) -> tuple[Table, Table]: ... + def replace_nulls( + self, value: Table, replace_policies: list[ReplacePolicy] + ) -> tuple[Table, Table]: ... + def get_groups( + self, values: Table | None = None + ) -> tuple[list[int], Table, Table]: ... diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx index 71f9ecb0453..e6cb3ac81a7 100644 --- a/python/pylibcudf/pylibcudf/groupby.pyx +++ b/python/pylibcudf/pylibcudf/groupby.pyx @@ -25,6 +25,8 @@ from .types cimport null_order, null_policy, order, sorted from .utils cimport _as_vector +__all__ = ["GroupBy", "GroupByRequest"] + cdef class GroupByRequest: """A request for a groupby aggregation or scan. @@ -45,6 +47,8 @@ cdef class GroupByRequest: self._values = values self._aggregations = aggregations + __hash__ = None + cdef aggregation_request _to_libcudf_agg_request(self) except *: """Convert to a libcudf aggregation_request object. @@ -127,6 +131,8 @@ cdef class GroupBy: # deallocated from under us: self._keys = keys + __hash__ = None + @staticmethod cdef tuple _parse_outputs( pair[unique_ptr[table], vector[aggregation_result]] c_res diff --git a/python/pylibcudf/pylibcudf/hashing.pyi b/python/pylibcudf/pylibcudf/hashing.pyi new file mode 100644 index 00000000000..a849f5d0729 --- /dev/null +++ b/python/pylibcudf/pylibcudf/hashing.pyi @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from typing import Final + +from pylibcudf.column import Column +from pylibcudf.table import Table + +LIBCUDF_DEFAULT_HASH_SEED: Final[int] + +def murmurhash3_x86_32(input: Table, seed: int = ...) -> Column: ... +def murmurhash3_x64_128(input: Table, seed: int = ...) -> Table: ... +def xxhash_64(input: Table, seed: int = ...) -> Column: ... +def md5(input: Table) -> Column: ... +def sha1(input: Table) -> Column: ... +def sha224(input: Table) -> Column: ... +def sha256(input: Table) -> Column: ... +def sha384(input: Table) -> Column: ... +def sha512(input: Table) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx index 9ea3d4d1bda..548cffc0ce8 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyx +++ b/python/pylibcudf/pylibcudf/hashing.pyx @@ -20,6 +20,19 @@ from pylibcudf.libcudf.table.table cimport table from .column cimport Column from .table cimport Table +__all__ = [ + "LIBCUDF_DEFAULT_HASH_SEED", + "md5", + "murmurhash3_x64_128", + "murmurhash3_x86_32", + "sha1", + "sha224", + "sha256", + "sha384", + "sha512", + "xxhash_64", +] + LIBCUDF_DEFAULT_HASH_SEED = DEFAULT_HASH_SEED cpdef Column murmurhash3_x86_32( diff --git a/python/pylibcudf/pylibcudf/interop.pyi b/python/pylibcudf/pylibcudf/interop.pyi new file mode 100644 index 00000000000..63de816010b --- /dev/null +++ b/python/pylibcudf/pylibcudf/interop.pyi @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Iterable, Mapping +from dataclasses import dataclass +from typing import Any, overload + +import pyarrow as pa + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table +from pylibcudf.types import DataType + +@dataclass +class ColumnMetadata: + name: str = ... + children_meta: list[ColumnMetadata] = ... + +@overload +def from_arrow(obj: pa.DataType) -> DataType: ... +@overload +def from_arrow( + obj: pa.Scalar[Any], *, data_type: DataType | None = None +) -> Scalar: ... +@overload +def from_arrow(obj: pa.Array[Any]) -> Column: ... +@overload +def from_arrow(obj: pa.Table) -> Table: ... +@overload +def to_arrow( + obj: DataType, + *, + precision: int | None = None, + fields: Iterable[pa.Field[pa.DataType] | tuple[str, pa.DataType]] + | Mapping[str, pa.DataType] + | None = None, + value_type: pa.DataType | None = None, +) -> pa.DataType: ... +@overload +def to_arrow( + obj: Table, metadata: list[ColumnMetadata | str] | None = None +) -> pa.Table: ... +@overload +def to_arrow( + obj: Column, metadata: ColumnMetadata | str | None = None +) -> pa.Array[Any]: ... +@overload +def to_arrow( + obj: Scalar, metadata: ColumnMetadata | str | None = None +) -> pa.Scalar[Any]: ... +def from_dlpack(managed_tensor: Any) -> Table: ... +def to_dlpack(input: Table) -> Any: ... diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx index 61e812353b7..bd5397ac328 100644 --- a/python/pylibcudf/pylibcudf/interop.pyx +++ b/python/pylibcudf/pylibcudf/interop.pyx @@ -38,6 +38,14 @@ from .scalar cimport Scalar from .table cimport Table from .types cimport DataType, type_id +__all__ = [ + "ColumnMetadata", + "from_arrow", + "from_dlpack", + "to_arrow", + "to_dlpack", +] + ARROW_TO_PYLIBCUDF_TYPES = { pa.int8(): type_id.INT8, pa.int16(): type_id.INT16, diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt index 965724a47b1..664faef718f 100644 --- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx timezone.pyx - types.pyx +set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx + parquet_metadata.pyx text.pyx timezone.pyx types.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd index 1bcc0a3f963..663804e714d 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.pxd +++ b/python/pylibcudf/pylibcudf/io/__init__.pxd @@ -1,5 +1,15 @@ # Copyright (c) 2024, NVIDIA CORPORATION. # CSV is removed since it is def not cpdef (to force kw-only arguments) -from . cimport avro, datasource, json, orc, parquet, timezone, types +from . cimport ( + avro, + datasource, + json, + orc, + parquet, + parquet_metadata, + text, + timezone, + types, +) from .types cimport SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py index 2e4f215b12c..f913a400684 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.py +++ b/python/pylibcudf/pylibcudf/io/__init__.py @@ -1,4 +1,31 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import avro, csv, datasource, json, orc, parquet, timezone, types +from . import ( + avro, + csv, + datasource, + json, + orc, + parquet, + parquet_metadata, + text, + timezone, + types, +) from .types import SinkInfo, SourceInfo, TableWithMetadata + +__all__ = [ + "SinkInfo", + "SourceInfo", + "TableWithMetadata", + "avro", + "csv", + "datasource", + "json", + "orc", + "parquet", + "parquet_metadata", + "text", + "timezone", + "types", +] diff --git a/python/pylibcudf/pylibcudf/io/avro.pyi b/python/pylibcudf/pylibcudf/io/avro.pyi new file mode 100644 index 00000000000..49c2f083702 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/avro.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.io.types import SourceInfo, TableWithMetadata + +__all__ = ["read_avro"] + +def read_avro( + source_info: SourceInfo, + columns: list[str] | None = None, + skip_rows: int = 0, + num_rows: int = -1, +) -> TableWithMetadata: ... diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx index fe765b34f82..4271333511a 100644 --- a/python/pylibcudf/pylibcudf/io/avro.pyx +++ b/python/pylibcudf/pylibcudf/io/avro.pyx @@ -10,6 +10,8 @@ from pylibcudf.libcudf.io.avro cimport ( ) from pylibcudf.libcudf.types cimport size_type +__all__ = ["read_avro"] + cpdef TableWithMetadata read_avro( SourceInfo source_info, diff --git a/python/pylibcudf/pylibcudf/io/csv.pyi b/python/pylibcudf/pylibcudf/io/csv.pyi new file mode 100644 index 00000000000..356825a927d --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/csv.pyi @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Mapping + +from pylibcudf.io.types import ( + CompressionType, + QuoteStyle, + SourceInfo, + TableWithMetadata, +) +from pylibcudf.types import DataType + +def read_csv( + source_info: SourceInfo, + *, + compression: CompressionType = CompressionType.AUTO, + byte_range_offset: int = 0, + byte_range_size: int = 0, + col_names: list[str] | None = None, + prefix: str = "", + mangle_dupe_cols: bool = True, + usecols: list[int] | list[str] | None = None, + nrows: int = -1, + skiprows: int = 0, + skipfooter: int = 0, + header: int = 0, + lineterminator: str = "\n", + delimiter: str | None = None, + thousands: str | None = None, + decimal: str = ".", + comment: str | None = None, + delim_whitespace: bool = False, + skipinitialspace: bool = False, + skip_blank_lines: bool = True, + quoting: QuoteStyle = QuoteStyle.MINIMAL, + quotechar: str = '"', + doublequote: bool = True, + parse_dates: list[str] | list[int] | None = None, + parse_hex: list[str] | list[int] | None = None, + # Technically this should be dict/list + # but using a fused type prevents using None as default + dtypes: Mapping[str, DataType] | list[DataType] | None = None, + true_values: list[str] | None = None, + false_values: list[str] | None = None, + na_values: list[str] | None = None, + keep_default_na: bool = True, + na_filter: bool = True, + dayfirst: bool = False, + # Note: These options are supported by the libcudf reader + # but are not exposed here since there is no demand for them + # on the Python side yet. + # detect_whitespace_around_quotes: bool = False, + # timestamp_type: DataType = DataType(type_id.EMPTY), +) -> TableWithMetadata: ... diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx index 2c61cc42d82..858e580ab34 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyx +++ b/python/pylibcudf/pylibcudf/io/csv.pyx @@ -19,6 +19,8 @@ from pylibcudf.libcudf.types cimport data_type, size_type from pylibcudf.types cimport DataType +__all__ = ["read_csv"] + cdef tuple _process_parse_dates_hex(list cols): cdef vector[string] str_cols cdef vector[int] int_cols diff --git a/python/pylibcudf/pylibcudf/io/datasource.pyi b/python/pylibcudf/pylibcudf/io/datasource.pyi new file mode 100644 index 00000000000..e52197f793b --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/datasource.pyi @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +class Datasource: + def __init__(self): ... diff --git a/python/pylibcudf/pylibcudf/io/datasource.pyx b/python/pylibcudf/pylibcudf/io/datasource.pyx index 02418444caa..aac1c0d1014 100644 --- a/python/pylibcudf/pylibcudf/io/datasource.pyx +++ b/python/pylibcudf/pylibcudf/io/datasource.pyx @@ -2,8 +2,10 @@ from pylibcudf.libcudf.io.datasource cimport datasource +__all__ = ["Datasource"] cdef class Datasource: + __hash__ = None cdef datasource* get_datasource(self) except * nogil: with gil: raise NotImplementedError("get_datasource() should not " diff --git a/python/pylibcudf/pylibcudf/io/json.pyi b/python/pylibcudf/pylibcudf/io/json.pyi new file mode 100644 index 00000000000..b2bc6a43700 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/json.pyi @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from collections.abc import Mapping +from typing import TypeAlias + +from pylibcudf.column import Column +from pylibcudf.io.types import ( + CompressionType, + JSONRecoveryMode, + SinkInfo, + SourceInfo, + TableWithMetadata, +) +from pylibcudf.types import DataType + +ChildNameToTypeMap: TypeAlias = Mapping[str, ChildNameToTypeMap] + +NameAndType: TypeAlias = tuple[str, DataType, list[NameAndType]] + +def read_json( + source_info: SourceInfo, + dtypes: list[NameAndType] | None = None, + compression: CompressionType = CompressionType.AUTO, + lines: bool = False, + byte_range_offset: int = 0, + byte_range_size: int = 0, + keep_quotes: bool = False, + mixed_types_as_string: bool = False, + prune_columns: bool = False, + recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL, +) -> TableWithMetadata: ... +def write_json( + sink_info: SinkInfo, + table_w_meta: TableWithMetadata, + na_rep: str = "", + include_nulls: bool = False, + lines: bool = False, + rows_per_chunk: int = 2**32 - 1, + true_value: str = "true", + false_value: str = "false", +) -> None: ... +def chunked_read_json( + source_info: SourceInfo, + dtypes: list[NameAndType] | None = None, + compression: CompressionType = CompressionType.AUTO, + keep_quotes: bool = False, + mixed_types_as_string: bool = False, + prune_columns: bool = False, + recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL, + chunk_size: int = 100_000_000, +) -> tuple[list[Column], list[str], ChildNameToTypeMap]: ... diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx index 65f78f830f1..ad2989925c9 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyx +++ b/python/pylibcudf/pylibcudf/io/json.pyx @@ -23,6 +23,7 @@ from pylibcudf.libcudf.io.types cimport ( from pylibcudf.libcudf.types cimport data_type, size_type from pylibcudf.types cimport DataType +__all__ = ["chunked_read_json", "read_json", "write_json"] cdef map[string, schema_element] _generate_schema_map(list dtypes): cdef map[string, schema_element] schema_map diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi new file mode 100644 index 00000000000..4cf87f1a832 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/orc.pyi @@ -0,0 +1,41 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from typing import Any + +from pylibcudf.io.types import SourceInfo, TableWithMetadata +from pylibcudf.types import DataType + +def read_orc( + source_info: SourceInfo, + columns: list[str] | None = None, + stripes: list[list[int]] | None = None, + skip_rows: int = 0, + nrows: int = -1, + use_index: bool = True, + use_np_dtypes: bool = True, + timestamp_type: DataType | None = None, + decimal128_columns: list[str] | None = None, +) -> TableWithMetadata: ... + +class OrcColumnStatistics: + def __init__(self): ... + @property + def number_of_values(self) -> int | None: ... + @property + def has_null(self) -> bool | None: ... + def __getitem__(self, item: str) -> Any: ... + def __contains__(self, item: str) -> bool: ... + def get[T](self, item: str, default: None | T = None) -> T | None: ... + +class ParsedOrcStatistics: + def __init__(self): ... + @property + def column_names(self) -> list[str]: ... + @property + def file_stats(self) -> list[OrcColumnStatistics]: ... + @property + def stripes_stats(self) -> list[OrcColumnStatistics]: ... + +def read_parsed_orc_statistics( + source_info: SourceInfo, +) -> ParsedOrcStatistics: ... diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx index 70e0a7995a2..4270f5b4f95 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyx +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -30,6 +30,12 @@ from pylibcudf.libcudf.types cimport size_type from pylibcudf.types cimport DataType from pylibcudf.variant cimport get_if, holds_alternative +__all__ = [ + "OrcColumnStatistics", + "ParsedOrcStatistics", + "read_orc", + "read_parsed_orc_statistics", +] cdef class OrcColumnStatistics: def __init__(self): @@ -39,6 +45,8 @@ cdef class OrcColumnStatistics: "use `OrcColumnStatistics.from_libcudf` instead." ) + __hash__ = None + @property def number_of_values(self): if self.number_of_values_c.has_value(): @@ -183,6 +191,8 @@ cdef class OrcColumnStatistics: cdef class ParsedOrcStatistics: + __hash__ = None + @property def column_names(self): return [name.decode() for name in self.c_obj.column_names] diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi new file mode 100644 index 00000000000..bcf1d1cce09 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/parquet.pyi @@ -0,0 +1,36 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.expressions import Expression +from pylibcudf.io.types import SourceInfo, TableWithMetadata + +class ChunkedParquetReader: + def __init__( + self, + source_info: SourceInfo, + columns: list[str] | None = None, + row_groups: list[list[int]] | None = None, + use_pandas_metadata: bool = True, + convert_strings_to_categories: bool = False, + skip_rows: int = 0, + nrows: int = 0, + chunk_read_limit: int = 0, + pass_read_limit: int = 1024000000, + allow_mismatched_pq_schemas: bool = False, + ) -> None: ... + def has_next(self) -> bool: ... + def read_chunk(self) -> TableWithMetadata: ... + +def read_parquet( + source_info: SourceInfo, + columns: list[str] | None = None, + row_groups: list[list[int]] | None = None, + filters: Expression | None = None, + convert_strings_to_categories: bool = False, + use_pandas_metadata: bool = True, + skip_rows: int = 0, + nrows: int = -1, + allow_mismatched_pq_schemas: bool = False, + # disabled see comment in parquet.pyx for more + # reader_column_schema: ReaderColumnSchema = *, + # timestamp_type: DataType = * +) -> TableWithMetadata: ... diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx index 981ca7b8159..b76a352d633 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet.pyx @@ -16,6 +16,8 @@ from pylibcudf.libcudf.io.parquet cimport ( from pylibcudf.libcudf.io.types cimport table_with_metadata from pylibcudf.libcudf.types cimport size_type +__all__ = ["ChunkedParquetReader", "read_parquet"] + cdef parquet_reader_options _setup_parquet_reader_options( SourceInfo source_info, @@ -123,6 +125,8 @@ cdef class ChunkedParquetReader: ) ) + __hash__ = None + cpdef bool has_next(self): """ Returns True if there is another chunk in the Parquet file diff --git a/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd b/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd new file mode 100644 index 00000000000..e421a64adc8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.io.types cimport SourceInfo +from pylibcudf.libcudf.io.parquet_metadata cimport( + parquet_metadata, + parquet_schema, + parquet_column_schema, +) + +cdef class ParquetColumnSchema: + cdef parquet_column_schema column_schema + + @staticmethod + cdef from_column_schema(parquet_column_schema column_schema) + + cpdef str name(self) + + cpdef int num_children(self) + + cpdef ParquetColumnSchema child(self, int idx) + + cpdef list children(self) + + +cdef class ParquetSchema: + cdef parquet_schema schema + + @staticmethod + cdef from_schema(parquet_schema schema) + + cpdef ParquetColumnSchema root(self) + + +cdef class ParquetMetadata: + cdef parquet_metadata meta + + @staticmethod + cdef from_metadata(parquet_metadata meta) + + cpdef ParquetSchema schema(self) + + cpdef int num_rows(self) + + cpdef int num_rowgroups(self) + + cpdef dict metadata(self) + + cpdef list rowgroup_metadata(self) + + +cpdef ParquetMetadata read_parquet_metadata(SourceInfo src_info) diff --git a/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx new file mode 100644 index 00000000000..0ad4dafb0cf --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx @@ -0,0 +1,214 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.io.types cimport SourceInfo +from pylibcudf.libcudf.io cimport parquet_metadata as cpp_parquet_metadata + + +__all__ = [ + "ParquetColumnSchema", + "ParquetMetadata", + "ParquetSchema", + "read_parquet_metadata", +] + +cdef class ParquetColumnSchema: + """ + Schema of a parquet column, including the nested columns. + + Parameters + ---------- + parquet_column_schema + """ + def __init__(self): + raise ValueError("Construct ParquetColumnSchema with from_column_schema.") + + @staticmethod + cdef from_column_schema(cpp_parquet_metadata.parquet_column_schema column_schema): + cdef ParquetColumnSchema result = ParquetColumnSchema.__new__( + ParquetColumnSchema + ) + result.column_schema = column_schema + return result + + cpdef str name(self): + """ + Returns parquet column name; can be empty. + + Returns + ------- + str + Column name + """ + return self.column_schema.name().decode() + + cpdef int num_children(self): + """ + Returns the number of child columns. + + Returns + ------- + int + Children count + """ + return self.column_schema.num_children() + + cpdef ParquetColumnSchema child(self, int idx): + """ + Returns schema of the child with the given index. + + Parameters + ---------- + idx : int + Child Index + + Returns + ------- + ParquetColumnSchema + Child schema + """ + return ParquetColumnSchema.from_column_schema(self.column_schema.child(idx)) + + cpdef list children(self): + """ + Returns schemas of all child columns. + + Returns + ------- + list[ParquetColumnSchema] + Child schemas. + """ + cdef cpp_parquet_metadata.parquet_column_schema child + return [ + ParquetColumnSchema.from_column_schema(child) + for child in self.column_schema.children() + ] + + +cdef class ParquetSchema: + """ + Schema of a parquet file. + + Parameters + ---------- + parquet_schema + """ + + def __init__(self): + raise ValueError("Construct ParquetSchema with from_schema.") + + @staticmethod + cdef from_schema(cpp_parquet_metadata.parquet_schema schema): + cdef ParquetSchema result = ParquetSchema.__new__(ParquetSchema) + result.schema = schema + return result + + cpdef ParquetColumnSchema root(self): + """ + Returns the schema of the struct column that contains all columns as fields. + + Returns + ------- + ParquetColumnSchema + Root column schema + """ + return ParquetColumnSchema.from_column_schema(self.schema.root()) + + +cdef class ParquetMetadata: + """ + Information about content of a parquet file. + + Parameters + ---------- + parquet_metadata + """ + + def __init__(self): + raise ValueError("Construct ParquetMetadata with from_metadata.") + + @staticmethod + cdef from_metadata(cpp_parquet_metadata.parquet_metadata meta): + cdef ParquetMetadata result = ParquetMetadata.__new__(ParquetMetadata) + result.meta = meta + return result + + cpdef ParquetSchema schema(self): + """ + Returns the parquet schema. + + Returns + ------- + ParquetSchema + Parquet schema + """ + return ParquetSchema.from_schema(self.meta.schema()) + + cpdef int num_rows(self): + """ + Returns the number of rows of the root column. + + Returns + ------- + int + Number of rows + """ + return self.meta.num_rows() + + cpdef int num_rowgroups(self): + """ + Returns the number of rowgroups in the file. + + Returns + ------- + int + Number of row groups. + """ + return self.meta.num_rowgroups() + + cpdef dict metadata(self): + """ + Returns the key-value metadata in the file footer. + + Returns + ------- + dict[str, str] + Key value metadata as a map. + """ + return {key.decode(): val.decode() for key, val in self.meta.metadata()} + + cpdef list rowgroup_metadata(self): + """ + Returns the row group metadata in the file footer. + + Returns + ------- + list[dict[str, int]] + Vector of row group metadata as maps. + """ + return [ + {key.decode(): val for key, val in metadata} + for metadata in self.meta.rowgroup_metadata() + ] + + +cpdef ParquetMetadata read_parquet_metadata(SourceInfo src_info): + """ + Reads metadata of parquet dataset. + + Parameters + ---------- + src_info : SourceInfo + Dataset source. + + Returns + ------- + ParquetMetadata + Parquet_metadata with parquet schema, number of rows, + number of row groups and key-value metadata. + """ + cdef cpp_parquet_metadata.parquet_metadata c_result + + with nogil: + c_result = cpp_parquet_metadata.read_parquet_metadata(src_info.c_obj) + + return ParquetMetadata.from_metadata(c_result) diff --git a/python/pylibcudf/pylibcudf/io/text.pxd b/python/pylibcudf/pylibcudf/io/text.pxd new file mode 100644 index 00000000000..051e9bc0cde --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/text.pxd @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from pylibcudf.column cimport Column +from pylibcudf.libcudf.io.text cimport parse_options, data_chunk_source + +cdef class ParseOptions: + cdef parse_options c_options + +cdef class DataChunkSource: + cdef unique_ptr[data_chunk_source] c_source + cdef string data_ref + + +cpdef Column multibyte_split( + DataChunkSource source, + str delimiter, + ParseOptions options=* +) + +cpdef DataChunkSource make_source(str data) + +cpdef DataChunkSource make_source_from_file(str filename) + +cpdef DataChunkSource make_source_from_bgzip_file( + str filename, + int virtual_begin=*, + int virtual_end=*, +) diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx new file mode 100644 index 00000000000..d3cbdc4cd60 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/text.pyx @@ -0,0 +1,202 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libc.stdint cimport uint64_t +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.io cimport text as cpp_text + +__all__ = [ + "DataChunkSource", + "ParseOptions", + "make_source", + "make_source_from_bgzip_file", + "make_source_from_file", + "multibyte_split", +] + +cdef class ParseOptions: + """ + Parsing options for `multibyte_split` + + Parameters + ---------- + byte_range : list | tuple, default None + Only rows starting inside this byte range will be + part of the output column. + + strip_delimiters : bool, default True + Whether delimiters at the end of rows should + be stripped from the output column. + """ + def __init__( + self, + *, + byte_range=None, + strip_delimiters=False, + ): + self.c_options = cpp_text.parse_options() + if byte_range is not None: + c_byte_range_offset = byte_range[0] + c_byte_range_size = byte_range[1] + self.c_options.byte_range = cpp_text.byte_range_info( + c_byte_range_offset, + c_byte_range_size + ) + self.c_options.strip_delimiters = strip_delimiters + + +cdef class DataChunkSource: + """ + Data source for `multibyte_split` + + Parameters + ---------- + data : str + Filename or data itself. + """ + + def __cinit__(self, str data): + # Need to keep a reference alive for make_source + self.data_ref = data.encode() + + +cpdef DataChunkSource make_source(str data): + """ + Creates a data source capable of producing device-buffered views + of the given string. + + Parameters + ---------- + data : str + The host data to be exposed as a data chunk source. + + Returns + ------- + DataChunkSource + The data chunk source for the provided host data. + """ + cdef DataChunkSource dcs = DataChunkSource(data) + with nogil: + dcs.c_source = move(cpp_text.make_source(dcs.data_ref)) + return dcs + + +cpdef DataChunkSource make_source_from_file(str filename): + """ + Creates a data source capable of producing device-buffered views of the file. + + Parameters + ---------- + filename : str + The filename of the file to be exposed as a data chunk source. + + Returns + ------- + DataChunkSource + The data chunk source for the provided filename. + """ + cdef DataChunkSource dcs = DataChunkSource(filename) + with nogil: + dcs.c_source = move(cpp_text.make_source_from_file(dcs.data_ref)) + return dcs + +cpdef DataChunkSource make_source_from_bgzip_file( + str filename, + int virtual_begin=-1, + int virtual_end=-1, +): + """ + Creates a data source capable of producing device-buffered views of + a BGZIP compressed file with virtual record offsets. + + Parameters + ---------- + filename : str + The filename of the BGZIP-compressed file to be exposed as a data chunk source. + + virtual_begin : int + The virtual (Tabix) offset of the first byte to be read. Its upper 48 bits + describe the offset into the compressed file, its lower 16 bits describe the + block-local offset. + + virtual_end : int, default None + The virtual (Tabix) offset one past the last byte to be read + + Returns + ------- + DataChunkSource + The data chunk source for the provided filename. + """ + cdef uint64_t c_virtual_begin + cdef uint64_t c_virtual_end + cdef DataChunkSource dcs = DataChunkSource(filename) + + if virtual_begin == -1 and virtual_end == -1: + with nogil: + dcs.c_source = move(cpp_text.make_source_from_bgzip_file(dcs.data_ref)) + elif virtual_begin != -1 and virtual_end != -1: + c_virtual_begin = virtual_begin + c_virtual_end = virtual_end + with nogil: + dcs.c_source = move( + cpp_text.make_source_from_bgzip_file( + dcs.data_ref, + c_virtual_begin, + c_virtual_end, + ) + ) + else: + raise ValueError( + "virtual_begin and virtual_end must both be None or both be int" + ) + return dcs + +cpdef Column multibyte_split( + DataChunkSource source, + str delimiter, + ParseOptions options=None +): + """ + Splits the source text into a strings column using a multiple byte delimiter. + + For details, see :cpp:func:`cudf::io::text::multibyte_split` + + Parameters + ---------- + source : + The source string. + + delimiter : str + UTF-8 encoded string for which to find offsets in the source. + + options : ParseOptions + The parsing options to use (including byte range). + + Returns + ------- + Column + The strings found by splitting the source by the delimiter + within the relevant byte range. + """ + cdef unique_ptr[column] c_result + cdef unique_ptr[data_chunk_source] c_source = move(source.c_source) + cdef string c_delimiter = delimiter.encode() + + if options is None: + options = ParseOptions() + + cdef cpp_text.parse_options c_options = options.c_options + + with nogil: + c_result = cpp_text.multibyte_split( + dereference(c_source), + c_delimiter, + c_options + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyi b/python/pylibcudf/pylibcudf/io/timezone.pyi new file mode 100644 index 00000000000..0582800c4af --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/timezone.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.table import Table + +def make_timezone_transition_table( + tzif_dir: str, timezone_name: str +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyx b/python/pylibcudf/pylibcudf/io/timezone.pyx index f120b65fb2c..af7cf8a4ee5 100644 --- a/python/pylibcudf/pylibcudf/io/timezone.pyx +++ b/python/pylibcudf/pylibcudf/io/timezone.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.table.table cimport table from ..table cimport Table +__all__ = ["make_timezone_transition_table"] cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name): """ diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi new file mode 100644 index 00000000000..a4f4fc13bdc --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/types.pyi @@ -0,0 +1,97 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import io +import os +from collections.abc import Mapping +from enum import IntEnum +from typing import Any, Literal, TypeAlias, overload + +from pylibcudf.column import Column +from pylibcudf.io.datasource import Datasource +from pylibcudf.table import Table + +class JSONRecoveryMode(IntEnum): + FAIL = ... + RECOVER_WITH_NULL = ... + +class CompressionType(IntEnum): + NONE = ... + AUTO = ... + SNAPPY = ... + GZIP = ... + BZIP2 = ... + BROTLI = ... + ZIP = ... + XZ = ... + ZLIB = ... + LZ4 = ... + LZO = ... + ZSTD = ... + +class ColumnEncoding(IntEnum): + USE_DEFAULT = ... + DICTIONARY = ... + PLAIN = ... + DELTA_BINARY_PACKED = ... + DELTA_LENGTH_BYTE_ARRAY = ... + DELTA_BYTE_ARRAY = ... + BYTE_STREAM_SPLIT = ... + DIRECT = ... + DIRECT_V2 = ... + DICTIONARY_V2 = ... + +class DictionaryPolicy(IntEnum): + NEVER = ... + ADAPTIVE = ... + ALWAYS = ... + +class StatisticsFreq(IntEnum): + STATISTICS_NONE = ... + STATISTICS_ROWGROUP = ... + STATISTICS_PAGE = ... + STATISTICS_COLUMN = ... + +class QuoteStyle(IntEnum): + MINIMAL = ... + ALL = ... + NONNUMERIC = ... + NONE = ... + +ColumnNameSpec: TypeAlias = tuple[str, list[ColumnNameSpec]] +ChildNameSpec: TypeAlias = Mapping[str, ChildNameSpec] + +class TableWithMetadata: + tbl: Table + def __init__( + self, tbl: Table, column_names: list[ColumnNameSpec] + ) -> None: ... + @property + def columns(self) -> list[Column]: ... + @overload + def column_names(self, include_children: Literal[False]) -> list[str]: ... + @overload + def column_names( + self, include_children: Literal[True] + ) -> list[ColumnNameSpec]: ... + @overload + def column_names( + self, include_children: bool = False + ) -> list[str] | list[ColumnNameSpec]: ... + @property + def child_names(self) -> ChildNameSpec: ... + @property + def per_file_user_data(self) -> list[Mapping[str, str]]: ... + +class SourceInfo: + def __init__( + self, sources: list[str] | list[os.PathLike[Any]] | list[Datasource] + ) -> None: ... + +class SinkInfo: + def __init__( + self, + sinks: list[os.PathLike[Any]] + | list[io.StringIO] + | list[io.BytesIO] + | list[io.TextIOBase] + | list[str], + ) -> None: ... diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index 967d05e7057..5db4eeb9583 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -20,6 +20,7 @@ import codecs import errno import io import os +import re from pylibcudf.libcudf.io.json import \ json_recovery_mode_t as JSONRecoveryMode # no-cython-lint @@ -27,9 +28,21 @@ from pylibcudf.libcudf.io.types import ( compression_type as CompressionType, # no-cython-lint column_encoding as ColumnEncoding, # no-cython-lint dictionary_policy as DictionaryPolicy, # no-cython-lint + quote_style as QuoteStyle, # no-cython-lint statistics_freq as StatisticsFreq, # no-cython-lint ) +__all__ = [ + "ColumnEncoding", + "CompressionType", + "DictionaryPolicy", + "JSONRecoveryMode", + "QuoteStyle", + "SinkInfo", + "SourceInfo", + "StatisticsFreq", + "TableWithMetadata", +] cdef class TableWithMetadata: """A container holding a table and its associated metadata @@ -53,6 +66,8 @@ cdef class TableWithMetadata: self.metadata.schema_info = self._make_column_info(column_names) + __hash__ = None + cdef vector[column_name_info] _make_column_info(self, list column_names): cdef vector[column_name_info] col_name_infos cdef column_name_info info @@ -147,6 +162,8 @@ cdef class SourceInfo: Mixing different types of sources will raise a `ValueError`. """ + # Regular expression that match remote file paths supported by libcudf + _is_remote_file_pattern = re.compile(r"^s3://", re.IGNORECASE) def __init__(self, list sources): if not sources: @@ -161,11 +178,10 @@ cdef class SourceInfo: for src in sources: if not isinstance(src, (os.PathLike, str)): raise ValueError("All sources must be of the same type!") - if not os.path.isfile(src): - raise FileNotFoundError(errno.ENOENT, - os.strerror(errno.ENOENT), - src) - + if not (os.path.isfile(src) or self._is_remote_file_pattern.match(src)): + raise FileNotFoundError( + errno.ENOENT, os.strerror(errno.ENOENT), src + ) c_files.push_back( str(src).encode()) self.c_obj = move(source_info(c_files)) @@ -217,6 +233,8 @@ cdef class SourceInfo: self.c_obj = source_info(c_host_buffers) + __hash__ = None + # Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you # write from cudf to any python file-like object (File/BytesIO/SocketIO etc) @@ -299,3 +317,5 @@ cdef class SinkInfo: else: # we don't have sinks so we must have paths to sinks self.c_obj = sink_info(paths) + + __hash__ = None diff --git a/python/pylibcudf/pylibcudf/join.pyi b/python/pylibcudf/pylibcudf/join.pyi new file mode 100644 index 00000000000..f34357baa67 --- /dev/null +++ b/python/pylibcudf/pylibcudf/join.pyi @@ -0,0 +1,78 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.expressions import Expression +from pylibcudf.table import Table +from pylibcudf.types import NullEquality + +def inner_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> tuple[Column, Column]: ... +def left_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> tuple[Column, Column]: ... +def full_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> tuple[Column, Column]: ... +def left_semi_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> Column: ... +def left_anti_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> Column: ... +def cross_join(left: Table, right: Table) -> Table: ... +def conditional_inner_join( + left: Table, right: Table, binary_predicate: Expression +) -> tuple[Column, Column]: ... +def conditional_left_join( + left: Table, right: Table, binary_predicate: Expression +) -> tuple[Column, Column]: ... +def conditional_full_join( + left: Table, right: Table, binary_predicate: Expression +) -> tuple[Column, Column]: ... +def conditional_left_semi_join( + left: Table, right: Table, binary_predicate: Expression +) -> Column: ... +def conditional_left_anti_join( + left: Table, right: Table, binary_predicate: Expression +) -> Column: ... +def mixed_inner_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> tuple[Column, Column]: ... +def mixed_left_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> tuple[Column, Column]: ... +def mixed_full_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> tuple[Column, Column]: ... +def mixed_left_semi_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> Column: ... +def mixed_left_anti_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx index 0d841eee194..c2efe05ffc4 100644 --- a/python/pylibcudf/pylibcudf/join.pyx +++ b/python/pylibcudf/pylibcudf/join.pyx @@ -15,6 +15,24 @@ from .column cimport Column from .expressions cimport Expression from .table cimport Table +__all__ = [ + "conditional_full_join", + "conditional_inner_join", + "conditional_left_anti_join", + "conditional_left_join", + "conditional_left_semi_join", + "cross_join", + "full_join", + "inner_join", + "left_anti_join", + "left_join", + "left_semi_join", + "mixed_full_join", + "mixed_inner_join", + "mixed_left_anti_join", + "mixed_left_join", + "mixed_left_semi_join", +] cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map): # helper to convert a gather map to a Column diff --git a/python/pylibcudf/pylibcudf/json.pyi b/python/pylibcudf/pylibcudf/json.pyi new file mode 100644 index 00000000000..b93d4876dab --- /dev/null +++ b/python/pylibcudf/pylibcudf/json.pyi @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class GetJsonObjectOptions: + def __init__( + self, + *, + allow_single_quotes: bool = False, + strip_quotes_from_single_strings: bool = True, + missing_fields_as_nulls: bool = False, + ) -> None: ... + def get_allow_single_quotes(self) -> bool: ... + def get_strip_quotes_from_single_strings(self) -> bool: ... + def get_missing_fields_as_nulls(self) -> bool: ... + def set_allow_single_quotes(self, val: bool) -> None: ... + def set_strip_quotes_from_single_strings(self, val: bool) -> None: ... + def set_missing_fields_as_nulls(self, val: bool) -> None: ... + +def get_json_object( + col: Column, json_path: Scalar, options: GetJsonObjectOptions | None = None +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/json.pyx b/python/pylibcudf/pylibcudf/json.pyx index ebb82f80408..5ec1e1be971 100644 --- a/python/pylibcudf/pylibcudf/json.pyx +++ b/python/pylibcudf/pylibcudf/json.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.scalar cimport Scalar +__all__ = ["GetJsonObjectOptions", "get_json_object"] cdef class GetJsonObjectOptions: """Settings for ``get_json_object()``""" @@ -26,6 +27,8 @@ cdef class GetJsonObjectOptions: ) self.set_missing_fields_as_nulls(missing_fields_as_nulls) + __hash__ = None + def get_allow_single_quotes(self): """ Returns true/false depending on whether single-quotes for representing strings diff --git a/python/pylibcudf/pylibcudf/labeling.pxd b/python/pylibcudf/pylibcudf/labeling.pxd index 6f8797ae7d3..b1f9f2e806d 100644 --- a/python/pylibcudf/pylibcudf/labeling.pxd +++ b/python/pylibcudf/pylibcudf/labeling.pxd @@ -8,7 +8,7 @@ from .column cimport Column cpdef Column label_bins( Column input, Column left_edges, - bool left_inclusive, + inclusive left_inclusive, Column right_edges, - bool right_inclusive + inclusive right_inclusive ) diff --git a/python/pylibcudf/pylibcudf/labeling.pyi b/python/pylibcudf/pylibcudf/labeling.pyi new file mode 100644 index 00000000000..c3a75d10baf --- /dev/null +++ b/python/pylibcudf/pylibcudf/labeling.pyi @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column + +class Inclusive(IntEnum): + YES = ... + NO = ... + +def label_bins( + input: Column, + left_edges: Column, + left_inclusive: Inclusive, + right_edges: Column, + right_inclusive: Inclusive, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx index 226a9e14172..cae1830f6b9 100644 --- a/python/pylibcudf/pylibcudf/labeling.pyx +++ b/python/pylibcudf/pylibcudf/labeling.pyx @@ -10,13 +10,14 @@ from pylibcudf.libcudf.labeling import inclusive as Inclusive # no-cython-lint from .column cimport Column +__all__ = ["Inclusive", "label_bins"] cpdef Column label_bins( Column input, Column left_edges, - bool left_inclusive, + inclusive left_inclusive, Column right_edges, - bool right_inclusive + inclusive right_inclusive ): """Labels elements based on membership in the specified bins. @@ -28,11 +29,11 @@ cpdef Column label_bins( Column of input elements to label according to the specified bins. left_edges : Column Column of the left edge of each bin. - left_inclusive : bool + left_inclusive : Inclusive Whether or not the left edge is inclusive. right_edges : Column Column of the right edge of each bin. - right_inclusive : bool + right_inclusive : Inclusive Whether or not the right edge is inclusive. Returns @@ -42,24 +43,13 @@ cpdef Column label_bins( according to the specified bins. """ cdef unique_ptr[column] c_result - cdef inclusive c_left_inclusive = ( - inclusive.YES - if left_inclusive - else inclusive.NO - ) - cdef inclusive c_right_inclusive = ( - inclusive.YES - if right_inclusive - else inclusive.NO - ) - with nogil: c_result = cpp_labeling.label_bins( input.view(), left_edges.view(), - c_left_inclusive, + left_inclusive, right_edges.view(), - c_right_inclusive, + right_inclusive, ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt index 15beaee47d4..00669ff579a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt @@ -24,4 +24,5 @@ rapids_cython_create_modules( LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp ) add_subdirectory(io) +add_subdirectory(lists) add_subdirectory(strings) diff --git a/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd index 8e6da56c9a6..b0ce13e4492 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd @@ -1,11 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -cimport pylibcudf.libcudf.io.types as cudf_io_types from libc.stdint cimport int64_t from libcpp.string cimport string from libcpp.unordered_map cimport unordered_map from libcpp.vector cimport vector from pylibcudf.libcudf.types cimport size_type +from pylibcudf.libcudf.io.types cimport source_info cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil: @@ -28,4 +28,4 @@ cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil: unordered_map[string, string] metadata() except+ vector[unordered_map[string, int64_t]] rowgroup_metadata() except+ - cdef parquet_metadata read_parquet_metadata(cudf_io_types.source_info src) except+ + cdef parquet_metadata read_parquet_metadata(source_info src_info) except+ diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt new file mode 100644 index 00000000000..c896db2c85a --- /dev/null +++ b/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt @@ -0,0 +1,23 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources combine.pyx contains.pyx) + +set(linked_libraries cudf::cudf) + +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_lists +) diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd index d077958ce03..09a5d84c64f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -9,10 +10,9 @@ from pylibcudf.libcudf.table.table_view cimport table_view cdef extern from "cudf/lists/combine.hpp" namespace \ "cudf::lists" nogil: - ctypedef enum concatenate_null_policy: - IGNORE "cudf::lists::concatenate_null_policy::IGNORE" - NULLIFY_OUTPUT_ROW \ - "cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW" + cpdef enum class concatenate_null_policy(int32_t): + IGNORE + NULLIFY_OUTPUT_ROW cdef unique_ptr[column] concatenate_rows( const table_view input_table diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pyx b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/contains.pyx b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index 41250037dcf..ebf8eda1ce3 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -22,6 +22,14 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const size_type width, ) except + + cdef unique_ptr[column] minhash_permuted( + const column_view &strings, + const uint32_t seed, + const column_view &a, + const column_view &b, + const size_type width, + ) except + + cdef unique_ptr[column] minhash64( const column_view &strings, const column_view &seeds, @@ -34,6 +42,14 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const size_type width, ) except + + cdef unique_ptr[column] minhash64_permuted( + const column_view &strings, + const uint64_t seed, + const column_view &a, + const column_view &b, + const size_type width, + ) except + + cdef unique_ptr[column] word_minhash( const column_view &input, const column_view &seeds diff --git a/python/pylibcudf/pylibcudf/lists.pxd b/python/pylibcudf/pylibcudf/lists.pxd index e7d006e6e2e..10c1c26e24e 100644 --- a/python/pylibcudf/pylibcudf/lists.pxd +++ b/python/pylibcudf/pylibcudf/lists.pxd @@ -1,7 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp cimport bool -from pylibcudf.libcudf.types cimport null_order, size_type +from pylibcudf.libcudf.types cimport ( + nan_equality, null_equality, null_order, order, size_type +) +from pylibcudf.libcudf.lists.combine cimport concatenate_null_policy +from pylibcudf.libcudf.lists.contains cimport duplicate_find_option from .column cimport Column from .scalar cimport Scalar @@ -19,13 +23,13 @@ cpdef Table explode_outer(Table, size_type explode_column_idx) cpdef Column concatenate_rows(Table) -cpdef Column concatenate_list_elements(Column, bool dropna) +cpdef Column concatenate_list_elements(Column, concatenate_null_policy null_policy) cpdef Column contains(Column, ColumnOrScalar) cpdef Column contains_nulls(Column) -cpdef Column index_of(Column, ColumnOrScalar, bool) +cpdef Column index_of(Column, ColumnOrScalar, duplicate_find_option) cpdef Column reverse(Column) @@ -37,16 +41,24 @@ cpdef Column count_elements(Column) cpdef Column sequences(Column, Column, Column steps = *) -cpdef Column sort_lists(Column, bool, null_order, bool stable = *) +cpdef Column sort_lists(Column, order, null_order, bool stable = *) -cpdef Column difference_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) +cpdef Column difference_distinct( + Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* +) -cpdef Column have_overlap(Column, Column, bool nulls_equal=*, bool nans_equal=*) +cpdef Column have_overlap( + Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* +) -cpdef Column intersect_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) +cpdef Column intersect_distinct( + Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* +) -cpdef Column union_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) +cpdef Column union_distinct( + Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* +) cpdef Column apply_boolean_mask(Column, Column) -cpdef Column distinct(Column, bool, bool) +cpdef Column distinct(Column, null_equality, nan_equality) diff --git a/python/pylibcudf/pylibcudf/lists.pyi b/python/pylibcudf/pylibcudf/lists.pyi new file mode 100644 index 00000000000..dff6c400638 --- /dev/null +++ b/python/pylibcudf/pylibcudf/lists.pyi @@ -0,0 +1,70 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table +from pylibcudf.types import NanEquality, NullEquality, NullOrder, Order + +class ConcatenateNullPolicy(IntEnum): + IGNORE = ... + NULLIFY_OUTPUT_ROW = ... + +class DuplicateFindOption(IntEnum): + FIND_FIRST = ... + FIND_LAST = ... + +def explode_outer(input: Table, explode_column_idx: int) -> Table: ... +def concatenate_rows(input: Table) -> Column: ... +def concatenate_list_elements( + input: Column, null_policy: ConcatenateNullPolicy +) -> Column: ... +def contains(input: Column, search_key: Column | Scalar) -> Column: ... +def contains_nulls(input: Column) -> Column: ... +def index_of( + input: Column, + search_key: Column | Scalar, + find_option: DuplicateFindOption, +) -> Column: ... +def reverse(input: Column) -> Column: ... +def segmented_gather(input: Column, gather_map_list: Column) -> Column: ... +def extract_list_element(input: Column, index: Column | int) -> Column: ... +def count_elements(input: Column) -> Column: ... +def sequences( + starts: Column, sizes: Column, steps: Column | None = None +) -> Column: ... +def sort_lists( + input: Column, + sort_order: Order, + na_position: NullOrder, + stable: bool = False, +) -> Column: ... +def difference_distinct( + lhs: Column, + rhs: Column, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Column: ... +def have_overlap( + lhs: Column, + rhs: Column, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Column: ... +def intersect_distinct( + lhs: Column, + rhs: Column, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Column: ... +def union_distinct( + lhs: Column, + rhs: Column, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Column: ... +def apply_boolean_mask(input: Column, boolean_mask: Column) -> Column: ... +def distinct( + input: Column, nulls_equal: NullEquality, nans_equal: NanEquality +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx index ecaf62d6895..ccc56eaa520 100644 --- a/python/pylibcudf/pylibcudf/lists.pyx +++ b/python/pylibcudf/pylibcudf/lists.pyx @@ -42,10 +42,35 @@ from pylibcudf.libcudf.types cimport ( ) from pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType +from pylibcudf.libcudf.lists.combine import concatenate_null_policy as ConcatenateNullPolicy # no-cython-lint +from pylibcudf.libcudf.lists.contains import duplicate_find_option as DuplicateFindOption # no-cython-lint + from .column cimport Column, ListColumnView from .scalar cimport Scalar from .table cimport Table +__all__ = [ + "ConcatenateNullPolicy", + "DuplicateFindOption", + "apply_boolean_mask", + "concatenate_list_elements", + "concatenate_rows", + "contains", + "contains_nulls", + "count_elements", + "difference_distinct", + "distinct", + "explode_outer", + "extract_list_element", + "have_overlap", + "index_of", + "intersect_distinct", + "reverse", + "segmented_gather", + "sequences", + "sort_lists", + "union_distinct", +] cpdef Table explode_outer(Table input, size_type explode_column_idx): """Explode a column of lists into rows. @@ -97,7 +122,9 @@ cpdef Column concatenate_rows(Table input): return Column.from_libcudf(move(c_result)) -cpdef Column concatenate_list_elements(Column input, bool dropna): +cpdef Column concatenate_list_elements( + Column input, concatenate_null_policy null_policy +): """Concatenate multiple lists on the same row into a single list. For details, see :cpp:func:`concatenate_list_elements`. @@ -106,20 +133,14 @@ cpdef Column concatenate_list_elements(Column input, bool dropna): ---------- input : Column The input column - dropna : bool - If true, null list elements will be ignored - from concatenation. Otherwise any input null values will result in - the corresponding output row being set to null. + null_policy : ConcatenateNullPolicy + How to treat null list elements. Returns ------- Column A new Column of concatenated list elements """ - cdef concatenate_null_policy null_policy = ( - concatenate_null_policy.IGNORE if dropna - else concatenate_null_policy.NULLIFY_OUTPUT_ROW - ) cdef unique_ptr[column] c_result with nogil: @@ -191,7 +212,9 @@ cpdef Column contains_nulls(Column input): return Column.from_libcudf(move(c_result)) -cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_option): +cpdef Column index_of( + Column input, ColumnOrScalar search_key, duplicate_find_option find_option +): """Create a column of index values indicating the position of a search key row within the corresponding list row in the lists column. @@ -207,9 +230,8 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o The input column. search_key : Union[Column, Scalar] The search key. - find_first_option : bool - If true, index_of returns the first match. - Otherwise the last match is returned. + find_option : DuplicateFindOption + Which match to return if there are duplicates. Returns ------- @@ -220,11 +242,6 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o """ cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() - cdef cpp_contains.duplicate_find_option find_option = ( - cpp_contains.duplicate_find_option.FIND_FIRST if find_first_option - else cpp_contains.duplicate_find_option.FIND_LAST - ) - with nogil: c_result = cpp_contains.index_of( list_view.view(), @@ -380,7 +397,7 @@ cpdef Column sequences(Column starts, Column sizes, Column steps = None): cpdef Column sort_lists( Column input, - bool ascending, + order sort_order, null_order na_position, bool stable = False ): @@ -392,8 +409,8 @@ cpdef Column sort_lists( ---------- input : Column The input column. - ascending : bool - If true, the sort order is ascending. Otherwise, the sort order is descending. + ascending : Order + Sort order in the list. na_position : NullOrder If na_position equals NullOrder.FIRST, then the null values in the output column are placed first. Otherwise, they are be placed after. @@ -409,21 +426,17 @@ cpdef Column sort_lists( cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() - cdef order c_sort_order = ( - order.ASCENDING if ascending else order.DESCENDING - ) - with nogil: if stable: c_result = cpp_stable_sort_lists( list_view.view(), - c_sort_order, + sort_order, na_position, ) else: c_result = cpp_sort_lists( list_view.view(), - c_sort_order, + sort_order, na_position, ) return Column.from_libcudf(move(c_result)) @@ -432,8 +445,8 @@ cpdef Column sort_lists( cpdef Column difference_distinct( Column lhs, Column rhs, - bool nulls_equal=True, - bool nans_equal=True + null_equality nulls_equal=null_equality.EQUAL, + nan_equality nans_equal=nan_equality.ALL_EQUAL, ): """Create a column of index values indicating the position of a search key row within the corresponding list row in the lists column. @@ -446,11 +459,10 @@ cpdef Column difference_distinct( The input lists column of elements that may be included. rhs : Column The input lists column of elements to exclude. - nulls_equal : bool, default True - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool, default True - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality, default EQUAL + Are nulls considered equal. + nans_equal : NanEquality, default ALL_EQUAL + Are nans considered equal. Returns ------- @@ -461,19 +473,12 @@ cpdef Column difference_distinct( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: c_result = cpp_set_operations.difference_distinct( lhs_view.view(), rhs_view.view(), - c_nulls_equal, - c_nans_equal, + nulls_equal, + nans_equal, ) return Column.from_libcudf(move(c_result)) @@ -481,8 +486,8 @@ cpdef Column difference_distinct( cpdef Column have_overlap( Column lhs, Column rhs, - bool nulls_equal=True, - bool nans_equal=True + null_equality nulls_equal=null_equality.EQUAL, + nan_equality nans_equal=nan_equality.ALL_EQUAL, ): """Check if lists at each row of the given lists columns overlap. @@ -494,11 +499,10 @@ cpdef Column have_overlap( The input lists column for one side. rhs : Column The input lists column for the other side. - nulls_equal : bool, default True - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool, default True - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality, default EQUAL + Are nulls considered equal. + nans_equal : NanEquality, default ALL_EQUAL + Are nans considered equal. Returns ------- @@ -509,19 +513,12 @@ cpdef Column have_overlap( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: c_result = cpp_set_operations.have_overlap( lhs_view.view(), rhs_view.view(), - c_nulls_equal, - c_nans_equal, + nulls_equal, + nans_equal, ) return Column.from_libcudf(move(c_result)) @@ -529,8 +526,8 @@ cpdef Column have_overlap( cpdef Column intersect_distinct( Column lhs, Column rhs, - bool nulls_equal=True, - bool nans_equal=True + null_equality nulls_equal=null_equality.EQUAL, + nan_equality nans_equal=nan_equality.ALL_EQUAL, ): """Create a lists column of distinct elements common to two input lists columns. @@ -542,11 +539,10 @@ cpdef Column intersect_distinct( The input lists column of elements that may be included. rhs : Column The input lists column of elements to exclude. - nulls_equal : bool, default True - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool, default True - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality, default EQUAL + Are nulls considered equal. + nans_equal : NanEquality, default ALL_EQUAL + Are nans considered equal. Returns ------- @@ -557,19 +553,12 @@ cpdef Column intersect_distinct( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: c_result = cpp_set_operations.intersect_distinct( lhs_view.view(), rhs_view.view(), - c_nulls_equal, - c_nans_equal, + nulls_equal, + nans_equal, ) return Column.from_libcudf(move(c_result)) @@ -577,8 +566,8 @@ cpdef Column intersect_distinct( cpdef Column union_distinct( Column lhs, Column rhs, - bool nulls_equal=True, - bool nans_equal=True + null_equality nulls_equal=null_equality.EQUAL, + nan_equality nans_equal=nan_equality.ALL_EQUAL, ): """Create a lists column of distinct elements found in either of two input lists columns. @@ -591,11 +580,10 @@ cpdef Column union_distinct( The input lists column of elements that may be included. rhs : Column The input lists column of elements to exclude. - nulls_equal : bool, default True - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool, default True - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality, default EQUAL + Are nulls considered equal. + nans_equal : NanEquality, default ALL_EQUAL + Are nans considered equal. Returns ------- @@ -606,19 +594,12 @@ cpdef Column union_distinct( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: c_result = cpp_set_operations.union_distinct( lhs_view.view(), rhs_view.view(), - c_nulls_equal, - c_nans_equal, + nulls_equal, + nans_equal, ) return Column.from_libcudf(move(c_result)) @@ -651,7 +632,7 @@ cpdef Column apply_boolean_mask(Column input, Column boolean_mask): return Column.from_libcudf(move(c_result)) -cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal): +cpdef Column distinct(Column input, null_equality nulls_equal, nan_equality nans_equal): """Create a new list column without duplicate elements in each list. For details, see :cpp:func:`distinct`. @@ -660,11 +641,10 @@ cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal): ---------- input : Column The input column. - nulls_equal : bool - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality + Are nulls considered equal. + nans_equal : NanEquality + Are nans considered equal. Returns ------- @@ -674,17 +654,10 @@ cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal): cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: c_result = cpp_distinct( list_view.view(), - c_nulls_equal, - c_nans_equal, + nulls_equal, + nans_equal, ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/merge.pyi b/python/pylibcudf/pylibcudf/merge.pyi new file mode 100644 index 00000000000..b18eb01f8a2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/merge.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.table import Table +from pylibcudf.types import NullOrder, Order + +def merge( + tables_to_merge: list[Table], + key_cols: list[int], + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx index 61a21aafdb2..c051cdc0c66 100644 --- a/python/pylibcudf/pylibcudf/merge.pyx +++ b/python/pylibcudf/pylibcudf/merge.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.types cimport null_order, order, size_type from .table cimport Table +__all__ = ["merge"] cpdef Table merge ( list tables_to_merge, diff --git a/python/pylibcudf/pylibcudf/null_mask.pyi b/python/pylibcudf/pylibcudf/null_mask.pyi new file mode 100644 index 00000000000..1a6d96a0822 --- /dev/null +++ b/python/pylibcudf/pylibcudf/null_mask.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from rmm.pylibrmm.device_buffer import DeviceBuffer + +from pylibcudf.column import Column +from pylibcudf.types import MaskState + +def copy_bitmask(col: Column) -> DeviceBuffer: ... +def bitmask_allocation_size_bytes(number_of_bits: int) -> int: ... +def create_null_mask( + size: int, state: MaskState = MaskState.UNINITIALIZED +) -> DeviceBuffer: ... +def bitmask_and(columns: list[Column]) -> tuple[DeviceBuffer, int]: ... +def bitmask_or(columns: list[Column]) -> tuple[DeviceBuffer, int]: ... diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx index 74180951562..adc264e9af6 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pyx +++ b/python/pylibcudf/pylibcudf/null_mask.pyx @@ -14,6 +14,13 @@ from pylibcudf.libcudf.types import mask_state as MaskState # no-cython-lint from .column cimport Column from .table cimport Table +__all__ = [ + "bitmask_allocation_size_bytes", + "bitmask_and", + "bitmask_or", + "copy_bitmask", + "create_null_mask", +] cdef DeviceBuffer buffer_to_python(device_buffer buf): return DeviceBuffer.c_from_unique_ptr(make_unique[device_buffer](move(buf))) diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi new file mode 100644 index 00000000000..ca39aa16d7e --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class BPEMergePairs: + def __init__(self, merge_pairs: Column): ... + +def byte_pair_encoding( + input: Column, merge_pairs: BPEMergePairs, separator: Scalar | None = None +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx index 76caad276d4..7565b21084f 100644 --- a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx @@ -16,6 +16,7 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( ) from pylibcudf.scalar cimport Scalar +__all__ = ["BPEMergePairs", "byte_pair_encoding"] cdef class BPEMergePairs: """The table of merge pairs for the BPE encoder. @@ -27,6 +28,8 @@ cdef class BPEMergePairs: with nogil: self.c_obj = move(cpp_load_merge_pairs(c_pairs)) + __hash__ = None + cpdef Column byte_pair_encoding( Column input, BPEMergePairs merge_pairs, diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi new file mode 100644 index 00000000000..85bbbb880ee --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def edit_distance(input: Column, targets: Column) -> Column: ... +def edit_distance_matrix(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx index dcacb2e1267..eceeaff24e3 100644 --- a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.nvtext.edit_distance cimport ( edit_distance_matrix as cpp_edit_distance_matrix, ) +__all__ = ["edit_distance", "edit_distance_matrix"] cpdef Column edit_distance(Column input, Column targets): """ diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi new file mode 100644 index 00000000000..2757518379d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def generate_ngrams( + input: Column, ngrams: int, separator: Scalar +) -> Column: ... +def generate_character_ngrams(input: Column, ngrams: int = 2) -> Column: ... +def hash_character_ngrams(input: Column, ngrams: int = 2) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx index 09859d09e9e..521bc0ef4a4 100644 --- a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx @@ -14,6 +14,11 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar +__all__ = [ + "generate_ngrams", + "generate_character_ngrams", + "hash_character_ngrams", +] cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator): """ diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi new file mode 100644 index 00000000000..18263c5c8fd --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def jaccard_index(input1: Column, input2: Column, width: int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx index 3d8669865d9..90cace088f7 100644 --- a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.nvtext.jaccard cimport ( ) from pylibcudf.libcudf.types cimport size_type +__all__ = ["jaccard_index"] cpdef Column jaccard_index(Column input1, Column input2, size_type width): """ diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd index 97e8c9dc83c..6b544282f44 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd @@ -11,8 +11,24 @@ ctypedef fused ColumnOrScalar: cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*) +cpdef Column minhash_permuted( + Column input, + uint32_t seed, + Column a, + Column b, + size_type width +) + cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*) +cpdef Column minhash64_permuted( + Column input, + uint64_t seed, + Column a, + Column b, + size_type width +) + cpdef Column word_minhash(Column input, Column seeds) cpdef Column word_minhash64(Column input, Column seeds) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi new file mode 100644 index 00000000000..a2d9b6364f7 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def minhash( + input: Column, seeds: Column | Scalar, width: int = 4 +) -> Column: ... +def minhash64( + input: Column, seeds: Column | Scalar, width: int = 4 +) -> Column: ... +def word_minhash(input: Column, seeds: Column) -> Column: ... +def word_minhash64(input: Column, seeds: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index f1e012e60e5..5448cc6de9b 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -8,6 +8,8 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.nvtext.minhash cimport ( minhash as cpp_minhash, minhash64 as cpp_minhash64, + minhash64_permuted as cpp_minhash64_permuted, + minhash_permuted as cpp_minhash_permuted, word_minhash as cpp_word_minhash, word_minhash64 as cpp_word_minhash64, ) @@ -16,7 +18,14 @@ from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar from cython.operator import dereference +import warnings +__all__ = [ + "minhash", + "minhash64", + "word_minhash", + "word_minhash64", +] cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): """ @@ -40,6 +49,12 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): Column List column of minhash values for each string per seed """ + warnings.warn( + "Starting in version 25.02, the signature of this function will " + "be changed to match pylibcudf.nvtext.minhash_permuted.", + FutureWarning + ) + cdef unique_ptr[column] c_result if not isinstance(seeds, (Column, Scalar)): @@ -55,6 +70,50 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): return Column.from_libcudf(move(c_result)) +cpdef Column minhash_permuted( + Column input, + uint32_t seed, + Column a, + Column b, + size_type width +): + """ + Returns the minhash values for each string. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`minhash_permuted`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seed : uint32_t + Seed used for the hash function + a : Column + 1st parameter value used for the minhash algorithm. + b : Column + 2nd parameter value used for the minhash algorithm. + width : size_type + Character width used for apply substrings; + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_minhash_permuted( + input.view(), + seed, + a.view(), + b.view(), + width + ) + + return Column.from_libcudf(move(c_result)) + cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): """ Returns the minhash values for each string per seed. @@ -77,6 +136,12 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): Column List column of minhash values for each string per seed """ + warnings.warn( + "Starting in version 25.02, the signature of this function will " + "be changed to match pylibcudf.nvtext.minhash64_permuted.", + FutureWarning + ) + cdef unique_ptr[column] c_result if not isinstance(seeds, (Column, Scalar)): @@ -92,6 +157,50 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): return Column.from_libcudf(move(c_result)) +cpdef Column minhash64_permuted( + Column input, + uint64_t seed, + Column a, + Column b, + size_type width +): + """ + Returns the minhash values for each string. + This function uses MurmurHash3_x64_128 for the hash algorithm. + + For details, see :cpp:func:`minhash64_permuted`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seed : uint64_t + Seed used for the hash function + a : Column + 1st parameter value used for the minhash algorithm. + b : Column + 2nd parameter value used for the minhash algorithm. + width : size_type + Character width used for apply substrings; + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_minhash64_permuted( + input.view(), + seed, + a.view(), + b.view(), + width + ) + + return Column.from_libcudf(move(c_result)) + cpdef Column word_minhash(Column input, Column seeds): """ Returns the minhash values for each row of strings per seed. diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi new file mode 100644 index 00000000000..224640ed44d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def ngrams_tokenize( + input: Column, ngrams: int, delimiter: Scalar, separator: Scalar +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx index 8a1854c5f0d..771c7c019fc 100644 --- a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx @@ -12,6 +12,7 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar +__all__ = ["ngrams_tokenize"] cpdef Column ngrams_tokenize( Column input, diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi new file mode 100644 index 00000000000..1d90a5a8960 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def normalize_spaces(input: Column) -> Column: ... +def normalize_characters(input: Column, do_lower_case: bool) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx index 637d900b659..b259ccaefa6 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.nvtext.normalize cimport ( normalize_spaces as cpp_normalize_spaces, ) +__all__ = ["normalize_characters", "normalize_spaces"] cpdef Column normalize_spaces(Column input): """ diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyi b/python/pylibcudf/pylibcudf/nvtext/replace.pyi new file mode 100644 index 00000000000..1f1ac72ce7c --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyi @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def replace_tokens( + input: Column, + targets: Column, + replacements: Column, + delimiter: Scalar | None = None, +) -> Column: ... +def filter_tokens( + input: Column, + min_token_length: int, + replacement: Scalar | None = None, + delimiter: Scalar | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyx b/python/pylibcudf/pylibcudf/nvtext/replace.pyx index b65348ce14d..a27592fb434 100644 --- a/python/pylibcudf/pylibcudf/nvtext/replace.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyx @@ -16,6 +16,7 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar +__all__ = ["filter_tokens", "replace_tokens"] cpdef Column replace_tokens( Column input, diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi new file mode 100644 index 00000000000..d6ba1d189bd --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def is_letter( + input: Column, check_vowels: bool, indices: Column | int +) -> Column: ... +def porter_stemmer_measure(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx index 854d1053624..c9e4f1274e4 100644 --- a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx @@ -12,6 +12,7 @@ from pylibcudf.libcudf.nvtext.stemmer cimport ( ) from pylibcudf.libcudf.types cimport size_type +__all__ = ["is_letter", "porter_stemmer_measure"] cpdef Column is_letter( Column input, diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi new file mode 100644 index 00000000000..f6618e296b1 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +class HashedVocabulary: + def __init__(self, hash_file: str): ... + +def subword_tokenize( + input: Column, + vocabulary_table: HashedVocabulary, + max_sequence_length: int, + stride: int, + do_lower_case: bool, + do_truncate: bool, +) -> tuple[Column, Column, Column]: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx index 04643d3bd84..14fb6f5fe1e 100644 --- a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx @@ -13,6 +13,7 @@ from pylibcudf.libcudf.nvtext.subword_tokenize cimport ( tokenizer_result as cpp_tokenizer_result, ) +__all__ = ["HashedVocabulary", "subword_tokenize"] cdef class HashedVocabulary: """The vocabulary data for use with the subword_tokenize function. @@ -24,6 +25,8 @@ cdef class HashedVocabulary: with nogil: self.c_obj = move(cpp_load_vocabulary_file(c_hash_file)) + __hash__ = None + cpdef tuple[Column, Column, Column] subword_tokenize( Column input, HashedVocabulary vocabulary_table, diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi new file mode 100644 index 00000000000..b9aa2393514 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class TokenizeVocabulary: + def __init__(self, vocab: Column): ... + +def tokenize_scalar( + input: Column, delimiter: Scalar | None = None +) -> Column: ... +def tokenize_column(input: Column, delimiters: Column) -> Column: ... +def count_tokens_scalar( + input: Column, delimiter: Scalar | None = None +) -> Column: ... +def count_tokens_column(input: Column, delimiters: Column) -> Column: ... +def character_tokenize(input: Column) -> Column: ... +def detokenize( + input: Column, row_indices: Column, separator: Scalar | None = None +) -> Column: ... +def tokenize_with_vocabulary( + input: Column, + vocabulary: TokenizeVocabulary, + delimiter: Scalar, + default_id: int = -1, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx index ec02e8ebf4e..43d426489b4 100644 --- a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx @@ -20,6 +20,16 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( ) from pylibcudf.libcudf.types cimport size_type +__all__ = [ + "TokenizeVocabulary", + "character_tokenize", + "count_tokens_column", + "count_tokens_scalar", + "detokenize", + "tokenize_column", + "tokenize_scalar", + "tokenize_with_vocabulary", +] cdef class TokenizeVocabulary: """The Vocabulary object to be used with ``tokenize_with_vocabulary``. @@ -31,6 +41,8 @@ cdef class TokenizeVocabulary: with nogil: self.c_obj = move(cpp_load_vocabulary(c_vocab)) + __hash__ = None + cpdef Column tokenize_scalar(Column input, Scalar delimiter=None): """ Returns a single column of strings by tokenizing the input diff --git a/python/pylibcudf/pylibcudf/partitioning.pyi b/python/pylibcudf/pylibcudf/partitioning.pyi new file mode 100644 index 00000000000..48a2ade23f1 --- /dev/null +++ b/python/pylibcudf/pylibcudf/partitioning.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.table import Table + +def hash_partition( + input: Table, columns_to_hash: list[int], num_partitions: int +) -> tuple[Table, list[int]]: ... +def partition( + t: Table, partition_map: Column, num_partitions: int +) -> tuple[Table, list[int]]: ... +def round_robin_partition( + input: Table, num_partitions: int, start_partition: int = 0 +) -> tuple[Table, list[int]]: ... diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx index 3cff4843735..1dacabceb06 100644 --- a/python/pylibcudf/pylibcudf/partitioning.pyx +++ b/python/pylibcudf/pylibcudf/partitioning.pyx @@ -11,6 +11,11 @@ from pylibcudf.libcudf.table.table cimport table from .column cimport Column from .table cimport Table +__all__ = [ + "hash_partition", + "partition", + "round_robin_partition", +] cpdef tuple[Table, list] hash_partition( Table input, diff --git a/python/pylibcudf/pylibcudf/py.typed b/python/pylibcudf/pylibcudf/py.typed new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/quantiles.pyi b/python/pylibcudf/pylibcudf/quantiles.pyi new file mode 100644 index 00000000000..dca6eed013a --- /dev/null +++ b/python/pylibcudf/pylibcudf/quantiles.pyi @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Sequence + +from pylibcudf.column import Column +from pylibcudf.table import Table +from pylibcudf.types import Interpolation, NullOrder, Order, Sorted + +def quantile( + input: Column, + q: Sequence[float], + interp: Interpolation = Interpolation.LINEAR, + ordered_indices: Column | None = None, + exact: bool = True, +) -> Column: ... +def quantiles( + input: Table, + q: Sequence[float], + interp: Interpolation = Interpolation.NEAREST, + is_input_sorted: Sorted = Sorted.NO, + column_order: list[Order] | None = None, + null_precedence: list[NullOrder] | None = None, +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx index 7d92b598bd0..634218586ac 100644 --- a/python/pylibcudf/pylibcudf/quantiles.pyx +++ b/python/pylibcudf/pylibcudf/quantiles.pyx @@ -17,6 +17,7 @@ from .column cimport Column from .table cimport Table from .types cimport interpolation +__all__ = ["quantile", "quantiles"] cpdef Column quantile( Column input, diff --git a/python/pylibcudf/pylibcudf/reduce.pyi b/python/pylibcudf/pylibcudf/reduce.pyi new file mode 100644 index 00000000000..a09949b7b30 --- /dev/null +++ b/python/pylibcudf/pylibcudf/reduce.pyi @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.aggregation import Aggregation +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.types import DataType + +class ScanType(IntEnum): + INCLUSIVE = ... + EXCLUSIVE = ... + +def reduce(col: Column, agg: Aggregation, data_type: DataType) -> Scalar: ... +def scan(col: Column, agg: Aggregation, inclusive: ScanType) -> Column: ... +def minmax(col: Column) -> tuple[Scalar, Scalar]: ... diff --git a/python/pylibcudf/pylibcudf/reduce.pyx b/python/pylibcudf/pylibcudf/reduce.pyx index d9ec3a9bdc4..1d6ffd9de10 100644 --- a/python/pylibcudf/pylibcudf/reduce.pyx +++ b/python/pylibcudf/pylibcudf/reduce.pyx @@ -16,6 +16,7 @@ from .types cimport DataType from pylibcudf.libcudf.reduce import scan_type as ScanType # no-cython-lint +__all__ = ["ScanType", "minmax", "reduce", "scan"] cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type): """Perform a reduction on a column diff --git a/python/pylibcudf/pylibcudf/replace.pyi b/python/pylibcudf/pylibcudf/replace.pyi new file mode 100644 index 00000000000..eed7a2a6c52 --- /dev/null +++ b/python/pylibcudf/pylibcudf/replace.pyi @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class ReplacePolicy(IntEnum): + PRECEDING = ... + FOLLOWING = ... + +def replace_nulls( + source_column: Column, replacement: Column | Scalar | ReplacePolicy +) -> Column: ... +def find_and_replace_all( + source_column: Column, + values_to_replace: Column, + replacement_values: Column, +) -> Column: ... +def clamp( + source_column: Column, + lo: Scalar, + hi: Scalar, + lo_replace: Scalar | None = None, + hi_replace: Scalar | None = None, +) -> Column: ... +def normalize_nans_and_zeros( + source_column: Column, inplace: bool = False +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/replace.pyx b/python/pylibcudf/pylibcudf/replace.pyx index f77eba7ace5..51be2b29277 100644 --- a/python/pylibcudf/pylibcudf/replace.pyx +++ b/python/pylibcudf/pylibcudf/replace.pyx @@ -15,6 +15,14 @@ from pylibcudf.libcudf.replace import \ from .column cimport Column from .scalar cimport Scalar +__all__ = [ + "ReplacePolicy", + "clamp", + "find_and_replace_all", + "normalize_nans_and_zeros", + "replace_nulls", +] + cpdef Column replace_nulls(Column source_column, ReplacementType replacement): """Replace nulls in source_column. diff --git a/python/pylibcudf/pylibcudf/reshape.pyi b/python/pylibcudf/pylibcudf/reshape.pyi new file mode 100644 index 00000000000..d8d0ffcc3e0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/reshape.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.table import Table + +def interleave_columns(source_table: Table) -> Column: ... +def tile(source_table: Table, count: int) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx index 6540b5198ab..bdc212a1985 100644 --- a/python/pylibcudf/pylibcudf/reshape.pyx +++ b/python/pylibcudf/pylibcudf/reshape.pyx @@ -13,6 +13,7 @@ from pylibcudf.libcudf.types cimport size_type from .column cimport Column from .table cimport Table +__all__ = ["interleave_columns", "tile"] cpdef Column interleave_columns(Table source_table): """Interleave columns of a table into a single column. diff --git a/python/pylibcudf/pylibcudf/rolling.pyi b/python/pylibcudf/pylibcudf/rolling.pyi new file mode 100644 index 00000000000..ca0111e01ec --- /dev/null +++ b/python/pylibcudf/pylibcudf/rolling.pyi @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.aggregation import Aggregation +from pylibcudf.column import Column + +def rolling_window[WindowType: (Column, int)]( + source: Column, + preceding_window: WindowType, + following_window: WindowType, + min_periods: int, + agg: Aggregation, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/rolling.pyx b/python/pylibcudf/pylibcudf/rolling.pyx index 4fd0b005431..11acf57ccf4 100644 --- a/python/pylibcudf/pylibcudf/rolling.pyx +++ b/python/pylibcudf/pylibcudf/rolling.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.types cimport size_type from .aggregation cimport Aggregation from .column cimport Column +__all__ = ["rolling_window"] cpdef Column rolling_window( Column source, diff --git a/python/pylibcudf/pylibcudf/round.pyi b/python/pylibcudf/pylibcudf/round.pyi new file mode 100644 index 00000000000..410cf5de586 --- /dev/null +++ b/python/pylibcudf/pylibcudf/round.pyi @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column + +class RoundingMethod(IntEnum): + HALF_UP = ... + HALF_EVEN = ... + +def round( + source: Column, + decimal_places: int = 0, + round_method: RoundingMethod = RoundingMethod.HALF_UP, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/round.pyx b/python/pylibcudf/pylibcudf/round.pyx index 689363e652d..09e5a9cc3bc 100644 --- a/python/pylibcudf/pylibcudf/round.pyx +++ b/python/pylibcudf/pylibcudf/round.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.column.column cimport column from .column cimport Column +__all__ = ["RoundingMethod", "round"] cpdef Column round( Column source, diff --git a/python/pylibcudf/pylibcudf/scalar.pyi b/python/pylibcudf/pylibcudf/scalar.pyi new file mode 100644 index 00000000000..0b72b10ef86 --- /dev/null +++ b/python/pylibcudf/pylibcudf/scalar.pyi @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +class Scalar: + def type(self) -> DataType: ... + def is_valid(self) -> bool: ... + @staticmethod + def empty_like(column: Column) -> Scalar: ... diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx index d4888a62ad1..1ac014e891e 100644 --- a/python/pylibcudf/pylibcudf/scalar.pyx +++ b/python/pylibcudf/pylibcudf/scalar.pyx @@ -11,6 +11,8 @@ from rmm.pylibrmm.memory_resource cimport get_current_device_resource from .column cimport Column from .types cimport DataType +__all__ = ["Scalar"] + # The DeviceMemoryResource attribute could be released prematurely # by the gc if the Scalar is in a reference cycle. Removing the tp_clear @@ -37,6 +39,8 @@ cdef class Scalar: # DeviceScalar. raise ValueError("Scalar should be constructed with a factory") + __hash__ = None + cdef const scalar* get(self) noexcept nogil: return self.c_obj.get() diff --git a/python/pylibcudf/pylibcudf/search.pyi b/python/pylibcudf/pylibcudf/search.pyi new file mode 100644 index 00000000000..7f292b129b2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/search.pyi @@ -0,0 +1,19 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.table import Table +from pylibcudf.types import NullOrder, Order + +def lower_bound( + haystack: Table, + needles: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Column: ... +def upper_bound( + haystack: Table, + needles: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Column: ... +def contains(haystack: Column, needles: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx index 1a870248046..50353fcd0cc 100644 --- a/python/pylibcudf/pylibcudf/search.pyx +++ b/python/pylibcudf/pylibcudf/search.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.types cimport null_order, order from .column cimport Column from .table cimport Table +__all__ = ["contains", "lower_bound", "upper_bound"] cpdef Column lower_bound( Table haystack, diff --git a/python/pylibcudf/pylibcudf/sorting.pyi b/python/pylibcudf/pylibcudf/sorting.pyi new file mode 100644 index 00000000000..5255d869a4d --- /dev/null +++ b/python/pylibcudf/pylibcudf/sorting.pyi @@ -0,0 +1,64 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.aggregation import RankMethod +from pylibcudf.column import Column +from pylibcudf.table import Table +from pylibcudf.types import NullOrder, NullPolicy, Order + +def sorted_order( + source_table: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Column: ... +def stable_sorted_order( + source_table: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Column: ... +def rank( + input_view: Column, + method: RankMethod, + column_order: Order, + null_handling: NullPolicy, + null_precedence: NullOrder, + percentage: bool, +) -> Column: ... +def is_sorted( + tbl: Table, column_order: list[Order], null_precedence: list[NullOrder] +) -> bool: ... +def segmented_sort_by_key( + values: Table, + keys: Table, + segment_offsets: Column, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def stable_segmented_sort_by_key( + values: Table, + keys: Table, + segment_offsets: Column, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def sort_by_key( + values: Table, + keys: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def stable_sort_by_key( + values: Table, + keys: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def sort( + source_table: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def stable_sort( + source_table: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx index fc40f03e1fd..fb29ef8c571 100644 --- a/python/pylibcudf/pylibcudf/sorting.pyx +++ b/python/pylibcudf/pylibcudf/sorting.pyx @@ -12,6 +12,18 @@ from pylibcudf.libcudf.types cimport null_order, null_policy, order from .column cimport Column from .table cimport Table +__all__ = [ + "is_sorted", + "rank", + "segmented_sort_by_key", + "sort", + "sort_by_key", + "sorted_order", + "stable_segmented_sort_by_key", + "stable_sort", + "stable_sort_by_key", + "stable_sorted_order", +] cpdef Column sorted_order(Table source_table, list column_order, list null_precedence): """Computes the row indices required to sort the table. diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/stream_compaction.pxd index a4f39792f0c..a20a23e2e58 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pxd +++ b/python/pylibcudf/pylibcudf/stream_compaction.pxd @@ -17,6 +17,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold) cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold) +cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask) + cpdef Table unique( Table input, list keys, diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyi b/python/pylibcudf/pylibcudf/stream_compaction.pyi new file mode 100644 index 00000000000..99cade48309 --- /dev/null +++ b/python/pylibcudf/pylibcudf/stream_compaction.pyi @@ -0,0 +1,53 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.table import Table +from pylibcudf.types import NanEquality, NanPolicy, NullEquality, NullPolicy + +class DuplicateKeepOption(IntEnum): + KEEP_ANY = ... + KEEP_FIRST = ... + KEEP_LAST = ... + KEEP_NONE = ... + +def drop_nulls( + source_table: Table, keys: list[int], keep_threshold: int +) -> Table: ... +def drop_nans( + source_table: Table, keys: list[int], keep_threshold: int +) -> Table: ... +def apply_boolean_mask(source_table: Table, boolean_mask: Column) -> Table: ... +def unique( + input: Table, + keys: list[int], + keep: DuplicateKeepOption, + nulls_equal: NullEquality, +) -> Table: ... +def distinct( + input: Table, + keys: list[int], + keep: DuplicateKeepOption, + nulls_equal: NullEquality, + nans_equal: NanEquality, +) -> Table: ... +def distinct_indices( + input: Table, + keep: DuplicateKeepOption, + nulls_equal: NullEquality, + nans_equal: NanEquality, +) -> Column: ... +def stable_distinct( + input: Table, + keys: list[int], + keep: DuplicateKeepOption, + nulls_equal: NullEquality, + nans_equal: NanEquality, +) -> Table: ... +def unique_count( + source: Column, null_handling: NullPolicy, nan_handling: NanPolicy +) -> int: ... +def distinct_count( + source: Column, null_handling: NullPolicy, nan_handling: NanPolicy +) -> int: ... diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx index 2145398a191..6e403ca1b07 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pyx +++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx @@ -21,6 +21,18 @@ from pylibcudf.libcudf.stream_compaction import \ from .column cimport Column from .table cimport Table +__all__ = [ + "DuplicateKeepOption", + "apply_boolean_mask", + "distinct", + "distinct_count", + "distinct_indices", + "drop_nans", + "drop_nulls", + "stable_distinct", + "unique", + "unique_count", +] cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): """Filters out rows from the input table based on the presence of nulls. diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index fa7294c7dbd..67054f0b447 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -28,6 +28,7 @@ from .side_type import SideType __all__ = [ + "SideType", "attributes", "capitalize", "case", @@ -46,9 +47,8 @@ "replace", "replace_re", "slice", - "strip", "split", - "SideType", + "strip", "translate", "wrap", ] diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyi b/python/pylibcudf/pylibcudf/strings/attributes.pyi new file mode 100644 index 00000000000..7fd5c9773d4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/attributes.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def count_characters(source_strings: Column) -> Column: ... +def count_bytes(source_strings: Column) -> Column: ... +def code_points(source_strings: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx index 8e46a32835d..f1eb09b4965 100644 --- a/python/pylibcudf/pylibcudf/strings/attributes.pyx +++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport attributes as cpp_attributes +__all__ = ["code_points", "count_bytes", "count_characters"] cpdef Column count_characters(Column source_strings): """ diff --git a/python/pylibcudf/pylibcudf/strings/capitalize.pyi b/python/pylibcudf/pylibcudf/strings/capitalize.pyi new file mode 100644 index 00000000000..5c6689418e2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyi @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.char_types import StringCharacterTypes + +def capitalize(input: Column, delimiters: Scalar | None = None) -> Column: ... +def title( + input: Column, + sequence_type: StringCharacterTypes = StringCharacterTypes.ALPHA, +) -> Column: ... +def is_title(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/capitalize.pyx b/python/pylibcudf/pylibcudf/strings/capitalize.pyx index 06b991c3cf1..a54480b8e4a 100644 --- a/python/pylibcudf/pylibcudf/strings/capitalize.pyx +++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyx @@ -14,6 +14,7 @@ from pylibcudf.strings.char_types cimport string_character_types from cython.operator import dereference +__all__ = ["capitalize", "is_title", "title"] cpdef Column capitalize( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/case.pyi b/python/pylibcudf/pylibcudf/strings/case.pyi new file mode 100644 index 00000000000..4e50db4d1da --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/case.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def to_lower(input: Column) -> Column: ... +def to_upper(input: Column) -> Column: ... +def swapcase(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/case.pyx b/python/pylibcudf/pylibcudf/strings/case.pyx index 9e6cd7717d3..d0e054bef72 100644 --- a/python/pylibcudf/pylibcudf/strings/case.pyx +++ b/python/pylibcudf/pylibcudf/strings/case.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport case as cpp_case +__all__ = ["swapcase", "to_lower", "to_upper"] cpdef Column to_lower(Column input): cdef unique_ptr[column] c_result diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyi b/python/pylibcudf/pylibcudf/strings/char_types.pyi new file mode 100644 index 00000000000..daa36cbb68d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/char_types.pyi @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class StringCharacterTypes(IntEnum): + DECIMAL = ... + NUMERIC = ... + DIGIT = ... + ALPHA = ... + SPACE = ... + UPPER = ... + LOWER = ... + ALPHANUM = ... + CASE_TYPES = ... + ALL_TYPES = ... + +def all_characters_of_type( + source_strings: Column, + types: StringCharacterTypes, + verify_types: StringCharacterTypes, +) -> Column: ... +def filter_characters_of_type( + source_strings: Column, + types_to_remove: StringCharacterTypes, + replacement: Scalar, + types_to_keep: StringCharacterTypes, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx index cb04efe5e8f..0af4a1f9c37 100644 --- a/python/pylibcudf/pylibcudf/strings/char_types.pyx +++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx @@ -12,6 +12,11 @@ from cython.operator import dereference from pylibcudf.libcudf.strings.char_types import \ string_character_types as StringCharacterTypes # no-cython-lint +__all__ = [ + "StringCharacterTypes", + "all_characters_of_type", + "filter_characters_of_type", +] cpdef Column all_characters_of_type( Column source_strings, diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyi b/python/pylibcudf/pylibcudf/strings/combine.pyi new file mode 100644 index 00000000000..3094b20f141 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/combine.pyi @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table + +class SeparatorOnNulls(IntEnum): + YES = ... + NO = ... + +class OutputIfEmptyList(IntEnum): + EMPTY_STRING = ... + NULL_ELEMENT = ... + +def concatenate( + strings_columns: Table, + separator: Column | Scalar, + narep: Scalar | None = None, + col_narep: Scalar | None = None, + separate_nulls: SeparatorOnNulls = SeparatorOnNulls.YES, +) -> Column: ... +def join_strings( + input: Column, separator: Scalar, narep: Scalar +) -> Column: ... +def join_list_elements( + lists_strings_column: Column, + separator: Column | Scalar, + separator_narep: Scalar, + string_narep: Scalar, + separate_nulls: SeparatorOnNulls, + empty_list_policy: OutputIfEmptyList, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx index f17d5265ab4..dc1e72c799b 100644 --- a/python/pylibcudf/pylibcudf/strings/combine.pyx +++ b/python/pylibcudf/pylibcudf/strings/combine.pyx @@ -17,6 +17,13 @@ from pylibcudf.libcudf.strings.combine import \ from pylibcudf.libcudf.strings.combine import \ separator_on_nulls as SeparatorOnNulls # no-cython-lint +__all__ = [ + "OutputIfEmptyList", + "SeparatorOnNulls", + "concatenate", + "join_list_elements", + "join_strings", +] cpdef Column concatenate( Table strings_columns, diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyi b/python/pylibcudf/pylibcudf/strings/contains.pyi new file mode 100644 index 00000000000..1f0620383b3 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/contains.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.regex_program import RegexProgram + +def contains_re(input: Column, prog: RegexProgram) -> Column: ... +def count_re(input: Column, prog: RegexProgram) -> Column: ... +def matches_re(input: Column, prog: RegexProgram) -> Column: ... +def like( + input: Column, + pattern: Column | Scalar, + escape_character: Scalar | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx index d4b1130241d..7b4c53ed853 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pyx +++ b/python/pylibcudf/pylibcudf/strings/contains.pyx @@ -12,6 +12,7 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( from pylibcudf.libcudf.strings cimport contains as cpp_contains from pylibcudf.strings.regex_program cimport RegexProgram +__all__ = ["contains_re", "count_re", "like", "matches_re"] cpdef Column contains_re( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index aa27a7c8929..08b5034456e 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -10,3 +10,15 @@ convert_lists, convert_urls, ) + +__all__ = [ + "convert_booleans", + "convert_datetime", + "convert_durations", + "convert_fixed_point", + "convert_floats", + "convert_integers", + "convert_ipv4", + "convert_lists", + "convert_urls", +] diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi new file mode 100644 index 00000000000..77c09242e9a --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def to_booleans(input: Column, true_string: Scalar) -> Column: ... +def from_booleans( + booleans: Column, true_string: Scalar, false_string: Scalar +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx index dc12b291b11..1899a3b27cc 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx @@ -12,6 +12,7 @@ from pylibcudf.scalar cimport Scalar from cython.operator import dereference +__all__ = ["from_booleans", "to_booleans"] cpdef Column to_booleans(Column input, Scalar true_string): """ diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi new file mode 100644 index 00000000000..c6857169765 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_timestamps( + input: Column, timestamp_type: DataType, format: str +) -> Column: ... +def from_timestamps( + timestamps: Column, format: str, input_strings_names: Column +) -> Column: ... +def is_timestamp(input: Column, format: str) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx index 0ee60812e00..f1cd684166c 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.strings.convert cimport ( from pylibcudf.types import DataType +__all__ = ["from_timestamps", "is_timestamp", "to_timestamps"] cpdef Column to_timestamps( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi new file mode 100644 index 00000000000..a5787a5fe49 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_durations( + input: Column, duration_type: DataType, format: str +) -> Column: ... +def from_durations(durations: Column, format: str | None = None) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx index 31980ace418..a9654afd00a 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.strings.convert cimport ( from pylibcudf.types import DataType +__all__ = ["from_durations", "to_durations"] cpdef Column to_durations( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi new file mode 100644 index 00000000000..1192d3dfcd6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_fixed_point(input: Column, output_type: DataType) -> Column: ... +def from_fixed_point(input: Column) -> Column: ... +def is_fixed_point( + input: Column, decimal_type: DataType | None = None +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx index 962a47dfadf..00cbc822f36 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx @@ -9,6 +9,8 @@ from pylibcudf.libcudf.strings.convert cimport ( ) from pylibcudf.types cimport DataType, type_id +__all__ = ["from_fixed_point", "is_fixed_point", "to_fixed_point"] + cpdef Column to_fixed_point(Column input, DataType output_type): """ diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi new file mode 100644 index 00000000000..ddf4042e10d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_floats(strings: Column, output_type: DataType) -> Column: ... +def from_floats(floats: Column) -> Column: ... +def is_float(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx index 1296f4f9db5..b5199aac577 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.strings.convert cimport ( ) from pylibcudf.types cimport DataType +__all__ = ["from_floats", "is_float", "to_floats"] cpdef Column to_floats(Column strings, DataType output_type): """ diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi new file mode 100644 index 00000000000..b96226fba90 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_integers(input: Column, output_type: DataType) -> Column: ... +def from_integers(integers: Column) -> Column: ... +def is_integer(input: Column, int_type: DataType | None = None) -> Column: ... +def hex_to_integers(input: Column, output_type: DataType) -> Column: ... +def is_hex(input: Column) -> Column: ... +def integers_to_hex(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx index 5558683a502..12984e15ce9 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx @@ -9,6 +9,14 @@ from pylibcudf.libcudf.strings.convert cimport ( ) from pylibcudf.types cimport DataType +__all__ = [ + "from_integers", + "hex_to_integers", + "integers_to_hex", + "is_hex", + "is_integer", + "to_integers" +] cpdef Column to_integers(Column input, DataType output_type): """ diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi new file mode 100644 index 00000000000..b017b32598c --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def ipv4_to_integers(input: Column) -> Column: ... +def integers_to_ipv4(integers: Column) -> Column: ... +def is_ipv4(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx index 834781f95f3..e7c6aae4fa8 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings.convert cimport convert_ipv4 as cpp_convert_ipv4 +__all__ = ["integers_to_ipv4", "ipv4_to_integers", "is_ipv4"] cpdef Column ipv4_to_integers(Column input): """ diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi new file mode 100644 index 00000000000..6ab3a4183e9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def format_list_column( + input: Column, + na_rep: Scalar | None = None, + separators: Column | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx index cbfe5f5aa8b..518f72f6644 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx @@ -17,6 +17,7 @@ from pylibcudf.types cimport type_id from cython.operator import dereference +__all__ = ["format_list_column"] cpdef Column format_list_column( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi new file mode 100644 index 00000000000..49b8468957c --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def url_encode(input: Column) -> Column: ... +def url_decode(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx index 82f8a75f1d9..bd5e23bca43 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings.convert cimport convert_urls as cpp_convert_urls +__all__ = ["url_decode", "url_encode"] cpdef Column url_encode(Column input): """ diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyi b/python/pylibcudf/pylibcudf/strings/extract.pyi new file mode 100644 index 00000000000..4354bd3072d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/extract.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.strings.regex_program import RegexProgram +from pylibcudf.table import Table + +def extract(input: Column, prog: RegexProgram) -> Table: ... +def extract_all_record(input: Column, prog: RegexProgram) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx index b56eccc8287..0ce70666e92 100644 --- a/python/pylibcudf/pylibcudf/strings/extract.pyx +++ b/python/pylibcudf/pylibcudf/strings/extract.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.strings.regex_program cimport RegexProgram from pylibcudf.table cimport Table +__all__ = ["extract", "extract_all_record"] cpdef Table extract(Column input, RegexProgram prog): """ diff --git a/python/pylibcudf/pylibcudf/strings/find.pyi b/python/pylibcudf/pylibcudf/strings/find.pyi new file mode 100644 index 00000000000..3d04a9c3161 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def find( + input: Column, target: Column | Scalar, start: int = 0, stop: int = -1 +) -> Column: ... +def rfind( + input: Column, target: Scalar, start: int = 0, stop: int = -1 +) -> Column: ... +def contains(input: Column, target: Column | Scalar) -> Column: ... +def starts_with(input: Column, target: Column | Scalar) -> Column: ... +def ends_with(input: Column, target: Column | Scalar) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/find.pyx b/python/pylibcudf/pylibcudf/strings/find.pyx index 6fc6dca24fd..f0af339ff08 100644 --- a/python/pylibcudf/pylibcudf/strings/find.pyx +++ b/python/pylibcudf/pylibcudf/strings/find.pyx @@ -10,6 +10,7 @@ from cython.operator import dereference from pylibcudf.libcudf.scalar.scalar cimport string_scalar +__all__ = ["contains", "ends_with", "find", "rfind", "starts_with"] cpdef Column find( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyi b/python/pylibcudf/pylibcudf/strings/find_multiple.pyi new file mode 100644 index 00000000000..3d46fd2fa6d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def find_multiple(input: Column, targets: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx index 672aa606bd0..c9ce734b4be 100644 --- a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple +__all__ = ["find_multiple"] cpdef Column find_multiple(Column input, Column targets): """ diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyi b/python/pylibcudf/pylibcudf/strings/findall.pyi new file mode 100644 index 00000000000..77e38581d22 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/findall.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.strings.regex_program import RegexProgram + +def find_re(input: Column, pattern: RegexProgram) -> Column: ... +def findall(input: Column, pattern: RegexProgram) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx index 89fa4302824..23c84675a16 100644 --- a/python/pylibcudf/pylibcudf/strings/findall.pyx +++ b/python/pylibcudf/pylibcudf/strings/findall.pyx @@ -7,6 +7,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport findall as cpp_findall from pylibcudf.strings.regex_program cimport RegexProgram +__all__ = ["findall", "find_re"] cpdef Column findall(Column input, RegexProgram pattern): """ diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyi b/python/pylibcudf/pylibcudf/strings/padding.pyi new file mode 100644 index 00000000000..a991935e6e5 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/padding.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.strings.side_type import SideType + +def pad( + input: Column, width: int, side: SideType, fill_char: str +) -> Column: ... +def zfill(input: Column, width: int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyx b/python/pylibcudf/pylibcudf/strings/padding.pyx index f6950eecf60..0e349a7be47 100644 --- a/python/pylibcudf/pylibcudf/strings/padding.pyx +++ b/python/pylibcudf/pylibcudf/strings/padding.pyx @@ -6,6 +6,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport padding as cpp_padding from pylibcudf.libcudf.strings.side_type cimport side_type +__all__ = ["pad", "zfill"] cpdef Column pad(Column input, size_type width, side_type side, str fill_char): """ diff --git a/python/pylibcudf/pylibcudf/strings/regex_flags.pyi b/python/pylibcudf/pylibcudf/strings/regex_flags.pyi new file mode 100644 index 00000000000..c551cebf181 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from enum import IntEnum + +class RegexFlags(IntEnum): + DEFAULT = ... + MULTILINE = ... + DOTALL = ... diff --git a/python/pylibcudf/pylibcudf/strings/regex_flags.pyx b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx index ce3b6b10a42..65b504e0dc7 100644 --- a/python/pylibcudf/pylibcudf/strings/regex_flags.pyx +++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx @@ -2,3 +2,5 @@ from pylibcudf.libcudf.strings.regex_flags import \ regex_flags as RegexFlags # no-cython-lint + +__all__ = ["RegexFlags"] diff --git a/python/pylibcudf/pylibcudf/strings/regex_program.pyi b/python/pylibcudf/pylibcudf/strings/regex_program.pyi new file mode 100644 index 00000000000..9abd6fa7802 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.strings.regex_flags import RegexFlags + +class RegexProgram: + def __init__(self): ... + @staticmethod + def create(pattern: str, flags: RegexFlags) -> RegexProgram: ... diff --git a/python/pylibcudf/pylibcudf/strings/regex_program.pyx b/python/pylibcudf/pylibcudf/strings/regex_program.pyx index 91f585cd637..46bfde074d2 100644 --- a/python/pylibcudf/pylibcudf/strings/regex_program.pyx +++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyx @@ -11,6 +11,7 @@ from pylibcudf.strings.regex_flags import RegexFlags from pylibcudf.strings.regex_flags cimport regex_flags +__all__ = ["RegexProgram"] cdef class RegexProgram: """Regex program class. @@ -24,6 +25,8 @@ cdef class RegexProgram: def __init__(self, *args, **kwargs): raise ValueError("Do not instantiate RegexProgram directly, use create") + __hash__ = None + @staticmethod def create(str pattern, int flags): """Create a program from a pattern. diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyi b/python/pylibcudf/pylibcudf/strings/repeat.pyi new file mode 100644 index 00000000000..93a46b71caa --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def repeat_strings(input: Column, repeat_times: Column | int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx index fb2bb13c666..a497b1f438e 100644 --- a/python/pylibcudf/pylibcudf/strings/repeat.pyx +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx @@ -6,6 +6,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport repeat as cpp_repeat from pylibcudf.libcudf.types cimport size_type +__all__ = ["repeat_strings"] cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times): """ diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyi b/python/pylibcudf/pylibcudf/strings/replace.pyi new file mode 100644 index 00000000000..64df09ef7e8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/replace.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def replace( + input: Column, target: Scalar, repl: Scalar, maxrepl: int = -1 +) -> Column: ... +def replace_multiple( + input: Column, target: Column, repl: Column, maxrepl: int = -1 +) -> Column: ... +def replace_slice( + input: Column, repl: Scalar | None = None, start: int = 0, stop: int = -1 +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx index 2b94f5e3fee..3ba6c1b5530 100644 --- a/python/pylibcudf/pylibcudf/strings/replace.pyx +++ b/python/pylibcudf/pylibcudf/strings/replace.pyx @@ -16,6 +16,7 @@ from pylibcudf.libcudf.strings.replace cimport ( from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar +__all__ = ["replace", "replace_multiple", "replace_slice"] cpdef Column replace( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyi b/python/pylibcudf/pylibcudf/strings/replace_re.pyi new file mode 100644 index 00000000000..056bafbf7ef --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyi @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from typing import overload + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.regex_flags import RegexFlags +from pylibcudf.strings.regex_program import RegexProgram + +@overload +def replace_re( + input: Column, + pattern: RegexProgram, + replacement: Scalar, + max_replace_count: int = -1, +) -> Column: ... +@overload +def replace_re( + input: Column, + patterns: list[str], + replacement: Column, + max_replace_count: int = -1, + flags: RegexFlags = RegexFlags.DEFAULT, +) -> Column: ... +def replace_with_backrefs( + input: Column, prog: RegexProgram, replacement: str +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyx b/python/pylibcudf/pylibcudf/strings/replace_re.pyx index ccc33fd4425..bdabc779ddf 100644 --- a/python/pylibcudf/pylibcudf/strings/replace_re.pyx +++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyx @@ -16,6 +16,7 @@ from pylibcudf.scalar cimport Scalar from pylibcudf.strings.regex_flags cimport regex_flags from pylibcudf.strings.regex_program cimport RegexProgram +__all__ = ["replace_re", "replace_with_backrefs"] cpdef Column replace_re( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyi b/python/pylibcudf/pylibcudf/strings/side_type.pyi new file mode 100644 index 00000000000..532edd60077 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/side_type.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from enum import IntEnum + +class SideType(IntEnum): + LEFT = ... + RIGHT = ... + BOTH = ... diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx index cf0c770cc11..87db4206a9c 100644 --- a/python/pylibcudf/pylibcudf/strings/side_type.pyx +++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx @@ -1,3 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from pylibcudf.libcudf.strings.side_type import \ side_type as SideType # no-cython-lint + +__all__ = ["SideType"] diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyi b/python/pylibcudf/pylibcudf/strings/slice.pyi new file mode 100644 index 00000000000..7bf9a7cb8c6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/slice.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def slice_strings( + input: Column, + start: Column | Scalar | None = None, + stop: Column | Scalar | None = None, + step: Scalar | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyx b/python/pylibcudf/pylibcudf/strings/slice.pyx index 70d10cab36c..d32de7c50e0 100644 --- a/python/pylibcudf/pylibcudf/strings/slice.pyx +++ b/python/pylibcudf/pylibcudf/strings/slice.pyx @@ -14,6 +14,7 @@ from pylibcudf.scalar cimport Scalar from cython.operator import dereference +__all__ = ["slice_strings"] cpdef Column slice_strings( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.py b/python/pylibcudf/pylibcudf/strings/split/__init__.py index 2033e5e275b..db2a597882e 100644 --- a/python/pylibcudf/pylibcudf/strings/split/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/split/__init__.py @@ -1,2 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from . import partition, split + +__all__ = ["partition", "split"] diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyi b/python/pylibcudf/pylibcudf/strings/split/partition.pyi new file mode 100644 index 00000000000..f19a463bd7e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table + +def partition(input: Column, delimiter: Scalar | None = None) -> Table: ... +def rpartition(input: Column, delimiter: Scalar | None = None) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx index 0fb4f186c41..75537ea46d3 100644 --- a/python/pylibcudf/pylibcudf/strings/split/partition.pyx +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx @@ -13,6 +13,7 @@ from pylibcudf.table cimport Table from cython.operator import dereference +__all__ = ["partition", "rpartition"] cpdef Table partition(Column input, Scalar delimiter=None): """ diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyi b/python/pylibcudf/pylibcudf/strings/split/split.pyi new file mode 100644 index 00000000000..3ccf0bc2a01 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/split.pyi @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.regex_program import RegexProgram +from pylibcudf.table import Table + +def split( + strings_column: Column, delimiter: Scalar, maxsplit: int +) -> Table: ... +def rsplit( + strings_column: Column, delimiter: Scalar, maxsplit: int +) -> Table: ... +def split_record( + strings: Column, delimiter: Scalar, maxsplit: int +) -> Column: ... +def rsplit_record( + strings: Column, delimiter: Scalar, maxsplit: int +) -> Column: ... +def split_re(input: Column, prog: RegexProgram, maxsplit: int) -> Table: ... +def rsplit_re(input: Column, prog: RegexProgram, maxsplit: int) -> Table: ... +def split_record_re( + input: Column, prog: RegexProgram, maxsplit: int +) -> Column: ... +def rsplit_record_re( + input: Column, prog: RegexProgram, maxsplit: int +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx index e3827f6645e..90087f996f0 100644 --- a/python/pylibcudf/pylibcudf/strings/split/split.pyx +++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx @@ -13,6 +13,16 @@ from pylibcudf.table cimport Table from cython.operator import dereference +__all__ = [ + "rsplit", + "rsplit_re", + "rsplit_record", + "rsplit_record_re", + "split", + "split_re", + "split_record", + "split_record_re", +] cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit): """ diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyi b/python/pylibcudf/pylibcudf/strings/strip.pyi new file mode 100644 index 00000000000..680355fc88f --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/strip.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.side_type import SideType + +def strip( + input: Column, + side: SideType = SideType.BOTH, + to_strip: Scalar | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyx b/python/pylibcudf/pylibcudf/strings/strip.pyx index 429a23c3cdf..805d959891b 100644 --- a/python/pylibcudf/pylibcudf/strings/strip.pyx +++ b/python/pylibcudf/pylibcudf/strings/strip.pyx @@ -13,6 +13,7 @@ from pylibcudf.libcudf.strings cimport strip as cpp_strip from pylibcudf.scalar cimport Scalar from pylibcudf.strings.side_type cimport side_type +__all__ = ["strip"] cpdef Column strip( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyi b/python/pylibcudf/pylibcudf/strings/translate.pyi new file mode 100644 index 00000000000..7158b6eb05c --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/translate.pyi @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from collections.abc import Mapping +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class FilterType(IntEnum): + KEEP = ... + REMOVE = ... + +def translate( + input: Column, chars_table: Mapping[int | str, int | str] +) -> Column: ... +def filter_characters( + input: Column, + characters_to_filter: Mapping[int | str, int | str], + keep_characters: FilterType, + replacement: Scalar, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx index d85da8e6cdd..ba1e8dc5d27 100644 --- a/python/pylibcudf/pylibcudf/strings/translate.pyx +++ b/python/pylibcudf/pylibcudf/strings/translate.pyx @@ -14,6 +14,7 @@ from cython.operator import dereference from pylibcudf.libcudf.strings.translate import \ filter_type as FilterType # no-cython-lint +__all__ = ["FilterType", "filter_characters", "translate"] cdef vector[pair[char_utf8, char_utf8]] _table_to_c_table(dict table): """ diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyi b/python/pylibcudf/pylibcudf/strings/wrap.pyi new file mode 100644 index 00000000000..5658f279197 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/wrap.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def wrap(input: Column, width: int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyx b/python/pylibcudf/pylibcudf/strings/wrap.pyx index 2ced250f837..b696eb48e47 100644 --- a/python/pylibcudf/pylibcudf/strings/wrap.pyx +++ b/python/pylibcudf/pylibcudf/strings/wrap.pyx @@ -7,6 +7,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport wrap as cpp_wrap from pylibcudf.libcudf.types cimport size_type +__all__ = ["wrap"] cpdef Column wrap(Column input, size_type width): """ diff --git a/python/pylibcudf/pylibcudf/table.pyi b/python/pylibcudf/pylibcudf/table.pyi new file mode 100644 index 00000000000..5aef7e009c8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/table.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +class Table: + def __init__(self, column: list[Column]): ... + def num_columns(self) -> int: ... + def num_rows(self) -> int: ... + def columns(self) -> list[Column]: ... diff --git a/python/pylibcudf/pylibcudf/table.pyx b/python/pylibcudf/pylibcudf/table.pyx index d0d6f2343d0..0c1e88a927c 100644 --- a/python/pylibcudf/pylibcudf/table.pyx +++ b/python/pylibcudf/pylibcudf/table.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.table.table cimport table from .column cimport Column +__all__ = ["Table"] cdef class Table: """A list of columns of the same size. @@ -24,6 +25,8 @@ cdef class Table: raise ValueError("All columns must be pylibcudf Column objects") self._columns = columns + __hash__ = None + cdef table_view view(self) nogil: """Generate a libcudf table_view to pass to libcudf algorithms. diff --git a/python/pylibcudf/pylibcudf/tests/io/test_text.py b/python/pylibcudf/pylibcudf/tests/io/test_text.py new file mode 100644 index 00000000000..f69e940e34e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/io/test_text.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.mark.parametrize( + "source_func", + [ + "make_source", + "make_source_from_file", + ], +) +@pytest.mark.parametrize("options", [None, plc.io.text.ParseOptions()]) +def test_multibyte_split(source_func, options, tmp_path): + data = "x::y::z" + func = getattr(plc.io.text, source_func) + if source_func == "make_source": + source = func(data) + elif source_func == "make_source_from_file": + fle = tmp_path / "fle.txt" + fle.write_text(data) + source = func(str(fle)) + result = plc.io.text.multibyte_split(source, "::", options) + expected = pa.array(["x::", "y::", "z"]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py index bbb08e8b95a..a33122221f6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_binaryops.py +++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py @@ -541,13 +541,6 @@ def py_shift_right_unsigned(x, y): plc.binaryop.BinaryOperator.LOGICAL_AND, pa.compute.and_, ), - ( - "int64", - "int64", - "int64", - plc.binaryop.BinaryOperator.LOGICAL_AND, - pa.compute.and_, - ), ( "int64", "int64", @@ -562,13 +555,6 @@ def py_shift_right_unsigned(x, y): plc.binaryop.BinaryOperator.LOGICAL_OR, pa.compute.or_, ), - ( - "int64", - "int64", - "int64", - plc.binaryop.BinaryOperator.LOGICAL_OR, - pa.compute.or_, - ), ( "int64", "int64", diff --git a/python/pylibcudf/pylibcudf/tests/test_filling.py b/python/pylibcudf/pylibcudf/tests/test_filling.py new file mode 100644 index 00000000000..91c7e42a0a0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_filling.py @@ -0,0 +1,91 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from datetime import datetime + +import pyarrow as pa +import pytest +from utils import assert_column_eq, assert_table_eq + +import pylibcudf as plc + + +@pytest.fixture +def pa_col(): + return pa.array([2, 3, 5, 7, 11]) + + +@pytest.fixture +def pa_table(): + pa_col = pa.array([1, 2, 3]) + return pa.table([pa_col], names=["a"]) + + +def test_fill(pa_col): + result = plc.filling.fill( + plc.interop.from_arrow(pa_col), + 1, + 3, + plc.interop.from_arrow(pa.scalar(5)), + ) + expect = pa.array([2, 5, 5, 7, 11]) + assert_column_eq(result, expect) + + +def test_fill_in_place(pa_col): + result = plc.interop.from_arrow(pa_col) + plc.filling.fill_in_place( + result, + 1, + 3, + plc.interop.from_arrow(pa.scalar(5)), + ) + expect = pa.array([2, 5, 5, 7, 11]) + assert_column_eq(result, expect) + + +def test_sequence(): + size = 5 + init_scalar = plc.interop.from_arrow(pa.scalar(10)) + step_scalar = plc.interop.from_arrow(pa.scalar(2)) + result = plc.filling.sequence( + size, + init_scalar, + step_scalar, + ) + expect = pa.array([10, 12, 14, 16, 18]) + assert_column_eq(result, expect) + + +def test_repeat_with_count_int(pa_table): + input_table = plc.interop.from_arrow(pa_table) + count = 2 + result = plc.filling.repeat(input_table, count) + expect = pa.table([[1, 1, 2, 2, 3, 3]], names=["a"]) + assert_table_eq(expect, result) + + +def test_repeat_with_count_column(pa_table): + input_table = plc.interop.from_arrow(pa_table) + count = plc.interop.from_arrow(pa.array([1, 2, 3])) + result = plc.filling.repeat(input_table, count) + expect = pa.table([[1] + [2] * 2 + [3] * 3], names=["a"]) + assert_table_eq(expect, result) + + +def test_calendrical_month_sequence(): + n = 5 + init_date = datetime(2020, 1, 31) + init = plc.interop.from_arrow( + pa.scalar(init_date, type=pa.timestamp("ms")) + ) + months = 1 + result = plc.filling.calendrical_month_sequence(n, init, months) + expected_dates = [ + datetime(2020, 1, 31), + datetime(2020, 2, 29), + datetime(2020, 3, 31), + datetime(2020, 4, 30), + datetime(2020, 5, 31), + ] + expect = pa.array(expected_dates, type=pa.timestamp("ms")) + assert_column_eq(result, expect) diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py index beacfc63ce5..946d583d1cc 100644 --- a/python/pylibcudf/pylibcudf/tests/test_labeling.py +++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py @@ -6,8 +6,12 @@ import pylibcudf as plc -@pytest.mark.parametrize("left_inclusive", [True, False]) -@pytest.mark.parametrize("right_inclusive", [True, False]) +@pytest.mark.parametrize( + "left_inclusive", [plc.labeling.Inclusive.YES, plc.labeling.Inclusive.NO] +) +@pytest.mark.parametrize( + "right_inclusive", [plc.labeling.Inclusive.YES, plc.labeling.Inclusive.NO] +) def test_label_bins(left_inclusive, right_inclusive): in_col = plc.interop.from_arrow(pa.array([1, 2, 3])) left_edges = plc.interop.from_arrow(pa.array([0, 5])) diff --git a/python/pylibcudf/pylibcudf/tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py index f3ef555f11d..8c1229c2a04 100644 --- a/python/pylibcudf/pylibcudf/tests/test_lists.py +++ b/python/pylibcudf/pylibcudf/tests/test_lists.py @@ -62,12 +62,12 @@ def test_concatenate_rows(test_data): [ ( [[[1, 2], [3, 4], [5]], [[6], None, [7, 8, 9]]], - False, + plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW, [[1, 2, 3, 4, 5], None], ), ( [[[1, 2], [3, 4], [5, None]], [[6], [None], [7, 8, 9]]], - True, + plc.lists.ConcatenateNullPolicy.IGNORE, [[1, 2, 3, 4, 5, None], [6, None, 7, 8, 9]], ), ], @@ -138,7 +138,9 @@ def test_index_of_scalar(list_column, scalar): plc_column = plc.interop.from_arrow(arr) plc_scalar = plc.interop.from_arrow(scalar) - res = plc.lists.index_of(plc_column, plc_scalar, True) + res = plc.lists.index_of( + plc_column, plc_scalar, plc.lists.DuplicateFindOption.FIND_FIRST + ) expect = pa.array([1, -1, -1, -1], type=pa.int32()) @@ -150,7 +152,9 @@ def test_index_of_list_column(list_column, search_key_column): arr2, expect = search_key_column plc_column1 = plc.interop.from_arrow(arr1) plc_column2 = plc.interop.from_arrow(arr2) - res = plc.lists.index_of(plc_column1, plc_column2, True) + res = plc.lists.index_of( + plc_column1, plc_column2, plc.lists.DuplicateFindOption.FIND_FIRST + ) expect = pa.array(search_key_column[1], type=pa.int32()) @@ -227,39 +231,34 @@ def test_sequences(): @pytest.mark.parametrize( - "ascending,na_position,expected", + "order,na_position,expected", [ ( - True, + plc.types.Order.ASCENDING, plc.types.NullOrder.BEFORE, [[1, 2, 3, 4], [None, 1, 2, 4], [-10, 0, 10, 10]], ), ( - True, + plc.types.Order.ASCENDING, plc.types.NullOrder.AFTER, [[1, 2, 3, 4], [1, 2, 4, None], [-10, 0, 10, 10]], ), ( - False, + plc.types.Order.DESCENDING, plc.types.NullOrder.BEFORE, [[4, 3, 2, 1], [4, 2, 1, None], [10, 10, 0, -10]], ), ( - False, - plc.types.NullOrder.AFTER, - [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]], - ), - ( - False, + plc.types.Order.DESCENDING, plc.types.NullOrder.AFTER, [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]], ), ], ) -def test_sort_lists(lists_column, ascending, na_position, expected): +def test_sort_lists(lists_column, order, na_position, expected): plc_column = plc.interop.from_arrow(pa.array(lists_column)) - res = plc.lists.sort_lists(plc_column, ascending, na_position, False) - res_stable = plc.lists.sort_lists(plc_column, ascending, na_position, True) + res = plc.lists.sort_lists(plc_column, order, na_position, False) + res_stable = plc.lists.sort_lists(plc_column, order, na_position, True) expect = pa.array(expected) @@ -272,44 +271,44 @@ def test_sort_lists(lists_column, ascending, na_position, expected): [ ( plc.lists.difference_distinct, - True, - True, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.EQUAL, [[], [1, 2, 3], None, [4, 5]], ), ( plc.lists.difference_distinct, - False, - True, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.EQUAL, [[], [1, 2, 3], None, [4, None, 5]], ), ( plc.lists.have_overlap, - True, - True, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.EQUAL, [True, False, None, True], ), ( plc.lists.have_overlap, - False, - False, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.UNEQUAL, [True, False, None, False], ), ( plc.lists.intersect_distinct, - True, - True, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.EQUAL, [[np.nan, 1, 2], [], None, [None]], ), ( plc.lists.intersect_distinct, - True, - False, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.UNEQUAL, [[1, 2], [], None, [None]], ), ( plc.lists.union_distinct, - False, - True, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.EQUAL, [ [np.nan, 2, 1, 3], [1, 2, 3, 4, 5], @@ -319,8 +318,8 @@ def test_sort_lists(lists_column, ascending, na_position, expected): ), ( plc.lists.union_distinct, - False, - False, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.UNEQUAL, [ [np.nan, np.nan, 2, 1, np.nan, 3], [1, 2, 3, 4, 5], @@ -352,20 +351,24 @@ def test_set_operations( @pytest.mark.parametrize( "nans_equal,nulls_equal,expected", [ - (True, True, [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]]), ( - False, - True, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.EQUAL, + [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]], + ), + ( + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.EQUAL, [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, None, 5]], ), ( - True, - False, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.UNEQUAL, [[np.nan, np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]], ), ( - False, - False, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.UNEQUAL, [ [np.nan, np.nan, 0, 1, 2, 3], [3, 1, 2], diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py index ead9ee094af..ec533e64307 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -21,15 +21,19 @@ def word_minhash_input_data(request): @pytest.mark.parametrize("width", [5, 12]) -def test_minhash(minhash_input_data, width): +def test_minhash_permuted(minhash_input_data, width): input_arr, seeds, seed_type = minhash_input_data minhash_func = ( - plc.nvtext.minhash.minhash + plc.nvtext.minhash.minhash_permuted if seed_type == pa.uint32() - else plc.nvtext.minhash.minhash64 + else plc.nvtext.minhash.minhash64_permuted ) result = minhash_func( - plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width + plc.interop.from_arrow(input_arr), + 0, + plc.interop.from_arrow(seeds), + plc.interop.from_arrow(seeds), + width, ) pa_result = plc.interop.to_arrow(result) assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py index f461657281a..e85cd1cc443 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py @@ -8,7 +8,7 @@ import pylibcudf as plc -@pytest.fixture() +@pytest.fixture def str_data(): pa_data = pa.array(["A", None]) return pa_data, plc.interop.from_arrow(pa_data) diff --git a/python/pylibcudf/pylibcudf/traits.pyi b/python/pylibcudf/pylibcudf/traits.pyi new file mode 100644 index 00000000000..fdb31a262cf --- /dev/null +++ b/python/pylibcudf/pylibcudf/traits.pyi @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.types import DataType + +def is_relationally_comparable(typ: DataType) -> bool: ... +def is_equality_comparable(typ: DataType) -> bool: ... +def is_numeric(typ: DataType) -> bool: ... +def is_numeric_not_bool(typ: DataType) -> bool: ... +def is_index_type(typ: DataType) -> bool: ... +def is_unsigned(typ: DataType) -> bool: ... +def is_integral(typ: DataType) -> bool: ... +def is_integral_not_bool(typ: DataType) -> bool: ... +def is_floating_point(typ: DataType) -> bool: ... +def is_boolean(typ: DataType) -> bool: ... +def is_timestamp(typ: DataType) -> bool: ... +def is_fixed_point(typ: DataType) -> bool: ... +def is_duration(typ: DataType) -> bool: ... +def is_chrono(typ: DataType) -> bool: ... +def is_dictionary(typ: DataType) -> bool: ... +def is_fixed_width(typ: DataType) -> bool: ... +def is_compound(typ: DataType) -> bool: ... +def is_nested(typ: DataType) -> bool: ... +def is_bit_castable(source: DataType, target: DataType) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/traits.pyx b/python/pylibcudf/pylibcudf/traits.pyx index 9c52e0ac1ab..3cf0a3a4b3b 100644 --- a/python/pylibcudf/pylibcudf/traits.pyx +++ b/python/pylibcudf/pylibcudf/traits.pyx @@ -5,6 +5,27 @@ from pylibcudf.libcudf.utilities cimport traits from .types cimport DataType +__all__ = [ + "is_bit_castable", + "is_boolean", + "is_chrono", + "is_compound", + "is_dictionary", + "is_duration", + "is_equality_comparable", + "is_fixed_point", + "is_fixed_width", + "is_floating_point", + "is_index_type", + "is_integral", + "is_integral_not_bool", + "is_nested", + "is_numeric", + "is_numeric_not_bool", + "is_relationally_comparable", + "is_timestamp", + "is_unsigned", +] cpdef bool is_relationally_comparable(DataType typ): """Checks if the given data type supports relational comparisons. diff --git a/python/pylibcudf/pylibcudf/transform.pyi b/python/pylibcudf/pylibcudf/transform.pyi new file mode 100644 index 00000000000..5cbd2e635f0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/transform.pyi @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.column import Column +from pylibcudf.expressions import Expression +from pylibcudf.gpumemoryview import gpumemoryview +from pylibcudf.table import Table +from pylibcudf.types import DataType + +def nans_to_nulls(input: Column) -> tuple[gpumemoryview, int]: ... +def compute_column(input: Table, expr: Expression) -> Column: ... +def bools_to_mask(input: Column) -> tuple[gpumemoryview, int]: ... +def mask_to_bools(bitmask: int, begin_bit: int, end_bit: int) -> Column: ... +def transform( + input: Column, unary_udf: str, output_type: DataType, is_ptx: bool +) -> Column: ... +def encode(input: Table) -> tuple[Table, Column]: ... +def one_hot_encode(input: Column, categories: Column) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx index e8d95cadb0c..9700bcff221 100644 --- a/python/pylibcudf/pylibcudf/transform.pyx +++ b/python/pylibcudf/pylibcudf/transform.pyx @@ -18,6 +18,15 @@ from .gpumemoryview cimport gpumemoryview from .types cimport DataType from .utils cimport int_to_bitmask_ptr +__all__ = [ + "bools_to_mask", + "compute_column", + "encode", + "mask_to_bools", + "nans_to_nulls", + "one_hot_encode", + "transform", +] cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): """Create a null mask preserving existing nulls and converting nans to null. diff --git a/python/pylibcudf/pylibcudf/transpose.pyi b/python/pylibcudf/pylibcudf/transpose.pyi new file mode 100644 index 00000000000..a84ab8a60ea --- /dev/null +++ b/python/pylibcudf/pylibcudf/transpose.pyi @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.table import Table + +def transpose(input_table: Table) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx index a24f937ced3..5eb3e58cebc 100644 --- a/python/pylibcudf/pylibcudf/transpose.pyx +++ b/python/pylibcudf/pylibcudf/transpose.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view from .column cimport Column from .table cimport Table +__all__ = ["transpose"] cpdef Table transpose(Table input_table): """Transpose a Table. diff --git a/python/pylibcudf/pylibcudf/types.pyi b/python/pylibcudf/pylibcudf/types.pyi new file mode 100644 index 00000000000..c91a95414bd --- /dev/null +++ b/python/pylibcudf/pylibcudf/types.pyi @@ -0,0 +1,86 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from enum import IntEnum +from typing import Final + +class Interpolation(IntEnum): + LINEAR = ... + LOWER = ... + HIGHER = ... + MIDPOINT = ... + NEAREST = ... + +class MaskState(IntEnum): + UNALLOCATED = ... + UNINITIALIZED = ... + ALL_VALID = ... + ALL_NULL = ... + +class NanEquality(IntEnum): + ALL_EQUAL = ... + UNEQUAL = ... + +class NanPolicy(IntEnum): + NAN_IS_NULL = ... + NAN_IS_VALID = ... + +class NullEquality(IntEnum): + EQUAL = ... + UNEQUAL = ... + +class NullOrder(IntEnum): + AFTER = ... + BEFORE = ... + +class NullPolicy(IntEnum): + EXCLUDE = ... + INCLUDE = ... + +class Order(IntEnum): + ASCENDING = ... + DESCENDING = ... + +class Sorted(IntEnum): + NO = ... + YES = ... + +class TypeId(IntEnum): + EMPTY = ... + INT8 = ... + INT16 = ... + INT32 = ... + INT64 = ... + UINT8 = ... + UINT16 = ... + UINT32 = ... + UINT64 = ... + FLOAT32 = ... + FLOAT64 = ... + BOOL8 = ... + TIMESTAMP_DAYS = ... + TIMESTAMP_SECONDS = ... + TIMESTAMP_MILLISECONDS = ... + TIMESTAMP_MICROSECONDS = ... + TIMESTAMP_NANOSECONDS = ... + DURATION_DAYS = ... + DURATION_SECONDS = ... + DURATION_MILLISECONDS = ... + DURATION_MICROSECONDS = ... + DURATION_NANOSECONDS = ... + DICTIONARY32 = ... + STRING = ... + LIST = ... + DECIMAL32 = ... + DECIMAL64 = ... + DECIMAL128 = ... + STRUCT = ... + NUM_TYPE_IDS = ... + +class DataType: + def __init__(self, type_id: TypeId, scale: int = 0): ... + def id(self) -> TypeId: ... + def scale(self) -> int: ... + +def size_of(t: DataType) -> int: ... + +SIZE_TYPE: Final[DataType] +SIZE_TYPE_ID: Final[TypeId] diff --git a/python/pylibcudf/pylibcudf/types.pyx b/python/pylibcudf/pylibcudf/types.pyx index a0c31f994a3..afa1b56f38a 100644 --- a/python/pylibcudf/pylibcudf/types.pyx +++ b/python/pylibcudf/pylibcudf/types.pyx @@ -20,6 +20,22 @@ from pylibcudf.libcudf.types import null_order as NullOrder # no-cython-lint, i from pylibcudf.libcudf.types import order as Order # no-cython-lint, isort:skip from pylibcudf.libcudf.types import sorted as Sorted # no-cython-lint, isort:skip +__all__ = [ + "DataType", + "Interpolation", + "MaskState", + "NanEquality", + "NanPolicy", + "NullEquality", + "NullOrder", + "NullPolicy", + "Order", + "SIZE_TYPE", + "SIZE_TYPE_ID", + "Sorted", + "TypeId", + "size_of" +] cdef class DataType: """Indicator for the logical data type of an element in a column. diff --git a/python/pylibcudf/pylibcudf/unary.pyi b/python/pylibcudf/pylibcudf/unary.pyi new file mode 100644 index 00000000000..7aa23b618f4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/unary.pyi @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +class UnaryOperator(IntEnum): + SIN = ... + COS = ... + TAN = ... + ARCSIN = ... + ARCCOS = ... + ARCTAN = ... + SINH = ... + COSH = ... + TANH = ... + ARCSINH = ... + ARCCOSH = ... + ARCTANH = ... + EXP = ... + LOG = ... + SQRT = ... + CBRT = ... + CEIL = ... + FLOOR = ... + ABS = ... + RINT = ... + BIT_INVERT = ... + NOT = ... + +def unary_operation(input: Column, op: UnaryOperator) -> Column: ... +def is_null(input: Column) -> Column: ... +def is_valid(input: Column) -> Column: ... +def cast(input: Column, data_type: DataType) -> Column: ... +def is_nan(input: Column) -> Column: ... +def is_not_nan(input: Column) -> Column: ... +def is_supported_cast(from_: DataType, to: DataType) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/unary.pyx b/python/pylibcudf/pylibcudf/unary.pyx index 53e8c382b5e..b738ab53d1b 100644 --- a/python/pylibcudf/pylibcudf/unary.pyx +++ b/python/pylibcudf/pylibcudf/unary.pyx @@ -13,6 +13,16 @@ from pylibcudf.libcudf.unary import \ from .column cimport Column from .types cimport DataType +__all__ = [ + "UnaryOperator", + "cast", + "is_nan", + "is_not_nan", + "is_null", + "is_supported_cast", + "is_valid", + "unary_operation", +] cpdef Column unary_operation(Column input, unary_operator op): """Perform a unary operation on a column. diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index b2cec80f484..e83db47830c 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -22,8 +22,8 @@ dependencies = [ "libcudf==24.12.*,>=0.0.0a0", "nvtx>=0.2.1", "packaging", - "pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'", - "pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'", + "pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'", + "pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'", "rmm==24.12.*,>=0.0.0a0", "typing_extensions>=4.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -56,18 +56,43 @@ Documentation = "https://docs.rapids.ai/api/cudf/stable/" [tool.ruff] extend = "../../pyproject.toml" +[tool.ruff.lint] +extend-select = [ + "TCH", # flake8-type-checking + "TID", # flake8-tidy-imports + "PT", # flake8-pytest-style +] +extend-ignore = [ + "PT011", # pytest.raises(...) is too broad +] + +[tool.ruff.lint.flake8-pytest-style] +# https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style +fixture-parentheses = false +mark-parentheses = false +parametrize-names-type = "csv" +parametrize-values-type = "list" +parametrize-values-row-type = "tuple" + [tool.ruff.lint.isort] combine-as-imports = true -known-first-party = ["cudf"] -section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] +known-first-party = ["pylibcudf"] +section-order = ["future", "standard-library", "third-party", "rapids", "first-party", "local-folder"] [tool.ruff.lint.isort.sections] -dask = ["dask", "distributed", "dask_cuda"] rapids = ["rmm"] [tool.ruff.lint.per-file-ignores] "__init__.py" = ["E402", "F401"] +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] # --import-mode=importlib because two test_json.py exists and tests directory is not a structured module addopts = "--tb=native --strict-config --strict-markers --import-mode=importlib"