diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index c19bb68986f..8190b5d0297 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -33,3 +33,4 @@ ENV SCCACHE_REGION="us-east-2" ENV SCCACHE_BUCKET="rapids-sccache-devs" ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai" ENV HISTFILE="/home/coder/.cache/._bash_history" +ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache" diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 67c451fbd6e..6942ef0009d 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -108,3 +108,21 @@ jobs: sha: ${{ inputs.sha }} date: ${{ inputs.date }} package-name: dask_cudf + trigger-pandas-tests: + if: inputs.build_type == 'nightly' + needs: wheel-build-cudf + runs-on: ubuntu-latest + steps: + - name: Checkout code repo + uses: actions/checkout@v4 + with: + ref: ${{ inputs.sha }} + persist-credentials: false + - name: Trigger pandas-tests + env: + GH_TOKEN: ${{ github.token }} + run: | + gh workflow run pandas-tests.yaml \ + -f branch=${{ inputs.branch }} \ + -f sha=${{ inputs.sha }} \ + -f date=${{ inputs.date }} diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml new file mode 100644 index 00000000000..60544294809 --- /dev/null +++ b/.github/workflows/pandas-tests.yaml @@ -0,0 +1,27 @@ +name: Pandas Test Job + +on: + workflow_dispatch: + inputs: + branch: + required: true + type: string + date: + required: true + type: string + sha: + required: true + type: string + +jobs: + pandas-tests: + # run the Pandas unit tests + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 + with: + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" )) + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + script: ci/cudf_pandas_scripts/pandas-tests/run.sh main diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 345ccbea45b..f9d5976f1fe 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -174,7 +174,7 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 with: - matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" )) build_type: pull-request script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit. @@ -182,7 +182,7 @@ jobs: pandas-tests-diff: # diff the results of running the Pandas unit tests and publish a job summary needs: pandas-tests - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@patch-1 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 with: node_type: cpu4 build_type: pull-request diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 65aef37697e..170f45e23fd 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -44,7 +44,6 @@ jobs: container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_cpp_memcheck.sh" static-configure: - needs: checks secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 with: @@ -125,14 +124,3 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} script: ci/cudf_pandas_scripts/run_tests.sh - pandas-tests: - # run the Pandas unit tests - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 - with: - matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(min_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) - build_type: nightly - branch: ${{ inputs.branch }} - date: ${{ inputs.date }} - sha: ${{ inputs.sha }} - script: ci/cudf_pandas_scripts/pandas-tests/run.sh main diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index dce92d7e613..757eaa44510 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -105,7 +105,7 @@ Instructions for a minimal build environment without conda are included below. # create the conda environment (assuming in base `cudf` directory) # note: RAPIDS currently doesn't support `channel_priority: strict`; # use `channel_priority: flexible` instead -conda env create --name cudf_dev --file conda/environments/all_cuda-118_arch-x86_64.yaml +conda env create --name cudf_dev --file conda/environments/all_cuda-122_arch-x86_64.yaml # activate the environment conda activate cudf_dev ``` diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh index 675e0c3981f..d1f9e0d1399 100755 --- a/ci/configure_cpp_static.sh +++ b/ci/configure_cpp_static.sh @@ -3,8 +3,6 @@ set -euo pipefail -rapids-configure-conda-channels - source rapids-date-string rapids-logger "Configure static cpp build" diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh index cf80f383db4..f87a3a36fcc 100755 --- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh +++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh @@ -8,14 +8,16 @@ # Hard-coded needs to match the version deduced by rapids-upload-artifacts-dir GH_JOB_NAME="pandas-tests-diff / build" +RAPIDS_FULL_VERSION=$(<./VERSION) rapids-logger "Github job name: ${GH_JOB_NAME}" +rapids-logger "Rapids version: ${RAPIDS_FULL_VERSION}" PY_VER="39" -MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-results.json -PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-results.json +MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json +PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json rapids-logger "Fetching latest available results from nightly" -aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt +aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt read -r COMPARE_ENV < s3_output.txt export COMPARE_ENV diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh index 1f70ca78c41..abde5e5d160 100755 --- a/ci/cudf_pandas_scripts/pandas-tests/run.sh +++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh @@ -6,8 +6,8 @@ set -euo pipefail PANDAS_TESTS_BRANCH=${1} - -rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch" +RAPIDS_FULL_VERSION=$(<./VERSION) +rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch and rapids-version $RAPIDS_FULL_VERSION" rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" @@ -27,9 +27,10 @@ bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \ --dist worksteal \ --report-log=${PANDAS_TESTS_BRANCH}.json 2>&1 +SUMMARY_FILE_NAME=${PANDAS_TESTS_BRANCH}-${RAPIDS_FULL_VERSION}-results.json # summarize the results and save them to artifacts: -python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${PANDAS_TESTS_BRANCH}-results.json +python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${SUMMARY_FILE_NAME} RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"} mkdir -p "${RAPIDS_ARTIFACTS_DIR}" -mv pandas-testing/${PANDAS_TESTS_BRANCH}-results.json ${RAPIDS_ARTIFACTS_DIR}/ -rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${PANDAS_TESTS_BRANCH}-results.json "${RAPIDS_ARTIFACTS_DIR}" +mv pandas-testing/${SUMMARY_FILE_NAME} ${RAPIDS_ARTIFACTS_DIR}/ +rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${SUMMARY_FILE_NAME} "${RAPIDS_ARTIFACTS_DIR}" diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 82d7104b0da..e629f8b633e 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -68,7 +68,7 @@ dependencies: - pandoc - pip - pre-commit -- protobuf>=4.21,<5 +- protobuf>=3.20,<5 - ptxcompiler - pyarrow==14.0.2.* - pydata-sphinx-theme!=0.14.2 diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index 0fd87e91745..f135a88cac2 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -66,7 +66,7 @@ dependencies: - pandoc - pip - pre-commit -- protobuf>=4.21,<5 +- protobuf>=3.20,<5 - pyarrow==14.0.2.* - pydata-sphinx-theme!=0.14.2 - pynvjitlink diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 7633fbb00a3..cd9237bd7cb 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -78,7 +78,7 @@ requirements: {% endif %} - cuda-version ={{ cuda_version }} run: - - {{ pin_compatible('protobuf', min_pin='x.x', max_pin='x') }} + - protobuf >=3.20,<5.0a0 - python - typing_extensions >=4.0.0 - pandas >=2.0,<2.2.2dev0 diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index b384f6d5674..571780888c0 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -339,6 +339,11 @@ ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp) ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp) target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB) +# ################################################################################################## +# * decimal benchmark +# --------------------------------------------------------------------------------- +ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp) + add_custom_target( run_benchmarks DEPENDS CUDF_BENCHMARKS diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index 9857aac4473..6df2cb44adc 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -324,10 +324,11 @@ struct random_value_fn()>> { distribution_fn dist; std::optional scale; - random_value_fn(distribution_params const& desc) + random_value_fn(distribution_params const& desc) : lower_bound{desc.lower_bound}, upper_bound{desc.upper_bound}, - dist{make_distribution(desc.id, desc.lower_bound, desc.upper_bound)} + dist{make_distribution(desc.id, lower_bound, upper_bound)}, + scale{desc.scale} { } diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp index 31dc2673d70..68d3dc492f5 100644 --- a/cpp/benchmarks/common/generate_input.hpp +++ b/cpp/benchmarks/common/generate_input.hpp @@ -182,9 +182,17 @@ struct distribution_params -struct distribution_params()>> {}; +struct distribution_params()>> { + distribution_id id; + typename T::rep lower_bound; + typename T::rep upper_bound; + std::optional scale; +}; /** * @brief Returns a vector of types, corresponding to the input type or a type group. @@ -226,7 +234,7 @@ class data_profile { cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 64}, 2}; distribution_params struct_dist_desc{ {cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::STRING}, 2}; - std::map> decimal_params; + std::map> decimal_params; double bool_probability_true = 0.5; std::optional null_probability = 0.01; @@ -300,16 +308,21 @@ class data_profile { } template ()>* = nullptr> - distribution_params get_distribution_params() const + distribution_params get_distribution_params() const { using rep = typename T::rep; auto it = decimal_params.find(cudf::type_to_id()); if (it == decimal_params.end()) { auto const range = default_range(); - return distribution_params{default_distribution_id(), range.first, range.second}; + auto const scale = std::optional{}; + return distribution_params{ + default_distribution_id(), range.first, range.second, scale}; } else { auto& desc = it->second; - return {desc.id, static_cast(desc.lower_bound), static_cast(desc.upper_bound)}; + return {desc.id, + static_cast(desc.lower_bound), + static_cast(desc.upper_bound), + desc.scale}; } } @@ -359,6 +372,23 @@ class data_profile { } } + // Users should pass integral values for bounds when setting the parameters for fixed-point. + // Otherwise the call with have no effect. + template , T>* = nullptr> + void set_distribution_params(Type_enum type_or_group, + distribution_id dist, + T lower_bound, + T upper_bound, + numeric::scale_type scale) + { + for (auto tid : get_type_or_group(static_cast(type_or_group))) { + decimal_params[tid] = { + dist, static_cast<__int128_t>(lower_bound), static_cast<__int128_t>(upper_bound), scale}; + } + } + template (), T>* = nullptr> void set_distribution_params(Type_enum type_or_group, distribution_id dist, diff --git a/cpp/benchmarks/decimal/convert_floating.cpp b/cpp/benchmarks/decimal/convert_floating.cpp new file mode 100644 index 00000000000..a367036c494 --- /dev/null +++ b/cpp/benchmarks/decimal/convert_floating.cpp @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include + +#include + +// This benchmark compares the cost of converting decimal <--> floating point +template +void bench_cast_decimal(nvbench::state& state, nvbench::type_list) +{ + static constexpr bool is_input_floating = std::is_floating_point_v; + static constexpr bool is_output_floating = std::is_floating_point_v; + + static constexpr bool is_double = + std::is_same_v || std::is_same_v; + static constexpr bool is_32bit = + std::is_same_v || std::is_same_v; + static constexpr bool is_128bit = std::is_same_v || + std::is_same_v; + + // Skip floating --> floating and decimal --> decimal + if constexpr (is_input_floating == is_output_floating) { + state.skip("Meaningless conversion."); + return; + } + + // Skip float <--> dec128 + if constexpr (!is_double && is_128bit) { + state.skip("Ignoring float <--> dec128."); + return; + } + + // Get settings + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const exp_mode = state.get_int64("exp_range"); + + // Exponent range: Range size is 10^6 + // These probe the edges of the float and double ranges, as well as more common values + int const exp_min_array[] = {-307, -37, -14, -3, 8, 31, 301}; + int const exp_range_size = 6; + int const exp_min = exp_min_array[exp_mode]; + int const exp_max = exp_min + exp_range_size; + + // With exp range size of 6, decimal output (generated or casted-to) has 7 digits of precision + int const extra_digits_precision = 1; + + // Exclude end range of double from float test + if (!is_double && ((exp_mode == 0) || (exp_mode == 6))) { + state.skip("Range beyond end of float tests."); + return; + } + + // The current float <--> decimal conversion algorithm is limited + static constexpr bool is_64bit = !is_32bit && !is_128bit; + if (is_32bit && (exp_mode != 3)) { + state.skip("Decimal32 conversion only works up to scale factors of 10^9."); + return; + } + if (is_64bit && ((exp_mode < 2) || (exp_mode > 4))) { + state.skip("Decimal64 conversion only works up to scale factors of 10^18."); + return; + } + if (is_128bit && ((exp_mode == 0) || (exp_mode == 6))) { + state.skip("Decimal128 conversion only works up to scale factors of 10^38."); + return; + } + + // Type IDs + auto const input_id = cudf::type_to_id(); + auto const output_id = cudf::type_to_id(); + + // Create data profile and scale + auto const [output_scale, profile] = [&]() { + if constexpr (is_input_floating) { + // Range for generated floating point values + auto get_pow10 = [](auto exp10) { + return std::pow(static_cast(10), static_cast(exp10)); + }; + InputType const floating_range_min = get_pow10(exp_min); + InputType const floating_range_max = get_pow10(exp_max); + + // With exp range size of 6, output has 7 decimal digits of precision + auto const decimal_output_scale = exp_min - extra_digits_precision; + + // Input distribution + data_profile const profile = data_profile_builder().distribution( + input_id, distribution_id::NORMAL, floating_range_min, floating_range_max); + + return std::pair{decimal_output_scale, profile}; + + } else { // Generating decimals + + using decimal_rep_type = typename InputType::rep; + + // For exp range size 6 and precision 7, generates ints between 10 and 10^7, + // with scale factor of: exp_max - 7. This matches floating point generation. + int const digits_precision = exp_range_size + extra_digits_precision; + auto const decimal_input_scale = numeric::scale_type{exp_max - digits_precision}; + + // Range for generated integer values + auto get_pow10 = [](auto exp10) { + return numeric::detail::ipow(exp10); + }; + auto const decimal_range_min = get_pow10(digits_precision - exp_range_size); + auto const decimal_range_max = get_pow10(digits_precision); + + // Input distribution + data_profile const profile = data_profile_builder().distribution(input_id, + distribution_id::NORMAL, + decimal_range_min, + decimal_range_max, + decimal_input_scale); + + return std::pair{0, profile}; + } + }(); + + // Generate input data + auto const input_col = create_random_column(input_id, row_count{num_rows}, profile); + auto const input_view = input_col->view(); + + // Output type + auto const output_type = + !is_input_floating ? cudf::data_type(output_id) : cudf::data_type(output_id, output_scale); + + // Stream + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + // Run benchmark + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::cast(input_view, output_type); }); + + // Throughput statistics + state.add_element_count(num_rows); + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(num_rows); +} + +// Data types +using data_types = + nvbench::type_list; + +NVBENCH_BENCH_TYPES(bench_cast_decimal, NVBENCH_TYPE_AXES(data_types, data_types)) + .set_name("decimal_floating_conversion") + .set_type_axes_names({"InputType", "OutputType"}) + .add_int64_power_of_two_axis("num_rows", {28}) + .add_int64_axis("exp_range", nvbench::range(0, 6)); diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index 884e5a2f368..025bff7d8f0 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -14,44 +14,19 @@ # This function finds nanoarrow and sets any additional necessary environment variables. function(find_and_configure_nanoarrow) - set(oneValueArgs VERSION FORK PINNED_TAG) - cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - # Only run if PKG_VERSION is < 0.5.0 - if(PKG_VERSION VERSION_LESS 0.5.0) - set(patch_files_to_run "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches/nanoarrow_cmake.diff") - set(patch_issues_to_ref - "Fix issues with nanoarrow CMake [https://github.com/apache/arrow-nanoarrow/pull/406]" - ) - set(patch_script "${CMAKE_BINARY_DIR}/rapids-cmake/patches/nanoarrow/patch.cmake") - set(log_file "${CMAKE_BINARY_DIR}/rapids-cmake/patches/nanoarrow/log") - string(TIMESTAMP current_year "%Y" UTC) - configure_file( - ${rapids-cmake-dir}/cpm/patches/command_template.cmake.in "${patch_script}" @ONLY - ) - else() - message( - FATAL_ERROR - "Nanoarrow version ${PKG_VERSION} already contains the necessary patch. Please remove this patch from cudf." - ) - endif() - + # Currently we need to always build nanoarrow so we don't pickup a previous installed version + set(CPM_DOWNLOAD_nanoarrow ON) rapids_cpm_find( - nanoarrow ${PKG_VERSION} + nanoarrow 0.5.0 GLOBAL_TARGETS nanoarrow CPM_ARGS - GIT_REPOSITORY https://github.com/${PKG_FORK}/arrow-nanoarrow.git - GIT_TAG ${PKG_PINNED_TAG} - # TODO: Commit hashes are not supported with shallow clones. Can switch this if and when we pin - # to an actual tag. + GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git + GIT_TAG 11e73a8c85b45e3d49c8c541b4e1497a649fe03c GIT_SHALLOW FALSE - PATCH_COMMAND ${CMAKE_COMMAND} -P ${patch_script} OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ) set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) rapids_export_find_package_root(BUILD nanoarrow "${nanoarrow_BINARY_DIR}" EXPORT_SET cudf-exports) endfunction() -find_and_configure_nanoarrow( - VERSION 0.4.0 FORK apache PINNED_TAG c97720003ff863b81805bcdb9f7c91306ab6b6a8 -) +find_and_configure_nanoarrow() diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff deleted file mode 100644 index 1262a38c0a4..00000000000 --- a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff +++ /dev/null @@ -1,184 +0,0 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 8714c70..6a9e505 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -49,7 +49,6 @@ else() - endif() - - option(NANOARROW_CODE_COVERAGE "Enable coverage reporting" OFF) --add_library(coverage_config INTERFACE) - - # Avoids a warning about timestamps on downloaded files (prefer new policy - # if available)) -@@ -59,6 +58,7 @@ endif() - - configure_file(src/nanoarrow/nanoarrow_config.h.in generated/nanoarrow_config.h) - -+include(GNUInstallDirs) - if(NANOARROW_BUNDLE) - # Combine all headers into amalgamation/nanoarrow.h in the build directory - file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/amalgamation) -@@ -111,6 +111,8 @@ if(NANOARROW_BUNDLE) - if(NANOARROW_BUILD_TESTS) - include_directories(${CMAKE_BINARY_DIR}/amalgamation) - add_library(nanoarrow ${NANOARROW_C_TEMP}) -+ add_library(nanoarrow::nanoarrow ALIAS nanoarrow) -+ - target_compile_definitions(nanoarrow PUBLIC "$<$:NANOARROW_DEBUG>") - endif() - -@@ -120,10 +122,11 @@ if(NANOARROW_BUNDLE) - else() - add_library(nanoarrow src/nanoarrow/array.c src/nanoarrow/schema.c - src/nanoarrow/array_stream.c src/nanoarrow/utils.c) -+ add_library(nanoarrow::nanoarrow ALIAS nanoarrow) - - target_include_directories(nanoarrow - PUBLIC $ -- $) -+ $) - target_include_directories(nanoarrow - PUBLIC $ - ) -@@ -154,13 +157,49 @@ else() - endif() - endif() - -- install(TARGETS nanoarrow DESTINATION lib) -+ install(TARGETS nanoarrow -+ DESTINATION "${CMAKE_INSTALL_LIBDIR}" -+ EXPORT nanoarrow-exports) - install(DIRECTORY src/ -- DESTINATION include -+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" - FILES_MATCHING -- PATTERN "*.h") -+ PATTERN "*.h*") - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/generated/nanoarrow_config.h -- DESTINATION include/nanoarrow) -+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nanoarrow") -+ -+ # Generate package files for the build and install trees. -+ include(CMakePackageConfigHelpers) -+ -+ foreach(tree_type BUILD INSTALL) -+ if(tree_type STREQUAL "BUILD") -+ set(install_location ".") -+ else() -+ set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/nanoarrow") -+ endif() -+ -+ set(build_location "${PROJECT_BINARY_DIR}/${install_location}") -+ write_basic_package_version_file( -+ "${build_location}/nanoarrow-config-version.cmake" -+ VERSION ${nanoarrow_VERSION} -+ # After 1.0.0, we can use `SameMajorVersion` here. -+ COMPATIBILITY ExactVersion) -+ configure_package_config_file("${CMAKE_CURRENT_LIST_DIR}/cmake/config.cmake.in" -+ "${build_location}/nanoarrow-config.cmake" -+ INSTALL_DESTINATION "${install_location}") -+ -+ if(tree_type STREQUAL "BUILD") -+ export(EXPORT nanoarrow-exports -+ FILE "${build_location}/nanoarrow-targets.cmake" -+ NAMESPACE nanoarrow::) -+ -+ else() -+ install(DIRECTORY "${build_location}/" DESTINATION "${install_location}") -+ install(EXPORT nanoarrow-exports -+ DESTINATION "${install_location}" -+ FILE "nanoarrow-targets.cmake" -+ NAMESPACE nanoarrow::) -+ endif() -+ endforeach() - endif() - - # Always build integration test if building tests -@@ -171,7 +210,7 @@ if(NANOARROW_BUILD_TESTS OR NANOARROW_BUILD_INTEGRATION_TESTS) - src/nanoarrow/integration/c_data_integration.cc) - target_include_directories(nanoarrow_c_data_integration - PUBLIC $ -- $) -+ $) - target_link_libraries(nanoarrow_c_data_integration PRIVATE nanoarrow nlohmann_json) - endif() - -@@ -215,34 +254,18 @@ if(NANOARROW_BUILD_TESTS) - src/nanoarrow/integration/c_data_integration_test.cc) - - if(NANOARROW_CODE_COVERAGE) -- target_compile_options(coverage_config INTERFACE -O0 -g --coverage) -- target_link_options(coverage_config INTERFACE --coverage) -- target_link_libraries(nanoarrow coverage_config) -+ target_compile_options(nanoarrow PUBLIC -O0 -g --coverage) -+ target_link_options(nanoarrow PUBLIC --coverage) - endif() - -- target_link_libraries(utils_test -- nanoarrow -- gtest_main -- ${NANOARROW_ARROW_TARGET} -- coverage_config) -- target_link_libraries(buffer_test nanoarrow gtest_main coverage_config) -- target_link_libraries(array_test -- nanoarrow -- gtest_main -- ${NANOARROW_ARROW_TARGET} -- coverage_config) -- target_link_libraries(schema_test -- nanoarrow -- gtest_main -- ${NANOARROW_ARROW_TARGET} -- coverage_config) -- target_link_libraries(array_stream_test nanoarrow gtest_main coverage_config) -- target_link_libraries(nanoarrow_hpp_test nanoarrow gtest_main coverage_config) -- target_link_libraries(nanoarrow_testing_test -- nanoarrow -- gtest_main -- nlohmann_json::nlohmann_json -- coverage_config) -+ target_link_libraries(utils_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET}) -+ target_link_libraries(buffer_test nanoarrow gtest_main) -+ target_link_libraries(array_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET}) -+ target_link_libraries(schema_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET}) -+ target_link_libraries(array_stream_test nanoarrow gtest_main) -+ target_link_libraries(nanoarrow_hpp_test nanoarrow gtest_main) -+ target_link_libraries(nanoarrow_testing_test nanoarrow gtest_main -+ nlohmann_json::nlohmann_json) - target_link_libraries(c_data_integration_test nanoarrow nanoarrow_c_data_integration - gtest_main) - -diff --git a/cmake/config.cmake.in b/cmake/config.cmake.in -new file mode 100644 -index 0000000..021dc31 ---- /dev/null -+++ b/cmake/config.cmake.in -@@ -0,0 +1,28 @@ -+# Licensed to the Apache Software Foundation (ASF) under one -+# or more contributor license agreements. See the NOTICE file -+# distributed with this work for additional information -+# regarding copyright ownership. The ASF licenses this file -+# to you under the Apache License, Version 2.0 (the -+# "License"); you may not use this file except in compliance -+# with the License. You may obtain a copy of the License at -+# -+# http://www.apache.org/licenses/LICENSE-2.0 -+# -+# Unless required by applicable law or agreed to in writing, -+# software distributed under the License is distributed on an -+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -+# KIND, either express or implied. See the License for the -+# specific language governing permissions and limitations -+# under the License. -+ -+ -+@PACKAGE_INIT@ -+ -+cmake_minimum_required(VERSION @CMAKE_MINIMUM_REQUIRED_VERSION@) -+ -+include("${CMAKE_CURRENT_LIST_DIR}/nanoarrow-targets.cmake" REQUIRED) -+include("${CMAKE_CURRENT_LIST_DIR}/nanoarrow-config-version.cmake" REQUIRED) -+ -+set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}") -+include(FindPackageHandleStandardArgs) -+find_package_handle_standard_args(${CMAKE_FIND_PACKAGE_NAME} CONFIG_MODE) diff --git a/cpp/include/cudf/strings/detail/convert/string_to_float.cuh b/cpp/include/cudf/strings/detail/convert/string_to_float.cuh index ab934750f9e..bbf56cf1446 100644 --- a/cpp/include/cudf/strings/detail/convert/string_to_float.cuh +++ b/cpp/include/cudf/strings/detail/convert/string_to_float.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -102,6 +102,9 @@ __device__ inline double stod(string_view const& d_str) ch = *in_ptr++; if (ch < '0' || ch > '9') break; exp_ten = (exp_ten * 10) + (int)(ch - '0'); + // Prevent integer overflow in exp_ten. 100,000,000 is the largest + // power of ten that can be multiplied by 10 without overflow. + if (exp_ten >= 100'000'000) { break; } } } } diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index bc5c45d8980..9d40c657396 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -76,16 +76,16 @@ void print_tree(host_span input, tree_meta_t const& d_gpu_tree, rmm::cuda_stream_view stream) { - print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.node_categories, stream), + print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.node_categories, stream), "node_categories", to_cat); - print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.parent_node_ids, stream), + print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.parent_node_ids, stream), "parent_node_ids", to_int); print_vec( - cudf::detail::make_std_vector_async(d_gpu_tree.node_levels, stream), "node_levels", to_int); - auto node_range_begin = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_begin, stream); - auto node_range_end = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_end, stream); + cudf::detail::make_std_vector_sync(d_gpu_tree.node_levels, stream), "node_levels", to_int); + auto node_range_begin = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_begin, stream); + auto node_range_end = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_end, stream); print_vec(node_range_begin, "node_range_begin", to_int); print_vec(node_range_end, "node_range_end", to_int); for (int i = 0; i < int(node_range_begin.size()); i++) { @@ -333,10 +333,11 @@ rmm::device_uvector get_values_column_indices(TreeDepthT const row_a * @param stream CUDA stream * @return Vector of strings */ -std::vector copy_strings_to_host(device_span input, - device_span node_range_begin, - device_span node_range_end, - rmm::cuda_stream_view stream) +std::vector copy_strings_to_host_sync( + device_span input, + device_span node_range_begin, + device_span node_range_end, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); auto const num_strings = node_range_begin.size(); @@ -371,12 +372,13 @@ std::vector copy_strings_to_host(device_span input, auto to_host = [stream](auto const& col) { if (col.is_empty()) return std::vector{}; auto const scv = cudf::strings_column_view(col); - auto const h_chars = cudf::detail::make_std_vector_sync( + auto const h_chars = cudf::detail::make_std_vector_async( cudf::device_span(scv.chars_begin(stream), scv.chars_size(stream)), stream); - auto const h_offsets = cudf::detail::make_std_vector_sync( + auto const h_offsets = cudf::detail::make_std_vector_async( cudf::device_span(scv.offsets().data() + scv.offset(), scv.size() + 1), stream); + stream.synchronize(); // build std::string vector from chars and offsets std::vector host_data; @@ -528,8 +530,9 @@ void make_device_json_column(device_span input, auto column_range_beg = cudf::detail::make_std_vector_async(d_column_tree.node_range_begin, stream); auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream); - std::vector column_names = copy_strings_to_host( + std::vector column_names = copy_strings_to_host_sync( input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); + stream.synchronize(); // array of arrays column names if (is_array_of_arrays) { TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; @@ -537,6 +540,7 @@ void make_device_json_column(device_span input, get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); auto h_values_column_indices = cudf::detail::make_std_vector_async(values_column_indices, stream); + stream.synchronize(); std::transform(unique_col_ids.begin(), unique_col_ids.end(), column_names.begin(), @@ -609,7 +613,7 @@ void make_device_json_column(device_span input, std::vector is_str_column_all_nulls{}; if (is_enabled_mixed_types_as_string) { - is_str_column_all_nulls = cudf::detail::make_std_vector_async( + is_str_column_all_nulls = cudf::detail::make_std_vector_sync( is_all_nulls_each_column(input, d_column_tree, tree, col_ids, options, stream), stream); } diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index fa1de5f301d..83bf7fb0d73 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -924,7 +924,7 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s, auto start = cur; - auto init_rle = [s, lvl, end, level_bits](uint8_t const* cur, uint8_t const* end) { + auto init_rle = [s, lvl, level_bits](uint8_t const* cur, uint8_t const* end) { uint32_t const run = get_vlq32(cur, end); s->initial_rle_run[lvl] = run; if (!(run & 1)) { @@ -1160,7 +1160,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, int32_t units = 0; // Duration types are not included because no scaling is done when reading if (s->col.logical_type.has_value()) { - auto const& lt = s->col.logical_type.value(); + auto const& lt = *s->col.logical_type; if (lt.is_timestamp_millis()) { units = cudf::timestamp_ms::period::den; } else if (lt.is_timestamp_micros()) { @@ -1217,7 +1217,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, } else if (data_type == INT32) { // check for smaller bitwidths if (s->col.logical_type.has_value()) { - auto const& lt = s->col.logical_type.value(); + auto const& lt = *s->col.logical_type; if (lt.type == LogicalType::INTEGER) { s->dtype_len = lt.bit_width() / 8; } else if (lt.is_time_millis()) { diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 5a8d96975ce..fd8d4f8bd7f 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1396,16 +1396,13 @@ void init_encoder_pages(hostdevice_2dvector& chunks, } /** - * @brief Encode a batch of pages. + * @brief Encode pages. * * @throws rmm::bad_alloc if there is insufficient space for temporary buffers * * @param chunks column chunk array * @param pages encoder pages array - * @param pages_in_batch number of pages in this batch - * @param first_page_in_batch first page in batch - * @param rowgroups_in_batch number of rowgroups in this batch - * @param first_rowgroup first rowgroup in batch + * @param num_rowgroups number of rowgroups * @param page_stats optional page-level statistics (nullptr if none) * @param chunk_stats optional chunk-level statistics (nullptr if none) * @param column_stats optional page-level statistics for column index (nullptr if none) @@ -1417,10 +1414,6 @@ void init_encoder_pages(hostdevice_2dvector& chunks, */ void encode_pages(hostdevice_2dvector& chunks, device_span pages, - uint32_t pages_in_batch, - uint32_t first_page_in_batch, - uint32_t rowgroups_in_batch, - uint32_t first_rowgroup, statistics_chunk const* page_stats, statistics_chunk const* chunk_stats, statistics_chunk const* column_stats, @@ -1430,14 +1423,12 @@ void encode_pages(hostdevice_2dvector& chunks, bool write_v2_headers, rmm::cuda_stream_view stream) { - auto batch_pages = pages.subspan(first_page_in_batch, pages_in_batch); + auto const num_pages = pages.size(); + auto pages_stats = (page_stats != nullptr) + ? device_span(page_stats, num_pages) + : device_span(); - auto batch_pages_stats = - (page_stats != nullptr) - ? device_span(page_stats + first_page_in_batch, pages_in_batch) - : device_span(); - - uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? pages_in_batch : 0; + uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? num_pages : 0; rmm::device_uvector> comp_in(max_comp_pages, stream); rmm::device_uvector> comp_out(max_comp_pages, stream); @@ -1447,7 +1438,7 @@ void encode_pages(hostdevice_2dvector& chunks, comp_res.end(), compression_result{0, compression_status::FAILURE}); - EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream); + EncodePages(pages, write_v2_headers, comp_in, comp_out, comp_res, stream); switch (compression) { case Compression::SNAPPY: if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) { @@ -1480,25 +1471,23 @@ void encode_pages(hostdevice_2dvector& chunks, // TBD: Not clear if the official spec actually allows dynamically turning off compression at the // chunk-level - auto d_chunks_in_batch = chunks.device_view().subspan(first_rowgroup, rowgroups_in_batch); - DecideCompression(d_chunks_in_batch.flat_view(), stream); - EncodePageHeaders(batch_pages, comp_res, batch_pages_stats, chunk_stats, stream); - GatherPages(d_chunks_in_batch.flat_view(), pages, stream); + auto d_chunks = chunks.device_view(); + DecideCompression(d_chunks.flat_view(), stream); + EncodePageHeaders(pages, comp_res, pages_stats, chunk_stats, stream); + GatherPages(d_chunks.flat_view(), pages, stream); // By now, the var_bytes has been calculated in InitPages, and the histograms in EncodePages. // EncodeColumnIndexes can encode the histograms in the ColumnIndex, and also sum up var_bytes // and the histograms for inclusion in the chunk's SizeStats. if (column_stats != nullptr) { - EncodeColumnIndexes(d_chunks_in_batch.flat_view(), - {column_stats, pages.size()}, - column_index_truncate_length, - stream); + EncodeColumnIndexes( + d_chunks.flat_view(), {column_stats, pages.size()}, column_index_truncate_length, stream); } - auto h_chunks_in_batch = chunks.host_view().subspan(first_rowgroup, rowgroups_in_batch); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks_in_batch.data(), - d_chunks_in_batch.data(), - d_chunks_in_batch.flat_view().size_bytes(), + auto h_chunks = chunks.host_view(); + CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks.data(), + d_chunks.data(), + d_chunks.flat_view().size_bytes(), cudaMemcpyDefault, stream.value())); @@ -1959,33 +1948,23 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, std::fill_n(std::back_inserter(rg_to_part), num_rg_in_part[p], p); } - // Batch processing is no longer supported. - // This line disables batch processing (so batch size will no longer be limited at 1GB as before). - // TODO: All the relevant code will be removed in the follow-up work: - // https://github.com/rapidsai/cudf/issues/13440 - auto const max_bytes_in_batch = std::numeric_limits::max(); - - // Initialize batches of rowgroups to encode (mainly to limit peak memory usage) - std::vector batch_list; - size_type num_pages = 0; - size_t max_uncomp_bfr_size = 0; - size_t max_comp_bfr_size = 0; - size_t max_chunk_bfr_size = 0; - size_type max_pages_in_batch = 0; - size_t bytes_in_batch = 0; - size_t comp_bytes_in_batch = 0; + // Initialize rowgroups to encode + size_type num_pages = 0; + size_t max_uncomp_bfr_size = 0; + size_t max_comp_bfr_size = 0; + size_t max_chunk_bfr_size = 0; + size_t column_index_bfr_size = 0; size_t def_histogram_bfr_size = 0; size_t rep_histogram_bfr_size = 0; - for (size_type r = 0, groups_in_batch = 0, pages_in_batch = 0; r <= num_rowgroups; r++) { - size_t rowgroup_size = 0; - size_t comp_rowgroup_size = 0; + size_t rowgroup_size = 0; + size_t comp_rowgroup_size = 0; + for (size_type r = 0; r <= num_rowgroups; r++) { if (r < num_rowgroups) { for (int i = 0; i < num_columns; i++) { EncColumnChunk* ck = &chunks[r][i]; ck->first_page = num_pages; num_pages += ck->num_pages; - pages_in_batch += ck->num_pages; rowgroup_size += ck->bfr_size; comp_rowgroup_size += ck->compressed_size; max_chunk_bfr_size = @@ -2007,29 +1986,17 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, } } } - // TBD: We may want to also shorten the batch if we have enough pages (not just based on size) - if ((r == num_rowgroups) || - (groups_in_batch != 0 && bytes_in_batch + rowgroup_size > max_bytes_in_batch)) { - max_uncomp_bfr_size = std::max(max_uncomp_bfr_size, bytes_in_batch); - max_comp_bfr_size = std::max(max_comp_bfr_size, comp_bytes_in_batch); - max_pages_in_batch = std::max(max_pages_in_batch, pages_in_batch); - if (groups_in_batch != 0) { - batch_list.push_back(groups_in_batch); - groups_in_batch = 0; - } - bytes_in_batch = 0; - comp_bytes_in_batch = 0; - pages_in_batch = 0; + // write bfr sizes if this is the last rowgroup + if (r == num_rowgroups) { + max_uncomp_bfr_size = rowgroup_size; + max_comp_bfr_size = comp_rowgroup_size; } - bytes_in_batch += rowgroup_size; - comp_bytes_in_batch += comp_rowgroup_size; - groups_in_batch++; } // Clear compressed buffer size if compression has been turned off if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; } - // Initialize data pointers in batch + // Initialize data pointers uint32_t const num_stats_bfr = (stats_granularity != statistics_freq::STATISTICS_NONE) ? num_pages + num_chunks : 0; @@ -2055,10 +2022,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto bfr_i = static_cast(col_idx_bfr.data()); auto bfr_r = rep_level_histogram.data(); auto bfr_d = def_level_histogram.data(); - for (auto b = 0, r = 0; b < static_cast(batch_list.size()); b++) { + if (num_rowgroups != 0) { auto bfr = static_cast(uncomp_bfr.data()); auto bfr_c = static_cast(comp_bfr.data()); - for (auto j = 0; j < batch_list[b]; j++, r++) { + for (auto r = 0; r < num_rowgroups; r++) { for (auto i = 0; i < num_columns; i++) { EncColumnChunk& ck = chunks[r][i]; ck.uncompressed_bfr = bfr; @@ -2108,22 +2075,11 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, std::optional comp_stats; if (collect_compression_statistics) { comp_stats = writer_compression_statistics{}; } - // Encode row groups in batches - for (auto b = 0, batch_r_start = 0; b < static_cast(batch_list.size()); b++) { - // Count pages in this batch - auto const rnext = batch_r_start + batch_list[b]; - auto const first_page_in_batch = chunks[batch_r_start][0].first_page; - auto const first_page_in_next_batch = - (rnext < num_rowgroups) ? chunks[rnext][0].first_page : num_pages; - auto const pages_in_batch = first_page_in_next_batch - first_page_in_batch; - + // Encode row groups + if (num_rowgroups != 0) { encode_pages( chunks, {pages.data(), pages.size()}, - pages_in_batch, - first_page_in_batch, - batch_list[b], - batch_r_start, (stats_granularity == statistics_freq::STATISTICS_PAGE) ? page_stats.data() : nullptr, (stats_granularity != statistics_freq::STATISTICS_NONE) ? page_stats.data() + num_pages : nullptr, @@ -2152,7 +2108,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, } } - for (int r = batch_r_start; r < rnext; r++) { + for (int r = 0; r < num_rowgroups; r++) { int p = rg_to_part[r]; int global_r = global_rowgroup_base[p] + r - first_rg_in_part[p]; auto& row_group = agg_meta->file(p).row_groups[global_r]; @@ -2192,7 +2148,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto h_def_ptr = h_def_histogram.data(); auto h_rep_ptr = h_rep_histogram.data(); - for (int r = batch_r_start; r < rnext; r++) { + for (int r = 0; r < num_rowgroups; r++) { int const p = rg_to_part[r]; int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p]; auto& row_group = agg_meta->file(p).row_groups[global_r]; @@ -2239,8 +2195,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, } } } - - batch_r_start = rnext; } auto bounce_buffer = @@ -2251,7 +2205,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, std::move(chunks), std::move(global_rowgroup_base), std::move(first_rg_in_part), - std::move(batch_list), std::move(rg_to_part), std::move(comp_stats), std::move(uncomp_bfr), @@ -2358,7 +2311,6 @@ void writer::impl::write(table_view const& input, std::vector co chunks, global_rowgroup_base, first_rg_in_part, - batch_list, rg_to_part, comp_stats, uncomp_bfr, // unused, but contains data for later write to sink @@ -2402,7 +2354,6 @@ void writer::impl::write(table_view const& input, std::vector co chunks, global_rowgroup_base, first_rg_in_part, - batch_list, rg_to_part, bounce_buffer); @@ -2417,18 +2368,17 @@ void writer::impl::write_parquet_data_to_sink( host_2dspan chunks, host_span global_rowgroup_base, host_span first_rg_in_part, - host_span batch_list, host_span rg_to_part, host_span bounce_buffer) { - _agg_meta = std::move(updated_agg_meta); - auto const num_columns = chunks.size().second; + _agg_meta = std::move(updated_agg_meta); + auto const num_rowgroups = chunks.size().first; + auto const num_columns = chunks.size().second; - for (auto b = 0, r = 0; b < static_cast(batch_list.size()); b++) { - auto const rnext = r + batch_list[b]; + if (num_rowgroups != 0) { std::vector> write_tasks; - for (; r < rnext; r++) { + for (auto r = 0; r < static_cast(num_rowgroups); r++) { int const p = rg_to_part[r]; int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p]; auto& row_group = _agg_meta->file(p).row_groups[global_r]; @@ -2472,10 +2422,9 @@ void writer::impl::write_parquet_data_to_sink( auto const h_pages = cudf::detail::make_host_vector_sync(pages, _stream); // add column and offset indexes to metadata - for (auto b = 0, r = 0; b < static_cast(batch_list.size()); b++) { - auto const rnext = r + batch_list[b]; - auto curr_page_idx = chunks[r][0].first_page; - for (; r < rnext; r++) { + if (num_rowgroups != 0) { + auto curr_page_idx = chunks[0][0].first_page; + for (auto r = 0; r < static_cast(num_rowgroups); r++) { int const p = rg_to_part[r]; int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p]; auto const& row_group = _agg_meta->file(p).row_groups[global_r]; diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 2f6608b0ae7..3cbb7630fab 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -129,7 +129,6 @@ class writer::impl { * @param chunks Column chunks * @param global_rowgroup_base Numbers of rowgroups in each file/partition * @param first_rg_in_part The first rowgroup in each partition - * @param batch_list The batches of rowgroups to encode * @param rg_to_part A map from rowgroup to partition * @param[out] bounce_buffer Temporary host output buffer */ @@ -138,7 +137,6 @@ class writer::impl { host_2dspan chunks, host_span global_rowgroup_base, host_span first_rg_in_part, - host_span batch_list, host_span rg_to_part, host_span bounce_buffer); diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index 8d8930013cf..a7fd244f8a5 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,9 @@ #include #include +#include +#include +#include namespace cudf { namespace strings { @@ -110,23 +114,22 @@ struct convert_char_fn { * * This can be used in calls to make_strings_children. */ -struct upper_lower_fn { +struct base_upper_lower_fn { convert_char_fn converter; - column_device_view d_strings; size_type* d_offsets{}; char* d_chars{}; - __device__ void operator()(size_type idx) const + base_upper_lower_fn(convert_char_fn converter) : converter(converter) {} + + __device__ inline void process_string(string_view d_str, size_type idx) const { - if (d_strings.is_null(idx)) { - if (!d_chars) d_offsets[idx] = 0; - return; - } - auto const d_str = d_strings.element(idx); - size_type bytes = 0; - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) { - auto const size = converter.process_character(*itr, d_buffer); + size_type bytes = 0; + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + for (auto itr = d_str.data(); itr < (d_str.data() + d_str.size_bytes()); ++itr) { + if (is_utf8_continuation_char(static_cast(*itr))) continue; + char_utf8 chr = 0; + to_char_utf8(itr, chr); + auto const size = converter.process_character(chr, d_buffer); if (d_buffer) { d_buffer += size; } else { @@ -137,45 +140,116 @@ struct upper_lower_fn { } }; +struct upper_lower_fn : public base_upper_lower_fn { + column_device_view d_strings; + + upper_lower_fn(convert_char_fn converter, column_device_view const& d_strings) + : base_upper_lower_fn{converter}, d_strings{d_strings} + { + } + + __device__ void operator()(size_type idx) const + { + if (d_strings.is_null(idx)) { + if (!d_chars) { d_offsets[idx] = 0; } + return; + } + auto const d_str = d_strings.element(idx); + process_string(d_str, idx); + } +}; + +// Long strings are divided into smaller strings using this value as a guide. +// Generally strings are split into sub-blocks of bytes of this size but +// care is taken to not sub-block in the middle of a multi-byte character. +constexpr size_type LS_SUB_BLOCK_SIZE = 32; + /** - * @brief Count output bytes in warp-parallel threads + * @brief Produces sub-offsets for the chars in the given strings column + */ +struct sub_offset_fn { + char const* d_input_chars; + int64_t first_offset; + int64_t last_offset; + + __device__ int64_t operator()(int64_t idx) const + { + auto const end = d_input_chars + last_offset; + auto position = (idx + 1) * LS_SUB_BLOCK_SIZE; + auto begin = d_input_chars + first_offset + position; + while ((begin < end) && is_utf8_continuation_char(static_cast(*begin))) { + ++begin; + ++position; + } + return (begin < end) ? position + first_offset : last_offset; + } +}; + +/** + * @brief Specialized case conversion for long strings * - * This executes as one warp per string and just computes the output sizes. + * This is needed since the offset count can exceed size_type. + * Also, nulls are ignored since this purely builds the output chars. + * The d_offsets are only temporary to help address the sub-blocks. */ -struct count_bytes_fn { +struct upper_lower_ls_fn : public base_upper_lower_fn { convert_char_fn converter; - column_device_view d_strings; - size_type* d_offsets; + char const* d_input_chars; + int64_t* d_input_offsets; // includes column offset + upper_lower_ls_fn(convert_char_fn converter, char const* d_input_chars, int64_t* d_input_offsets) + : base_upper_lower_fn{converter}, d_input_chars{d_input_chars}, d_input_offsets{d_input_offsets} + { + } + + // idx is row index __device__ void operator()(size_type idx) const { - auto const str_idx = idx / cudf::detail::warp_size; - auto const lane_idx = idx % cudf::detail::warp_size; - - // initialize the output for the atomicAdd - if (lane_idx == 0) { d_offsets[str_idx] = 0; } - __syncwarp(); - - if (d_strings.is_null(str_idx)) { return; } - auto const d_str = d_strings.element(str_idx); - auto const str_ptr = d_str.data(); - - size_type size = 0; - for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) { - auto const chr = str_ptr[i]; - if (is_utf8_continuation_char(chr)) { continue; } - char_utf8 u8 = 0; - to_char_utf8(str_ptr + i, u8); - size += converter.process_character(u8); - } - // this is every so slightly faster than using the cub::warp_reduce - if (size > 0) { - cuda::atomic_ref ref{*(d_offsets + str_idx)}; - ref.fetch_add(size, cuda::std::memory_order_relaxed); - } + auto const offset = d_input_offsets[idx]; + auto const d_str = string_view{d_input_chars + offset, + static_cast(d_input_offsets[idx + 1] - offset)}; + process_string(d_str, idx); } }; +/** + * @brief Count output bytes in warp-parallel threads + * + * This executes as one warp per string and just computes the output sizes. + */ +CUDF_KERNEL void count_bytes_kernel(convert_char_fn converter, + column_device_view d_strings, + size_type* d_sizes) +{ + auto idx = cudf::detail::grid_1d::global_thread_id(); + if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; } + + auto const str_idx = idx / cudf::detail::warp_size; + auto const lane_idx = idx % cudf::detail::warp_size; + + // initialize the output for the atomicAdd + if (lane_idx == 0) { d_sizes[str_idx] = 0; } + __syncwarp(); + + if (d_strings.is_null(str_idx)) { return; } + auto const d_str = d_strings.element(str_idx); + auto const str_ptr = d_str.data(); + + size_type size = 0; + for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) { + auto const chr = str_ptr[i]; + if (is_utf8_continuation_char(chr)) { continue; } + char_utf8 u8 = 0; + to_char_utf8(str_ptr + i, u8); + size += converter.process_character(u8); + } + // this is slightly faster than using the cub::warp_reduce + if (size > 0) { + cuda::atomic_ref ref{*(d_sizes + str_idx)}; + ref.fetch_add(size, cuda::std::memory_order_relaxed); + } +} + /** * @brief Special functor for processing ASCII-only data */ @@ -208,11 +282,18 @@ std::unique_ptr convert_case(strings_column_view const& input, auto const d_cases = get_character_cases_table(); auto const d_special = get_special_case_mapping_table(); + auto const first_offset = (input.offset() == 0) ? 0L + : cudf::strings::detail::get_offset_value( + input.offsets(), input.offset(), stream); + auto const last_offset = + cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream); + auto const chars_size = last_offset - first_offset; + convert_char_fn ccfn{case_flag, d_flags, d_cases, d_special}; upper_lower_fn converter{ccfn, *d_strings}; // For smaller strings, use the regular string-parallel algorithm - if ((input.chars_size(stream) / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) { + if ((chars_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) { auto [offsets, chars] = cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr); return make_strings_column(input.size(), @@ -235,9 +316,8 @@ std::unique_ptr convert_case(strings_column_view const& input, [] __device__(auto chr) { return is_utf8_continuation_char(chr); })) > 0; if (!multi_byte_chars) { // optimization for ASCII-only case: copy the input column and inplace replace each character - auto result = std::make_unique(input.parent(), stream, mr); - auto d_chars = result->mutable_view().head(); - auto const chars_size = strings_column_view(result->view()).chars_size(stream); + auto result = std::make_unique(input.parent(), stream, mr); + auto d_chars = result->mutable_view().head(); thrust::transform( rmm::exec_policy(stream), d_chars, d_chars + chars_size, d_chars, ascii_converter_fn{ccfn}); result->set_null_count(input.null_count()); @@ -245,30 +325,46 @@ std::unique_ptr convert_case(strings_column_view const& input, } // This will use a warp-parallel algorithm to compute the output sizes for each string - // and then uses the normal string parallel functor to build the output. - auto offsets = make_numeric_column( - data_type{type_to_id()}, input.size() + 1, mask_state::UNALLOCATED, stream, mr); - auto d_offsets = offsets->mutable_view().data(); - - // first pass, compute output sizes // note: tried to use segmented-reduce approach instead here and it was consistently slower - count_bytes_fn counter{ccfn, *d_strings, d_offsets}; - auto const count_itr = thrust::make_counting_iterator(0); - thrust::for_each_n( - rmm::exec_policy(stream), count_itr, input.size() * cudf::detail::warp_size, counter); - - // convert sizes to offsets - auto const bytes = - cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream); - CUDF_EXPECTS(bytes <= std::numeric_limits::max(), - "Size of output exceeds the column size limit", - std::overflow_error); - - rmm::device_uvector chars(bytes, stream, mr); - // second pass, write output - converter.d_offsets = d_offsets; - converter.d_chars = chars.data(); - thrust::for_each_n(rmm::exec_policy(stream), count_itr, input.size(), converter); + auto [offsets, bytes] = [&] { + rmm::device_uvector sizes(input.size(), stream); + constexpr int block_size = 512; + cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + count_bytes_kernel<<>>( + ccfn, *d_strings, sizes.data()); + // convert sizes to offsets + return cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr); + }(); + + // build sub-offsets + auto const input_chars = input.chars_begin(stream); + auto const sub_count = chars_size / LS_SUB_BLOCK_SIZE; + auto tmp_offsets = rmm::device_uvector(sub_count + input.size() + 1, stream); + { + rmm::device_uvector sub_offsets(sub_count, stream); + auto const count_itr = thrust::make_counting_iterator(0); + thrust::transform(rmm::exec_policy_nosync(stream), + count_itr, + count_itr + sub_count, + sub_offsets.data(), + sub_offset_fn{input_chars, first_offset, last_offset}); + + // merge them with input offsets + auto input_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + thrust::merge(rmm::exec_policy_nosync(stream), + input_offsets, + input_offsets + input.size() + 1, + sub_offsets.begin(), + sub_offsets.end(), + tmp_offsets.begin()); + } + + // run case conversion over the new sub-strings + auto const tmp_size = static_cast(tmp_offsets.size()) - 1; + upper_lower_ls_fn sub_conv{ccfn, input_chars, tmp_offsets.data()}; + auto chars = + std::get<1>(cudf::strings::detail::make_strings_children(sub_conv, tmp_size, stream, mr)); return make_strings_column(input.size(), std::move(offsets), diff --git a/cpp/tests/strings/case_tests.cpp b/cpp/tests/strings/case_tests.cpp index 1d82d785ae8..bb0e77a29d0 100644 --- a/cpp/tests/strings/case_tests.cpp +++ b/cpp/tests/strings/case_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -235,7 +235,7 @@ TEST_F(StringsCaseTest, LongStrings) { // average string length >= AVG_CHAR_BYTES_THRESHOLD as defined in case.cu cudf::test::strings_column_wrapper input{ - "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", + "abcdéfghijklmnopqrstuvwxyzABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ", "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ", "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"}; @@ -256,7 +256,8 @@ TEST_F(StringsCaseTest, LongStrings) results = cudf::strings::to_upper(view); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::to_upper(cudf::strings_column_view(cudf::slice(input, {1, 3}).front())); + view = cudf::strings_column_view(cudf::slice(input, {1, 3}).front()); + results = cudf::strings::to_upper(view); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, cudf::slice(expected, {1, 3}).front()); } diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp index f668c384787..9fa1a3325b4 100644 --- a/cpp/tests/strings/floats_tests.cpp +++ b/cpp/tests/strings/floats_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -25,8 +26,6 @@ #include -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; - struct StringsConvertTest : public cudf::test::BaseFixture {}; TEST_F(StringsConvertTest, IsFloat) @@ -89,7 +88,7 @@ TEST_F(StringsConvertTest, ToFloats32) h_expected.begin(), h_expected.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsConvertTest, FromFloats32) @@ -118,38 +117,41 @@ TEST_F(StringsConvertTest, FromFloats32) h_expected.end(), thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsConvertTest, ToFloats64) { // clang-format off std::vector h_strings{ - "1234", nullptr, "-876", "543.2", "-0.12", ".25", + "1234", "", "-876", "543.2", "-0.12", ".25", "-.002", "", "-0.0", "1.28e256", "NaN", "abc123", "123abc", "456e", "-1.78e+5", "-122.33644782", "12e+309", "1.7976931348623159E308", "-Inf", "-INFINITY", "1.0", "1.7976931348623157e+308", "1.7976931348623157e-307", // subnormal numbers: v--- smallest double v--- result is 0 - "4e-308", "3.3333333333e-320", "4.940656458412465441765688e-324", "1.e-324" }; + "4e-308", "3.3333333333e-320", "4.940656458412465441765688e-324", "1.e-324", + // another very small number + "9.299999257686047e-0005603333574677677" }; // clang-format on - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto validity = cudf::test::iterators::null_at(1); + cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity); std::vector h_expected; std::for_each(h_strings.begin(), h_strings.end(), [&](char const* str) { - h_expected.push_back(str ? std::atof(str) : 0); + h_expected.push_back(std::atof(str)); }); auto strings_view = cudf::strings_column_view(strings); auto results = cudf::strings::to_floats(strings_view, cudf::data_type{cudf::type_id::FLOAT64}); cudf::test::fixed_width_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); + h_expected.begin(), h_expected.end(), validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + + results = cudf::strings::is_float(strings_view); + cudf::test::fixed_width_column_wrapper is_expected( + {1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, is_expected); } TEST_F(StringsConvertTest, FromFloats64) @@ -178,7 +180,7 @@ TEST_F(StringsConvertTest, FromFloats64) h_expected.end(), thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsConvertTest, ZeroSizeStringsColumnFloat) diff --git a/dependencies.yaml b/dependencies.yaml index 5bb555df818..8cd4c798c38 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -286,7 +286,7 @@ dependencies: - output_types: conda packages: - &rmm_conda rmm==24.6.* - - &protobuf protobuf>=4.21,<5 + - &protobuf protobuf>=3.20,<5 - pip - pip: - git+https://github.com/python-streamz/streamz.git@master diff --git a/docs/cudf/source/user_guide/pandas-comparison.md b/docs/cudf/source/user_guide/pandas-comparison.md index 549d91b771a..4aaaa8a93df 100644 --- a/docs/cudf/source/user_guide/pandas-comparison.md +++ b/docs/cudf/source/user_guide/pandas-comparison.md @@ -87,9 +87,17 @@ using `.from_arrow()` or `.from_pandas()`. ## Result ordering -By default, `join` (or `merge`), `value_counts` and `groupby` operations in cuDF -do *not* guarantee output ordering. -Compare the results obtained from Pandas and cuDF below: +In Pandas, `join` (or `merge`), `value_counts` and `groupby` operations provide +certain guarantees about the order of rows in the result returned. In a Pandas +`join`, the order of join keys is (depending on the particular style of join +being performed) either preserved or sorted lexicographically by default. +`groupby` sorts the group keys, and preserves the order of rows within each +group. In some cases, disabling this option in Pandas can yield better +performance. + +By contrast, cuDF's default behavior is to return rows in a +non-deterministic order to maximize performance. Compare the results +obtained from Pandas and cuDF below: ```{code} python >>> import cupy as cp @@ -114,13 +122,16 @@ a 4 342.000000 ``` -To match Pandas behavior, you must explicitly pass `sort=True` -or enable the `mode.pandas_compatible` option when trying to -match Pandas behavior with `sort=False`: +In most cases, the rows of a DataFrame are accessed by index labels +rather than by position, so the order in which rows are returned +doesn't matter. However, if you require that results be returned in a +predictable (sorted) order, you can pass the `sort=True` option +explicitly or enable the `mode.pandas_compatible` option when trying +to match Pandas behavior with `sort=False`: ```{code} python ->>> df.to_pandas().groupby("a", sort=True).mean().head() - b +>>> df.groupby("a", sort=True).mean().head() + b a 0 70.000000 1 356.333333 diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 81d15cf95b4..c2b7cb7ca3d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -44,3 +44,5 @@ rapids_cython_create_modules( LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf ) link_to_pyarrow_headers(pylibcudf_interop) + +add_subdirectory(strings) diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 48c23a9dd4c..5adefa5fd93 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -17,6 +17,7 @@ from . cimport ( search, sorting, stream_compaction, + strings, types, unary, ) @@ -48,6 +49,7 @@ __all__ = [ "rolling", "search", "stream_compaction", + "strings", "sorting", "types", "unary", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 8ccb0ecc341..89f874f5fa5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -17,6 +17,7 @@ search, sorting, stream_compaction, + strings, types, unary, ) @@ -48,6 +49,7 @@ "rolling", "search", "stream_compaction", + "strings", "sorting", "types", "unary", diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt new file mode 100644 index 00000000000..3a2a9e1e7eb --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -0,0 +1,21 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources case.pyx) +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd new file mode 100644 index 00000000000..ff87549b5b5 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -0,0 +1,3 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import case diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py new file mode 100644 index 00000000000..ff87549b5b5 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import case diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd new file mode 100644 index 00000000000..225d566fe06 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.column cimport Column + + +cpdef Column to_lower(Column input) +cpdef Column to_upper(Column input) +cpdef Column swapcase(Column input) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx new file mode 100644 index 00000000000..69910fd8c50 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.strings cimport case as cpp_case +from cudf._lib.pylibcudf.column cimport Column + + +cpdef Column to_lower(Column input): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_case.to_lower(input.view()) + + return Column.from_libcudf(move(c_result)) + +cpdef Column to_upper(Column input): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_case.to_upper(input.view()) + + return Column.from_libcudf(move(c_result)) + +cpdef Column swapcase(Column input): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_case.swapcase(input.view()) + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/case.pyx b/python/cudf/cudf/_lib/strings/case.pyx index 09af1178946..38f242a67d6 100644 --- a/python/cudf/cudf/_lib/strings/case.pyx +++ b/python/cudf/cudf/_lib/strings/case.pyx @@ -1,48 +1,34 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf._lib.column cimport Column -from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.strings.case cimport ( - swapcase as cpp_swapcase, - to_lower as cpp_to_lower, - to_upper as cpp_to_upper, -) + +from cudf._lib.pylibcudf.strings import case @acquire_spill_lock() def to_upper(Column source_strings): - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_to_upper(source_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + case.to_upper( + source_strings.to_pylibcudf(mode='read') + ) + ) @acquire_spill_lock() def to_lower(Column source_strings): - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_to_lower(source_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + case.to_lower( + source_strings.to_pylibcudf(mode='read') + ) + ) @acquire_spill_lock() def swapcase(Column source_strings): - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_swapcase(source_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + case.swapcase( + source_strings.to_pylibcudf(mode='read') + ) + ) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index e4620ee5bc4..e3e73035046 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -614,12 +614,6 @@ def children(self) -> Tuple[NumericalColumn]: def categories(self) -> ColumnBase: return self.dtype.categories._values - @categories.setter - def categories(self, value): - self._dtype = CategoricalDtype( - categories=value, ordered=self.dtype.ordered - ) - @property def codes(self) -> NumericalColumn: if self._codes is None: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 809bdb4e6d1..01842b5f0a9 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1808,12 +1808,6 @@ def all(self, axis=0, skipna=True, **kwargs): b False dtype: bool - .. pandas-compat:: - **DataFrame.all, Series.all** - - Parameters currently not supported are `axis`, `bool_only`, - `level`. - .. pandas-compat:: **DataFrame.all, Series.all** @@ -1867,12 +1861,6 @@ def any(self, axis=0, skipna=True, **kwargs): b True dtype: bool - .. pandas-compat:: - **DataFrame.any, Series.any** - - Parameters currently not supported are `axis`, `bool_only`, - `level`. - .. pandas-compat:: **DataFrame.any, Series.any** diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 945e546af1a..dd4924676f3 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1767,13 +1767,23 @@ def transform(self, function): -------- agg """ + if not (isinstance(function, str) or callable(function)): + raise TypeError( + "Aggregation must be a named aggregation or a callable" + ) try: result = self.agg(function) except TypeError as e: raise NotImplementedError( "Currently, `transform()` supports only aggregations." ) from e - + # If the aggregation is a scan, don't broadcast + if libgroupby._is_all_scan_aggregate([[function]]): + if len(result) != len(self.obj): + raise AssertionError( + "Unexpected result length for scan transform" + ) + return result return self._broadcast(result) def rolling(self, *args, **kwargs): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ca9d5590044..c412b7a7e47 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1530,11 +1530,6 @@ def median( >>> ser.median() 17.0 - .. pandas-compat:: - **DataFrame.median, Series.median** - - Parameters currently not supported are `level` and `numeric_only`. - .. pandas-compat:: **DataFrame.median, Series.median** diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 19dde2e51b9..829790007c9 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -103,11 +103,6 @@ def _num_columns(self): def _column(self): return self._data[self.name] - @_column.setter # type: ignore - @_cudf_nvtx_annotate - def _column(self, value): - self._data[self.name] = value - @property # type: ignore @_cudf_nvtx_annotate def values(self): # noqa: D102 diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index b2f3fd09146..5ef25a99590 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -38,25 +38,6 @@ def read_json( f"or a bool, or None. Got {type(dtype)}" ) - if engine == "cudf_experimental": - raise ValueError( - "engine='cudf_experimental' support has been removed, " - "use `engine='cudf'`" - ) - - if engine == "cudf_legacy": - # TODO: Deprecated in 23.02, please - # give some time until(more than couple of - # releases from now) `cudf_legacy` - # support can be removed completely. - warnings.warn( - "engine='cudf_legacy' is a deprecated engine." - "This will be removed in a future release." - "Please switch to using engine='cudf'.", - FutureWarning, - ) - if engine == "cudf_legacy" and not lines: - raise ValueError(f"{engine} engine only supports JSON Lines format") if engine == "auto": engine = "cudf" if lines else "pandas" if engine != "cudf" and keep_quotes: @@ -64,7 +45,7 @@ def read_json( "keep_quotes='True' is supported only with engine='cudf'" ) - if engine == "cudf_legacy" or engine == "cudf": + if engine == "cudf": if dtype is None: dtype = True @@ -117,7 +98,7 @@ def read_json( lines, compression, byte_range, - engine == "cudf_legacy", + False, keep_quotes, mixed_types_as_string, ) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index b7c8e92e8db..3c82d571939 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -174,7 +174,7 @@ def _DataFrame__dir__(self): "__arrow_array__": arrow_array_method, "__cuda_array_interface__": cuda_array_interface, "__iter__": custom_iter, - "dt": _AccessorAttr(DatetimeProperties), + "dt": _AccessorAttr(CombinedDatetimelikeProperties), "str": _AccessorAttr(StringMethods), "cat": _AccessorAttr(_CategoricalAccessor), "_constructor": _FastSlowAttribute("_constructor"), @@ -208,7 +208,7 @@ def Index__new__(cls, *args, **kwargs): "__array_function__": array_function_method, "__arrow_array__": arrow_array_method, "__cuda_array_interface__": cuda_array_interface, - "dt": _AccessorAttr(DatetimeProperties), + "dt": _AccessorAttr(CombinedDatetimelikeProperties), "str": _AccessorAttr(StringMethods), "cat": _AccessorAttr(_CategoricalAccessor), "__iter__": custom_iter, diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh index 2f6c4ac5b13..07ec5c8bc0c 100755 --- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh +++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh @@ -130,10 +130,15 @@ and not test_s3_roundtrip_for_dir[partition_col0] \ and not test_s3_roundtrip_for_dir[partition_col1] \ and not test_s3_roundtrip" +TEST_THAT_CRASH_PYTEST_WORKERS="not test_bitmasks_pyarrow \ +and not test_large_string_pyarrow \ +and not test_interchange_from_corrected_buffer_dtypes \ +and not test_eof_states" + # TODO: Remove "not db" once a postgres & mysql container is set up on the CI PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \ -v -m "not single_cpu and not db" \ - -k "not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods and $TEST_THAT_NEED_MOTO_SERVER" \ + -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS" \ --import-mode=importlib \ ${PYTEST_IGNORES} \ "$@" || [ $? = 1 ] # Exit success if exit code was 1 (permit test failures but not other errors) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_case.py b/python/cudf/cudf/pylibcudf_tests/test_string_case.py new file mode 100644 index 00000000000..ae01d953df5 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_string_case.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(scope="module") +def string_col(): + return pa.array( + ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"] + ) + + +def test_to_upper(string_col): + plc_col = plc.interop.from_arrow(string_col) + got = plc.strings.case.to_upper(plc_col) + expected = pa.compute.utf8_upper(string_col) + assert_column_eq(got, expected) + + +def test_to_lower(string_col): + plc_col = plc.interop.from_arrow(string_col) + got = plc.strings.case.to_lower(plc_col) + expected = pa.compute.utf8_lower(string_col) + assert_column_eq(got, expected) + + +def test_swapcase(string_col): + plc_col = plc.interop.from_arrow(string_col) + got = plc.strings.case.swapcase(plc_col) + expected = pa.compute.utf8_swapcase(string_col) + assert_column_eq(got, expected) diff --git a/python/cudf/cudf/tests/groupby/test_transform.py b/python/cudf/cudf/tests/groupby/test_transform.py new file mode 100644 index 00000000000..78d7fbfd879 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_transform.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import itertools + +import pytest + +import cudf +from cudf.testing._utils import assert_eq + + +@pytest.fixture(params=[False, True], ids=["no-null-keys", "null-keys"]) +def keys_null(request): + return request.param + + +@pytest.fixture(params=[False, True], ids=["no-null-values", "null-values"]) +def values_null(request): + return request.param + + +@pytest.fixture +def df(keys_null, values_null): + keys = ["a", "b", "a", "c", "b", "b", "c", "a"] + r = range(len(keys)) + if keys_null: + keys[::3] = itertools.repeat(None, len(r[::3])) + values = list(range(len(keys))) + if values_null: + values[1::3] = itertools.repeat(None, len(r[1::3])) + return cudf.DataFrame({"key": keys, "values": values}) + + +@pytest.mark.parametrize("agg", ["cumsum", "cumprod", "max", "sum", "prod"]) +def test_transform_broadcast(agg, df): + pf = df.to_pandas() + got = df.groupby("key").transform(agg) + expect = pf.groupby("key").transform(agg) + assert_eq(got, expect, check_dtype=False) + + +def test_transform_invalid(): + df = cudf.DataFrame({"key": [1, 1], "values": [4, 5]}) + with pytest.raises(TypeError): + df.groupby("key").transform({"values": "cumprod"}) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 40935733f34..3033a3e75e3 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -495,9 +495,6 @@ def test_json_lines_compression(tmpdir, ext, out_comp, in_comp): @pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.filterwarnings( - "ignore:engine='cudf_legacy' is a deprecated engine." -) def test_json_engine_selection(): json = "[1, 2, 3]" @@ -519,10 +516,6 @@ def test_json_engine_selection(): for col_name in df.columns: assert isinstance(col_name, int) - # should raise an exception - with pytest.raises(ValueError): - cudf.read_json(StringIO(json), lines=False, engine="cudf_legacy") - def test_json_bool_values(): buffer = "[true,1]\n[false,false]\n[true,true]" @@ -541,30 +534,6 @@ def test_json_bool_values(): np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) -@pytest.mark.filterwarnings( - "ignore:engine='cudf_legacy' is a deprecated engine." -) -@pytest.mark.parametrize( - "buffer", - [ - "[1.0,]\n[null, ]", - '{"0":1.0,"1":}\n{"0":null,"1": }', - '{ "0" : 1.0 , "1" : }\n{ "0" : null , "1" : }', - '{"0":1.0}\n{"1":}', - ], -) -def test_json_null_literal(buffer): - df = cudf.read_json(StringIO(buffer), lines=True, engine="cudf_legacy") - - # first column contains a null field, type should be set to float - # second column contains only empty fields, type should be set to int8 - np.testing.assert_array_equal(df.dtypes, ["float64", "int8"]) - np.testing.assert_array_equal( - df["0"].to_numpy(na_value=np.nan), [1.0, np.nan] - ) - np.testing.assert_array_equal(df["1"].to_numpy(na_value=0), [0, 0]) - - def test_json_bad_protocol_string(): test_string = StringIO('{"field": "s3://path"}') @@ -739,14 +708,8 @@ def test_default_integer_bitwidth(default_integer_bitwidth, engine): @pytest.mark.parametrize( "engine", [ - pytest.param( - "cudf_legacy", - marks=pytest.mark.skip( - reason="cannot partially set dtypes for cudf json engine" - ), - ), - "pandas", "cudf", + "pandas", ], ) def test_default_integer_bitwidth_partial(default_integer_bitwidth, engine): diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 0a0ee4f592c..8c58f2b859e 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -543,7 +543,7 @@ function or `StringIO`). Multiple inputs may be provided as a list. If a list is specified each list entry may be of a different input type as long as each input is of a valid type and all input JSON schema(s) match. -engine : {{ 'auto', 'cudf', 'cudf_legacy', 'pandas' }}, default 'auto' +engine : {{ 'auto', 'cudf', 'pandas' }}, default 'auto' Parser engine to use. If 'auto' is passed, the engine will be automatically selected based on the other parameters. See notes below. orient : string diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 003a92988de..434383bc208 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ "nvtx>=0.2.1", "packaging", "pandas>=2.0,<2.2.2dev0", - "protobuf>=4.21,<5", + "protobuf>=3.20,<5", "ptxcompiler", "pyarrow>=14.0.1,<15.0.0a0", "rich",