From 15c148dcbba087ed1be32e0cef7188c9b609e7dc Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 9 Apr 2024 17:50:26 -0700 Subject: [PATCH 1/6] Fix for logical and syntactical errors in libcudf c++ examples (#15346) This PR fixes a couple of fatal compile and runtime errors in `libcudf/strings` examples Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Nghia Truong (https://github.com/ttnghia) - Mark Harris (https://github.com/harrism) - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15346 --- cpp/examples/build.sh | 4 +++- cpp/examples/strings/common.hpp | 4 +++- cpp/examples/strings/custom_optimized.cu | 8 ++++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index 001cdeec694..424da35ad18 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -1,9 +1,11 @@ #!/bin/bash -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # libcudf examples build script +set -euo pipefail + # Parallelism control PARALLEL_LEVEL=${PARALLEL_LEVEL:-4} diff --git a/cpp/examples/strings/common.hpp b/cpp/examples/strings/common.hpp index 0dbe6fe2b7b..65a9c100c7c 100644 --- a/cpp/examples/strings/common.hpp +++ b/cpp/examples/strings/common.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -110,7 +111,8 @@ int main(int argc, char const** argv) std::chrono::duration elapsed = std::chrono::steady_clock::now() - st; std::cout << "Wall time: " << elapsed.count() << " seconds\n"; - std::cout << "Output size " << result->view().child(1).size() << " bytes\n"; + auto const scv = cudf::strings_column_view(result->view()); + std::cout << "Output size " << scv.chars_size(rmm::cuda_stream_default) << " bytes\n"; return 0; } diff --git a/cpp/examples/strings/custom_optimized.cu b/cpp/examples/strings/custom_optimized.cu index cefa3346150..62ca19a5ca9 100644 --- a/cpp/examples/strings/custom_optimized.cu +++ b/cpp/examples/strings/custom_optimized.cu @@ -153,8 +153,12 @@ std::unique_ptr redact_strings(cudf::column_view const& names, redact_kernel<<>>( *d_names, *d_visibilities, offsets.data(), chars.data()); - // create column from offsets and chars vectors (no copy is performed) - auto result = cudf::make_strings_column(names.size(), std::move(offsets), chars.release(), {}, 0); + // create column from offsets vector (move only) + auto offsets_column = std::make_unique(std::move(offsets), rmm::device_buffer{}, 0); + + // create column for chars vector (no copy is performed) + auto result = cudf::make_strings_column( + names.size(), std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); // wait for all of the above to finish stream.synchronize(); From b06536d3c061d62286c6844ed8d6a69cf906dc3d Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 10 Apr 2024 08:56:47 -0500 Subject: [PATCH 2/6] Make improvements in pandas-test reporting (#15485) This PR fixes an issue where `listJobsForWorkflowRun` returns only 30 jobs details by default and we need to paginate and load the rest all of the job details to be able to filter jobs. This PR also address review comments in https://github.com/rapidsai/cudf/pull/15369/ Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/15485 --- .github/workflows/status.yaml | 13 +++++++++---- .github/workflows/test.yaml | 2 +- ci/cudf_pandas_scripts/pandas-tests/diff.sh | 9 +++++---- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/.github/workflows/status.yaml b/.github/workflows/status.yaml index 0aad4c8a23e..781264bc55e 100644 --- a/.github/workflows/status.yaml +++ b/.github/workflows/status.yaml @@ -85,13 +85,18 @@ jobs: state: CUSTOM_STATE = 'success' } = contentJSON; - // Fetch the first job ID from the workflow run - const jobs = await github.rest.actions.listJobsForWorkflowRun({ + // Fetch all jobs using pagination + const jobs = await github.paginate( + github.rest.actions.listJobsForWorkflowRun, + { owner: context.repo.owner, repo: context.repo.repo, run_id: process.env.WORKFLOW_RUN_ID, - }); - const job = jobs.data.jobs.find(job => job.name === JOB_NAME); + } + ); + + // Fetch the first job ID from the workflow run + const job = jobs.find(job => job.name === JOB_NAME); const JOB_ID = job ? job.id : null; // Set default target URL if not defined diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index ea47b6ad466..65aef37697e 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -130,7 +130,7 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 with: - matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(min_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh index ae5a249bcbd..cf80f383db4 100755 --- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh +++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh @@ -10,12 +10,13 @@ GH_JOB_NAME="pandas-tests-diff / build" rapids-logger "Github job name: ${GH_JOB_NAME}" -MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py310.main-results.json -PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py39.pr-results.json +PY_VER="39" +MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-results.json +PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-results.json rapids-logger "Fetching latest available results from nightly" -aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '.main-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt -cat s3_output.txt +aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt + read -r COMPARE_ENV < s3_output.txt export COMPARE_ENV rapids-logger "Latest available results from nightly: ${COMPARE_ENV}" From 94726ad056e2473c836f47d310e2584bdf44d1f9 Mon Sep 17 00:00:00 2001 From: Ray Douglass Date: Wed, 10 Apr 2024 10:12:23 -0400 Subject: [PATCH 3/6] Update Changelog [skip ci] --- CHANGELOG.md | 297 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 297 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bce764f59e3..7ecad2c9c39 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,300 @@ +# cuDF 24.04.00 (10 Apr 2024) + +## 🚨 Breaking Changes + +- Restructure pylibcudf/arrow interop facilities ([#15325](https://github.com/rapidsai/cudf/pull/15325)) [@vyasr](https://github.com/vyasr) +- Change exceptions thrown by copying APIs ([#15319](https://github.com/rapidsai/cudf/pull/15319)) [@vyasr](https://github.com/vyasr) +- Change strings_column_view::char_size to return int64 ([#15197](https://github.com/rapidsai/cudf/pull/15197)) [@davidwendt](https://github.com/davidwendt) +- Upgrade to `arrow-14.0.2` ([#15108](https://github.com/rapidsai/cudf/pull/15108)) [@galipremsagar](https://github.com/galipremsagar) +- Add support for `pandas-2.2` in `cudf` ([#15100](https://github.com/rapidsai/cudf/pull/15100)) [@galipremsagar](https://github.com/galipremsagar) +- Deprecate cudf::hashing::spark_murmurhash3_x86_32 ([#15074](https://github.com/rapidsai/cudf/pull/15074)) [@davidwendt](https://github.com/davidwendt) +- Align MultiIndex.get_indexder with pandas 2.2 change ([#15059](https://github.com/rapidsai/cudf/pull/15059)) [@mroeschke](https://github.com/mroeschke) +- Raise an error on import for unsupported GPUs. ([#15053](https://github.com/rapidsai/cudf/pull/15053)) [@bdice](https://github.com/bdice) +- Deprecate datelike isin casting strings to dates to match pandas 2.2 ([#15046](https://github.com/rapidsai/cudf/pull/15046)) [@mroeschke](https://github.com/mroeschke) +- Align concat Series name behavior in pandas 2.2 ([#15032](https://github.com/rapidsai/cudf/pull/15032)) [@mroeschke](https://github.com/mroeschke) +- Add `future_stack` to `DataFrame.stack` ([#15015](https://github.com/rapidsai/cudf/pull/15015)) [@galipremsagar](https://github.com/galipremsagar) +- Deprecate groupby fillna ([#15000](https://github.com/rapidsai/cudf/pull/15000)) [@mroeschke](https://github.com/mroeschke) +- Deprecate replace with categorical columns ([#14988](https://github.com/rapidsai/cudf/pull/14988)) [@mroeschke](https://github.com/mroeschke) +- Deprecate delim_whitespace in read_csv for pandas 2.2 ([#14986](https://github.com/rapidsai/cudf/pull/14986)) [@mroeschke](https://github.com/mroeschke) +- Deprecate parameters similar to pandas 2.2 ([#14984](https://github.com/rapidsai/cudf/pull/14984)) [@mroeschke](https://github.com/mroeschke) +- Add missing atomic operators, refactor atomic operators, move atomic operators to detail namespace. ([#14962](https://github.com/rapidsai/cudf/pull/14962)) [@bdice](https://github.com/bdice) +- Add `pandas-2.x` support in `cudf` ([#14916](https://github.com/rapidsai/cudf/pull/14916)) [@galipremsagar](https://github.com/galipremsagar) +- Use cuco::static_set in the hash-based groupby ([#14813](https://github.com/rapidsai/cudf/pull/14813)) [@PointKernel](https://github.com/PointKernel) + +## 🐛 Bug Fixes + +- Fix an issue with creating a series from scalar when `dtype='category'` ([#15476](https://github.com/rapidsai/cudf/pull/15476)) [@galipremsagar](https://github.com/galipremsagar) +- Update pre-commit-hooks to v0.0.3 ([#15355](https://github.com/rapidsai/cudf/pull/15355)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- [BUG][JNI] Trigger MemoryBuffer.onClosed after memory is freed ([#15351](https://github.com/rapidsai/cudf/pull/15351)) [@abellina](https://github.com/abellina) +- Fix an issue with multiple short list rowgroups using the Parquet chunked reader. ([#15342](https://github.com/rapidsai/cudf/pull/15342)) [@nvdbaranec](https://github.com/nvdbaranec) +- Avoid importing dask-expr if "query-planning" config is `False` ([#15340](https://github.com/rapidsai/cudf/pull/15340)) [@rjzamora](https://github.com/rjzamora) +- Fix gtests/ERROR_TEST errors when run in Debug ([#15317](https://github.com/rapidsai/cudf/pull/15317)) [@davidwendt](https://github.com/davidwendt) +- Fix OOB read in `inflate_kernel` ([#15309](https://github.com/rapidsai/cudf/pull/15309)) [@vuule](https://github.com/vuule) +- Work around a cuFile error when running CSV tests with memcheck ([#15293](https://github.com/rapidsai/cudf/pull/15293)) [@vuule](https://github.com/vuule) +- Fix Doxygen upload directory ([#15291](https://github.com/rapidsai/cudf/pull/15291)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Fix Doxygen check ([#15289](https://github.com/rapidsai/cudf/pull/15289)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Reintroduce PANDAS_GE_220 import ([#15287](https://github.com/rapidsai/cudf/pull/15287)) [@wence-](https://github.com/wence-) +- Fix mean computation for the geometric distribution in the data generator ([#15282](https://github.com/rapidsai/cudf/pull/15282)) [@vuule](https://github.com/vuule) +- Fix Parquet decimal64 stats ([#15281](https://github.com/rapidsai/cudf/pull/15281)) [@etseidl](https://github.com/etseidl) +- Make linking of nvtx3-cpp BUILD_LOCAL_INTERFACE ([#15271](https://github.com/rapidsai/cudf/pull/15271)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Workaround compute-sanitizer memcheck bug ([#15259](https://github.com/rapidsai/cudf/pull/15259)) [@davidwendt](https://github.com/davidwendt) +- Cleanup `hostdevice_vector` and add more APIs ([#15252](https://github.com/rapidsai/cudf/pull/15252)) [@ttnghia](https://github.com/ttnghia) +- Fix number of rows in randomly generated lists columns ([#15248](https://github.com/rapidsai/cudf/pull/15248)) [@vuule](https://github.com/vuule) +- Fix wrong output for `collect_list`/`collect_set` of lists column ([#15243](https://github.com/rapidsai/cudf/pull/15243)) [@ttnghia](https://github.com/ttnghia) +- Fix testchunkedPackTwoPasses to copy from the bounce buffer ([#15220](https://github.com/rapidsai/cudf/pull/15220)) [@abellina](https://github.com/abellina) +- Fix accessing `.columns` by an external API ([#15212](https://github.com/rapidsai/cudf/pull/15212)) [@galipremsagar](https://github.com/galipremsagar) +- [JNI] Disable testChunkedPackTwoPasses for now ([#15210](https://github.com/rapidsai/cudf/pull/15210)) [@abellina](https://github.com/abellina) +- Update labeler and codeowner configs for CMake files ([#15208](https://github.com/rapidsai/cudf/pull/15208)) [@PointKernel](https://github.com/PointKernel) +- Avoid dict normalization in ``__dask_tokenize__`` ([#15187](https://github.com/rapidsai/cudf/pull/15187)) [@rjzamora](https://github.com/rjzamora) +- Fix memcheck error in distinct inner join ([#15164](https://github.com/rapidsai/cudf/pull/15164)) [@PointKernel](https://github.com/PointKernel) +- Remove unneeded script parameters in test_cpp_memcheck.sh ([#15158](https://github.com/rapidsai/cudf/pull/15158)) [@davidwendt](https://github.com/davidwendt) +- Fix `ListColumn.to_pandas()` to retain `list` type ([#15155](https://github.com/rapidsai/cudf/pull/15155)) [@galipremsagar](https://github.com/galipremsagar) +- Avoid factorization in MultiIndex.to_pandas ([#15150](https://github.com/rapidsai/cudf/pull/15150)) [@mroeschke](https://github.com/mroeschke) +- Fix GroupBy.get_group and GroupBy.indices ([#15143](https://github.com/rapidsai/cudf/pull/15143)) [@wence-](https://github.com/wence-) +- Remove `const` from `range_window_bounds::_extent`. ([#15138](https://github.com/rapidsai/cudf/pull/15138)) [@mythrocks](https://github.com/mythrocks) +- DataFrame.columns = ... retains RangeIndex & set dtype ([#15129](https://github.com/rapidsai/cudf/pull/15129)) [@mroeschke](https://github.com/mroeschke) +- Correctly handle output for `GroupBy.apply` when chunk results are reindexed series ([#15109](https://github.com/rapidsai/cudf/pull/15109)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Fix Series.groupby.shift with a MultiIndex ([#15098](https://github.com/rapidsai/cudf/pull/15098)) [@mroeschke](https://github.com/mroeschke) +- Fix reductions when DataFrame has MulitIndex columns ([#15097](https://github.com/rapidsai/cudf/pull/15097)) [@mroeschke](https://github.com/mroeschke) +- Fix deprecation warnings for deprecated hash() calls ([#15095](https://github.com/rapidsai/cudf/pull/15095)) [@davidwendt](https://github.com/davidwendt) +- Add support for arrow `large_string` in `cudf` ([#15093](https://github.com/rapidsai/cudf/pull/15093)) [@galipremsagar](https://github.com/galipremsagar) +- Fix `sort_values` pytest failure with pandas-2.x regression ([#15092](https://github.com/rapidsai/cudf/pull/15092)) [@galipremsagar](https://github.com/galipremsagar) +- Resolve path parsing issues in `get_json_object` ([#15082](https://github.com/rapidsai/cudf/pull/15082)) [@SurajAralihalli](https://github.com/SurajAralihalli) +- Fix bugs in handling of delta encodings ([#15075](https://github.com/rapidsai/cudf/pull/15075)) [@etseidl](https://github.com/etseidl) +- Fix `is_device_write_preferred` in `void_sink` and `user_sink_wrapper` ([#15064](https://github.com/rapidsai/cudf/pull/15064)) [@vuule](https://github.com/vuule) +- Eliminate duplicate allocation of nested string columns ([#15061](https://github.com/rapidsai/cudf/pull/15061)) [@vuule](https://github.com/vuule) +- Raise an error on import for unsupported GPUs. ([#15053](https://github.com/rapidsai/cudf/pull/15053)) [@bdice](https://github.com/bdice) +- Align concat Series name behavior in pandas 2.2 ([#15032](https://github.com/rapidsai/cudf/pull/15032)) [@mroeschke](https://github.com/mroeschke) +- Fix `Index.difference` to handle duplicate values when one of the inputs is empty ([#15016](https://github.com/rapidsai/cudf/pull/15016)) [@galipremsagar](https://github.com/galipremsagar) +- Add `future_stack` to `DataFrame.stack` ([#15015](https://github.com/rapidsai/cudf/pull/15015)) [@galipremsagar](https://github.com/galipremsagar) +- Fix handling of values=None in pylibcudf GroupBy.get_groups ([#14998](https://github.com/rapidsai/cudf/pull/14998)) [@shwina](https://github.com/shwina) +- Fix `DataFrame.sort_index` to respect `ignore_index` on all axis ([#14995](https://github.com/rapidsai/cudf/pull/14995)) [@galipremsagar](https://github.com/galipremsagar) +- Raise for pyarrow array that is tz-aware ([#14980](https://github.com/rapidsai/cudf/pull/14980)) [@mroeschke](https://github.com/mroeschke) +- Direct ``SeriesGroupBy.aggregate`` to ``SeriesGroupBy.agg`` ([#14971](https://github.com/rapidsai/cudf/pull/14971)) [@rjzamora](https://github.com/rjzamora) +- Respect IntervalDtype and CategoricalDtype objects passed by users ([#14961](https://github.com/rapidsai/cudf/pull/14961)) [@mroeschke](https://github.com/mroeschke) +- unset `CUDF_SPILL` after a pytest ([#14958](https://github.com/rapidsai/cudf/pull/14958)) [@galipremsagar](https://github.com/galipremsagar) +- Fix Null literals to be not parsed as string when mixed types as string is enabled in JSON reader ([#14939](https://github.com/rapidsai/cudf/pull/14939)) [@karthikeyann](https://github.com/karthikeyann) +- Fix chunked reads of Parquet delta encoded pages ([#14921](https://github.com/rapidsai/cudf/pull/14921)) [@etseidl](https://github.com/etseidl) +- Fix reading offset for data stream in ORC reader ([#14911](https://github.com/rapidsai/cudf/pull/14911)) [@ttnghia](https://github.com/ttnghia) +- Enable sanitizer check for a test case testORCReadAndWriteForDecimal128 ([#14897](https://github.com/rapidsai/cudf/pull/14897)) [@res-life](https://github.com/res-life) +- Fix dask token normalization ([#14829](https://github.com/rapidsai/cudf/pull/14829)) [@rjzamora](https://github.com/rjzamora) +- Fix 24.04 versions ([#14825](https://github.com/rapidsai/cudf/pull/14825)) [@raydouglass](https://github.com/raydouglass) +- Ensure slow private attrs are maybe proxies ([#14380](https://github.com/rapidsai/cudf/pull/14380)) [@mroeschke](https://github.com/mroeschke) + +## 📖 Documentation + +- Ignore DLManagedTensor in the docs build ([#15392](https://github.com/rapidsai/cudf/pull/15392)) [@davidwendt](https://github.com/davidwendt) +- Revert "Temporarily disable docs errors. ([#15265)" (#15269](https://github.com/rapidsai/cudf/pull/15265)" (#15269)) [@bdice](https://github.com/bdice) +- Temporarily disable docs errors. ([#15265](https://github.com/rapidsai/cudf/pull/15265)) [@bdice](https://github.com/bdice) +- Update `developer_guide.md` with new guidance on quoted internal includes ([#15238](https://github.com/rapidsai/cudf/pull/15238)) [@harrism](https://github.com/harrism) +- Fix broken link for developer guide ([#15025](https://github.com/rapidsai/cudf/pull/15025)) [@sanjana098](https://github.com/sanjana098) +- [DOC] Update typo in docs example of structs_column_wrapper ([#14949](https://github.com/rapidsai/cudf/pull/14949)) [@karthikeyann](https://github.com/karthikeyann) +- Update cudf.pandas FAQ. ([#14940](https://github.com/rapidsai/cudf/pull/14940)) [@bdice](https://github.com/bdice) +- Optimize doc builds ([#14856](https://github.com/rapidsai/cudf/pull/14856)) [@vyasr](https://github.com/vyasr) +- Add developer guideline to use east const. ([#14836](https://github.com/rapidsai/cudf/pull/14836)) [@bdice](https://github.com/bdice) +- Document how cuDF is pronounced ([#14753](https://github.com/rapidsai/cudf/pull/14753)) [@pentschev](https://github.com/pentschev) +- Notes convert to Pandas-compat ([#12641](https://github.com/rapidsai/cudf/pull/12641)) [@Touutae-lab](https://github.com/Touutae-lab) + +## 🚀 New Features + +- Address inconsistency in single quote normalization in JSON reader ([#15324](https://github.com/rapidsai/cudf/pull/15324)) [@shrshi](https://github.com/shrshi) +- Use JNI pinned pool resource with cuIO ([#15255](https://github.com/rapidsai/cudf/pull/15255)) [@abellina](https://github.com/abellina) +- Add DELTA_BYTE_ARRAY encoder for Parquet ([#15239](https://github.com/rapidsai/cudf/pull/15239)) [@etseidl](https://github.com/etseidl) +- Migrate filling operations to pylibcudf ([#15225](https://github.com/rapidsai/cudf/pull/15225)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- [JNI] rmm based pinned pool ([#15219](https://github.com/rapidsai/cudf/pull/15219)) [@abellina](https://github.com/abellina) +- Implement zero-copy host buffer source instead of using an arrow implementation ([#15189](https://github.com/rapidsai/cudf/pull/15189)) [@vuule](https://github.com/vuule) +- Enable creation of columns from scalar ([#15181](https://github.com/rapidsai/cudf/pull/15181)) [@vyasr](https://github.com/vyasr) +- Use NVTX from GitHub. ([#15178](https://github.com/rapidsai/cudf/pull/15178)) [@bdice](https://github.com/bdice) +- Implement `segmented_row_bit_count` for computing row sizes by segments of rows ([#15169](https://github.com/rapidsai/cudf/pull/15169)) [@ttnghia](https://github.com/ttnghia) +- Implement search using pylibcudf ([#15166](https://github.com/rapidsai/cudf/pull/15166)) [@vyasr](https://github.com/vyasr) +- Add distinct left join ([#15149](https://github.com/rapidsai/cudf/pull/15149)) [@PointKernel](https://github.com/PointKernel) +- Add cardinality control for groupby benchs with flat types ([#15134](https://github.com/rapidsai/cudf/pull/15134)) [@PointKernel](https://github.com/PointKernel) +- Add ability to request Parquet encodings on a per-column basis ([#15081](https://github.com/rapidsai/cudf/pull/15081)) [@etseidl](https://github.com/etseidl) +- Automate include grouping order in .clang-format ([#15063](https://github.com/rapidsai/cudf/pull/15063)) [@harrism](https://github.com/harrism) +- Requesting a clean build directory also clears Jitify cache ([#15052](https://github.com/rapidsai/cudf/pull/15052)) [@robertmaynard](https://github.com/robertmaynard) +- API for JSON unquoted whitespace normalization ([#15033](https://github.com/rapidsai/cudf/pull/15033)) [@shrshi](https://github.com/shrshi) +- Implement concatenate, lists.explode, merge, sorting, and stream compaction in pylibcudf ([#15011](https://github.com/rapidsai/cudf/pull/15011)) [@vyasr](https://github.com/vyasr) +- Implement replace in pylibcudf ([#15005](https://github.com/rapidsai/cudf/pull/15005)) [@vyasr](https://github.com/vyasr) +- Add distinct key inner join ([#14990](https://github.com/rapidsai/cudf/pull/14990)) [@PointKernel](https://github.com/PointKernel) +- Implement rolling in pylibcudf ([#14982](https://github.com/rapidsai/cudf/pull/14982)) [@vyasr](https://github.com/vyasr) +- Implement joins in pylibcudf ([#14972](https://github.com/rapidsai/cudf/pull/14972)) [@vyasr](https://github.com/vyasr) +- Implement scans and reductions in pylibcudf ([#14970](https://github.com/rapidsai/cudf/pull/14970)) [@vyasr](https://github.com/vyasr) +- Rewrite cudf internals using pylibcudf groupby ([#14946](https://github.com/rapidsai/cudf/pull/14946)) [@vyasr](https://github.com/vyasr) +- Implement groupby in pylibcudf ([#14945](https://github.com/rapidsai/cudf/pull/14945)) [@vyasr](https://github.com/vyasr) +- Support casting of Map type to string in JSON reader ([#14936](https://github.com/rapidsai/cudf/pull/14936)) [@karthikeyann](https://github.com/karthikeyann) +- POC for whitespace removal in input JSON data using FST ([#14931](https://github.com/rapidsai/cudf/pull/14931)) [@shrshi](https://github.com/shrshi) +- Support for LZ4 compression in ORC and Parquet ([#14906](https://github.com/rapidsai/cudf/pull/14906)) [@vuule](https://github.com/vuule) +- Remove supports_streams from cuDF custom memory resources. ([#14857](https://github.com/rapidsai/cudf/pull/14857)) [@harrism](https://github.com/harrism) +- Migrate unary operations to pylibcudf ([#14850](https://github.com/rapidsai/cudf/pull/14850)) [@vyasr](https://github.com/vyasr) +- Migrate binary operations to pylibcudf ([#14821](https://github.com/rapidsai/cudf/pull/14821)) [@vyasr](https://github.com/vyasr) +- Add row index and stripe size options to Python ORC chunked writer ([#14785](https://github.com/rapidsai/cudf/pull/14785)) [@vuule](https://github.com/vuule) +- Support CUDA 12.2 ([#14712](https://github.com/rapidsai/cudf/pull/14712)) [@jameslamb](https://github.com/jameslamb) + +## 🛠️ Improvements + +- Use `conda env create --yes` instead of `--force` ([#15403](https://github.com/rapidsai/cudf/pull/15403)) [@bdice](https://github.com/bdice) +- Restructure pylibcudf/arrow interop facilities ([#15325](https://github.com/rapidsai/cudf/pull/15325)) [@vyasr](https://github.com/vyasr) +- Change exceptions thrown by copying APIs ([#15319](https://github.com/rapidsai/cudf/pull/15319)) [@vyasr](https://github.com/vyasr) +- Enable branch testing for `cudf.pandas` ([#15316](https://github.com/rapidsai/cudf/pull/15316)) [@galipremsagar](https://github.com/galipremsagar) +- Replace black with ruff-format ([#15312](https://github.com/rapidsai/cudf/pull/15312)) [@mroeschke](https://github.com/mroeschke) +- This fixes an NPE when trying to read empty JSON data by adding a new API for missing information ([#15307](https://github.com/rapidsai/cudf/pull/15307)) [@revans2](https://github.com/revans2) +- Address poor performance of Parquet string decoding ([#15304](https://github.com/rapidsai/cudf/pull/15304)) [@etseidl](https://github.com/etseidl) +- Update script input name ([#15301](https://github.com/rapidsai/cudf/pull/15301)) [@AyodeAwe](https://github.com/AyodeAwe) +- Make test_read_parquet_partitioned_filtered data deterministic ([#15296](https://github.com/rapidsai/cudf/pull/15296)) [@mroeschke](https://github.com/mroeschke) +- Add timeout for `cudf.pandas` pandas tests ([#15284](https://github.com/rapidsai/cudf/pull/15284)) [@galipremsagar](https://github.com/galipremsagar) +- Add upper bound to prevent usage of NumPy 2 ([#15283](https://github.com/rapidsai/cudf/pull/15283)) [@bdice](https://github.com/bdice) +- Fix cudf::test::to_host return of host_vector ([#15263](https://github.com/rapidsai/cudf/pull/15263)) [@davidwendt](https://github.com/davidwendt) +- Implement grouped product scan ([#15254](https://github.com/rapidsai/cudf/pull/15254)) [@wence-](https://github.com/wence-) +- Add CUDA 12.4 to supported PTX versions ([#15247](https://github.com/rapidsai/cudf/pull/15247)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Implement DataFrame|Series.squeeze ([#15244](https://github.com/rapidsai/cudf/pull/15244)) [@mroeschke](https://github.com/mroeschke) +- Roll back ipow changes due to register pressure. ([#15242](https://github.com/rapidsai/cudf/pull/15242)) [@pmattione-nvidia](https://github.com/pmattione-nvidia) +- Remove create_chars_child_column utility ([#15241](https://github.com/rapidsai/cudf/pull/15241)) [@davidwendt](https://github.com/davidwendt) +- Update dlpack to version 0.8 ([#15237](https://github.com/rapidsai/cudf/pull/15237)) [@dantegd](https://github.com/dantegd) +- Improve performance in JSON reader when `mixed_types_as_string` option is enabled ([#15236](https://github.com/rapidsai/cudf/pull/15236)) [@shrshi](https://github.com/shrshi) +- Remove row conversion code from libcudf ([#15234](https://github.com/rapidsai/cudf/pull/15234)) [@ttnghia](https://github.com/ttnghia) +- Use variable substitution for RAPIDS version in Doxyfile ([#15231](https://github.com/rapidsai/cudf/pull/15231)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Add ListColumns.to_pandas(arrow_type=) ([#15228](https://github.com/rapidsai/cudf/pull/15228)) [@mroeschke](https://github.com/mroeschke) +- Treat dask-cudf CI artifacts as pure wheels ([#15223](https://github.com/rapidsai/cudf/pull/15223)) [@bdice](https://github.com/bdice) +- Clean up usage of __CUDA_ARCH__ and other macros. ([#15218](https://github.com/rapidsai/cudf/pull/15218)) [@bdice](https://github.com/bdice) +- DOC: use constants in performance-comparisons.ipynb ([#15215](https://github.com/rapidsai/cudf/pull/15215)) [@raybellwaves](https://github.com/raybellwaves) +- Rewrite conversion in terms of column ([#15213](https://github.com/rapidsai/cudf/pull/15213)) [@vyasr](https://github.com/vyasr) +- Switch `pytest-xdist` algo to `worksteal` ([#15207](https://github.com/rapidsai/cudf/pull/15207)) [@galipremsagar](https://github.com/galipremsagar) +- Deprecate strings_column_view::offsets_begin() ([#15205](https://github.com/rapidsai/cudf/pull/15205)) [@davidwendt](https://github.com/davidwendt) +- Add `get_upstream_resource` method to `stream_checking_resource_adaptor` ([#15203](https://github.com/rapidsai/cudf/pull/15203)) [@miscco](https://github.com/miscco) +- Tune up row size estimation in the data generator ([#15202](https://github.com/rapidsai/cudf/pull/15202)) [@vuule](https://github.com/vuule) +- Fix `offset` value for generating test data in `parquet_chunked_reader_test.cu` ([#15200](https://github.com/rapidsai/cudf/pull/15200)) [@ttnghia](https://github.com/ttnghia) +- Change strings_column_view::char_size to return int64 ([#15197](https://github.com/rapidsai/cudf/pull/15197)) [@davidwendt](https://github.com/davidwendt) +- Fix includes for row_operators.cuh ([#15194](https://github.com/rapidsai/cudf/pull/15194)) [@davidwendt](https://github.com/davidwendt) +- Generalize GHA selectors for pure Python testing ([#15191](https://github.com/rapidsai/cudf/pull/15191)) [@bdice](https://github.com/bdice) +- Improvements for `__cuda_array_interface__` tests ([#15188](https://github.com/rapidsai/cudf/pull/15188)) [@bdice](https://github.com/bdice) +- Allow to_pandas to return pandas.ArrowDtype ([#15182](https://github.com/rapidsai/cudf/pull/15182)) [@mroeschke](https://github.com/mroeschke) +- Ignore `byte_range` in `read_json` when the size is not smaller than the input data ([#15180](https://github.com/rapidsai/cudf/pull/15180)) [@vuule](https://github.com/vuule) +- Expose new stable_sort and finish stream_compaction in pylibcudf ([#15175](https://github.com/rapidsai/cudf/pull/15175)) [@wence-](https://github.com/wence-) +- [ci] update matrix filters for dask-cudf builds ([#15174](https://github.com/rapidsai/cudf/pull/15174)) [@jameslamb](https://github.com/jameslamb) +- Change make_strings_children to return uvector ([#15171](https://github.com/rapidsai/cudf/pull/15171)) [@davidwendt](https://github.com/davidwendt) +- Don't override to_pandas for Datelike columns ([#15167](https://github.com/rapidsai/cudf/pull/15167)) [@mroeschke](https://github.com/mroeschke) +- Drop python-snappy from dependencies. ([#15161](https://github.com/rapidsai/cudf/pull/15161)) [@bdice](https://github.com/bdice) +- Add microkernels for fixed-width and fixed-width dictionary in Parquet decode ([#15159](https://github.com/rapidsai/cudf/pull/15159)) [@abellina](https://github.com/abellina) +- Make HostColumnVector.DataType accessor methods public ([#15157](https://github.com/rapidsai/cudf/pull/15157)) [@jbrennan333](https://github.com/jbrennan333) +- Java bindings for left outer distinct join ([#15154](https://github.com/rapidsai/cudf/pull/15154)) [@jlowe](https://github.com/jlowe) +- Forward-merge branch-24.02 to branch-24.04 ([#15153](https://github.com/rapidsai/cudf/pull/15153)) [@bdice](https://github.com/bdice) +- Enable pandas pytests for `cudf.pandas` ([#15147](https://github.com/rapidsai/cudf/pull/15147)) [@galipremsagar](https://github.com/galipremsagar) +- Add java option to keep quotes for JSON reads ([#15146](https://github.com/rapidsai/cudf/pull/15146)) [@revans2](https://github.com/revans2) +- Change cross-pandas-version testing in `cudf` ([#15145](https://github.com/rapidsai/cudf/pull/15145)) [@galipremsagar](https://github.com/galipremsagar) +- Use `hostdevice_vector` in `kernel_error` to avoid the pageable copy ([#15140](https://github.com/rapidsai/cudf/pull/15140)) [@vuule](https://github.com/vuule) +- Clean up Columns.astype & cudf.dtype ([#15125](https://github.com/rapidsai/cudf/pull/15125)) [@mroeschke](https://github.com/mroeschke) +- Simplify some to_pandas implementations ([#15123](https://github.com/rapidsai/cudf/pull/15123)) [@mroeschke](https://github.com/mroeschke) +- Java: Add leak tracking for Scalar instances ([#15121](https://github.com/rapidsai/cudf/pull/15121)) [@jlowe](https://github.com/jlowe) +- Remove calls to strings_column_view::offsets_begin() ([#15112](https://github.com/rapidsai/cudf/pull/15112)) [@davidwendt](https://github.com/davidwendt) +- Add support for Python 3.11, require NumPy 1.23+ ([#15111](https://github.com/rapidsai/cudf/pull/15111)) [@jameslamb](https://github.com/jameslamb) +- Compile-time ipow computation with array lookup ([#15110](https://github.com/rapidsai/cudf/pull/15110)) [@pmattione-nvidia](https://github.com/pmattione-nvidia) +- Upgrade to `arrow-14.0.2` ([#15108](https://github.com/rapidsai/cudf/pull/15108)) [@galipremsagar](https://github.com/galipremsagar) +- Dynamically set version in RAPIDS doc builds ([#15101](https://github.com/rapidsai/cudf/pull/15101)) [@jakirkham](https://github.com/jakirkham) +- Add support for `pandas-2.2` in `cudf` ([#15100](https://github.com/rapidsai/cudf/pull/15100)) [@galipremsagar](https://github.com/galipremsagar) +- Update devcontainers to CUDA Toolkit 12.2 ([#15099](https://github.com/rapidsai/cudf/pull/15099)) [@trxcllnt](https://github.com/trxcllnt) +- Fix `datetime` binop pytest failures in pandas-2.2 ([#15090](https://github.com/rapidsai/cudf/pull/15090)) [@galipremsagar](https://github.com/galipremsagar) +- Validate types in pylibcudf Column/Table constructors ([#15088](https://github.com/rapidsai/cudf/pull/15088)) [@wence-](https://github.com/wence-) +- xfail test_join_ordering_pandas_compat for pandas 2.2 ([#15080](https://github.com/rapidsai/cudf/pull/15080)) [@mroeschke](https://github.com/mroeschke) +- Add general purpose host memory allocator reference to cuIO with a demo of pooled-pinned allocation. ([#15079](https://github.com/rapidsai/cudf/pull/15079)) [@nvdbaranec](https://github.com/nvdbaranec) +- Adjust test_binops for pandas 2.2 ([#15078](https://github.com/rapidsai/cudf/pull/15078)) [@mroeschke](https://github.com/mroeschke) +- Remove offsets_begin() call from nvtext::generate_ngrams ([#15077](https://github.com/rapidsai/cudf/pull/15077)) [@davidwendt](https://github.com/davidwendt) +- Use offsetalator in cudf::detail::has_nonempty_null_rows ([#15076](https://github.com/rapidsai/cudf/pull/15076)) [@davidwendt](https://github.com/davidwendt) +- Deprecate cudf::hashing::spark_murmurhash3_x86_32 ([#15074](https://github.com/rapidsai/cudf/pull/15074)) [@davidwendt](https://github.com/davidwendt) +- Fix cudf::test::to_host to handle both offset types for strings columns ([#15073](https://github.com/rapidsai/cudf/pull/15073)) [@davidwendt](https://github.com/davidwendt) +- Add condition for test_groupby_nulls_basic in pandas 2.2 ([#15072](https://github.com/rapidsai/cudf/pull/15072)) [@mroeschke](https://github.com/mroeschke) +- xfail tests in test_udf_masked_ops due to pandas 2.2 bug ([#15071](https://github.com/rapidsai/cudf/pull/15071)) [@mroeschke](https://github.com/mroeschke) +- target branch-24.04 for GitHub Actions workflows ([#15069](https://github.com/rapidsai/cudf/pull/15069)) [@jameslamb](https://github.com/jameslamb) +- Implement stable version of `cudf::sort` ([#15066](https://github.com/rapidsai/cudf/pull/15066)) [@wence-](https://github.com/wence-) +- Fix ORC and JSON tests failures for pandas 2.2 ([#15062](https://github.com/rapidsai/cudf/pull/15062)) [@mroeschke](https://github.com/mroeschke) +- Adjust test_joining for pandas 2.2 ([#15060](https://github.com/rapidsai/cudf/pull/15060)) [@mroeschke](https://github.com/mroeschke) +- Align MultiIndex.get_indexder with pandas 2.2 change ([#15059](https://github.com/rapidsai/cudf/pull/15059)) [@mroeschke](https://github.com/mroeschke) +- Fix test_resample index dtype checking for pandas 2.2 ([#15058](https://github.com/rapidsai/cudf/pull/15058)) [@mroeschke](https://github.com/mroeschke) +- Split out strings/replace.cu and rework its gtests ([#15054](https://github.com/rapidsai/cudf/pull/15054)) [@davidwendt](https://github.com/davidwendt) +- Avoid incompatible value type setting in test_rolling for pandas 2.2 ([#15050](https://github.com/rapidsai/cudf/pull/15050)) [@mroeschke](https://github.com/mroeschke) +- Change chained replace inplace test to COW test for pandas 2.2 ([#15049](https://github.com/rapidsai/cudf/pull/15049)) [@mroeschke](https://github.com/mroeschke) +- Deprecate datelike isin casting strings to dates to match pandas 2.2 ([#15046](https://github.com/rapidsai/cudf/pull/15046)) [@mroeschke](https://github.com/mroeschke) +- Avoid chained indexing in test_indexing for pandas 2.2 ([#15045](https://github.com/rapidsai/cudf/pull/15045)) [@mroeschke](https://github.com/mroeschke) +- Avoid pandas 2.2 `DeprecationWarning` in test_hdf ([#15044](https://github.com/rapidsai/cudf/pull/15044)) [@mroeschke](https://github.com/mroeschke) +- Use appropriate make_offsets_child_column for building lists columns ([#15043](https://github.com/rapidsai/cudf/pull/15043)) [@davidwendt](https://github.com/davidwendt) +- Factor out position-offsets logic from strings split_helper utility ([#15040](https://github.com/rapidsai/cudf/pull/15040)) [@davidwendt](https://github.com/davidwendt) +- Forward-merge branch-24.02 to branch-24.04 ([#15039](https://github.com/rapidsai/cudf/pull/15039)) [@bdice](https://github.com/bdice) +- Clean up nvtx macros ([#15038](https://github.com/rapidsai/cudf/pull/15038)) [@PointKernel](https://github.com/PointKernel) +- Add xfailures for test_applymap for pandas 2.2 ([#15034](https://github.com/rapidsai/cudf/pull/15034)) [@mroeschke](https://github.com/mroeschke) +- Expose libcudf filter expression in read_parquet ([#15028](https://github.com/rapidsai/cudf/pull/15028)) [@wence-](https://github.com/wence-) +- Adjust tests in test_dataframe.py for pandas 2.2 ([#15023](https://github.com/rapidsai/cudf/pull/15023)) [@mroeschke](https://github.com/mroeschke) +- Adjust test_datetime_infer_format for pandas 2.2 ([#15021](https://github.com/rapidsai/cudf/pull/15021)) [@mroeschke](https://github.com/mroeschke) +- Performance optimizations for parquet sub-rowgroup reader. ([#15020](https://github.com/rapidsai/cudf/pull/15020)) [@nvdbaranec](https://github.com/nvdbaranec) +- JNI bindings for distinct_hash_join ([#15019](https://github.com/rapidsai/cudf/pull/15019)) [@jlowe](https://github.com/jlowe) +- Change copy_if_safe to call thrust instead of the overload function ([#15018](https://github.com/rapidsai/cudf/pull/15018)) [@davidwendt](https://github.com/davidwendt) +- Improve performance of copy_if_else for long strings ([#15017](https://github.com/rapidsai/cudf/pull/15017)) [@davidwendt](https://github.com/davidwendt) +- Fix is_string_dtype test for pandas 2.2 ([#15012](https://github.com/rapidsai/cudf/pull/15012)) [@mroeschke](https://github.com/mroeschke) +- Rework cudf::strings::detail::copy_range for offsetalator ([#15010](https://github.com/rapidsai/cudf/pull/15010)) [@davidwendt](https://github.com/davidwendt) +- Use offsetalator in cudf::get_json_object() ([#15009](https://github.com/rapidsai/cudf/pull/15009)) [@davidwendt](https://github.com/davidwendt) +- Align integral types in ORC to specs ([#15008](https://github.com/rapidsai/cudf/pull/15008)) [@vuule](https://github.com/vuule) +- Clean up detail sequence header inclusion ([#15007](https://github.com/rapidsai/cudf/pull/15007)) [@PointKernel](https://github.com/PointKernel) +- Add groupby.apply(include_groups=) to match pandas 2.2 deprecation ([#15006](https://github.com/rapidsai/cudf/pull/15006)) [@mroeschke](https://github.com/mroeschke) +- Use offsetalator in cudf::interleave_columns() ([#15004](https://github.com/rapidsai/cudf/pull/15004)) [@davidwendt](https://github.com/davidwendt) +- Use offsetalator in cudf::row_bit_count() ([#15003](https://github.com/rapidsai/cudf/pull/15003)) [@davidwendt](https://github.com/davidwendt) +- Use offsetalator in cudf::strings::wrap() ([#15002](https://github.com/rapidsai/cudf/pull/15002)) [@davidwendt](https://github.com/davidwendt) +- Use offsetalator in cudf::strings::reverse ([#15001](https://github.com/rapidsai/cudf/pull/15001)) [@davidwendt](https://github.com/davidwendt) +- Deprecate groupby fillna ([#15000](https://github.com/rapidsai/cudf/pull/15000)) [@mroeschke](https://github.com/mroeschke) +- Ensure to_* IO methods respect pandas 2.2 keyword only deprecation ([#14999](https://github.com/rapidsai/cudf/pull/14999)) [@mroeschke](https://github.com/mroeschke) +- Remove unneeded calls to create_chars_child_column utility ([#14997](https://github.com/rapidsai/cudf/pull/14997)) [@davidwendt](https://github.com/davidwendt) +- Add environment-agnostic scripts for running ctests and pytests ([#14992](https://github.com/rapidsai/cudf/pull/14992)) [@trxcllnt](https://github.com/trxcllnt) +- Filter all `DeprecationWarning`'s by `ArrowTable.to_pandas()` ([#14989](https://github.com/rapidsai/cudf/pull/14989)) [@galipremsagar](https://github.com/galipremsagar) +- Deprecate replace with categorical columns ([#14988](https://github.com/rapidsai/cudf/pull/14988)) [@mroeschke](https://github.com/mroeschke) +- Deprecate delim_whitespace in read_csv for pandas 2.2 ([#14986](https://github.com/rapidsai/cudf/pull/14986)) [@mroeschke](https://github.com/mroeschke) +- Deprecate parameters similar to pandas 2.2 ([#14984](https://github.com/rapidsai/cudf/pull/14984)) [@mroeschke](https://github.com/mroeschke) +- Ensure that `ctest` is called with `--no-tests=error`. ([#14983](https://github.com/rapidsai/cudf/pull/14983)) [@bdice](https://github.com/bdice) +- Deprecate non-integer `periods` in `date_range` and `interval_range` ([#14976](https://github.com/rapidsai/cudf/pull/14976)) [@galipremsagar](https://github.com/galipremsagar) +- Update ops-bot.yaml ([#14974](https://github.com/rapidsai/cudf/pull/14974)) [@AyodeAwe](https://github.com/AyodeAwe) +- Use page statistics in Parquet reader ([#14973](https://github.com/rapidsai/cudf/pull/14973)) [@etseidl](https://github.com/etseidl) +- Use fused types for overloaded function signatures ([#14969](https://github.com/rapidsai/cudf/pull/14969)) [@vyasr](https://github.com/vyasr) +- Deprecate certain frequency strings ([#14967](https://github.com/rapidsai/cudf/pull/14967)) [@galipremsagar](https://github.com/galipremsagar) +- Update copyrights for 24.04. ([#14964](https://github.com/rapidsai/cudf/pull/14964)) [@bdice](https://github.com/bdice) +- Add missing atomic operators, refactor atomic operators, move atomic operators to detail namespace. ([#14962](https://github.com/rapidsai/cudf/pull/14962)) [@bdice](https://github.com/bdice) +- Introduce `GetJsonObjectOptions` in `getJSONObject` Java API ([#14956](https://github.com/rapidsai/cudf/pull/14956)) [@SurajAralihalli](https://github.com/SurajAralihalli) +- JNI JSON read with DataSource and infered schema, along with basic java nested Schema JSON reads ([#14954](https://github.com/rapidsai/cudf/pull/14954)) [@revans2](https://github.com/revans2) +- Make codecov only informational (always pass). ([#14952](https://github.com/rapidsai/cudf/pull/14952)) [@bdice](https://github.com/bdice) +- Replace legacy cudf and dask_cudf imports as (d)gd ([#14944](https://github.com/rapidsai/cudf/pull/14944)) [@mroeschke](https://github.com/mroeschke) +- Replace _is_datetime64tz/interval_dtype with isinstance ([#14943](https://github.com/rapidsai/cudf/pull/14943)) [@mroeschke](https://github.com/mroeschke) +- Update tests for pandas 2. ([#14941](https://github.com/rapidsai/cudf/pull/14941)) [@bdice](https://github.com/bdice) +- Use more public pandas APIs ([#14929](https://github.com/rapidsai/cudf/pull/14929)) [@mroeschke](https://github.com/mroeschke) +- Replace local copyright check with pre-commit-hooks verify-copyright ([#14917](https://github.com/rapidsai/cudf/pull/14917)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Add `pandas-2.x` support in `cudf` ([#14916](https://github.com/rapidsai/cudf/pull/14916)) [@galipremsagar](https://github.com/galipremsagar) +- Use offsetalator in nvtext::byte_pair_encoding ([#14888](https://github.com/rapidsai/cudf/pull/14888)) [@davidwendt](https://github.com/davidwendt) +- De-DOS line-endings ([#14880](https://github.com/rapidsai/cudf/pull/14880)) [@wence-](https://github.com/wence-) +- Add detail `cuco_allocator` ([#14877](https://github.com/rapidsai/cudf/pull/14877)) [@PointKernel](https://github.com/PointKernel) +- Move all core types to using enum class in Cython ([#14876](https://github.com/rapidsai/cudf/pull/14876)) [@vyasr](https://github.com/vyasr) +- Read `cudf.__version__` in Sphinx build ([#14872](https://github.com/rapidsai/cudf/pull/14872)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Use int64 offset types for accessing code-points in nvtext::normalize ([#14868](https://github.com/rapidsai/cudf/pull/14868)) [@davidwendt](https://github.com/davidwendt) +- Read version from VERSION file in CMake ([#14867](https://github.com/rapidsai/cudf/pull/14867)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Update conda-cpp-post-build-checks to branch-24.04. ([#14854](https://github.com/rapidsai/cudf/pull/14854)) [@bdice](https://github.com/bdice) +- Update cudf for compatibility with the latest cuco ([#14849](https://github.com/rapidsai/cudf/pull/14849)) [@PointKernel](https://github.com/PointKernel) +- Remove deprecated strings functions ([#14848](https://github.com/rapidsai/cudf/pull/14848)) [@davidwendt](https://github.com/davidwendt) +- Fix CI workflows for pandas-tests and add test summary. ([#14847](https://github.com/rapidsai/cudf/pull/14847)) [@bdice](https://github.com/bdice) +- Use offsetalator in cudf::strings::copy_slice ([#14844](https://github.com/rapidsai/cudf/pull/14844)) [@davidwendt](https://github.com/davidwendt) +- Fix V2 Parquet page alignment for use with zStandard compression ([#14841](https://github.com/rapidsai/cudf/pull/14841)) [@etseidl](https://github.com/etseidl) +- Fix calls to deprecated strings factory API in examples. ([#14838](https://github.com/rapidsai/cudf/pull/14838)) [@bdice](https://github.com/bdice) +- Update pre-commit hooks ([#14837](https://github.com/rapidsai/cudf/pull/14837)) [@bdice](https://github.com/bdice) +- Use `rapids_cuda_set_runtime` to determine cuda runtime usage by target ([#14833](https://github.com/rapidsai/cudf/pull/14833)) [@vyasr](https://github.com/vyasr) +- Remove get_mem_info functions from custom memory resources ([#14832](https://github.com/rapidsai/cudf/pull/14832)) [@harrism](https://github.com/harrism) +- Fix debug build by splitting row_operator_tests_utilities.cu ([#14826](https://github.com/rapidsai/cudf/pull/14826)) [@davidwendt](https://github.com/davidwendt) +- Remove -DNVBench_ENABLE_CUPTI=OFF. ([#14820](https://github.com/rapidsai/cudf/pull/14820)) [@bdice](https://github.com/bdice) +- Use cuco::static_set in the hash-based groupby ([#14813](https://github.com/rapidsai/cudf/pull/14813)) [@PointKernel](https://github.com/PointKernel) +- Branch 24.04 merge branch 24.02 ([#14809](https://github.com/rapidsai/cudf/pull/14809)) [@vyasr](https://github.com/vyasr) +- Branch 24.04 merge branch 24.02 ([#14806](https://github.com/rapidsai/cudf/pull/14806)) [@vyasr](https://github.com/vyasr) +- Introduce basic "cudf" backend for Dask Expressions ([#14805](https://github.com/rapidsai/cudf/pull/14805)) [@rjzamora](https://github.com/rjzamora) +- Remove `build_struct|list_column` ([#14786](https://github.com/rapidsai/cudf/pull/14786)) [@mroeschke](https://github.com/mroeschke) +- Use offsetalator in nvtext tokenize functions ([#14783](https://github.com/rapidsai/cudf/pull/14783)) [@davidwendt](https://github.com/davidwendt) +- Reduce execution time of Python ORC tests ([#14776](https://github.com/rapidsai/cudf/pull/14776)) [@vuule](https://github.com/vuule) +- Use offsetalator in cudf::strings::split functions ([#14757](https://github.com/rapidsai/cudf/pull/14757)) [@davidwendt](https://github.com/davidwendt) +- Use offsetalator in cudf::strings::findall ([#14745](https://github.com/rapidsai/cudf/pull/14745)) [@davidwendt](https://github.com/davidwendt) +- Use offsetalator in cudf::strings::url_decode ([#14744](https://github.com/rapidsai/cudf/pull/14744)) [@davidwendt](https://github.com/davidwendt) +- Use get_offset_value utility in strings shift function ([#14743](https://github.com/rapidsai/cudf/pull/14743)) [@davidwendt](https://github.com/davidwendt) +- Use as_column instead of full ([#14698](https://github.com/rapidsai/cudf/pull/14698)) [@mroeschke](https://github.com/mroeschke) +- List all notable breaking changes ([#13535](https://github.com/rapidsai/cudf/pull/13535)) [@galipremsagar](https://github.com/galipremsagar) + # cuDF 24.02.00 (12 Feb 2024) ## 🚨 Breaking Changes From 460b41edadc90a43b02b1f1e7dc23190cc14d0b4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 10 Apr 2024 05:47:58 -1000 Subject: [PATCH 4/6] Use less _is_categorical_dtype (#15148) Rehash of https://github.com/rapidsai/cudf/pull/14942 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15148 --- python/cudf/cudf/_fuzz_testing/csv.py | 2 +- python/cudf/cudf/_fuzz_testing/json.py | 2 +- python/cudf/cudf/_lib/csv.pyx | 15 +++--- python/cudf/cudf/core/column/column.py | 7 +-- python/cudf/cudf/core/dtypes.py | 10 +++- python/cudf/cudf/testing/testing.py | 24 +++++----- python/cudf/cudf/tests/test_column.py | 4 +- python/cudf/cudf/tests/test_concat.py | 66 ++++++++------------------ python/cudf/cudf/tests/test_csv.py | 22 +++++++-- python/cudf/cudf/utils/dtypes.py | 4 +- 10 files changed, 78 insertions(+), 78 deletions(-) diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py index 5b49143fd5a..67211a1c4bf 100644 --- a/python/cudf/cudf/_fuzz_testing/csv.py +++ b/python/cudf/cudf/_fuzz_testing/csv.py @@ -99,7 +99,7 @@ def set_rand_params(self, params): if dtype_val is not None: dtype_val = { col_name: "category" - if cudf.utils.dtypes._is_categorical_dtype(dtype) + if isinstance(dtype, cudf.CategoricalDtype) else pandas_dtypes_to_np_dtypes[dtype] for col_name, dtype in dtype_val.items() } diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py index bffd508b2ef..e987529c8ba 100644 --- a/python/cudf/cudf/_fuzz_testing/json.py +++ b/python/cudf/cudf/_fuzz_testing/json.py @@ -27,7 +27,7 @@ def _get_dtype_param_value(dtype_val): if dtype_val is not None and isinstance(dtype_val, abc.Mapping): processed_dtypes = {} for col_name, dtype in dtype_val.items(): - if cudf.utils.dtypes._is_categorical_dtype(dtype): + if isinstance(dtype, cudf.CategoricalDtype): processed_dtypes[col_name] = "category" else: processed_dtypes[col_name] = str( diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 0f0bc3ce81a..b2e4d442bd2 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -434,7 +434,7 @@ def read_csv( if dtype is not None: if isinstance(dtype, abc.Mapping): for k, v in dtype.items(): - if cudf.api.types._is_categorical_dtype(v): + if isinstance(cudf.dtype(v), cudf.CategoricalDtype): df._data[str(k)] = df._data[str(k)].astype(v) elif ( cudf.api.types.is_scalar(dtype) or @@ -442,11 +442,11 @@ def read_csv( np.dtype, pd.api.extensions.ExtensionDtype, type )) ): - if cudf.api.types._is_categorical_dtype(dtype): + if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype): df = df.astype(dtype) elif isinstance(dtype, abc.Collection): for index, col_dtype in enumerate(dtype): - if cudf.api.types._is_categorical_dtype(col_dtype): + if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype): col_name = df._data.names[index] df._data[col_name] = df._data[col_name].astype(col_dtype) @@ -554,11 +554,10 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: # TODO: Remove this work-around Dictionary types # in libcudf are fully mapped to categorical columns: # https://github.com/rapidsai/cudf/issues/3960 - if cudf.api.types._is_categorical_dtype(dtype): - if isinstance(dtype, str): - dtype = "str" - else: - dtype = dtype.categories.dtype + if isinstance(dtype, cudf.CategoricalDtype): + dtype = dtype.categories.dtype + elif dtype == "category": + dtype = "str" if isinstance(dtype, str): if str(dtype) == "date32": diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 67f44ad2f48..c8a6493ddda 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -52,7 +52,6 @@ from cudf._lib.types import size_type_dtype from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.api.types import ( - _is_categorical_dtype, _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, infer_dtype, @@ -1381,7 +1380,7 @@ def column_empty_like( if ( hasattr(column, "dtype") - and _is_categorical_dtype(column.dtype) + and isinstance(column.dtype, cudf.CategoricalDtype) and dtype == column.dtype ): catcolumn = cast("cudf.core.column.CategoricalColumn", column) @@ -2008,7 +2007,9 @@ def as_column( length = 1 elif length < 0: raise ValueError(f"{length=} must be >=0.") - if isinstance(arbitrary, pd.Interval) or _is_categorical_dtype(dtype): + if isinstance( + arbitrary, pd.Interval + ) or cudf.api.types._is_categorical_dtype(dtype): # No cudf.Scalar support yet return as_column( pd.Series([arbitrary] * length), diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 73617763221..9bb1995b836 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -51,6 +51,11 @@ def dtype(arbitrary): raise TypeError(f"Unsupported type {np_dtype}") return np_dtype + if isinstance(arbitrary, str) and arbitrary in {"hex", "hex32", "hex64"}: + # read_csv only accepts "hex" + # e.g. test_csv_reader_hexadecimals, test_csv_reader_hexadecimal_overflow + return arbitrary + # use `pandas_dtype` to try and interpret # `arbitrary` as a Pandas extension type. # Return the corresponding NumPy/cuDF type. @@ -999,7 +1004,10 @@ def _is_categorical_dtype(obj): pd.Series, ), ): - return _is_categorical_dtype(obj.dtype) + try: + return isinstance(cudf.dtype(obj.dtype), cudf.CategoricalDtype) + except TypeError: + return False if hasattr(obj, "type"): if obj.type is pd.CategoricalDtype.type: return True diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index fc253c5c197..dffbbe92fc1 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -8,11 +8,7 @@ import cudf from cudf._lib.unary import is_nan -from cudf.api.types import ( - _is_categorical_dtype, - is_numeric_dtype, - is_string_dtype, -) +from cudf.api.types import is_numeric_dtype, is_string_dtype from cudf.core.missing import NA, NaT @@ -86,7 +82,7 @@ def _check_types( if ( exact and not isinstance(left, cudf.MultiIndex) - and _is_categorical_dtype(left) + and isinstance(left.dtype, cudf.CategoricalDtype) ): if left.dtype != right.dtype: raise_assert_detail( @@ -144,8 +140,8 @@ def assert_column_equal( """ if check_dtype is True: if ( - _is_categorical_dtype(left) - and _is_categorical_dtype(right) + isinstance(left.dtype, cudf.CategoricalDtype) + and isinstance(right.dtype, cudf.CategoricalDtype) and not check_categorical ): pass @@ -173,7 +169,9 @@ def assert_column_equal( return if check_exact and check_categorical: - if _is_categorical_dtype(left) and _is_categorical_dtype(right): + if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance( + right.dtype, cudf.CategoricalDtype + ): left_cat = left.categories right_cat = right.categories @@ -207,8 +205,8 @@ def assert_column_equal( if ( not check_dtype - and _is_categorical_dtype(left) - and _is_categorical_dtype(right) + and isinstance(left.dtype, cudf.CategoricalDtype) + and isinstance(right.dtype, cudf.CategoricalDtype) ): left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) @@ -258,7 +256,9 @@ def assert_column_equal( raise e else: columns_equal = False - if _is_categorical_dtype(left) and _is_categorical_dtype(right): + if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance( + right.dtype, cudf.CategoricalDtype + ): left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) if not columns_equal: diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 2f70f955fa9..dace8009041 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -81,7 +81,7 @@ def test_column_offset_and_size(pandas_input, offset, size): children=col.base_children, ) - if cudf.api.types._is_categorical_dtype(col.dtype): + if isinstance(col.dtype, cudf.CategoricalDtype): assert col.size == col.codes.size assert col.size == (col.codes.data.size / col.codes.dtype.itemsize) elif cudf.api.types.is_string_dtype(col.dtype): @@ -120,7 +120,7 @@ def column_slicing_test(col, offset, size, cast_to_float=False): else: pd_series = series.to_pandas() - if cudf.api.types._is_categorical_dtype(col.dtype): + if isinstance(col.dtype, cudf.CategoricalDtype): # The cudf.Series is constructed from an already sliced column, whereas # the pandas.Series is constructed from the unsliced series and then # sliced, so the indexes should be different and we must ignore it. diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 3d638da924b..87b3beb5589 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -9,7 +9,6 @@ import pytest import cudf -from cudf.api.types import _is_categorical_dtype from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import ( assert_eq, @@ -609,8 +608,8 @@ def test_concat_empty_dataframes(df, other, ignore_index): actual = cudf.concat(other_gd, ignore_index=ignore_index) if expected.shape != df.shape: for key, col in actual[actual.columns].items(): - if _is_categorical_dtype(col.dtype): - if not _is_categorical_dtype(expected[key].dtype): + if isinstance(col.dtype, cudf.CategoricalDtype): + if not isinstance(expected[key].dtype, pd.CategoricalDtype): # TODO: Pandas bug: # https://github.com/pandas-dev/pandas/issues/42840 expected[key] = expected[key].fillna("-1").astype("str") @@ -1195,10 +1194,10 @@ def test_concat_join_series(ignore_index, sort, join, axis): @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("join", ["inner", "outer"]) -@pytest.mark.parametrize("axis", [0]) def test_concat_join_empty_dataframes( - df, other, ignore_index, axis, join, sort + request, df, other, ignore_index, join, sort ): + axis = 0 other_pd = [df] + other gdf = cudf.from_pandas(df) other_gd = [gdf] + [cudf.from_pandas(o) for o in other] @@ -1209,50 +1208,27 @@ def test_concat_join_empty_dataframes( actual = cudf.concat( other_gd, ignore_index=ignore_index, axis=axis, join=join, sort=sort ) - if expected.shape != df.shape: - if axis == 0: - for key, col in actual[actual.columns].items(): - if _is_categorical_dtype(col.dtype): - if not _is_categorical_dtype(expected[key].dtype): - # TODO: Pandas bug: - # https://github.com/pandas-dev/pandas/issues/42840 - expected[key] = ( - expected[key].fillna("-1").astype("str") - ) - else: - expected[key] = ( - expected[key] - .cat.add_categories(["-1"]) - .fillna("-1") - .astype("str") - ) - actual[key] = col.astype("str").fillna("-1") - else: - expected[key] = expected[key].fillna(-1) - actual[key] = col.fillna(-1) - - assert_eq( - expected.fillna(-1), - actual.fillna(-1), - check_dtype=False, - check_index_type=False - if len(expected) == 0 or actual.empty - else True, - check_column_type=False, - ) - else: - # no need to fill in if axis=1 - assert_eq( - expected, - actual, - check_index_type=False, - check_column_type=False, + if ( + join == "outer" + and any( + isinstance(dtype, pd.CategoricalDtype) + for dtype in df.dtypes.tolist() + ) + and any( + isinstance(dtype, pd.CategoricalDtype) + for other_df in other + for dtype in other_df.dtypes.tolist() + ) + ): + request.applymarker( + pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/42840" ) + ) assert_eq( expected, actual, check_dtype=False, - check_index_type=False, check_column_type=False, ) @@ -1332,7 +1308,7 @@ def test_concat_join_empty_dataframes_axis_1( if expected.shape != df.shape: if axis == 0: for key, col in actual[actual.columns].items(): - if _is_categorical_dtype(col.dtype): + if isinstance(expected[key].dtype, pd.CategoricalDtype): expected[key] = expected[key].fillna("-1") actual[key] = col.astype("str").fillna("-1") # if not expected.empty: diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 2d728fb94ba..5009a7f2628 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -272,14 +272,30 @@ def test_csv_reader_mixed_data_delimiter_sep( gdf1 = read_csv( str(fname), names=["1", "2", "3", "4", "5", "6", "7"], - dtype=["int64", "date", "float64", "int64", "category", "str", "bool"], + dtype=[ + "int64", + "datetime64[ns]", + "float64", + "int64", + "category", + "str", + "bool", + ], dayfirst=True, **cudf_arg, ) gdf2 = read_csv( str(fname), names=["1", "2", "3", "4", "5", "6", "7"], - dtype=["int64", "date", "float64", "int64", "category", "str", "bool"], + dtype=[ + "int64", + "datetime64[ns]", + "float64", + "int64", + "category", + "str", + "bool", + ], dayfirst=True, **pandas_arg, ) @@ -368,7 +384,7 @@ def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe): out = read_csv( str(fname), names=["1", "2", "3"], - dtype=["int64", "date", "float64"], + dtype=["int64", "datetime64[ns]", "float64"], skiprows=1, skipfooter=1, dayfirst=True, diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 8521239413e..a33b5ca139c 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -392,9 +392,9 @@ def get_min_float_dtype(col): def is_mixed_with_object_dtype(lhs, rhs): - if cudf.api.types._is_categorical_dtype(lhs.dtype): + if isinstance(lhs.dtype, cudf.CategoricalDtype): return is_mixed_with_object_dtype(lhs.dtype.categories, rhs) - elif cudf.api.types._is_categorical_dtype(rhs.dtype): + elif isinstance(rhs.dtype, cudf.CategoricalDtype): return is_mixed_with_object_dtype(lhs, rhs.dtype.categories) return (lhs.dtype == "object" and rhs.dtype != "object") or ( From 888e9d5c38cb27402313681744b87462846bc405 Mon Sep 17 00:00:00 2001 From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com> Date: Wed, 10 Apr 2024 17:56:10 -0400 Subject: [PATCH 5/6] Floating <--> fixed-point conversion must now be called explicitly (#15438) This change makes it so fixed_point objects can no longer be constructed with floating point values, and can no longer be casted to floating point values. Instead the functions added to unary.hpp must be explicitly called. In addition to making it more clear when and where these conversions are occurring, this also makes it so that the low-level fixed_point.hpp header won't be inundated with all of the complex lossless conversion code to come. Authors: - Paul Mattione (https://github.com/pmattione-nvidia) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Shruti Shivakumar (https://github.com/shrshi) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/15438 --- cpp/include/cudf/fixed_point/fixed_point.hpp | 49 +---- cpp/include/cudf/unary.hpp | 75 ++++++- cpp/include/cudf/utilities/traits.hpp | 7 +- cpp/src/binaryop/compiled/binary_ops.cuh | 19 +- cpp/src/quantiles/quantiles_util.hpp | 9 +- .../quantiles/tdigest/tdigest_aggregation.cu | 14 +- cpp/src/unary/cast_ops.cu | 16 +- cpp/tests/fixed_point/fixed_point_tests.cpp | 189 +++++++++--------- cpp/tests/io/orc_test.cpp | 2 +- 9 files changed, 219 insertions(+), 161 deletions(-) diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index 4445af6c5a8..e39d75757e8 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -67,18 +67,6 @@ constexpr inline auto is_supported_representation_type() cuda::std::is_same_v; } -/** - * @brief Returns `true` if the value type is supported for constructing a `fixed_point` - * - * @tparam T The construction value type - * @return `true` if the value type is supported to construct a `fixed_point` type - */ -template -constexpr inline auto is_supported_construction_value_type() -{ - return cuda::std::is_integral() || cuda::std::is_floating_point_v; -} - /** @} */ // end of group // Helper functions for `fixed_point` type @@ -222,23 +210,8 @@ class fixed_point { scale_type _scale; public: - using rep = Rep; ///< The representation type - - /** - * @brief Constructor that will perform shifting to store value appropriately (from floating point - * types) - * - * @tparam T The floating point type that you are constructing from - * @param value The value that will be constructed from - * @param scale The exponent that is applied to Rad to perform shifting - */ - template () && - is_supported_representation_type()>* = nullptr> - CUDF_HOST_DEVICE inline explicit fixed_point(T const& value, scale_type const& scale) - : _value{static_cast(detail::shift(value, scale))}, _scale{scale} - { - } + using rep = Rep; ///< The representation type + static constexpr auto rad = Rad; ///< The base /** * @brief Constructor that will perform shifting to store value appropriately (from integral @@ -249,7 +222,7 @@ class fixed_point { * @param scale The exponent that is applied to Rad to perform shifting */ template () && + typename cuda::std::enable_if_t && is_supported_representation_type()>* = nullptr> CUDF_HOST_DEVICE inline explicit fixed_point(T const& value, scale_type const& scale) // `value` is cast to `Rep` to avoid overflow in cases where @@ -275,8 +248,7 @@ class fixed_point { * @tparam T The value type being constructing from * @param value The value that will be constructed from */ - template ()>* = nullptr> + template >* = nullptr> CUDF_HOST_DEVICE inline fixed_point(T const& value) : _value{static_cast(value)}, _scale{scale_type{0}} { @@ -288,19 +260,6 @@ class fixed_point { */ CUDF_HOST_DEVICE inline fixed_point() : _scale{scale_type{0}} {} - /** - * @brief Explicit conversion operator for casting to floating point types - * - * @tparam U The floating point type that is being explicitly converted to - * @return The `fixed_point` number in base 10 (aka human readable format) - */ - template >* = nullptr> - explicit constexpr operator U() const - { - return detail::shift(static_cast(_value), scale_type{-_scale}); - } - /** * @brief Explicit conversion operator for casting to integral types * diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp index 64e802d88dd..5ded22488c7 100644 --- a/cpp/include/cudf/unary.hpp +++ b/cpp/include/cudf/unary.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,10 @@ #pragma once +#include #include #include +#include #include @@ -31,6 +33,77 @@ namespace cudf { * @brief Column APIs for unary ops */ +/** + * @brief Convert a floating-point value to fixed point + * + * @note This conversion was moved from fixed-point member functions to free functions. + * This is so that the complex conversion code is not included into many parts of the + * code base that don't need it, and so that it's more obvious to pinpoint where these + * conversions are occurring. + * + * @tparam Fixed The fixed-point type to convert to + * @tparam Floating The floating-point type to convert from + * @param floating The floating-point value to convert + * @param scale The desired scale of the fixed-point value + * @return The converted fixed-point value + */ +template () && + cuda::std::is_floating_point_v>* = nullptr> +CUDF_HOST_DEVICE Fixed convert_floating_to_fixed(Floating floating, numeric::scale_type scale) +{ + using Rep = typename Fixed::rep; + auto const shifted = numeric::detail::shift(floating, scale); + numeric::scaled_integer scaled{static_cast(shifted), scale}; + return Fixed(scaled); +} + +/** + * @brief Convert a fixed-point value to floating point + * + * @note This conversion was moved from fixed-point member functions to free functions. + * This is so that the complex conversion code is not included into many parts of the + * code base that don't need it, and so that it's more obvious to pinpoint where these + * conversions are occurring. + * + * @tparam Floating The floating-point type to convert to + * @tparam Fixed The fixed-point type to convert from + * @param fixed The fixed-point value to convert + * @return The converted floating-point value + */ +template && + is_fixed_point()>* = nullptr> +CUDF_HOST_DEVICE Floating convert_fixed_to_floating(Fixed fixed) +{ + using Rep = typename Fixed::rep; + auto const casted = static_cast(fixed.value()); + auto const scale = numeric::scale_type{-fixed.scale()}; + return numeric::detail::shift(casted, scale); +} + +/** + * @brief Convert a value to floating point + * + * @tparam Floating The floating-point type to convert to + * @tparam Input The input type to convert from + * @param input The input value to convert + * @return The converted floating-point value + */ +template >* = nullptr> +CUDF_HOST_DEVICE Floating convert_to_floating(Input input) +{ + if constexpr (is_fixed_point()) { + return convert_fixed_to_floating(input); + } else { + return static_cast(input); + } +} + /** * @brief Types of unary operations that can be performed on data. */ diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index 2dda0740b96..d191e44228a 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -397,7 +397,10 @@ template constexpr inline bool is_fixed_point() { return std::is_same_v || std::is_same_v || - std::is_same_v; + std::is_same_v || + std::is_same_v, T> || + std::is_same_v, T> || + std::is_same_v, T>; } /** diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh index d605c877d3f..0bc144baa83 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cuh +++ b/cpp/src/binaryop/compiled/binary_ops.cuh @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -69,13 +70,17 @@ struct typed_casted_writer { if constexpr (mutable_column_device_view::has_element_accessor() and std::is_constructible_v) { col.element(i) = static_cast(val); - } else if constexpr (is_fixed_point() and - (is_fixed_point() or - std::is_constructible_v)) { - if constexpr (is_fixed_point()) - col.data()[i] = val.rescaled(numeric::scale_type{col.type().scale()}).value(); - else - col.data()[i] = Element{val, numeric::scale_type{col.type().scale()}}.value(); + } else if constexpr (is_fixed_point()) { + auto const scale = numeric::scale_type{col.type().scale()}; + if constexpr (is_fixed_point()) { + col.data()[i] = val.rescaled(scale).value(); + } else if constexpr (cuda::std::is_constructible_v) { + col.data()[i] = Element{val, scale}.value(); + } else if constexpr (cuda::std::is_floating_point_v) { + col.data()[i] = convert_floating_to_fixed(val, scale).value(); + } + } else if constexpr (cuda::std::is_floating_point_v and is_fixed_point()) { + col.data()[i] = convert_fixed_to_floating(val); } } }; diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp index 5efafdd0be6..47864c25c5f 100644 --- a/cpp/src/quantiles/quantiles_util.hpp +++ b/cpp/src/quantiles/quantiles_util.hpp @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -46,8 +47,8 @@ CUDF_HOST_DEVICE inline Result linear(T lhs, T rhs, double frac) // Underflow may occur when converting int64 to double // detail: https://github.com/rapidsai/cudf/issues/1417 - auto dlhs = static_cast(lhs); - auto drhs = static_cast(rhs); + auto dlhs = convert_to_floating(lhs); + auto drhs = convert_to_floating(rhs); double one_minus_frac = 1.0 - frac; return static_cast(one_minus_frac * dlhs + frac * drhs); } @@ -56,8 +57,8 @@ template CUDF_HOST_DEVICE inline Result midpoint(T lhs, T rhs) { // TODO: try std::midpoint (C++20) if available - auto dlhs = static_cast(lhs); - auto drhs = static_cast(rhs); + auto dlhs = convert_to_floating(lhs); + auto drhs = convert_to_floating(rhs); return static_cast(dlhs / 2 + drhs / 2); } diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index 56e1bfbe003..8544d9caa56 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -73,7 +74,7 @@ struct make_centroid { centroid operator() __device__(size_type index) const { auto const is_valid = col.is_valid(index); - auto const mean = is_valid ? static_cast(col.element(index)) : 0.0; + auto const mean = is_valid ? convert_to_floating(col.element(index)) : 0.0; auto const weight = is_valid ? 1.0 : 0.0; return {mean, weight, is_valid}; } @@ -87,7 +88,7 @@ struct make_centroid_no_nulls { centroid operator() __device__(size_type index) const { - return {static_cast(col.element(index)), 1.0, true}; + return {convert_to_floating(col.element(index)), 1.0, true}; } }; @@ -808,8 +809,9 @@ struct get_scalar_minmax_grouped { auto const valid_count = group_valid_counts[group_index]; return valid_count > 0 ? thrust::make_tuple( - static_cast(col.element(group_offsets[group_index])), - static_cast(col.element(group_offsets[group_index] + valid_count - 1))) + convert_to_floating(col.element(group_offsets[group_index])), + convert_to_floating( + col.element(group_offsets[group_index] + valid_count - 1))) : thrust::make_tuple(0.0, 0.0); } }; @@ -823,8 +825,8 @@ struct get_scalar_minmax { __device__ thrust::tuple operator()(size_type) { return valid_count > 0 - ? thrust::make_tuple(static_cast(col.element(0)), - static_cast(col.element(valid_count - 1))) + ? thrust::make_tuple(convert_to_floating(col.element(0)), + convert_to_floating(col.element(valid_count - 1))) : thrust::make_tuple(0.0, 0.0); } }; diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index 47a0cb393aa..b6c9b3caa20 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -116,8 +116,12 @@ struct fixed_point_unary_cast { std::enable_if_t<(cudf::is_fixed_point<_SourceT>() && cudf::is_numeric())>* = nullptr> __device__ inline TargetT operator()(DeviceT const element) { - auto const fp = SourceT{numeric::scaled_integer{element, scale}}; - return static_cast(fp); + auto const fixed_point = SourceT{numeric::scaled_integer{element, scale}}; + if constexpr (cuda::std::is_floating_point_v) { + return convert_fixed_to_floating(fixed_point); + } else { + return static_cast(fixed_point); + } } template < @@ -126,7 +130,11 @@ struct fixed_point_unary_cast { std::enable_if_t<(cudf::is_numeric<_SourceT>() && cudf::is_fixed_point())>* = nullptr> __device__ inline DeviceT operator()(SourceT const element) { - return TargetT{element, scale}.value(); + if constexpr (cuda::std::is_floating_point_v) { + return convert_floating_to_fixed(element, scale).value(); + } else { + return TargetT{element, scale}.value(); + } } }; diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp index 1c1680fcd6e..73de1fbaa68 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cpp +++ b/cpp/tests/fixed_point/fixed_point_tests.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -45,67 +46,71 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction) { using decimalXX = fixed_point; - decimalXX num0{1.234567, scale_type{0}}; - decimalXX num1{1.234567, scale_type{-1}}; - decimalXX num2{1.234567, scale_type{-2}}; - decimalXX num3{1.234567, scale_type{-3}}; - decimalXX num4{1.234567, scale_type{-4}}; - decimalXX num5{1.234567, scale_type{-5}}; - decimalXX num6{1.234567, scale_type{-6}}; - - EXPECT_EQ(1, static_cast(num0)); - EXPECT_EQ(1.2, static_cast(num1)); - EXPECT_EQ(1.23, static_cast(num2)); - EXPECT_EQ(1.234, static_cast(num3)); - EXPECT_EQ(1.2345, static_cast(num4)); - EXPECT_EQ(1.23456, static_cast(num5)); - EXPECT_EQ(1.234567, static_cast(num6)); + auto num0 = cudf::convert_floating_to_fixed(1.234567, scale_type(0)); + auto num1 = cudf::convert_floating_to_fixed(1.234567, scale_type(-1)); + auto num2 = cudf::convert_floating_to_fixed(1.234567, scale_type(-2)); + auto num3 = cudf::convert_floating_to_fixed(1.234567, scale_type(-3)); + auto num4 = cudf::convert_floating_to_fixed(1.234567, scale_type(-4)); + auto num5 = cudf::convert_floating_to_fixed(1.234567, scale_type(-5)); + auto num6 = cudf::convert_floating_to_fixed(1.234567, scale_type(-6)); + + EXPECT_EQ(1, cudf::convert_fixed_to_floating(num0)); + EXPECT_EQ(1.2, cudf::convert_fixed_to_floating(num1)); + EXPECT_EQ(1.23, cudf::convert_fixed_to_floating(num2)); + EXPECT_EQ(1.234, cudf::convert_fixed_to_floating(num3)); + EXPECT_EQ(1.2345, cudf::convert_fixed_to_floating(num4)); + EXPECT_EQ(1.23456, cudf::convert_fixed_to_floating(num5)); + EXPECT_EQ(1.234567, cudf::convert_fixed_to_floating(num6)); } TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction) { using decimalXX = fixed_point; - decimalXX num0{-1.234567, scale_type{0}}; - decimalXX num1{-1.234567, scale_type{-1}}; - decimalXX num2{-1.234567, scale_type{-2}}; - decimalXX num3{-1.234567, scale_type{-3}}; - decimalXX num4{-1.234567, scale_type{-4}}; - decimalXX num5{-1.234567, scale_type{-5}}; - decimalXX num6{-1.234567, scale_type{-6}}; - - EXPECT_EQ(-1, static_cast(num0)); - EXPECT_EQ(-1.2, static_cast(num1)); - EXPECT_EQ(-1.23, static_cast(num2)); - EXPECT_EQ(-1.234, static_cast(num3)); - EXPECT_EQ(-1.2345, static_cast(num4)); - EXPECT_EQ(-1.23456, static_cast(num5)); - EXPECT_EQ(-1.234567, static_cast(num6)); + auto num0 = cudf::convert_floating_to_fixed(-1.234567, scale_type(0)); + auto num1 = cudf::convert_floating_to_fixed(-1.234567, scale_type(-1)); + auto num2 = cudf::convert_floating_to_fixed(-1.234567, scale_type(-2)); + auto num3 = cudf::convert_floating_to_fixed(-1.234567, scale_type(-3)); + auto num4 = cudf::convert_floating_to_fixed(-1.234567, scale_type(-4)); + auto num5 = cudf::convert_floating_to_fixed(-1.234567, scale_type(-5)); + auto num6 = cudf::convert_floating_to_fixed(-1.234567, scale_type(-6)); + + EXPECT_EQ(-1, cudf::convert_fixed_to_floating(num0)); + EXPECT_EQ(-1.2, cudf::convert_fixed_to_floating(num1)); + EXPECT_EQ(-1.23, cudf::convert_fixed_to_floating(num2)); + EXPECT_EQ(-1.234, cudf::convert_fixed_to_floating(num3)); + EXPECT_EQ(-1.2345, cudf::convert_fixed_to_floating(num4)); + EXPECT_EQ(-1.23456, cudf::convert_fixed_to_floating(num5)); + EXPECT_EQ(-1.234567, cudf::convert_fixed_to_floating(num6)); } TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction) { using decimalXX = fixed_point; - decimalXX a{1.1, scale_type{-1}}; - decimalXX b{1.01, scale_type{-2}}; - decimalXX c{1.001, scale_type{-3}}; - decimalXX d{1.0001, scale_type{-4}}; - decimalXX e{1.00001, scale_type{-5}}; - decimalXX f{1.000001, scale_type{-6}}; - - decimalXX x{1.000123, scale_type{-8}}; - decimalXX y{0.000123, scale_type{-8}}; - - EXPECT_EQ(1.1, static_cast(a)); - EXPECT_EQ(1.01, static_cast(b)); - EXPECT_EQ(1, static_cast(c)); // intentional (inherited problem from floating point) - EXPECT_EQ(1.0001, static_cast(d)); - EXPECT_EQ(1.00001, static_cast(e)); - EXPECT_EQ(1, static_cast(f)); // intentional (inherited problem from floating point) - - EXPECT_TRUE(1.000123 - static_cast(x) < std::numeric_limits::epsilon()); - EXPECT_EQ(0.000123, static_cast(y)); + auto a = cudf::convert_floating_to_fixed(1.1, scale_type(-1)); + auto b = cudf::convert_floating_to_fixed(1.01, scale_type(-2)); + auto c = cudf::convert_floating_to_fixed(1.001, scale_type(-3)); + auto d = cudf::convert_floating_to_fixed(1.0001, scale_type(-4)); + auto e = cudf::convert_floating_to_fixed(1.00001, scale_type(-5)); + auto f = cudf::convert_floating_to_fixed(1.000001, scale_type(-6)); + auto x = cudf::convert_floating_to_fixed(1.000123, scale_type(-8)); + auto y = cudf::convert_floating_to_fixed(0.000123, scale_type(-8)); + + EXPECT_EQ(1.1, cudf::convert_fixed_to_floating(a)); + EXPECT_EQ(1.01, cudf::convert_fixed_to_floating(b)); + EXPECT_EQ(1, + cudf::convert_fixed_to_floating( + c)); // intentional (inherited problem from floating point) + EXPECT_EQ(1.0001, cudf::convert_fixed_to_floating(d)); + EXPECT_EQ(1.00001, cudf::convert_fixed_to_floating(e)); + EXPECT_EQ(1, + cudf::convert_fixed_to_floating( + f)); // intentional (inherited problem from floating point) + + EXPECT_TRUE(1.000123 - cudf::convert_fixed_to_floating(x) < + std::numeric_limits::epsilon()); + EXPECT_EQ(0.000123, cudf::convert_fixed_to_floating(y)); } TYPED_TEST(FixedPointTestAllReps, SimpleBinaryFPConstruction) @@ -118,34 +123,34 @@ TYPED_TEST(FixedPointTestAllReps, SimpleBinaryFPConstruction) binary_fp num3{10, scale_type{3}}; binary_fp num4{10, scale_type{4}}; - binary_fp num5{1.24, scale_type{0}}; - binary_fp num6{1.24, scale_type{-1}}; - binary_fp num7{1.32, scale_type{-2}}; - binary_fp num8{1.41, scale_type{-3}}; - binary_fp num9{1.45, scale_type{-4}}; - - EXPECT_EQ(10, static_cast(num0)); - EXPECT_EQ(10, static_cast(num1)); - EXPECT_EQ(8, static_cast(num2)); - EXPECT_EQ(8, static_cast(num3)); - EXPECT_EQ(0, static_cast(num4)); - - EXPECT_EQ(1, static_cast(num5)); - EXPECT_EQ(1, static_cast(num6)); - EXPECT_EQ(1.25, static_cast(num7)); - EXPECT_EQ(1.375, static_cast(num8)); - EXPECT_EQ(1.4375, static_cast(num9)); + auto num5 = cudf::convert_floating_to_fixed(1.24, scale_type(0)); + auto num6 = cudf::convert_floating_to_fixed(1.24, scale_type(-1)); + auto num7 = cudf::convert_floating_to_fixed(1.32, scale_type(-2)); + auto num8 = cudf::convert_floating_to_fixed(1.41, scale_type(-3)); + auto num9 = cudf::convert_floating_to_fixed(1.45, scale_type(-4)); + + EXPECT_EQ(10, cudf::convert_fixed_to_floating(num0)); + EXPECT_EQ(10, cudf::convert_fixed_to_floating(num1)); + EXPECT_EQ(8, cudf::convert_fixed_to_floating(num2)); + EXPECT_EQ(8, cudf::convert_fixed_to_floating(num3)); + EXPECT_EQ(0, cudf::convert_fixed_to_floating(num4)); + + EXPECT_EQ(1, cudf::convert_fixed_to_floating(num5)); + EXPECT_EQ(1, cudf::convert_fixed_to_floating(num6)); + EXPECT_EQ(1.25, cudf::convert_fixed_to_floating(num7)); + EXPECT_EQ(1.375, cudf::convert_fixed_to_floating(num8)); + EXPECT_EQ(1.4375, cudf::convert_fixed_to_floating(num9)); } TYPED_TEST(FixedPointTestAllReps, MoreSimpleBinaryFPConstruction) { using binary_fp = fixed_point; - binary_fp num0{1.25, scale_type{-2}}; - binary_fp num1{2.1, scale_type{-4}}; + auto num0 = cudf::convert_floating_to_fixed(1.25, scale_type(-2)); + auto num1 = cudf::convert_floating_to_fixed(2.1, scale_type(-4)); - EXPECT_EQ(1.25, static_cast(num0)); - EXPECT_EQ(2.0625, static_cast(num1)); + EXPECT_EQ(1.25, cudf::convert_fixed_to_floating(num0)); + EXPECT_EQ(2.0625, cudf::convert_fixed_to_floating(num1)); } TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXMath) @@ -166,7 +171,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXMath) EXPECT_EQ(TWO / ONE, TWO); EXPECT_EQ(SIX / TWO, THREE); - decimalXX a{1.23, scale_type{-2}}; + auto a = cudf::convert_floating_to_fixed(1.23, scale_type(-2)); decimalXX b{0, scale_type{0}}; EXPECT_EQ(a + b, a); @@ -211,8 +216,8 @@ TYPED_TEST(FixedPointTestAllReps, DecimalXXTrickyDivision) EXPECT_EQ(SIXTY_1 / TEN_0, ONE_1); EXPECT_EQ(SIXTY_1 / TEN_1, SIX_0); - decimalXX A{34.56, scale_type{-2}}; - decimalXX B{1.234, scale_type{-3}}; + auto A = cudf::convert_floating_to_fixed(34.56, scale_type(-2)); + auto B = cudf::convert_floating_to_fixed(1.234, scale_type(-3)); decimalXX C{1, scale_type{-2}}; EXPECT_EQ(static_cast(A / B), 20); @@ -255,17 +260,17 @@ TYPED_TEST(FixedPointTestAllReps, ArithmeticWithDifferentScales) using decimalXX = fixed_point; decimalXX a{1, scale_type{0}}; - decimalXX b{1.2, scale_type{-1}}; - decimalXX c{1.23, scale_type{-2}}; - decimalXX d{1.111, scale_type{-3}}; + auto b = cudf::convert_floating_to_fixed(1.2, scale_type(-1)); + auto c = cudf::convert_floating_to_fixed(1.23, scale_type(-2)); + auto d = cudf::convert_floating_to_fixed(1.111, scale_type(-3)); - decimalXX x{2.2, scale_type{-1}}; - decimalXX y{3.43, scale_type{-2}}; - decimalXX z{4.541, scale_type{-3}}; + auto x = cudf::convert_floating_to_fixed(2.2, scale_type(-1)); + auto y = cudf::convert_floating_to_fixed(3.43, scale_type(-2)); + auto z = cudf::convert_floating_to_fixed(4.541, scale_type(-3)); - decimalXX xx{0.2, scale_type{-1}}; - decimalXX yy{0.03, scale_type{-2}}; - decimalXX zz{0.119, scale_type{-3}}; + auto xx = cudf::convert_floating_to_fixed(0.2, scale_type(-1)); + auto yy = cudf::convert_floating_to_fixed(0.03, scale_type(-2)); + auto zz = cudf::convert_floating_to_fixed(0.119, scale_type(-3)); EXPECT_EQ(a + b, x); EXPECT_EQ(a + b + c, y); @@ -280,12 +285,12 @@ TYPED_TEST(FixedPointTestAllReps, RescaledTest) using decimalXX = fixed_point; decimalXX num0{1, scale_type{0}}; - decimalXX num1{1.2, scale_type{-1}}; - decimalXX num2{1.23, scale_type{-2}}; - decimalXX num3{1.234, scale_type{-3}}; - decimalXX num4{1.2345, scale_type{-4}}; - decimalXX num5{1.23456, scale_type{-5}}; - decimalXX num6{1.234567, scale_type{-6}}; + auto num1 = cudf::convert_floating_to_fixed(1.2, scale_type(-1)); + auto num2 = cudf::convert_floating_to_fixed(1.23, scale_type(-2)); + auto num3 = cudf::convert_floating_to_fixed(1.234, scale_type(-3)); + auto num4 = cudf::convert_floating_to_fixed(1.2345, scale_type(-4)); + auto num5 = cudf::convert_floating_to_fixed(1.23456, scale_type(-5)); + auto num6 = cudf::convert_floating_to_fixed(1.234567, scale_type(-6)); EXPECT_EQ(num0, num6.rescaled(scale_type{0})); EXPECT_EQ(num1, num6.rescaled(scale_type{-1})); @@ -314,7 +319,7 @@ TYPED_TEST(FixedPointTestAllReps, BoolConversion) { using decimalXX = fixed_point; - decimalXX truthy_value{1.234567, scale_type{0}}; + auto truthy_value = cudf::convert_floating_to_fixed(1.234567, scale_type(0)); decimalXX falsy_value{0, scale_type{0}}; // Test explicit conversions @@ -442,12 +447,14 @@ void float_vector_test(ValueType const initial_value, std::vector vec1(size); std::vector vec2(size); - std::iota(std::begin(vec1), std::end(vec1), decimal32{initial_value, scale_type{scale}}); + auto decimal_input = cudf::convert_floating_to_fixed(initial_value, scale_type{scale}); + std::iota(std::begin(vec1), std::end(vec1), decimal_input); std::iota(std::begin(vec2), std::end(vec2), initial_value); auto equal = std::equal( std::cbegin(vec1), std::cend(vec1), std::cbegin(vec2), [](auto const& a, auto const& b) { - return static_cast(a) - b <= std::numeric_limits::epsilon(); + return cudf::convert_fixed_to_floating(a) - b <= + std::numeric_limits::epsilon(); }); EXPECT_TRUE(equal); diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index e108e68e1f9..a544a812efb 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -548,7 +548,7 @@ TEST_F(OrcWriterTest, SlicedTable) int32_col col0(seq_col0.begin(), seq_col0.end()); str_col col1(strings.begin(), strings.end()); float32_col col2(seq_col2.begin(), seq_col2.end()); - float32_col col3(seq_col3, seq_col3 + num_rows); + dec64_col col3(seq_col3, seq_col3 + num_rows); list_col col4{ {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}}; From af33b0aba4dafe82cb5d25811e5e737af6c7faad Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Thu, 11 Apr 2024 16:13:09 -0400 Subject: [PATCH 6/6] nanoarrow uses package override for proper pinned versions generation (#15515) The usage of `PATCH_COMMAND` with `rapids_cpm_find` isn't capturable by `+rapids_cpm_generate_pinned_versions`. So we use a nanoarrow json override file to hold the patch we need applied and the custom SHA1 to check out. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Jason Lowe (https://github.com/jlowe) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15515 --- cpp/cmake/thirdparty/get_nanoarrow.cmake | 36 ++++--------------- .../patches/nanoarrow_override.json | 18 ++++++++++ 2 files changed, 24 insertions(+), 30 deletions(-) create mode 100644 cpp/cmake/thirdparty/patches/nanoarrow_override.json diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index 884e5a2f368..dc0b8d09746 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -14,44 +14,20 @@ # This function finds nanoarrow and sets any additional necessary environment variables. function(find_and_configure_nanoarrow) - set(oneValueArgs VERSION FORK PINNED_TAG) - cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + include(${rapids-cmake-dir}/cpm/package_override.cmake) - # Only run if PKG_VERSION is < 0.5.0 - if(PKG_VERSION VERSION_LESS 0.5.0) - set(patch_files_to_run "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches/nanoarrow_cmake.diff") - set(patch_issues_to_ref - "Fix issues with nanoarrow CMake [https://github.com/apache/arrow-nanoarrow/pull/406]" - ) - set(patch_script "${CMAKE_BINARY_DIR}/rapids-cmake/patches/nanoarrow/patch.cmake") - set(log_file "${CMAKE_BINARY_DIR}/rapids-cmake/patches/nanoarrow/log") - string(TIMESTAMP current_year "%Y" UTC) - configure_file( - ${rapids-cmake-dir}/cpm/patches/command_template.cmake.in "${patch_script}" @ONLY - ) - else() - message( - FATAL_ERROR - "Nanoarrow version ${PKG_VERSION} already contains the necessary patch. Please remove this patch from cudf." - ) - endif() + set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") + rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json") + # The git_repo and git_tag are provided by the nanoarrow_override file rapids_cpm_find( - nanoarrow ${PKG_VERSION} + nanoarrow 0.4.0 GLOBAL_TARGETS nanoarrow CPM_ARGS - GIT_REPOSITORY https://github.com/${PKG_FORK}/arrow-nanoarrow.git - GIT_TAG ${PKG_PINNED_TAG} - # TODO: Commit hashes are not supported with shallow clones. Can switch this if and when we pin - # to an actual tag. - GIT_SHALLOW FALSE - PATCH_COMMAND ${CMAKE_COMMAND} -P ${patch_script} OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ) set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) rapids_export_find_package_root(BUILD nanoarrow "${nanoarrow_BINARY_DIR}" EXPORT_SET cudf-exports) endfunction() -find_and_configure_nanoarrow( - VERSION 0.4.0 FORK apache PINNED_TAG c97720003ff863b81805bcdb9f7c91306ab6b6a8 -) +find_and_configure_nanoarrow() diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json new file mode 100644 index 00000000000..0b83d1808cb --- /dev/null +++ b/cpp/cmake/thirdparty/patches/nanoarrow_override.json @@ -0,0 +1,18 @@ + +{ + "packages" : { + "nanoarrow" : { + "version" : "0.4.0", + "git_url" : "https://github.com/apache/arrow-nanoarrow.git", + "git_tag" : "c97720003ff863b81805bcdb9f7c91306ab6b6a8", + "git_shallow" : false, + "patches" : [ + { + "file" : "${current_json_dir}/nanoarrow_cmake.diff", + "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]", + "fixed_in" : "0.5.0" + } + ] + } + } +}