Skip to content

Commit

Permalink
Merge branch 'branch-24.10' into test-drop-python-3.9
Browse files Browse the repository at this point in the history
  • Loading branch information
bdice authored Aug 13, 2024
2 parents 0b43c1a + 3801f81 commit 4f44e6f
Show file tree
Hide file tree
Showing 10 changed files with 94 additions and 111 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/pandas-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ jobs:
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
# This selects "ARCH=amd64 + the latest supported Python + CUDA".
matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
build_type: nightly
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ jobs:
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
# This selects "ARCH=amd64 + the latest supported Python + CUDA".
matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
build_type: pull-request
script: ci/cudf_pandas_scripts/run_tests.sh
Expand All @@ -196,7 +197,8 @@ jobs:
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
# This selects "ARCH=amd64 + the latest supported Python + CUDA".
matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
build_type: pull-request
script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
# Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.
Expand Down
77 changes: 0 additions & 77 deletions cpp/benchmarks/iterator/iterator.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
#include <thrust/execution_policy.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/pair.h>
#include <thrust/reduce.h>

#include <random>
Expand Down Expand Up @@ -161,68 +160,6 @@ void BM_iterator(benchmark::State& state)
sizeof(TypeParam));
}

// operator+ defined for pair iterator reduction
template <typename T>
__device__ thrust::pair<T, bool> operator+(thrust::pair<T, bool> lhs, thrust::pair<T, bool> rhs)
{
return thrust::pair<T, bool>{lhs.first * lhs.second + rhs.first * rhs.second,
lhs.second + rhs.second};
}
// -----------------------------------------------------------------------------
template <typename T, bool has_null>
void pair_iterator_bench_cub(cudf::column_view& col,
rmm::device_uvector<thrust::pair<T, bool>>& result)
{
thrust::pair<T, bool> init{0, false};
auto d_col = cudf::column_device_view::create(col);
int num_items = col.size();
auto begin = d_col->pair_begin<T, has_null>();
reduce_by_cub(result.begin(), begin, num_items, init);
}

template <typename T, bool has_null>
void pair_iterator_bench_thrust(cudf::column_view& col,
rmm::device_uvector<thrust::pair<T, bool>>& result)
{
thrust::pair<T, bool> init{0, false};
auto d_col = cudf::column_device_view::create(col);
auto d_in = d_col->pair_begin<T, has_null>();
auto d_end = d_in + col.size();
thrust::reduce(thrust::device, d_in, d_end, init, cudf::DeviceSum{});
}

template <class TypeParam, bool cub_or_thrust>
void BM_pair_iterator(benchmark::State& state)
{
cudf::size_type const column_size{(cudf::size_type)state.range(0)};
using T = TypeParam;
auto num_gen = thrust::counting_iterator<cudf::size_type>(0);
auto null_gen =
thrust::make_transform_iterator(num_gen, [](cudf::size_type row) { return row % 2 == 0; });

cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_F(num_gen, num_gen + column_size);
cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_T(
num_gen, num_gen + column_size, null_gen);
cudf::column_view hasnull_F = wrap_hasnull_F;
cudf::column_view hasnull_T = wrap_hasnull_T;

// Initialize dev_result to false
auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<thrust::pair<T, bool>>(
1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
if (cub_or_thrust) {
pair_iterator_bench_cub<T, false>(hasnull_T,
dev_result); // driven by pair iterator with nulls
} else {
pair_iterator_bench_thrust<T, false>(hasnull_T,
dev_result); // driven by pair iterator with nulls
}
}
state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * column_size *
sizeof(TypeParam));
}

#define ITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust, raw_or_iterator) \
BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state) \
{ \
Expand All @@ -238,17 +175,3 @@ ITER_BM_BENCHMARK_DEFINE(double_cub_raw, double, true, true);
ITER_BM_BENCHMARK_DEFINE(double_cub_iter, double, true, false);
ITER_BM_BENCHMARK_DEFINE(double_thrust_raw, double, false, true);
ITER_BM_BENCHMARK_DEFINE(double_thrust_iter, double, false, false);

#define PAIRITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust) \
BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state) \
{ \
BM_pair_iterator<type, cub_or_thrust>(state); \
} \
BENCHMARK_REGISTER_F(Iterator, name) \
->RangeMultiplier(10) \
->Range(1000, 10000000) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

PAIRITER_BM_BENCHMARK_DEFINE(double_cub_pair, double, true);
PAIRITER_BM_BENCHMARK_DEFINE(double_thrust_pair, double, false);
21 changes: 8 additions & 13 deletions cpp/include/cudf/io/text/byte_range_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

#pragma once

#include <cudf/utilities/error.hpp>
#include <cudf/utilities/export.hpp>

#include <cstdint>
Expand All @@ -40,53 +39,49 @@ class byte_range_info {
int64_t _size{}; ///< size in bytes

public:
constexpr byte_range_info() = default;
byte_range_info() = default;
/**
* @brief Constructs a byte_range_info object
*
* @param offset offset in bytes
* @param size size in bytes
*/
constexpr byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size)
{
CUDF_EXPECTS(offset >= 0, "offset must be non-negative");
CUDF_EXPECTS(size >= 0, "size must be non-negative");
}
byte_range_info(int64_t offset, int64_t size);

/**
* @brief Copy constructor
*
* @param other byte_range_info object to copy
*/
constexpr byte_range_info(byte_range_info const& other) noexcept = default;
byte_range_info(byte_range_info const& other) noexcept = default;
/**
* @brief Copy assignment operator
*
* @param other byte_range_info object to copy
* @return this object after copying
*/
constexpr byte_range_info& operator=(byte_range_info const& other) noexcept = default;
byte_range_info& operator=(byte_range_info const& other) noexcept = default;

/**
* @brief Get the offset in bytes
*
* @return Offset in bytes
*/
[[nodiscard]] constexpr int64_t offset() { return _offset; }
[[nodiscard]] int64_t offset() const { return _offset; }

/**
* @brief Get the size in bytes
*
* @return Size in bytes
*/
[[nodiscard]] constexpr int64_t size() { return _size; }
[[nodiscard]] int64_t size() const { return _size; }

/**
* @brief Returns whether the span is empty.
*
* @return true iff the span is empty, i.e. `size() == 0`
* @return true iff the range is empty, i.e. `size() == 0`
*/
[[nodiscard]] constexpr bool empty() { return size() == 0; }
[[nodiscard]] bool is_empty() const { return size() == 0; }
};

/**
Expand Down
7 changes: 7 additions & 0 deletions cpp/src/io/text/byte_range_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,20 @@

#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/io/text/byte_range_info.hpp>
#include <cudf/utilities/error.hpp>

#include <limits>

namespace cudf {
namespace io {
namespace text {

byte_range_info::byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size)
{
CUDF_EXPECTS(offset >= 0, "offset must be non-negative");
CUDF_EXPECTS(size >= 0, "size must be non-negative");
}

byte_range_info create_byte_range_info_max() { return {0, std::numeric_limits<int64_t>::max()}; }

std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes,
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/text/multibyte_split.cu
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
{
CUDF_FUNC_RANGE();

if (byte_range.empty()) { return make_empty_column(type_id::STRING); }
if (byte_range.is_empty()) { return make_empty_column(type_id::STRING); }

auto device_delim = cudf::string_scalar(delimiter, true, stream, mr);

Expand Down
24 changes: 10 additions & 14 deletions cpp/src/strings/split/split.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ struct base_split_tokenizer {

// max_tokens already included in token counts
if (d_tokens.size() == 1) {
d_tokens[0] = string_index_pair{d_str.data(), d_str.size_bytes()};
d_tokens[0] = string_index_pair{(d_str.empty() ? "" : d_str.data()), d_str.size_bytes()};
return;
}

Expand Down Expand Up @@ -357,24 +357,20 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
auto const chars_bytes =
get_offset_value(input.offsets(), input.offset() + strings_count, stream) -
get_offset_value(input.offsets(), input.offset(), stream);
if (chars_bytes == 0) {
auto offsets = cudf::make_column_from_scalar(
numeric_scalar<int32_t>(0, true, stream), strings_count + 1, stream, mr);
auto tokens = rmm::device_uvector<string_index_pair>(0, stream);
return std::pair{std::move(offsets), std::move(tokens)};
}
auto const d_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());

// count the number of delimiters in the entire column
rmm::device_scalar<int64_t> d_count(0, stream);
constexpr int64_t block_size = 512;
constexpr size_type bytes_per_thread = 4;
auto const num_blocks = util::div_rounding_up_safe(
util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
count_delimiters_kernel<Tokenizer, block_size, bytes_per_thread>
<<<num_blocks, block_size, 0, stream.value()>>>(
tokenizer, d_offsets, chars_bytes, d_count.data());
if (chars_bytes > 0) {
constexpr int64_t block_size = 512;
constexpr size_type bytes_per_thread = 4;
auto const num_blocks = util::div_rounding_up_safe(
util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
count_delimiters_kernel<Tokenizer, block_size, bytes_per_thread>
<<<num_blocks, block_size, 0, stream.value()>>>(
tokenizer, d_offsets, chars_bytes, d_count.data());
}

// Create a vector of every delimiter position in the chars column.
// These may include overlapping or otherwise out-of-bounds delimiters which
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/strings/split/split_re.cu
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ struct token_reader_fn {
auto const token_offset = d_token_offsets[idx];
auto const token_count = d_token_offsets[idx + 1] - token_offset;
auto const d_result = d_tokens + token_offset; // store tokens here
if (nchars == 0) {
d_result[0] = string_index_pair{"", 0};
return;
}

int64_t token_idx = 0;
auto itr = d_str.begin();
Expand Down
47 changes: 43 additions & 4 deletions cpp/tests/strings/split_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,24 +307,46 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
}

TEST_F(StringsSplitTest, SplitRecordAllEmpty)
TEST_F(StringsSplitTest, SplitAllEmpty)
{
auto input = cudf::test::strings_column_wrapper({"", "", "", ""});
auto sv = cudf::strings_column_view(input);
auto empty = cudf::string_scalar("");
auto delimiter = cudf::string_scalar("s");

auto result = cudf::strings::split(sv, delimiter);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
result = cudf::strings::rsplit(sv, delimiter);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);

// whitespace hits a special case where nothing matches returns an all-null column
auto expected = cudf::test::strings_column_wrapper({"", "", "", ""}, {0, 0, 0, 0});
result = cudf::strings::split(sv, empty);
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected);
result = cudf::strings::rsplit(sv, empty);
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected);
}

TEST_F(StringsSplitTest, SplitRecordAllEmpty)
{
auto input = cudf::test::strings_column_wrapper({"", "", "", ""});
auto sv = cudf::strings_column_view(input);
auto empty = cudf::string_scalar("");
auto delimiter = cudf::string_scalar("s");

using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
LCW expected({LCW{}, LCW{}, LCW{}, LCW{}});
LCW expected({LCW{""}, LCW{""}, LCW{""}, LCW{""}});
LCW expected_empty({LCW{}, LCW{}, LCW{}, LCW{}});

auto result = cudf::strings::split_record(sv, delimiter);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
result = cudf::strings::split_record(sv, empty);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_empty);

result = cudf::strings::rsplit_record(sv, delimiter);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
result = cudf::strings::rsplit_record(sv, empty);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_empty);
}

TEST_F(StringsSplitTest, MultiByteDelimiters)
Expand Down Expand Up @@ -575,6 +597,23 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary)
}
}

TEST_F(StringsSplitTest, SplitRegexAllEmpty)
{
auto input = cudf::test::strings_column_wrapper({"", "", "", ""});
auto sv = cudf::strings_column_view(input);
auto prog = cudf::strings::regex_program::create("[ _]");

auto result = cudf::strings::split_re(sv, *prog);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
result = cudf::strings::rsplit_re(sv, *prog);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);

auto rec_result = cudf::strings::split_record_re(sv, *prog);
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), input);
rec_result = cudf::strings::rsplit_record_re(sv, *prog);
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), input);
}

TEST_F(StringsSplitTest, RSplitRecord)
{
std::vector<char const*> h_strings{
Expand Down
16 changes: 16 additions & 0 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -978,6 +978,22 @@ def test_string_split_re(data, pat, n, expand):
assert_eq(expect, got)


@pytest.mark.parametrize("pat", [None, "\\s+"])
@pytest.mark.parametrize("regex", [False, True])
@pytest.mark.parametrize("expand", [False, True])
def test_string_split_all_empty(pat, regex, expand):
ps = pd.Series(["", "", "", ""], dtype="str")
gs = cudf.Series(["", "", "", ""], dtype="str")

expect = ps.str.split(pat=pat, expand=expand, regex=regex)
got = gs.str.split(pat=pat, expand=expand, regex=regex)

if isinstance(got, cudf.DataFrame):
assert_eq(expect, got, check_column_type=False)
else:
assert_eq(expect, got)


@pytest.mark.parametrize(
"str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]]
)
Expand Down

0 comments on commit 4f44e6f

Please sign in to comment.