Skip to content

Commit

Permalink
Apply suggestions from reviewer comments
Browse files Browse the repository at this point in the history
  • Loading branch information
mhaseeb123 committed Sep 26, 2024
1 parent 03e599c commit 4c4821a
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 70 deletions.
2 changes: 1 addition & 1 deletion cpp/src/join/mixed_join_common_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ struct double_row_equality_comparator {
}
};

// A CUDA Cooperative Group of 4 threads for the hash set.
// A CUDA Cooperative Group of 1 thread for the hash set for mixed semi.
auto constexpr DEFAULT_MIXED_SEMI_JOIN_CG_SIZE = 1;

// The hash set type used by mixed_semi_join with the build_table.
Expand Down
146 changes: 77 additions & 69 deletions cpp/tests/join/mixed_join_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -797,35 +797,16 @@ TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMapLarge)
{
using T1 = double;

auto const random_data = [](size_t size) {
std::vector<T1> values(size);
using uniform_distribution =
typename std::conditional_t<std::is_same_v<T1, bool>,
std::bernoulli_distribution,
std::conditional_t<std::is_floating_point_v<T1>,
std::uniform_real_distribution<T1>,
std::uniform_int_distribution<T1>>>;

static constexpr auto seed = 0xf00d;
static std::mt19937 engine{seed};
static uniform_distribution dist{};
std::generate(values.begin(), values.end(), [&]() { return T1{dist(engine)}; });

return values;
};

auto const random_validity = [&](size_t size) {
std::vector<bool> validity(size);
static constexpr auto seed = 0xcafe;
static std::mt19937 engine{seed};
static std::bernoulli_distribution dist{};
std::generate(validity.begin(), validity.end(), [&]() { return dist(engine); });
// Column size
auto constexpr N = 1000;

return validity;
};
// Generate column data for left and right tables
auto const [left_col0, right_col0] = gen_random_nullable_repeated_columns<T1>(N, 200);
auto const [left_col1, right_col1] = gen_random_nullable_repeated_columns<T1>(N, 100);

// Setup data and nulls for the left table
std::vector<std::pair<std::vector<T1>, std::vector<bool>>> lefts = {
{random_data(500), random_validity(500)}, {random_data(500), random_validity(500)}};
{left_col0.first, left_col0.second}, {left_col1.first, left_col1.second}};
std::vector<cudf::test::fixed_width_column_wrapper<T1>> left_wrappers;
std::vector<cudf::column_view> left_columns;
for (auto [data, valids] : lefts) {
Expand All @@ -834,8 +815,9 @@ TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMapLarge)
left_columns.emplace_back(left_wrappers.back());
};

// Setup data and nulls for the right table
std::vector<std::pair<std::vector<T1>, std::vector<bool>>> rights = {
{random_data(250), random_validity(250)}, {random_data(250), random_validity(250)}};
{right_col0.first, right_col0.second}, {right_col1.first, right_col1.second}};
std::vector<cudf::test::fixed_width_column_wrapper<T1>> right_wrappers;
std::vector<cudf::column_view> right_columns;
for (auto [data, valids] : rights) {
Expand All @@ -848,7 +830,7 @@ TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMapLarge)
auto const left_table = cudf::table_view{left_columns};
auto const right_table = cudf::table_view{right_columns};

// Use the zeroth column for equality.
// Using the zeroth column for equality.
auto const left_equality = left_table.select({0});
auto const right_equality = right_table.select({0});

Expand All @@ -858,48 +840,74 @@ TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMapLarge)
auto left_zero_eq_right_zero =
cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);

// Expected size of left_semi_join with only zeroth column equality.
auto const expected_num_idx_left_zero_eq_right_zero =
cudf::conditional_left_semi_join_size(left_table, right_table, left_zero_eq_right_zero);

// Actual size of mixed_left_semi_join with only zeroth column equality.
auto const num_idx_left_zero_eq_right_zero =
cudf::mixed_left_semi_join(left_equality,
right_equality,
left_table,
right_table,
left_zero_eq_right_zero,
cudf::null_equality::UNEQUAL)
->size();

// Expected and actual sizes must match.
EXPECT_EQ(expected_num_idx_left_zero_eq_right_zero, num_idx_left_zero_eq_right_zero);

// Common column references for conditional column.
auto const col_ref_left_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT);
auto const col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
auto left_one_gt_right_one =
cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
// Mixed semi join with zeroth column equality
{
// Expected left_semi_join result
auto const expected_mixed_semi_join =
cudf::conditional_left_semi_join(left_table, right_table, left_zero_eq_right_zero);

// Actual mixed_left_semi_join result
auto const mixed_semi_join = cudf::mixed_left_semi_join(left_equality,
right_equality,
left_table,
right_table,
left_zero_eq_right_zero,
cudf::null_equality::UNEQUAL);

// Copy data back to host for comparisons
auto expected_indices = cudf::detail::make_std_vector_async<int32_t>(
cudf::device_span<int32_t>(*expected_mixed_semi_join), cudf::get_default_stream());
auto result_indices = cudf::detail::make_std_vector_sync<int32_t>(
cudf::device_span<int32_t>(*mixed_semi_join), cudf::get_default_stream());

// Sort the indices for 1-1 comparison
std::sort(expected_indices.begin(), expected_indices.end());
std::sort(result_indices.begin(), result_indices.end());

// Expected and actual vectors must match.
EXPECT_EQ(expected_mixed_semi_join->size(), mixed_semi_join->size());
EXPECT_TRUE(
std::equal(expected_indices.begin(), expected_indices.end(), result_indices.begin()));
}

auto combined_condition = cudf::ast::operation(
cudf::ast::ast_operator::LOGICAL_AND, left_zero_eq_right_zero, left_one_gt_right_one);

// Expected size of left_semi_join with zeroth col equality and first col conditional.
auto const expected_num_idx_left_one_greater_right_one =
cudf::conditional_left_semi_join_size(left_table, right_table, combined_condition);

// Actual size of left_semi_join with zeroth col equality and first col conditional.
auto const num_idx_left_one_greater_right_one =
cudf::mixed_left_semi_join(left_equality,
right_equality,
left_table,
right_table,
left_one_gt_right_one,
cudf::null_equality::UNEQUAL)
->size();

// Expected and actual sizes must match.
EXPECT_EQ(expected_num_idx_left_one_greater_right_one, num_idx_left_one_greater_right_one);
// Mixed semi join with zeroth column equality and first column GREATER conditional
{
// Column references for conditional column.
auto const col_ref_left_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT);
auto const col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
auto left_one_gt_right_one =
cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);

// Expected left_semi_join result
auto const expected_mixed_semi_join = cudf::conditional_left_semi_join(
left_table,
right_table,
cudf::ast::operation(
cudf::ast::ast_operator::LOGICAL_AND, left_zero_eq_right_zero, left_one_gt_right_one));

// Actual left_semi_join result
auto const mixed_semi_join = cudf::mixed_left_semi_join(left_equality,
right_equality,
left_table,
right_table,
left_one_gt_right_one,
cudf::null_equality::UNEQUAL);

// Copy data back to host for comparisons
auto expected_indices = cudf::detail::make_std_vector_async<int32_t>(
cudf::device_span<int32_t>(*expected_mixed_semi_join), cudf::get_default_stream());
auto result_indices = cudf::detail::make_std_vector_sync<int32_t>(
cudf::device_span<int32_t>(*mixed_semi_join), cudf::get_default_stream());

// Sort the indices for 1-1 comparison
std::sort(expected_indices.begin(), expected_indices.end());
std::sort(result_indices.begin(), result_indices.end());

// Expected and actual vectors must match.
EXPECT_EQ(expected_mixed_semi_join->size(), mixed_semi_join->size());
EXPECT_TRUE(
std::equal(expected_indices.begin(), expected_indices.end(), result_indices.begin()));
}
}

TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates)
Expand Down

0 comments on commit 4c4821a

Please sign in to comment.