diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh index aed93e20823..4a52cfe098a 100644 --- a/cpp/src/join/mixed_join_common_utils.cuh +++ b/cpp/src/join/mixed_join_common_utils.cuh @@ -178,7 +178,7 @@ struct double_row_equality_comparator { } }; -// A CUDA Cooperative Group of 4 threads for the hash set. +// A CUDA Cooperative Group of 1 thread for the hash set for mixed semi. auto constexpr DEFAULT_MIXED_SEMI_JOIN_CG_SIZE = 1; // The hash set type used by mixed_semi_join with the build_table. diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu index d43bf4dfca9..6b720484771 100644 --- a/cpp/tests/join/mixed_join_tests.cu +++ b/cpp/tests/join/mixed_join_tests.cu @@ -797,35 +797,16 @@ TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMapLarge) { using T1 = double; - auto const random_data = [](size_t size) { - std::vector values(size); - using uniform_distribution = - typename std::conditional_t, - std::bernoulli_distribution, - std::conditional_t, - std::uniform_real_distribution, - std::uniform_int_distribution>>; - - static constexpr auto seed = 0xf00d; - static std::mt19937 engine{seed}; - static uniform_distribution dist{}; - std::generate(values.begin(), values.end(), [&]() { return T1{dist(engine)}; }); - - return values; - }; - - auto const random_validity = [&](size_t size) { - std::vector validity(size); - static constexpr auto seed = 0xcafe; - static std::mt19937 engine{seed}; - static std::bernoulli_distribution dist{}; - std::generate(validity.begin(), validity.end(), [&]() { return dist(engine); }); + // Column size + auto constexpr N = 1000; - return validity; - }; + // Generate column data for left and right tables + auto const [left_col0, right_col0] = gen_random_nullable_repeated_columns(N, 200); + auto const [left_col1, right_col1] = gen_random_nullable_repeated_columns(N, 100); + // Setup data and nulls for the left table std::vector, std::vector>> lefts = { - {random_data(500), random_validity(500)}, {random_data(500), random_validity(500)}}; + {left_col0.first, left_col0.second}, {left_col1.first, left_col1.second}}; std::vector> left_wrappers; std::vector left_columns; for (auto [data, valids] : lefts) { @@ -834,8 +815,9 @@ TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMapLarge) left_columns.emplace_back(left_wrappers.back()); }; + // Setup data and nulls for the right table std::vector, std::vector>> rights = { - {random_data(250), random_validity(250)}, {random_data(250), random_validity(250)}}; + {right_col0.first, right_col0.second}, {right_col1.first, right_col1.second}}; std::vector> right_wrappers; std::vector right_columns; for (auto [data, valids] : rights) { @@ -848,7 +830,7 @@ TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMapLarge) auto const left_table = cudf::table_view{left_columns}; auto const right_table = cudf::table_view{right_columns}; - // Use the zeroth column for equality. + // Using the zeroth column for equality. auto const left_equality = left_table.select({0}); auto const right_equality = right_table.select({0}); @@ -858,48 +840,74 @@ TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMapLarge) auto left_zero_eq_right_zero = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); - // Expected size of left_semi_join with only zeroth column equality. - auto const expected_num_idx_left_zero_eq_right_zero = - cudf::conditional_left_semi_join_size(left_table, right_table, left_zero_eq_right_zero); - - // Actual size of mixed_left_semi_join with only zeroth column equality. - auto const num_idx_left_zero_eq_right_zero = - cudf::mixed_left_semi_join(left_equality, - right_equality, - left_table, - right_table, - left_zero_eq_right_zero, - cudf::null_equality::UNEQUAL) - ->size(); - - // Expected and actual sizes must match. - EXPECT_EQ(expected_num_idx_left_zero_eq_right_zero, num_idx_left_zero_eq_right_zero); - - // Common column references for conditional column. - auto const col_ref_left_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT); - auto const col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT); - auto left_one_gt_right_one = - cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); + // Mixed semi join with zeroth column equality + { + // Expected left_semi_join result + auto const expected_mixed_semi_join = + cudf::conditional_left_semi_join(left_table, right_table, left_zero_eq_right_zero); + + // Actual mixed_left_semi_join result + auto const mixed_semi_join = cudf::mixed_left_semi_join(left_equality, + right_equality, + left_table, + right_table, + left_zero_eq_right_zero, + cudf::null_equality::UNEQUAL); + + // Copy data back to host for comparisons + auto expected_indices = cudf::detail::make_std_vector_async( + cudf::device_span(*expected_mixed_semi_join), cudf::get_default_stream()); + auto result_indices = cudf::detail::make_std_vector_sync( + cudf::device_span(*mixed_semi_join), cudf::get_default_stream()); + + // Sort the indices for 1-1 comparison + std::sort(expected_indices.begin(), expected_indices.end()); + std::sort(result_indices.begin(), result_indices.end()); + + // Expected and actual vectors must match. + EXPECT_EQ(expected_mixed_semi_join->size(), mixed_semi_join->size()); + EXPECT_TRUE( + std::equal(expected_indices.begin(), expected_indices.end(), result_indices.begin())); + } - auto combined_condition = cudf::ast::operation( - cudf::ast::ast_operator::LOGICAL_AND, left_zero_eq_right_zero, left_one_gt_right_one); - - // Expected size of left_semi_join with zeroth col equality and first col conditional. - auto const expected_num_idx_left_one_greater_right_one = - cudf::conditional_left_semi_join_size(left_table, right_table, combined_condition); - - // Actual size of left_semi_join with zeroth col equality and first col conditional. - auto const num_idx_left_one_greater_right_one = - cudf::mixed_left_semi_join(left_equality, - right_equality, - left_table, - right_table, - left_one_gt_right_one, - cudf::null_equality::UNEQUAL) - ->size(); - - // Expected and actual sizes must match. - EXPECT_EQ(expected_num_idx_left_one_greater_right_one, num_idx_left_one_greater_right_one); + // Mixed semi join with zeroth column equality and first column GREATER conditional + { + // Column references for conditional column. + auto const col_ref_left_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT); + auto const col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT); + auto left_one_gt_right_one = + cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); + + // Expected left_semi_join result + auto const expected_mixed_semi_join = cudf::conditional_left_semi_join( + left_table, + right_table, + cudf::ast::operation( + cudf::ast::ast_operator::LOGICAL_AND, left_zero_eq_right_zero, left_one_gt_right_one)); + + // Actual left_semi_join result + auto const mixed_semi_join = cudf::mixed_left_semi_join(left_equality, + right_equality, + left_table, + right_table, + left_one_gt_right_one, + cudf::null_equality::UNEQUAL); + + // Copy data back to host for comparisons + auto expected_indices = cudf::detail::make_std_vector_async( + cudf::device_span(*expected_mixed_semi_join), cudf::get_default_stream()); + auto result_indices = cudf::detail::make_std_vector_sync( + cudf::device_span(*mixed_semi_join), cudf::get_default_stream()); + + // Sort the indices for 1-1 comparison + std::sort(expected_indices.begin(), expected_indices.end()); + std::sort(result_indices.begin(), result_indices.end()); + + // Expected and actual vectors must match. + EXPECT_EQ(expected_mixed_semi_join->size(), mixed_semi_join->size()); + EXPECT_TRUE( + std::equal(expected_indices.begin(), expected_indices.end(), result_indices.begin())); + } } TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates)