From a956cb56ac67638e47b3af368a2e78e75f9f9a1a Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Wed, 27 Sep 2023 13:05:48 -0700 Subject: [PATCH 1/8] part_and_order_gens with nans --- integration_tests/src/main/python/array_test.py | 1 + integration_tests/src/main/python/window_function_test.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/array_test.py b/integration_tests/src/main/python/array_test.py index 6cbe4382338..99b68ccfba1 100644 --- a/integration_tests/src/main/python/array_test.py +++ b/integration_tests/src/main/python/array_test.py @@ -82,6 +82,7 @@ DecimalGen(precision=12, scale=2, nullable=False), DecimalGen(precision=20, scale=2, nullable=False)] +# This non-nans version is only used for Spark version < 3.1.3 no_neg_zero_all_basic_gens_no_nans = [byte_gen, short_gen, int_gen, long_gen, # -0.0 cannot work because of -0.0 == 0.0 in cudf for distinct FloatGen(special_cases=[], no_nans=True), diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py index 5a5347c70bf..e70df9fd770 100644 --- a/integration_tests/src/main/python/window_function_test.py +++ b/integration_tests/src/main/python/window_function_test.py @@ -134,7 +134,7 @@ ('a', IntegerGen()), ('b', LongGen(nullable=True))] -part_and_order_gens = [long_gen, DoubleGen(no_nans=True, special_cases=[]), +part_and_order_gens = [long_gen, DoubleGen(special_cases=[]), string_gen, boolean_gen, timestamp_gen, DecimalGen(precision=18, scale=1), DecimalGen(precision=38, scale=1)] From 8c4895573720412ea46d0cfb99fad74b2ae316ae Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Wed, 27 Sep 2023 13:08:23 -0700 Subject: [PATCH 2/8] running_part_and_oder_gens --- integration_tests/src/main/python/window_function_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py index e70df9fd770..a606c87a2e8 100644 --- a/integration_tests/src/main/python/window_function_test.py +++ b/integration_tests/src/main/python/window_function_test.py @@ -138,7 +138,7 @@ string_gen, boolean_gen, timestamp_gen, DecimalGen(precision=18, scale=1), DecimalGen(precision=38, scale=1)] -running_part_and_order_gens = [long_gen, DoubleGen(no_nans=True, special_cases=[]), +running_part_and_order_gens = [long_gen, DoubleGen(special_cases=[]), string_gen, byte_gen, timestamp_gen, DecimalGen(precision=18, scale=1), DecimalGen(precision=38, scale=1)] From 5d23523a8d506db73ed74503f703b73bbb427a19 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Wed, 27 Sep 2023 13:11:16 -0700 Subject: [PATCH 3/8] lead_lag_data_gens --- integration_tests/src/main/python/window_function_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py index a606c87a2e8..b4708b89668 100644 --- a/integration_tests/src/main/python/window_function_test.py +++ b/integration_tests/src/main/python/window_function_test.py @@ -142,7 +142,7 @@ string_gen, byte_gen, timestamp_gen, DecimalGen(precision=18, scale=1), DecimalGen(precision=38, scale=1)] -lead_lag_data_gens = [long_gen, DoubleGen(no_nans=True, special_cases=[]), +lead_lag_data_gens = [long_gen, DoubleGen(special_cases=[]), boolean_gen, timestamp_gen, string_gen, DecimalGen(precision=18, scale=3), DecimalGen(precision=38, scale=4), StructGen(children=[ From 9e399b14ea5009e0fb68e574da3c2bfebd812f34 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Wed, 27 Sep 2023 17:26:41 -0700 Subject: [PATCH 4/8] removed no_nans gens --- integration_tests/src/main/python/data_gen.py | 4 -- .../src/main/python/hash_aggregate_test.py | 63 +++++++------------ .../src/main/python/hashing_test.py | 2 +- 3 files changed, 23 insertions(+), 46 deletions(-) diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index 9f549adfa46..2a9d7e5e6f0 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -1067,12 +1067,8 @@ def gen_scalars_for_sql(data_gen, count, seed=0, force_no_nulls=False): # all of the basic types in a single struct all_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(all_basic_gens)]) -all_basic_struct_gen_no_nan = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(all_basic_gens_no_nan)]) - struct_array_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(single_level_array_gens)]) -struct_array_gen_no_nans = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(single_level_array_gens_no_nan)]) - # Some struct gens, but not all because of nesting nonempty_struct_gens_sample = [all_basic_struct_gen, StructGen([['child0', byte_gen], ['child1', all_basic_struct_gen]]), diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index 288cf3ebc07..e4c8434dc7c 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -167,19 +167,8 @@ _grpkey_floats_with_nulls_and_nans ] -# List of schemas with no NaNs -_init_list_no_nans = [ - _longs_with_nulls, - _longs_with_no_nulls, - _grpkey_longs_with_nulls, - _grpkey_dbls_with_nulls, - _grpkey_floats_with_nulls, - _grpkey_strings_with_nulls, - _grpkey_nulls, - _grpkey_strings_with_extra_nulls] - # List of schemas with NaNs included -_init_list_with_nans_and_no_nans = [ +_init_list_with_nans = [ _longs_with_nulls, _longs_with_no_nulls, _grpkey_longs_with_nulls, @@ -197,7 +186,7 @@ ('b', DecimalGen(nullable=False)), ('c', DecimalGen(nullable=False))] -_init_list_with_nans_and_no_nans_with_decimals = _init_list_with_nans_and_no_nans + [ +_init_list_with_nans_with_decimals = _init_list_with_nans + [ _decimals_with_nulls, _decimals_with_no_nulls] # Used to test ANSI-mode fallback @@ -303,15 +292,7 @@ def get_params(init_list, marked_params=[]): ('c', _decimal_gen_sum_38_neg10)] -_init_list_no_nans_with_decimal = _init_list_no_nans + [ - _grpkey_small_decimals] - -_init_list_no_nans_with_decimalbig = _init_list_no_nans + [ - _grpkey_small_decimals, _grpkey_big_decimals, _grpkey_short_mid_decimals, - _grpkey_short_big_decimals, _grpkey_short_very_big_decimals, - _grpkey_short_very_big_neg_scale_decimals] - -_init_list_with_nans_and_no_nans_with_decimalbig = _init_list_with_nans_and_no_nans + [ +_init_list_with_nans_with_decimalbig = _init_list_with_nans + [ _grpkey_small_decimals, _grpkey_big_decimals, _grpkey_short_mid_decimals, _grpkey_short_big_decimals, _grpkey_short_very_big_decimals, _grpkey_short_very_big_neg_scale_decimals] @@ -378,7 +359,7 @@ def test_computation_in_grpby_columns(): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimalbig, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_nans_with_decimalbig, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_grpby_sum(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( @@ -420,7 +401,7 @@ def test_hash_reduction_sum_full_decimal(data_gen, conf): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans + [_grpkey_short_mid_decimals, +@pytest.mark.parametrize('data_gen', _init_list_with_nans + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals, _grpkey_short_very_big_decimals, _grpkey_short_sum_full_decimals], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_grpby_avg(data_gen, conf): @@ -451,7 +432,7 @@ def test_hash_avg_nulls_partial_only(data_gen): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimalbig, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_nans_with_decimalbig, ids=idfn) def test_intersectAll(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, data_gen, length=100).intersectAll(gen_df(spark, data_gen, length=100))) @@ -459,7 +440,7 @@ def test_intersectAll(data_gen): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimalbig, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_nans_with_decimalbig, ids=idfn) def test_exceptAll(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, data_gen, length=100).exceptAll(gen_df(spark, data_gen, length=100).filter('a != b'))) @@ -477,7 +458,7 @@ def test_exceptAll(data_gen): ('b', _pivot_gen_128bit), ('c', decimal_gen_128bit)] -_pivot_gens_with_decimals = _init_list_with_nans_and_no_nans + [ +_pivot_gens_with_decimals = _init_list_with_nans + [ _grpkey_small_decimals, _pivot_big_decimals, _grpkey_short_mid_decimals, _pivot_short_big_decimals, _grpkey_short_very_big_decimals, _grpkey_short_very_big_neg_scale_decimals] @@ -497,9 +478,9 @@ def test_hash_grpby_pivot(data_gen, conf): @approximate_float @ignore_order(local=True) @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_grpby_pivot_without_nans(data_gen, conf): +def test_hash_grpby_pivot_with_nans(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) .groupby('a') @@ -510,7 +491,7 @@ def test_hash_grpby_pivot_without_nans(data_gen, conf): @approximate_float @ignore_order(local=True) @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_multiple_grpby_pivot(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( @@ -523,9 +504,9 @@ def test_hash_multiple_grpby_pivot(data_gen, conf): @approximate_float @ignore_order(local=True) @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_reduction_pivot_without_nans(data_gen, conf): +def test_hash_reduction_pivot_with_nans(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) .groupby() @@ -536,7 +517,7 @@ def test_hash_reduction_pivot_without_nans(data_gen, conf): @approximate_float @ignore_order(local=True) @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_reduction_pivot_with_nans(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( @@ -943,7 +924,7 @@ def test_hash_groupby_typed_imperative_agg_without_gpu_implementation_fallback() @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_multiple_mode_query(data_gen, conf): print_params(data_gen) @@ -965,7 +946,7 @@ def test_hash_multiple_mode_query(data_gen, conf): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_multiple_mode_query_avg_distincts(data_gen, conf): @@ -978,7 +959,7 @@ def test_hash_multiple_mode_query_avg_distincts(data_gen, conf): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf): local_conf = copy_and_update(conf, {'spark.sql.legacy.allowParameterlessCount': 'true'}) @@ -1001,7 +982,7 @@ def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_query_max_with_multiple_distincts(data_gen, conf): local_conf = copy_and_update(conf, {'spark.sql.legacy.allowParameterlessCount': 'true'}) @@ -1015,7 +996,7 @@ def test_hash_query_max_with_multiple_distincts(data_gen, conf): conf=local_conf) @ignore_order -@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_count_with_filter(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( @@ -1027,7 +1008,7 @@ def test_hash_count_with_filter(data_gen, conf): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_no_nans + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals], ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_nans + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_multiple_filters(data_gen, conf): assert_gpu_and_cpu_are_equal_sql( @@ -1784,7 +1765,7 @@ def do_it(spark): @ignore_order(local=True) @approximate_float @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans_with_decimals, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_nans_with_decimals, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_std_variance(data_gen, conf): local_conf = copy_and_update(conf, { @@ -1850,7 +1831,7 @@ def test_std_variance_nulls(data_gen, conf, ansi_enabled): 'StddevPop', 'StddevSamp', 'VariancePop', 'VarianceSamp', 'SortArray', 'Alias', 'Literal', 'Count', 'AggregateExpression', 'ProjectExec') -@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) @pytest.mark.parametrize('replace_mode', _replace_modes_non_distinct, ids=idfn) @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) diff --git a/integration_tests/src/main/python/hashing_test.py b/integration_tests/src/main/python/hashing_test.py index 107c3a4576e..6bd56da933d 100644 --- a/integration_tests/src/main/python/hashing_test.py +++ b/integration_tests/src/main/python/hashing_test.py @@ -39,7 +39,7 @@ _xxhash_fallback_gens = single_level_array_gens + nested_array_gens_sample + [ all_basic_struct_gen, - struct_array_gen_no_nans, + struct_array_gen, _struct_of_xxhash_gens] if is_before_spark_320(): _xxhash_fallback_gens += [float_gen, double_gen] From 606b7a2d98a45669a751e529096e25ad7fd9615c Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Wed, 27 Sep 2023 17:30:18 -0700 Subject: [PATCH 5/8] renamed generator from with_nans_with_decimal* to with_nan_and_decimal* --- .../src/main/python/hash_aggregate_test.py | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index e4c8434dc7c..f7db575ce69 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -168,7 +168,7 @@ ] # List of schemas with NaNs included -_init_list_with_nans = [ +_init_list = [ _longs_with_nulls, _longs_with_no_nulls, _grpkey_longs_with_nulls, @@ -186,7 +186,7 @@ ('b', DecimalGen(nullable=False)), ('c', DecimalGen(nullable=False))] -_init_list_with_nans_with_decimals = _init_list_with_nans + [ +_init_list_with_decimals = _init_list + [ _decimals_with_nulls, _decimals_with_no_nulls] # Used to test ANSI-mode fallback @@ -292,7 +292,7 @@ def get_params(init_list, marked_params=[]): ('c', _decimal_gen_sum_38_neg10)] -_init_list_with_nans_with_decimalbig = _init_list_with_nans + [ +_init_list_with_decimalbig = _init_list + [ _grpkey_small_decimals, _grpkey_big_decimals, _grpkey_short_mid_decimals, _grpkey_short_big_decimals, _grpkey_short_very_big_decimals, _grpkey_short_very_big_neg_scale_decimals] @@ -359,7 +359,7 @@ def test_computation_in_grpby_columns(): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans_with_decimalbig, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_grpby_sum(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( @@ -401,7 +401,7 @@ def test_hash_reduction_sum_full_decimal(data_gen, conf): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans + [_grpkey_short_mid_decimals, +@pytest.mark.parametrize('data_gen', _init_list + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals, _grpkey_short_very_big_decimals, _grpkey_short_sum_full_decimals], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_grpby_avg(data_gen, conf): @@ -432,7 +432,7 @@ def test_hash_avg_nulls_partial_only(data_gen): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans_with_decimalbig, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn) def test_intersectAll(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, data_gen, length=100).intersectAll(gen_df(spark, data_gen, length=100))) @@ -440,7 +440,7 @@ def test_intersectAll(data_gen): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans_with_decimalbig, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn) def test_exceptAll(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, data_gen, length=100).exceptAll(gen_df(spark, data_gen, length=100).filter('a != b'))) @@ -458,7 +458,7 @@ def test_exceptAll(data_gen): ('b', _pivot_gen_128bit), ('c', decimal_gen_128bit)] -_pivot_gens_with_decimals = _init_list_with_nans + [ +_pivot_gens_with_decimals = _init_list + [ _grpkey_small_decimals, _pivot_big_decimals, _grpkey_short_mid_decimals, _pivot_short_big_decimals, _grpkey_short_very_big_decimals, _grpkey_short_very_big_neg_scale_decimals] @@ -478,7 +478,7 @@ def test_hash_grpby_pivot(data_gen, conf): @approximate_float @ignore_order(local=True) @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_grpby_pivot_with_nans(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( @@ -491,7 +491,7 @@ def test_hash_grpby_pivot_with_nans(data_gen, conf): @approximate_float @ignore_order(local=True) @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_multiple_grpby_pivot(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( @@ -504,7 +504,7 @@ def test_hash_multiple_grpby_pivot(data_gen, conf): @approximate_float @ignore_order(local=True) @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_reduction_pivot_with_nans(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( @@ -517,7 +517,7 @@ def test_hash_reduction_pivot_with_nans(data_gen, conf): @approximate_float @ignore_order(local=True) @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_reduction_pivot_with_nans(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( @@ -924,7 +924,7 @@ def test_hash_groupby_typed_imperative_agg_without_gpu_implementation_fallback() @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_multiple_mode_query(data_gen, conf): print_params(data_gen) @@ -946,7 +946,7 @@ def test_hash_multiple_mode_query(data_gen, conf): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_multiple_mode_query_avg_distincts(data_gen, conf): @@ -959,7 +959,7 @@ def test_hash_multiple_mode_query_avg_distincts(data_gen, conf): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf): local_conf = copy_and_update(conf, {'spark.sql.legacy.allowParameterlessCount': 'true'}) @@ -982,7 +982,7 @@ def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_query_max_with_multiple_distincts(data_gen, conf): local_conf = copy_and_update(conf, {'spark.sql.legacy.allowParameterlessCount': 'true'}) @@ -996,7 +996,7 @@ def test_hash_query_max_with_multiple_distincts(data_gen, conf): conf=local_conf) @ignore_order -@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_count_with_filter(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( @@ -1008,7 +1008,7 @@ def test_hash_count_with_filter(data_gen, conf): @approximate_float @ignore_order @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals], ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_multiple_filters(data_gen, conf): assert_gpu_and_cpu_are_equal_sql( @@ -1765,7 +1765,7 @@ def do_it(spark): @ignore_order(local=True) @approximate_float @incompat -@pytest.mark.parametrize('data_gen', _init_list_with_nans_with_decimals, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list_with_decimals, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_std_variance(data_gen, conf): local_conf = copy_and_update(conf, { @@ -1831,7 +1831,7 @@ def test_std_variance_nulls(data_gen, conf, ansi_enabled): 'StddevPop', 'StddevSamp', 'VariancePop', 'VarianceSamp', 'SortArray', 'Alias', 'Literal', 'Count', 'AggregateExpression', 'ProjectExec') -@pytest.mark.parametrize('data_gen', _init_list_with_nans, ids=idfn) +@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) @pytest.mark.parametrize('replace_mode', _replace_modes_non_distinct, ids=idfn) @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) From 57c21d31a4aeb0406c9c7f770b8254a51eac37ae Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Thu, 28 Sep 2023 11:36:30 -0700 Subject: [PATCH 6/8] added missing gens to init_list --- integration_tests/src/main/python/hash_aggregate_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index f7db575ce69..ee836713bf3 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -175,6 +175,8 @@ _grpkey_dbls_with_nulls, _grpkey_floats_with_nulls, _grpkey_strings_with_nulls, + _grpkey_strings_with_extra_nulls, + _grpkey_nulls, _grpkey_floats_with_nulls_and_nans] # grouping decimals with nulls From 6dcd20d12563f81c9dfc3b879e13d1453b530801 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Thu, 28 Sep 2023 17:25:15 -0700 Subject: [PATCH 7/8] addressed review comments --- .../src/main/python/hash_aggregate_test.py | 28 +------------------ 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index ee836713bf3..4f58278360c 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -477,19 +477,6 @@ def test_hash_grpby_pivot(data_gen, conf): .agg(f.sum('c')), conf = conf) -@approximate_float -@ignore_order(local=True) -@incompat -@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) -@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_grpby_pivot_with_nans(data_gen, conf): - assert_gpu_and_cpu_are_equal_collect( - lambda spark: gen_df(spark, data_gen, length=100) - .groupby('a') - .pivot('b') - .agg(f.sum('c')), - conf=conf) - @approximate_float @ignore_order(local=True) @incompat @@ -508,20 +495,7 @@ def test_hash_multiple_grpby_pivot(data_gen, conf): @incompat @pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_reduction_pivot_with_nans(data_gen, conf): - assert_gpu_and_cpu_are_equal_collect( - lambda spark: gen_df(spark, data_gen, length=100) - .groupby() - .pivot('b') - .agg(f.sum('c')), - conf=conf) - -@approximate_float -@ignore_order(local=True) -@incompat -@pytest.mark.parametrize('data_gen', _init_list, ids=idfn) -@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_reduction_pivot_with_nans(data_gen, conf): +def test_hash_reduction_pivot(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) .groupby() From 3c2d0f411ab1e73cce78f47dd7a1e5d96e95c17d Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Thu, 28 Sep 2023 17:32:03 -0700 Subject: [PATCH 8/8] Signing off Signed-off-by: Raza Jafri