diff --git a/integration_tests/src/main/python/aqe_test.py b/integration_tests/src/main/python/aqe_test.py index 189bef329d7..06759954631 100755 --- a/integration_tests/src/main/python/aqe_test.py +++ b/integration_tests/src/main/python/aqe_test.py @@ -21,6 +21,9 @@ from marks import ignore_order, allow_non_gpu from spark_session import with_cpu_session, is_databricks113_or_later +# allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653' +not_utc_aqe_allow=['ShuffleExchangeExec', 'HashAggregateExec'] if is_not_utc() else [] + _adaptive_conf = { "spark.sql.adaptive.enabled": "true" } def create_skew_df(spark, length): @@ -194,9 +197,8 @@ def do_it(spark): # broadcast join. The bug currently manifests in Databricks, but could # theoretically show up in other Spark distributions @ignore_order(local=True) -@allow_non_gpu('BroadcastNestedLoopJoinExec', 'Cast', 'DateSub', *db_113_cpu_bnlj_join_allow) +@allow_non_gpu('BroadcastNestedLoopJoinExec', 'Cast', 'DateSub', *db_113_cpu_bnlj_join_allow, *not_utc_aqe_allow) @pytest.mark.parametrize('join', joins, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_aqe_join_reused_exchange_inequality_condition(spark_tmp_path, join): data_path = spark_tmp_path + '/PARQUET_DATA' def prep(spark): diff --git a/integration_tests/src/main/python/arithmetic_ops_test.py b/integration_tests/src/main/python/arithmetic_ops_test.py index 97d7cb153cb..b75872ed8b2 100644 --- a/integration_tests/src/main/python/arithmetic_ops_test.py +++ b/integration_tests/src/main/python/arithmetic_ops_test.py @@ -16,7 +16,6 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_fallback_collect, assert_gpu_and_cpu_are_equal_sql -from conftest import is_not_utc from data_gen import * from marks import ignore_order, incompat, approximate_float, allow_non_gpu, datagen_overrides from pyspark.sql.types import * @@ -984,7 +983,6 @@ def test_columnar_pow(data_gen): lambda spark : binary_op_df(spark, data_gen).selectExpr('pow(a, b)')) @pytest.mark.parametrize('data_gen', all_basic_gens + _arith_decimal_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_least(data_gen): num_cols = 20 s1 = with_cpu_session( @@ -1001,7 +999,6 @@ def test_least(data_gen): f.least(*command_args))) @pytest.mark.parametrize('data_gen', all_basic_gens + _arith_decimal_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_greatest(data_gen): num_cols = 20 s1 = with_cpu_session( diff --git a/integration_tests/src/main/python/array_test.py b/integration_tests/src/main/python/array_test.py index 29f4e64b893..ec29dce70d1 100644 --- a/integration_tests/src/main/python/array_test.py +++ b/integration_tests/src/main/python/array_test.py @@ -16,7 +16,7 @@ from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_and_cpu_error, assert_gpu_fallback_collect from data_gen import * -from conftest import is_databricks_runtime, is_not_utc +from conftest import is_databricks_runtime from marks import incompat from spark_session import is_before_spark_313, is_before_spark_330, is_databricks113_or_later, is_spark_330_or_later, is_databricks104_or_later, is_spark_33X, is_spark_340_or_later, is_spark_330, is_spark_330cdh from pyspark.sql.types import * @@ -103,13 +103,11 @@ @pytest.mark.parametrize('data_gen', array_item_test_gens, ids=idfn) @pytest.mark.parametrize('index_gen', array_index_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_item(data_gen, index_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, data_gen, index_gen).selectExpr('a[b]')) @pytest.mark.parametrize('data_gen', array_item_test_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_item_lit_ordinal(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -147,7 +145,6 @@ def test_array_item_with_strict_index(strict_index_enabled, index): # No need to test this for multiple data types for array. Only one is enough, but with two kinds of invalid index. @pytest.mark.parametrize('index', [-2, 100, array_neg_index_gen, array_out_index_gen], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_item_ansi_fail_invalid_index(index): message = "SparkArrayIndexOutOfBoundsException" if (is_databricks104_or_later() or is_spark_330_or_later()) else "java.lang.ArrayIndexOutOfBoundsException" if isinstance(index, int): @@ -174,7 +171,6 @@ def test_array_item_ansi_not_fail_all_null_data(): decimal_gen_32bit, decimal_gen_64bit, decimal_gen_128bit, binary_gen, StructGen([['child0', StructGen([['child01', IntegerGen()]])], ['child1', string_gen], ['child2', float_gen]], nullable=False), StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]], nullable=False)], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_make_array(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -187,7 +183,6 @@ def test_make_array(data_gen): @pytest.mark.parametrize('data_gen', single_level_array_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_orderby_array_unique(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark : append_unique_int_col_to_df(spark, unary_op_df(spark, data_gen)), @@ -217,7 +212,6 @@ def test_orderby_array_of_structs(data_gen): @pytest.mark.parametrize('data_gen', [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, string_gen, boolean_gen, date_gen, timestamp_gen], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_contains(data_gen): arr_gen = ArrayGen(data_gen) literal = with_cpu_session(lambda spark: gen_scalar(data_gen, force_no_nulls=True)) @@ -245,7 +239,6 @@ def test_array_contains_for_nans(data_gen): @pytest.mark.parametrize('data_gen', array_item_test_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_element_at(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, data_gen, array_no_zero_index_gen).selectExpr( @@ -310,7 +303,6 @@ def test_array_element_at_zero_index_fail(index, ansi_enabled): @pytest.mark.parametrize('data_gen', array_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_transform(data_gen): def do_it(spark): columns = ['a', 'b', @@ -345,7 +337,6 @@ def do_it(spark): string_gen, boolean_gen, date_gen, timestamp_gen, null_gen] + decimal_gens @pytest.mark.parametrize('data_gen', array_min_max_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_min_max(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, ArrayGen(data_gen)).selectExpr( @@ -370,7 +361,6 @@ def test_array_concat_decimal(data_gen): 'concat(a, a)'))) @pytest.mark.parametrize('data_gen', orderable_gens + nested_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_repeat_with_count_column(data_gen): cnt_gen = IntegerGen(min_val=-5, max_val=5, special_cases=[]) cnt_not_null_gen = IntegerGen(min_val=-5, max_val=5, special_cases=[], nullable=False) @@ -384,7 +374,6 @@ def test_array_repeat_with_count_column(data_gen): @pytest.mark.parametrize('data_gen', orderable_gens + nested_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_repeat_with_count_scalar(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -414,7 +403,6 @@ def test_sql_array_scalars(query): @pytest.mark.parametrize('data_gen', all_basic_gens + nested_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_array_struct_fields(data_gen): array_struct_gen = ArrayGen( StructGen([['child0', data_gen], ['child1', int_gen]]), @@ -453,7 +441,6 @@ def do_it(spark): @pytest.mark.parametrize('data_gen', array_zips_gen, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_arrays_zip(data_gen): gen = StructGen( [('a', data_gen), ('b', data_gen), ('c', data_gen), ('d', data_gen)], nullable=False) @@ -486,7 +473,6 @@ def q1(spark): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens + decimal_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') @pytest.mark.skipif(is_before_spark_313() or is_spark_330() or is_spark_330cdh(), reason="NaN equality is only handled in Spark 3.1.3+ and SPARK-39976 issue with null and ArrayIntersect in Spark 3.3.0") def test_array_intersect(data_gen): gen = StructGen( @@ -528,7 +514,6 @@ def test_array_intersect_spark330(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens_no_nans + decimal_gens, ids=idfn) @pytest.mark.skipif(not is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_intersect_before_spark313(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -549,7 +534,6 @@ def test_array_intersect_before_spark313(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens + decimal_gens, ids=idfn) @pytest.mark.skipif(is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_union(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -570,7 +554,6 @@ def test_array_union(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens_no_nans + decimal_gens, ids=idfn) @pytest.mark.skipif(not is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_union_before_spark313(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -591,7 +574,6 @@ def test_array_union_before_spark313(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens + decimal_gens, ids=idfn) @pytest.mark.skipif(is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_except(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -612,7 +594,6 @@ def test_array_except(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens_no_nans + decimal_gens, ids=idfn) @pytest.mark.skipif(not is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_except_before_spark313(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -633,7 +614,6 @@ def test_array_except_before_spark313(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens + decimal_gens, ids=idfn) @pytest.mark.skipif(is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_arrays_overlap(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -655,7 +635,6 @@ def test_arrays_overlap(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens_no_nans + decimal_gens, ids=idfn) @pytest.mark.skipif(not is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_arrays_overlap_before_spark313(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -693,7 +672,6 @@ def test_array_remove_scalar(data_gen): FloatGen(special_cases=_non_neg_zero_float_special_cases + [-0.0]), DoubleGen(special_cases=_non_neg_zero_double_special_cases + [-0.0]), StringGen(pattern='[0-9]{1,5}'), boolean_gen, date_gen, timestamp_gen] + decimal_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_remove(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -708,7 +686,6 @@ def test_array_remove(data_gen): @pytest.mark.parametrize('data_gen', [ArrayGen(sub_gen) for sub_gen in array_gens_sample], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_flatten_array(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('flatten(a)') diff --git a/integration_tests/src/main/python/ast_test.py b/integration_tests/src/main/python/ast_test.py index 94976ea2208..3d9cd3cd794 100644 --- a/integration_tests/src/main/python/ast_test.py +++ b/integration_tests/src/main/python/ast_test.py @@ -15,7 +15,6 @@ import pytest from asserts import assert_cpu_and_gpu_are_equal_collect_with_capture -from conftest import is_not_utc from data_gen import * from marks import approximate_float, datagen_overrides from spark_session import with_cpu_session, is_before_spark_330 @@ -71,7 +70,6 @@ def assert_binary_ast(data_descr, func, conf={}): assert_gpu_ast(is_supported, lambda spark: func(binary_op_df(spark, data_gen)), conf=conf) @pytest.mark.parametrize('data_gen', [boolean_gen, byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, timestamp_gen, date_gen], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_literal(spark_tmp_path, data_gen): # Write data to Parquet so Spark generates a plan using just the count of the data. data_path = spark_tmp_path + '/AST_TEST_DATA' @@ -81,7 +79,6 @@ def test_literal(spark_tmp_path, data_gen): func=lambda spark: spark.read.parquet(data_path).select(scalar)) @pytest.mark.parametrize('data_gen', [boolean_gen, byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, timestamp_gen, date_gen], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_null_literal(spark_tmp_path, data_gen): # Write data to Parquet so Spark generates a plan using just the count of the data. data_path = spark_tmp_path + '/AST_TEST_DATA' @@ -235,7 +232,6 @@ def test_expm1(data_descr): assert_unary_ast(data_descr, lambda df: df.selectExpr('expm1(a)')) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_eq(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, @@ -245,7 +241,6 @@ def test_eq(data_descr): f.col('a') == f.col('b'))) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_ne(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, @@ -255,7 +250,6 @@ def test_ne(data_descr): f.col('a') != f.col('b'))) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_lt(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, @@ -265,7 +259,6 @@ def test_lt(data_descr): f.col('a') < f.col('b'))) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_lte(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, @@ -275,7 +268,6 @@ def test_lte(data_descr): f.col('a') <= f.col('b'))) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_gt(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, @@ -285,7 +277,6 @@ def test_gt(data_descr): f.col('a') > f.col('b'))) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_gte(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, diff --git a/integration_tests/src/main/python/cache_test.py b/integration_tests/src/main/python/cache_test.py index e028e93a959..70fb95fc1ea 100644 --- a/integration_tests/src/main/python/cache_test.py +++ b/integration_tests/src/main/python/cache_test.py @@ -65,7 +65,6 @@ def test_passing_gpuExpr_as_Expr(enable_vectorized_conf): @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('enable_vectorized_conf', enable_vectorized_confs, ids=idfn) @ignore_order -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cache_join(data_gen, enable_vectorized_conf): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) @@ -93,7 +92,6 @@ def do_join(spark): @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('enable_vectorized_conf', enable_vectorized_confs, ids=idfn) @ignore_order -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cache_expand_exec(data_gen, enable_vectorized_conf): def op_df(spark, length=2048): cached = gen_df(spark, StructGen([ @@ -153,6 +151,9 @@ def n_fold(spark): with_cpu_session(n_fold) +# allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653' +non_utc_orc_save_table_allow = ['DataWritingCommandExec', 'WriteFilesExec'] if is_not_utc() else [] + # This test doesn't allow negative scale for Decimals as ` df.write.mode('overwrite').parquet(data_path)` # writes parquet which doesn't allow negative decimals @pytest.mark.parametrize('data_gen', [StringGen(), ByteGen(), ShortGen(), IntegerGen(), LongGen(), @@ -167,8 +168,7 @@ def n_fold(spark): @pytest.mark.parametrize('ts_write', ['TIMESTAMP_MICROS', 'TIMESTAMP_MILLIS']) @pytest.mark.parametrize('enable_vectorized', ['true', 'false'], ids=idfn) @ignore_order -@allow_non_gpu("SortExec", "ShuffleExchangeExec", "RangePartitioning") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu("SortExec", "ShuffleExchangeExec", "RangePartitioning", *non_utc_orc_save_table_allow) def test_cache_columnar(spark_tmp_path, data_gen, enable_vectorized, ts_write): data_path_gpu = spark_tmp_path + '/PARQUET_DATA' def read_parquet_cached(data_path): @@ -281,7 +281,6 @@ def helper(spark): @pytest.mark.parametrize('enable_vectorized_conf', enable_vectorized_confs, ids=idfn) @pytest.mark.parametrize('batch_size', [{"spark.rapids.sql.batchSizeBytes": "100"}, {}], ids=idfn) @ignore_order -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cache_count(data_gen, with_x_session, enable_vectorized_conf, batch_size): test_conf = copy_and_update(enable_vectorized_conf, batch_size) generate_data_and_test_func_on_cached_df(with_x_session, lambda df: df.count(), data_gen, test_conf) diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index beb3111383b..b81d6a2e050 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -61,11 +61,13 @@ def test_cast_nested(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type))) +date_after_1_2_1 = '(0{0,3}1-(0?[2-9]|[1-3][0-9]))|(([0-9]{0,3}[2-9]|[1-9][0-9]{0,2}[01])-[0-3]?[0-9])-[0-5]?[0-9]' + def test_cast_string_date_valid_format(): # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. # This provides values that are valid in all of those formats. assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, StringGen('[0-9]{0,3}[1-9]-[0-9]{1,2}-[0-9]{1,2}')).select(f.col('a').cast(DateType())), + lambda spark : unary_op_df(spark, StringGen(date_after_1_2_1)).select(f.col('a').cast(DateType())), conf = {'spark.rapids.sql.hasExtendedYearValues': 'false'}) invalid_values_string_to_date = ['200', ' 1970A', '1970 A', '1970T', # not conform to "yyyy" after trim @@ -146,11 +148,12 @@ def test_cast_string_date_non_ansi(): lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())), conf={'spark.rapids.sql.hasExtendedYearValues': 'false'}) -@pytest.mark.parametrize('data_gen', [StringGen('[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}'), - StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'), - StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]\.[0-9]{0,6}Z?')], +@pytest.mark.parametrize('data_gen', [StringGen(date_after_1_2_1), + StringGen(date_after_1_2_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'), + StringGen(date_after_1_2_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]\.[0-9]{0,6}Z?') + ], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_string_ts_valid_format(data_gen): # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. # This provides values that are valid in all of those formats. @@ -298,7 +301,7 @@ def _assert_cast_to_string_equal (data_gen, conf): @pytest.mark.parametrize('data_gen', all_array_gens_for_cast_to_string, ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_array_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, @@ -318,7 +321,7 @@ def test_cast_array_with_unmatched_element_to_string(data_gen, legacy): @pytest.mark.parametrize('data_gen', basic_map_gens_for_cast_to_string, ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_map_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, @@ -338,7 +341,7 @@ def test_cast_map_with_unmatched_element_to_string(data_gen, legacy): @pytest.mark.parametrize('data_gen', [StructGen([[str(i), gen] for i, gen in enumerate(basic_array_struct_gens_for_cast_to_string)] + [["map", MapGen(ByteGen(nullable=False), null_gen)]])], ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_struct_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, @@ -403,7 +406,7 @@ def test_cast_string_to_negative_scale_decimal(): @pytest.mark.skipif(is_before_spark_330(), reason="ansi cast throws exception only in 3.3.0+") @pytest.mark.parametrize('type', [DoubleType(), FloatType()], ids=idfn) @pytest.mark.parametrize('invalid_value', [float("inf"), float("-inf"), float("nan")]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_float_to_timestamp_ansi_for_nan_inf(type, invalid_value): def fun(spark): data = [invalid_value] @@ -415,7 +418,7 @@ def fun(spark): @pytest.mark.skipif(is_before_spark_330(), reason="ansi cast throws exception only in 3.3.0+") @pytest.mark.parametrize('type', [DoubleType(), FloatType()], ids=idfn) @pytest.mark.parametrize('invalid_value', [float(LONG_MAX) + 100, float(LONG_MIN) - 100]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_float_to_timestamp_ansi_overflow(type, invalid_value): def fun(spark): data = [invalid_value] @@ -424,7 +427,7 @@ def fun(spark): assert_gpu_and_cpu_error(fun, {"spark.sql.ansi.enabled": True}, "ArithmeticException") @pytest.mark.skipif(is_before_spark_330(), reason='330+ throws exception in ANSI mode') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_float_to_timestamp_side_effect(): def getDf(spark): data = [(True, float(LONG_MAX) + 100), (False, float(1))] @@ -436,7 +439,7 @@ def getDf(spark): # non ansi mode, will get null @pytest.mark.parametrize('type', [DoubleType(), FloatType()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_float_to_timestamp_for_nan_inf(type): def fun(spark): data = [(float("inf"),), (float("-inf"),), (float("nan"),)] @@ -456,7 +459,7 @@ def fun(spark): short_gen, int_gen, long_gen_to_timestamp], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_integral_to_timestamp(gen, ansi_enabled): if(is_before_spark_330() and ansi_enabled): # 330- does not support in ANSI mode pytest.skip() @@ -465,7 +468,7 @@ def test_cast_integral_to_timestamp(gen, ansi_enabled): conf={"spark.sql.ansi.enabled": ansi_enabled}) @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_float_to_timestamp(ansi_enabled): if(is_before_spark_330() and ansi_enabled): # 330- does not support in ANSI mode pytest.skip() @@ -475,7 +478,7 @@ def test_cast_float_to_timestamp(ansi_enabled): conf={"spark.sql.ansi.enabled": ansi_enabled}) @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_double_to_timestamp(ansi_enabled): if (is_before_spark_330() and ansi_enabled): # 330- does not support in ANSI mode pytest.skip() @@ -493,7 +496,7 @@ def test_cast_double_to_timestamp(ansi_enabled): (INT_MIN - 1, IntegerType()), ], ids=idfn) @pytest.mark.skipif(is_before_spark_330(), reason="Spark 330- does not ansi casting between numeric and timestamp") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_timestamp_to_integral_ansi_overflow(invalid_and_type): (invalid, to_type) = invalid_and_type assert_gpu_and_cpu_error( @@ -504,7 +507,7 @@ def test_cast_timestamp_to_integral_ansi_overflow(invalid_and_type): error_message="overflow") @pytest.mark.skipif(is_before_spark_330(), reason="Spark 330- does not ansi casting between numeric and timestamp") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_timestamp_to_numeric_ansi_no_overflow(): data = [datetime.fromtimestamp(i) for i in range(BYTE_MIN, BYTE_MAX + 1)] assert_gpu_and_cpu_are_equal_collect( @@ -513,14 +516,14 @@ def test_cast_timestamp_to_numeric_ansi_no_overflow(): "cast(value as float)", "cast(value as double)"), conf=ansi_enabled_conf) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_timestamp_to_numeric_non_ansi(): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, timestamp_gen) .selectExpr("cast(a as byte)", "cast(a as short)", "cast(a as int)", "cast(a as long)", "cast(a as float)", "cast(a as double)")) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cast_timestamp_to_string(): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, timestamp_gen) diff --git a/integration_tests/src/main/python/cmp_test.py b/integration_tests/src/main/python/cmp_test.py index a891b667016..59759098f28 100644 --- a/integration_tests/src/main/python/cmp_test.py +++ b/integration_tests/src/main/python/cmp_test.py @@ -19,11 +19,10 @@ from data_gen import * from spark_session import with_cpu_session, is_before_spark_330 from pyspark.sql.types import * -from marks import datagen_overrides +from marks import datagen_overrides, allow_non_gpu import pyspark.sql.functions as f @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + struct_gens_sample_with_decimal128_no_list, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_eq(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -57,7 +56,6 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + struct_gens_sample_with_decimal128_no_list, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_eq_ns(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -85,7 +83,6 @@ def test_eq_ns_for_interval(): f.col('a').eqNullSafe(f.col('b')))) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + struct_gens_sample_with_decimal128_no_list, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_ne(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -119,7 +116,6 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', orderable_gens + struct_gens_sample_with_decimal128_no_list, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_lt(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -153,7 +149,6 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', orderable_gens + struct_gens_sample_with_decimal128_no_list, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_lte(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -188,7 +183,6 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', orderable_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_gt(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -223,7 +217,6 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', orderable_gens + struct_gens_sample_with_decimal128_no_list, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_gte(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -258,7 +251,6 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + [binary_gen] + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_isnull(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select( @@ -278,37 +270,35 @@ def test_isnan(data_gen): f.isnan(f.col('a')))) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + [binary_gen] + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_dropna_any(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).dropna()) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + [binary_gen] + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_dropna_all(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).dropna(how='all')) #dropna is really a filter along with a test for null, but lets do an explicit filter test too @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_filter(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : three_col_df(spark, BooleanGen(), data_gen, data_gen).filter(f.col('a'))) # coalesce batch happens after a filter, but only if something else happens on the GPU after that @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_filter_with_project(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : two_col_df(spark, BooleanGen(), data_gen).filter(f.col('a')).selectExpr('*', 'a as a2')) +# DateAddInterval is a time zone aware expression +non_utc_allow_for_date_add_interval = ['ProjectExec', 'FilterExec'] if is_not_utc() else [] # It takes quite a bit to get filter to have a column it can filter on, but # no columns to actually filter. We are making it happen here with a sub-query # and some constants that then make it so all we need is the number of rows # of input. @pytest.mark.parametrize('op', ['>', '<']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow_for_date_add_interval) def test_empty_filter(op, spark_tmp_path): def do_it(spark): @@ -337,7 +327,6 @@ def test_filter_with_lit(expr): # Spark supports two different versions of 'IN', and it depends on the spark.sql.optimizer.inSetConversionThreshold conf # This is to test entries under that value. @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_in(data_gen): # nulls are not supported for in on the GPU yet num_entries = int(with_cpu_session(lambda spark: spark.conf.get('spark.sql.optimizer.inSetConversionThreshold'))) - 1 @@ -350,7 +339,7 @@ def test_in(data_gen): # This is to test entries over that value. @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9687') @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_in_set(data_gen): # nulls are not supported for in on the GPU yet num_entries = int(with_cpu_session(lambda spark: spark.conf.get('spark.sql.optimizer.inSetConversionThreshold'))) + 1 diff --git a/integration_tests/src/main/python/collection_ops_test.py b/integration_tests/src/main/python/collection_ops_test.py index ea9eb4538df..16a450a1a8c 100644 --- a/integration_tests/src/main/python/collection_ops_test.py +++ b/integration_tests/src/main/python/collection_ops_test.py @@ -15,7 +15,6 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error -from conftest import is_not_utc from data_gen import * from pyspark.sql.types import * from string_test import mk_str_gen @@ -23,6 +22,7 @@ import pyspark.sql.utils from spark_session import with_cpu_session, with_gpu_session from conftest import get_datagen_seed +from marks import allow_non_gpu nested_gens = [ArrayGen(LongGen()), ArrayGen(decimal_gen_128bit), StructGen([("a", LongGen()), ("b", decimal_gen_128bit)]), @@ -35,7 +35,6 @@ for sub_gen in all_gen + [null_gen]] @pytest.mark.parametrize('data_gen', non_nested_array_gens + nested_array_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_concat_list(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: three_col_df(spark, data_gen, data_gen, data_gen).selectExpr( @@ -46,7 +45,6 @@ def test_concat_list(data_gen): ) @pytest.mark.parametrize('dg', non_nested_array_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_concat_double_list_with_lit(dg): data_gen = ArrayGen(dg, max_length=2) array_lit = with_cpu_session(lambda spark: gen_scalar(data_gen)) @@ -70,7 +68,6 @@ def test_concat_double_list_with_lit(dg): @pytest.mark.parametrize('data_gen', non_nested_array_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_concat_list_with_lit(data_gen): lit_col1 = with_cpu_session(lambda spark: f.lit(gen_scalar(data_gen))).cast(data_gen.data_type) lit_col2 = with_cpu_session(lambda spark: f.lit(gen_scalar(data_gen))).cast(data_gen.data_type) @@ -99,7 +96,6 @@ def test_concat_string(): f.concat(f.col('a'), f.lit('')))) @pytest.mark.parametrize('data_gen', map_gens_sample + decimal_64_map_gens + decimal_128_map_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_concat(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: three_col_df(spark, data_gen, data_gen, data_gen @@ -111,7 +107,6 @@ def test_map_concat(data_gen): ) @pytest.mark.parametrize('data_gen', map_gens_sample + decimal_64_map_gens + decimal_128_map_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_concat_with_lit(data_gen): lit_col1 = with_cpu_session(lambda spark: f.lit(gen_scalar(data_gen))).cast(data_gen.data_type) lit_col2 = with_cpu_session(lambda spark: f.lit(gen_scalar(data_gen))).cast(data_gen.data_type) @@ -125,7 +120,6 @@ def test_map_concat_with_lit(data_gen): @pytest.mark.parametrize('data_gen', all_gen + nested_gens, ids=idfn) @pytest.mark.parametrize('size_of_null', ['true', 'false'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_size_of_array(data_gen, size_of_null): gen = ArrayGen(data_gen) assert_gpu_and_cpu_are_equal_collect( @@ -134,14 +128,12 @@ def test_size_of_array(data_gen, size_of_null): @pytest.mark.parametrize('data_gen', map_gens_sample, ids=idfn) @pytest.mark.parametrize('size_of_null', ['true', 'false'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_size_of_map(data_gen, size_of_null): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('size(a)'), conf={'spark.sql.legacy.sizeOfNull': size_of_null}) @pytest.mark.parametrize('data_gen', array_gens_sample + [string_gen], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_reverse(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('reverse(a)')) @@ -152,7 +144,6 @@ def test_reverse(data_gen): ] @pytest.mark.parametrize('data_gen', _sort_array_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sort_array(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select( @@ -160,7 +151,6 @@ def test_sort_array(data_gen): f.sort_array(f.col('a'), False))) @pytest.mark.parametrize('data_gen', _sort_array_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sort_array_lit(data_gen): array_lit = with_cpu_session(lambda spark: gen_scalar(data_gen)) assert_gpu_and_cpu_are_equal_collect( @@ -261,7 +251,7 @@ def test_sort_array_normalize_nans(): gens in sequence_normal_integral_gens] @pytest.mark.parametrize('start_gen,stop_gen', sequence_normal_no_step_integral_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sequence_without_step(start_gen, stop_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, start_gen, stop_gen).selectExpr( @@ -270,7 +260,7 @@ def test_sequence_without_step(start_gen, stop_gen): "sequence(20, b)")) @pytest.mark.parametrize('start_gen,stop_gen,step_gen', sequence_normal_integral_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sequence_with_step(start_gen, stop_gen, step_gen): # Get the datagen seed we use for all datagens, since we need to call start # on step_gen @@ -319,7 +309,7 @@ def test_sequence_with_step(start_gen, stop_gen, step_gen): ] @pytest.mark.parametrize('start_gen,stop_gen,step_gen', sequence_illegal_boundaries_integral_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sequence_illegal_boundaries(start_gen, stop_gen, step_gen): assert_gpu_and_cpu_error( lambda spark:three_col_df(spark, start_gen, stop_gen, step_gen).selectExpr( @@ -334,7 +324,7 @@ def test_sequence_illegal_boundaries(start_gen, stop_gen, step_gen): ] @pytest.mark.parametrize('stop_gen', sequence_too_long_length_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sequence_too_long_sequence(stop_gen): assert_gpu_and_cpu_error( # To avoid OOM, reduce the row number to 1, it is enough to verify this case. @@ -376,7 +366,7 @@ def get_sequence_data(gen, len): mixed_schema) # test for 3 cases mixed in a single dataset -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sequence_with_step_mixed_cases(): assert_gpu_and_cpu_are_equal_collect( lambda spark: get_sequence_cases_mixed_df(spark) diff --git a/integration_tests/src/main/python/conditionals_test.py b/integration_tests/src/main/python/conditionals_test.py index 48d5a05c099..be0eed865f9 100644 --- a/integration_tests/src/main/python/conditionals_test.py +++ b/integration_tests/src/main/python/conditionals_test.py @@ -15,11 +15,10 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect -from conftest import is_not_utc from data_gen import * from spark_session import is_before_spark_320, is_jvm_charset_utf8 from pyspark.sql.types import * -from marks import datagen_overrides +from marks import datagen_overrides, allow_non_gpu import pyspark.sql.functions as f def mk_str_gen(pattern): @@ -45,7 +44,6 @@ def mk_str_gen(pattern): if_nested_gens = if_array_gens_sample + if_struct_gens_sample @pytest.mark.parametrize('data_gen', all_gens + if_nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_if_else(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -65,17 +63,14 @@ def test_if_else(data_gen): # Maps scalars are not really supported by Spark from python without jumping through a lot of hoops # so for now we are going to skip them @pytest.mark.parametrize('data_gen', map_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_if_else_map(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : three_col_df(spark, boolean_gen, data_gen, data_gen).selectExpr( 'IF(TRUE, b, c)', 'IF(a, b, c)')) -@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9685') @pytest.mark.order(1) # at the head of xdist worker queue if pytest-order is installed @pytest.mark.parametrize('data_gen', all_gens + all_nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_case_when(data_gen): num_cmps = 20 s1 = with_cpu_session( @@ -119,7 +114,6 @@ def test_nanvl(data_gen): f.nanvl(f.lit(float('nan')).cast(data_type), f.col('b')))) @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_nvl(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -137,9 +131,7 @@ def test_nvl(data_gen): # in both cpu and gpu runs. # E: java.lang.AssertionError: assertion failed: each serializer expression should contain\ # at least one `BoundReference` -@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9684') @pytest.mark.parametrize('data_gen', all_gens + all_nested_gens_nonempty_struct + map_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_coalesce(data_gen): num_cols = 20 s1 = with_cpu_session( @@ -161,7 +153,6 @@ def test_coalesce_constant_output(): lambda spark : spark.range(1, 100).selectExpr("4 + coalesce(5, id) as nine")) @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_nvl2(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -175,7 +166,6 @@ def test_nvl2(data_gen): 'nvl2(a, {}, c)'.format(null_lit))) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_nullif(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -189,7 +179,6 @@ def test_nullif(data_gen): 'nullif(a, {})'.format(null_lit))) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_ifnull(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -241,7 +230,7 @@ def test_conditional_with_side_effects_case_when(data_gen): conf = test_conf) @pytest.mark.parametrize('data_gen', [mk_str_gen('[a-z]{0,3}')], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_conditional_with_side_effects_sequence(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr( @@ -252,7 +241,7 @@ def test_conditional_with_side_effects_sequence(data_gen): @pytest.mark.skipif(is_before_spark_320(), reason='Earlier versions of Spark cannot cast sequence to string') @pytest.mark.parametrize('data_gen', [mk_str_gen('[a-z]{0,3}')], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_conditional_with_side_effects_sequence_cast(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr( diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py index c10221a4407..3fb83a665e3 100644 --- a/integration_tests/src/main/python/csv_test.py +++ b/integration_tests/src/main/python/csv_test.py @@ -248,7 +248,7 @@ def read_impl(spark): @pytest.mark.parametrize('read_func', [read_csv_df, read_csv_sql]) @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) @pytest.mark.parametrize('ansi_enabled', ["true", "false"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_basic_csv_read(std_input_path, name, schema, options, read_func, v1_enabled_list, ansi_enabled, spark_tmp_table_factory): updated_conf=copy_and_update(_enable_all_types_conf, { 'spark.sql.sources.useV1SourceList': v1_enabled_list, @@ -289,7 +289,7 @@ def test_csv_read_small_floats(std_input_path, name, schema, options, read_func, @approximate_float @pytest.mark.parametrize('data_gen', csv_supported_gens, ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_round_trip(spark_tmp_path, data_gen, v1_enabled_list): gen = StructGen([('a', data_gen)], nullable=False) data_path = spark_tmp_path + '/CSV_DATA' @@ -331,8 +331,9 @@ def test_csv_fallback(spark_tmp_path, read_func, disable_conf, spark_tmp_table_f @pytest.mark.parametrize('ansi_enabled', ["true", "false"]) @pytest.mark.parametrize('time_parser_policy', [ pytest.param('LEGACY', marks=pytest.mark.allow_non_gpu('BatchScanExec,FileSourceScanExec')), - 'CORRECTED', - 'EXCEPTION' + # Date is also time zone related for csv since rebase. + pytest.param('CORRECTED', marks=pytest.mark.allow_non_gpu(*non_utc_allow)), + pytest.param('EXCEPTION', marks=pytest.mark.allow_non_gpu(*non_utc_allow)) ]) def test_date_formats_round_trip(spark_tmp_path, date_format, v1_enabled_list, ansi_enabled, time_parser_policy): gen = StructGen([('a', DateGen())], nullable=False) @@ -365,13 +366,16 @@ def test_date_formats_round_trip(spark_tmp_path, date_format, v1_enabled_list, a .csv(data_path), conf=updated_conf) +non_utc_allow_for_test_read_valid_and_invalid_dates=['BatchScanExec', 'FileSourceScanExec'] if is_not_utc() else [] + @pytest.mark.parametrize('filename', ["date.csv"]) @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) @pytest.mark.parametrize('ansi_enabled', ["true", "false"]) @pytest.mark.parametrize('time_parser_policy', [ pytest.param('LEGACY', marks=pytest.mark.allow_non_gpu('BatchScanExec,FileSourceScanExec')), - 'CORRECTED', - 'EXCEPTION' + # Date is also time zone related for csv since rebasing. + pytest.param('CORRECTED', marks=pytest.mark.allow_non_gpu(*non_utc_allow_for_test_read_valid_and_invalid_dates)), + pytest.param('EXCEPTION', marks=pytest.mark.allow_non_gpu(*non_utc_allow_for_test_read_valid_and_invalid_dates)) ]) def test_read_valid_and_invalid_dates(std_input_path, filename, v1_enabled_list, ansi_enabled, time_parser_policy): data_path = std_input_path + '/' + filename @@ -406,7 +410,7 @@ def test_read_valid_and_invalid_dates(std_input_path, filename, v1_enabled_list, @pytest.mark.parametrize('ts_part', csv_supported_ts_parts) @pytest.mark.parametrize('date_format', csv_supported_date_formats) @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_enabled_list): full_format = date_format + ts_part data_gen = TimestampGen() @@ -476,8 +480,7 @@ def test_input_meta_fallback(spark_tmp_path, v1_enabled_list, disable_conf): cpu_fallback_class_name = 'FileSourceScanExec' if v1_enabled_list == 'csv' else 'BatchScanExec', conf=updated_conf) -@allow_non_gpu('DataWritingCommandExec,ExecutedCommandExec,WriteFilesExec') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu('DataWritingCommandExec,ExecutedCommandExec,WriteFilesExec', *non_utc_allow) def test_csv_save_as_table_fallback(spark_tmp_path, spark_tmp_table_factory): gen = TimestampGen() data_path = spark_tmp_path + '/CSV_DATA' @@ -570,7 +573,6 @@ def test_csv_read_count(spark_tmp_path): @pytest.mark.parametrize("timestamp_type", [ pytest.param('TIMESTAMP_LTZ', marks=pytest.mark.xfail(is_spark_350_or_later(), reason="https://github.com/NVIDIA/spark-rapids/issues/9325")), "TIMESTAMP_NTZ"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_csv_infer_schema_timestamp_ntz_v1(spark_tmp_path, date_format, ts_part, timestamp_type): csv_infer_schema_timestamp_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, 'csv', 'FileSourceScanExec') @@ -615,15 +617,21 @@ def do_read(spark): cpu_fallback_class_name = cpu_scan_class, conf = conf) else: - assert_cpu_and_gpu_are_equal_collect_with_capture( - lambda spark: do_read(spark), - exist_classes = 'Gpu' + cpu_scan_class, - non_exist_classes = cpu_scan_class, - conf = conf) + if is_not_utc(): + # non UTC is not support for csv, skip capture check + # tracked in https://github.com/NVIDIA/spark-rapids/issues/9913 + assert_gpu_and_cpu_are_equal_collect(lambda spark: do_read(spark), conf = conf) + else: + assert_cpu_and_gpu_are_equal_collect_with_capture( + lambda spark: do_read(spark), + exist_classes = 'Gpu' + cpu_scan_class, + non_exist_classes = cpu_scan_class, + conf = conf) + + @allow_non_gpu('FileSourceScanExec', 'CollectLimitExec', 'DeserializeToObjectExec') @pytest.mark.skipif(is_before_spark_340(), reason='`preferDate` is only supported in Spark 340+') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_csv_prefer_date_with_infer_schema(spark_tmp_path): # start date ""0001-01-02" required due to: https://github.com/NVIDIA/spark-rapids/issues/5606 data_gens = [byte_gen, short_gen, int_gen, long_gen, boolean_gen, timestamp_gen, DateGen(start=date(1, 1, 2))] @@ -632,14 +640,21 @@ def test_csv_prefer_date_with_infer_schema(spark_tmp_path): with_cpu_session(lambda spark: gen_df(spark, gen_list).write.csv(data_path)) + if is_not_utc(): # non UTC is not support for csv + exist_clazz = 'FileSourceScanExec' + non_exist_clazz = 'GpuFileSourceScanExec' + else: + exist_clazz = 'GpuFileSourceScanExec' + non_exist_clazz = 'FileSourceScanExec' + assert_cpu_and_gpu_are_equal_collect_with_capture( lambda spark: spark.read.option("inferSchema", "true").csv(data_path), - exist_classes = 'GpuFileSourceScanExec', - non_exist_classes = 'FileSourceScanExec') + exist_classes = exist_clazz, + non_exist_classes = non_exist_clazz) assert_cpu_and_gpu_are_equal_collect_with_capture( lambda spark: spark.read.option("inferSchema", "true").option("preferDate", "false").csv(data_path), - exist_classes = 'GpuFileSourceScanExec', - non_exist_classes = 'FileSourceScanExec') + exist_classes = exist_clazz, + non_exist_classes = non_exist_clazz) @allow_non_gpu('FileSourceScanExec') @pytest.mark.skipif(is_before_spark_340(), reason='enableDateTimeParsingFallback is supported from Spark3.4.0') diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index 9aa5e547c45..2232f80d197 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -24,7 +24,7 @@ from spark_session import is_before_spark_340, with_cpu_session import sre_yield import struct -from conftest import skip_unless_precommit_tests,get_datagen_seed +from conftest import skip_unless_precommit_tests,get_datagen_seed, is_not_utc import time import os from functools import lru_cache @@ -676,7 +676,29 @@ def start(self, rand): def make_dict(): length = rand.randint(self._min_length, self._max_length) return {self._key_gen.gen(): self._value_gen.gen() for idx in range(0, length)} - self._start(rand, make_dict) + def make_dict_float(): + # In Spark map, at most one key can be NaN. However, in Python dict, multiple NaN keys + # are allowed because NaN != NaN. So we need to ensure that there is at most one NaN + # key in the dict when generating map type data. + length = rand.randint(self._min_length, self._max_length) + count = 0 + has_nan = False + result = {} + while count < length: + key = self._key_gen.gen() + if math.isnan(key): + if has_nan: + continue + else: + has_nan = True + result[key] = self._value_gen.gen() + count += 1 + return result + + if self._key_gen.data_type == FloatType() or self._key_gen.data_type == DoubleType(): + self._start(rand, make_dict_float) + else: + self._start(rand, make_dict) def contains_ts(self): return self._key_gen.contains_ts() or self._value_gen.contains_ts() @@ -1172,3 +1194,10 @@ def get_25_partitions_df(spark): StructField("c3", IntegerType())]) data = [[i, j, k] for i in range(0, 5) for j in range(0, 5) for k in range(0, 100)] return spark.createDataFrame(data, schema) + + +# allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653' +# This will be deprecated and replaced case specified non GPU allow list +non_utc_allow = ['ProjectExec', 'FilterExec', 'FileSourceScanExec', 'BatchScanExec', 'CollectLimitExec', + 'DeserializeToObjectExec', 'DataWritingCommandExec', 'WriteFilesExec', 'ShuffleExchangeExec', + 'ExecutedCommandExec'] if is_not_utc() else [] \ No newline at end of file diff --git a/integration_tests/src/main/python/datasourcev2_read_test.py b/integration_tests/src/main/python/datasourcev2_read_test.py index cc141700cb8..a3414f5c32a 100644 --- a/integration_tests/src/main/python/datasourcev2_read_test.py +++ b/integration_tests/src/main/python/datasourcev2_read_test.py @@ -15,7 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_row_counts_equal -from conftest import is_not_utc +from data_gen import non_utc_allow from marks import * columnarClass = 'com.nvidia.spark.rapids.tests.datasourcev2.parquet.ArrowColumnarDataSourceV2' @@ -27,31 +27,27 @@ def readTable(types, classToUse): .orderBy("col1") @validate_execs_in_gpu_plan('HostColumnarToGpu') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_int(): assert_gpu_and_cpu_are_equal_collect(readTable("int", columnarClass)) @validate_execs_in_gpu_plan('HostColumnarToGpu') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_read_strings(): assert_gpu_and_cpu_are_equal_collect(readTable("string", columnarClass)) @validate_execs_in_gpu_plan('HostColumnarToGpu') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_all_types(): assert_gpu_and_cpu_are_equal_collect( readTable("int,bool,byte,short,long,string,float,double,date,timestamp", columnarClass), conf={'spark.rapids.sql.castFloatToString.enabled': 'true'}) @validate_execs_in_gpu_plan('HostColumnarToGpu') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_all_types_count(): assert_gpu_and_cpu_row_counts_equal( readTable("int,bool,byte,short,long,string,float,double,date,timestamp", columnarClass), conf={'spark.rapids.sql.castFloatToString.enabled': 'true'}) @validate_execs_in_gpu_plan('HostColumnarToGpu') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_arrow_off(): assert_gpu_and_cpu_are_equal_collect( readTable("int,bool,byte,short,long,string,float,double,date,timestamp", columnarClass), diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index d68dd93efac..4c08d7d3935 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -26,7 +26,7 @@ vals = [(-584, 1563), (1943, 1101), (2693, 2167), (2729, 0), (44, 1534), (2635, 3319), (1885, -2828), (0, 2463), (932, 2286), (0, 0)] @pytest.mark.parametrize('data_gen', vals, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timesub(data_gen): days, seconds = data_gen assert_gpu_and_cpu_are_equal_collect( @@ -35,7 +35,7 @@ def test_timesub(data_gen): .selectExpr("a - (interval {} days {} seconds)".format(days, seconds))) @pytest.mark.parametrize('data_gen', vals, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timeadd(data_gen): days, seconds = data_gen assert_gpu_and_cpu_are_equal_collect( @@ -45,7 +45,7 @@ def test_timeadd(data_gen): .selectExpr("a + (interval {} days {} seconds)".format(days, seconds))) @pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timeadd_daytime_column(): gen_list = [ # timestamp column max year is 1000 @@ -63,7 +63,7 @@ def test_interval_seconds_overflow_exception(): error_message="IllegalArgumentException") @pytest.mark.parametrize('data_gen', vals, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timeadd_from_subquery(data_gen): def fun(spark): @@ -75,7 +75,7 @@ def fun(spark): assert_gpu_and_cpu_are_equal_collect(fun) @pytest.mark.parametrize('data_gen', vals, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timesub_from_subquery(data_gen): def fun(spark): @@ -91,7 +91,7 @@ def fun(spark): # [SPARK-34896][SQL] Return day-time interval from dates subtraction # 1. Add the SQL config `spark.sql.legacy.interval.enabled` which will control when Spark SQL should use `CalendarIntervalType` instead of ANSI intervals. @pytest.mark.parametrize('data_gen', vals, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_dateaddinterval(data_gen): days, seconds = data_gen assert_gpu_and_cpu_are_equal_collect( @@ -102,7 +102,7 @@ def test_dateaddinterval(data_gen): # test add days(not specify hours, minutes, seconds, milliseconds, microseconds) in ANSI mode. @pytest.mark.parametrize('data_gen', vals, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_dateaddinterval_ansi(data_gen): days, _ = data_gen # only specify the `days` @@ -130,17 +130,17 @@ def test_datediff(data_gen): 'datediff(a, date(null))', 'datediff(a, \'2016-03-02\')')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hour(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('hour(a)')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_minute(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('minute(a)')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_second(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('second(a)')) @@ -199,7 +199,7 @@ def test_datesub(data_gen): to_unix_timestamp_days_gen=[ByteGen(), ShortGen(), IntegerGen(min_val=-106032829, max_val=103819094, special_cases=[-106032829, 103819094,0,1,-1])] @pytest.mark.parametrize('data_gen', to_unix_timestamp_days_gen, ids=idfn) @incompat -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_dateadd_with_date_overflow(data_gen): string_type = to_cast_string(data_gen.data_type) assert_gpu_and_cpu_are_equal_collect( @@ -213,7 +213,7 @@ def test_dateadd_with_date_overflow(data_gen): to_unix_timestamp_days_gen=[ByteGen(), ShortGen(), IntegerGen(max_val=106032829, min_val=-103819094, special_cases=[106032829, -103819094,0,1,-1])] @pytest.mark.parametrize('data_gen', to_unix_timestamp_days_gen, ids=idfn) @incompat -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_datesub_with_date_overflow(data_gen): string_type = to_cast_string(data_gen.data_type) assert_gpu_and_cpu_are_equal_collect( @@ -245,7 +245,7 @@ def test_dayofyear(data_gen): lambda spark : unary_op_df(spark, data_gen).select(f.dayofyear(f.col('a')))) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_unix_timestamp(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col('a')))) @@ -262,7 +262,7 @@ def test_unsupported_fallback_unix_timestamp(data_gen): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_to_unix_timestamp(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("to_unix_timestamp(a)"), @@ -280,7 +280,7 @@ def test_unsupported_fallback_to_unix_timestamp(data_gen): @pytest.mark.parametrize('time_zone', ["UTC", "UTC+0", "UTC-0", "GMT", "GMT+0", "GMT-0"], ids=idfn) @pytest.mark.parametrize('data_gen', [timestamp_gen], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_utc_timestamp(data_gen, time_zone): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select(f.from_utc_timestamp(f.col('a'), time_zone))) @@ -305,7 +305,7 @@ def test_from_utc_timestamp_unsupported_timezone_fallback(data_gen, time_zone): @pytest.mark.parametrize('time_zone', ["UTC", "Asia/Shanghai", "EST", "MST", "VST"], ids=idfn) @pytest.mark.parametrize('data_gen', [timestamp_gen], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_utc_timestamp_supported_timezones(data_gen, time_zone): # TODO: Remove spark.rapids.sql.nonUTC.enabled configuration assert_gpu_and_cpu_are_equal_collect( @@ -363,7 +363,7 @@ def fun(spark): @pytest.mark.parametrize('parser_policy', ["CORRECTED", "EXCEPTION"], ids=idfn) # first get expected string via `date_format` -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_string_to_timestamp_functions_ansi_valid(parser_policy): expr_format = "{operator}(date_format(a, '{fmt}'), '{fmt}')" formats = ['yyyy-MM-dd', 'yyyy/MM/dd', 'yyyy-MM', 'yyyy/MM', 'dd/MM/yyyy', 'yyyy-MM-dd HH:mm:ss', @@ -381,7 +381,7 @@ def fun(spark): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_unix_timestamp_improved(data_gen, ansi_enabled): conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true", "spark.sql.legacy.timeParserPolicy": "CORRECTED"} @@ -391,7 +391,7 @@ def test_unix_timestamp_improved(data_gen, ansi_enabled): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_unix_timestamp(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col("a"))), @@ -399,7 +399,7 @@ def test_unix_timestamp(data_gen, ansi_enabled): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_to_unix_timestamp_improved(data_gen, ansi_enabled): conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true"} assert_gpu_and_cpu_are_equal_collect( @@ -418,7 +418,7 @@ def invalid_date_string_df(spark): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen,date_form', str_date_and_format_gen, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_string_to_unix_timestamp(data_gen, date_form, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen, seed=1).selectExpr("to_unix_timestamp(a, '{}')".format(date_form)), @@ -432,7 +432,7 @@ def test_string_to_unix_timestamp_ansi_exception(): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen,date_form', str_date_and_format_gen, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_string_unix_timestamp(data_gen, date_form, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen, seed=1).select(f.unix_timestamp(f.col('a'), date_form)), @@ -446,7 +446,7 @@ def test_string_unix_timestamp_ansi_exception(): @pytest.mark.parametrize('data_gen', [StringGen('200[0-9]-0[1-9]-[0-2][1-8]')], ids=idfn) @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_gettimestamp(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "yyyy-MM-dd")), @@ -454,7 +454,7 @@ def test_gettimestamp(data_gen, ansi_enabled): @pytest.mark.parametrize('data_gen', [StringGen('0[1-9]200[0-9]')], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_gettimestamp_format_MMyyyy(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "MMyyyy"))) @@ -469,7 +469,7 @@ def test_gettimestamp_ansi_exception(): 'MM-dd', 'MM/dd', 'dd-MM', 'dd/MM'] @pytest.mark.parametrize('date_format', supported_date_formats, ids=idfn) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_date_format(data_gen, date_format): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("date_format(a, '{}')".format(date_format))) @@ -504,7 +504,7 @@ def test_date_format_maybe(data_gen, date_format): @pytest.mark.parametrize('date_format', maybe_supported_date_formats, ids=idfn) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_date_format_maybe_incompat(data_gen, date_format): conf = {"spark.rapids.sql.incompatibleDateFormats.enabled": "true"} assert_gpu_and_cpu_are_equal_collect( @@ -514,9 +514,8 @@ def test_date_format_maybe_incompat(data_gen, date_format): # where we had a failure due to GpuCast canonicalization with timezone. # In this case it was doing filter after project, the way I get that to happen is by adding in the # input_file_name(), otherwise filter happens before project. -@allow_non_gpu('CollectLimitExec,FileSourceScanExec,DeserializeToObjectExec') +@allow_non_gpu("CollectLimitExec", "FileSourceScanExec" ,"DeserializeToObjectExec", *non_utc_allow) @ignore_order() -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_date_format_mmyyyy_cast_canonicalization(spark_tmp_path): data_path = spark_tmp_path + '/CSV_DATA' gen = StringGen(pattern='[0][0-9][1][8-9][1-9][1-9]', nullable=False) @@ -562,12 +561,12 @@ def test_unsupported_fallback_to_date(): seconds_gens = [LongGen(min_val=-62135510400, max_val=253402214400), IntegerGen(), ShortGen(), ByteGen(), DoubleGen(min_exp=0, max_exp=32), ts_float_gen, DecimalGen(16, 6), DecimalGen(13, 3), DecimalGen(10, 0), DecimalGen(7, -3), DecimalGen(6, 6)] @pytest.mark.parametrize('data_gen', seconds_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timestamp_seconds(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_seconds(a)")) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timestamp_seconds_long_overflow(): assert_gpu_and_cpu_error( lambda spark : unary_op_df(spark, long_gen).selectExpr("timestamp_seconds(a)").collect(), @@ -575,7 +574,7 @@ def test_timestamp_seconds_long_overflow(): error_message='long overflow') @pytest.mark.parametrize('data_gen', [DecimalGen(7, 7), DecimalGen(20, 7)], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timestamp_seconds_rounding_necessary(data_gen): assert_gpu_and_cpu_error( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_seconds(a)").collect(), @@ -583,7 +582,7 @@ def test_timestamp_seconds_rounding_necessary(data_gen): error_message='Rounding necessary') @pytest.mark.parametrize('data_gen', [DecimalGen(19, 6), DecimalGen(20, 6)], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timestamp_seconds_decimal_overflow(data_gen): assert_gpu_and_cpu_error( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_seconds(a)").collect(), @@ -592,12 +591,12 @@ def test_timestamp_seconds_decimal_overflow(data_gen): millis_gens = [LongGen(min_val=-62135510400000, max_val=253402214400000), IntegerGen(), ShortGen(), ByteGen()] @pytest.mark.parametrize('data_gen', millis_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timestamp_millis(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_millis(a)")) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timestamp_millis_long_overflow(): assert_gpu_and_cpu_error( lambda spark : unary_op_df(spark, long_gen).selectExpr("timestamp_millis(a)").collect(), @@ -606,7 +605,7 @@ def test_timestamp_millis_long_overflow(): micros_gens = [LongGen(min_val=-62135510400000000, max_val=253402214400000000), IntegerGen(), ShortGen(), ByteGen()] @pytest.mark.parametrize('data_gen', micros_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timestamp_micros(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_micros(a)")) diff --git a/integration_tests/src/main/python/expand_exec_test.py b/integration_tests/src/main/python/expand_exec_test.py index abb9a7bd094..d53000e9849 100644 --- a/integration_tests/src/main/python/expand_exec_test.py +++ b/integration_tests/src/main/python/expand_exec_test.py @@ -14,7 +14,6 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_equal -from conftest import is_not_utc from data_gen import * import pyspark.sql.functions as f from marks import ignore_order @@ -23,7 +22,6 @@ # Many Spark versions have issues sorting large decimals, # see https://issues.apache.org/jira/browse/SPARK-40089. @ignore_order(local=True) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_expand_exec(data_gen): def op_df(spark, length=2048): return gen_df(spark, StructGen([ diff --git a/integration_tests/src/main/python/explain_test.py b/integration_tests/src/main/python/explain_test.py index 1837f31aa95..84dbd4c5473 100644 --- a/integration_tests/src/main/python/explain_test.py +++ b/integration_tests/src/main/python/explain_test.py @@ -50,7 +50,7 @@ def do_join_explain(spark): with_cpu_session(do_join_explain) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@pytest.mark.skipif(is_not_utc(), reason='Cast is not supported with timezone setting. https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explain_set_config(): conf = {'spark.rapids.sql.hasExtendedYearValues': 'false', 'spark.rapids.sql.castStringToTimestamp.enabled': 'true'} diff --git a/integration_tests/src/main/python/fastparquet_compatibility_test.py b/integration_tests/src/main/python/fastparquet_compatibility_test.py index a12bd223778..66f069edeff 100644 --- a/integration_tests/src/main/python/fastparquet_compatibility_test.py +++ b/integration_tests/src/main/python/fastparquet_compatibility_test.py @@ -15,7 +15,6 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect -from conftest import is_not_utc from data_gen import * from fastparquet_utils import get_fastparquet_result_canonicalizer from spark_session import is_databricks_runtime, spark_version, with_cpu_session, with_gpu_session @@ -124,9 +123,12 @@ def read_with_fastparquet_or_plugin(spark): marks=pytest.mark.xfail(reason="fastparquet reads dates as timestamps.")), pytest.param(DateGen(nullable=False), marks=pytest.mark.xfail(reason="fastparquet reads far future dates (e.g. year=8705) incorrectly.")), - TimestampGen(nullable=False, - start=pandas_min_datetime, - end=pandas_max_datetime), # Vanilla case. + pytest.param(TimestampGen(nullable=False, + start=pandas_min_datetime, + end=pandas_max_datetime), + marks=pytest.mark.skipif(condition=is_not_utc(), + reason="fastparquet interprets timestamps in UTC timezone, regardless " + "of timezone settings")), # Vanilla case. pytest.param(TimestampGen(nullable=False, start=pandas_min_datetime, end=pandas_max_datetime), @@ -142,7 +144,6 @@ def read_with_fastparquet_or_plugin(spark): marks=pytest.mark.xfail(is_databricks_runtime(), reason="https://github.com/NVIDIA/spark-rapids/issues/9778")), ], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_reading_file_written_by_spark_cpu(data_gen, spark_tmp_path): """ This test writes data_gen output to Parquet via Apache Spark, then verifies that fastparquet and the RAPIDS @@ -201,15 +202,17 @@ def test_reading_file_written_by_spark_cpu(data_gen, spark_tmp_path): marks=pytest.mark.xfail(reason="fastparquet reads dates as timestamps.")), pytest.param(DateGen(nullable=False), marks=pytest.mark.xfail(reason="fastparquet reads far future dates (e.g. year=8705) incorrectly.")), - TimestampGen(nullable=False, - start=pandas_min_datetime, - end=pandas_max_datetime), # Vanilla case. + pytest.param(TimestampGen(nullable=False, + start=pandas_min_datetime, + end=pandas_max_datetime), + marks=pytest.mark.skipif(condition=is_not_utc(), + reason="fastparquet interprets timestamps in UTC timezone, regardless " + "of timezone settings")), # Vanilla case. pytest.param(TimestampGen(nullable=False, start=datetime(1, 2, 1, tzinfo=timezone.utc), end=pandas_min_datetime), marks=pytest.mark.xfail(reason="fastparquet reads timestamps preceding 1900 incorrectly.")), ], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_reading_file_written_with_gpu(spark_tmp_path, column_gen): """ This test writes the data-gen output to file via the RAPIDS plugin, then checks that the data is read identically @@ -392,7 +395,6 @@ def write_with_fastparquet(spark, data_gen): marks=pytest.mark.xfail(reason="fastparquet fails to read nullable Struct columns written from Apache Spark. " "It fails the rewrite to parquet, thereby failing the test.")), ], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_reading_file_rewritten_with_fastparquet(column_gen, time_format, spark_tmp_path): """ This test is a workaround to test data-types that have problems being converted diff --git a/integration_tests/src/main/python/generate_expr_test.py b/integration_tests/src/main/python/generate_expr_test.py index cde16352236..66c56ebcd19 100644 --- a/integration_tests/src/main/python/generate_expr_test.py +++ b/integration_tests/src/main/python/generate_expr_test.py @@ -15,7 +15,6 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect -from conftest import is_not_utc from data_gen import * from marks import allow_non_gpu, ignore_order from pyspark.sql.types import * @@ -38,7 +37,6 @@ def four_op_df(spark, gen, length=2048): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_makearray(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : four_op_df(spark, data_gen).selectExpr('a', 'explode(array(b, c, d))')) @@ -47,7 +45,6 @@ def test_explode_makearray(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_litarray(data_gen): array_lit = with_cpu_session( lambda spark: gen_scalar(ArrayGen(data_gen, min_length=3, max_length=3, nullable=False))) @@ -63,7 +60,6 @@ def test_explode_litarray(data_gen): @pytest.mark.parametrize('data_gen', explode_gens + struct_gens_sample_with_decimal128 + array_gens_sample + map_gens_sample + arrays_with_binary + maps_with_binary, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_array_data(data_gen): data_gen = [int_gen, ArrayGen(data_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -74,7 +70,6 @@ def test_explode_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('map_gen', map_gens_sample + decimal_128_map_gens + maps_with_binary, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_map_data(map_gen): data_gen = [int_gen, map_gen] assert_gpu_and_cpu_are_equal_collect( @@ -85,7 +80,6 @@ def test_explode_map_data(map_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_nested_array_data(data_gen): data_gen = [int_gen, ArrayGen(ArrayGen(data_gen))] assert_gpu_and_cpu_are_equal_collect( @@ -100,7 +94,6 @@ def test_explode_nested_array_data(data_gen): @pytest.mark.parametrize('data_gen', explode_gens + struct_gens_sample_with_decimal128 + array_gens_sample + arrays_with_binary + map_gens_sample + maps_with_binary, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_outer_array_data(data_gen): data_gen = [int_gen, ArrayGen(data_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -111,7 +104,6 @@ def test_explode_outer_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('map_gen', map_gens_sample + decimal_128_map_gens + maps_with_binary, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_outer_map_data(map_gen): data_gen = [int_gen, map_gen] assert_gpu_and_cpu_are_equal_collect( @@ -122,7 +114,7 @@ def test_explode_outer_map_data(map_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_explode_outer_nested_array_data(data_gen): data_gen = [int_gen, ArrayGen(ArrayGen(data_gen))] assert_gpu_and_cpu_are_equal_collect( @@ -134,7 +126,7 @@ def test_explode_outer_nested_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_posexplode_makearray(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : four_op_df(spark, data_gen).selectExpr('posexplode(array(b, c, d))', 'a')) @@ -143,7 +135,7 @@ def test_posexplode_makearray(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_posexplode_litarray(data_gen): array_lit = with_cpu_session( lambda spark: gen_scalar(ArrayGen(data_gen, min_length=3, max_length=3, nullable=False))) @@ -158,7 +150,7 @@ def test_posexplode_litarray(data_gen): @pytest.mark.parametrize('data_gen', explode_gens + struct_gens_sample_with_decimal128 + array_gens_sample + arrays_with_binary + map_gens_sample + maps_with_binary, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_posexplode_array_data(data_gen): data_gen = [int_gen, ArrayGen(data_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -169,7 +161,7 @@ def test_posexplode_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('map_gen', map_gens_sample + decimal_128_map_gens + maps_with_binary, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_posexplode_map_data(map_gen): data_gen = [int_gen, map_gen] assert_gpu_and_cpu_are_equal_collect( @@ -180,7 +172,7 @@ def test_posexplode_map_data(map_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_posexplode_nested_array_data(data_gen): data_gen = [int_gen, ArrayGen(ArrayGen(data_gen))] assert_gpu_and_cpu_are_equal_collect( @@ -195,7 +187,7 @@ def test_posexplode_nested_array_data(data_gen): @pytest.mark.parametrize('data_gen', explode_gens + struct_gens_sample_with_decimal128 + array_gens_sample + arrays_with_binary + map_gens_sample + maps_with_binary, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_posexplode_outer_array_data(data_gen): data_gen = [int_gen, ArrayGen(data_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -206,7 +198,7 @@ def test_posexplode_outer_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('map_gen', map_gens_sample + decimal_128_map_gens + maps_with_binary, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_posexplode_outer_map_data(map_gen): data_gen = [int_gen, map_gen] assert_gpu_and_cpu_are_equal_collect( @@ -217,7 +209,7 @@ def test_posexplode_outer_map_data(map_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_posexplode_nested_outer_array_data(data_gen): data_gen = [int_gen, ArrayGen(ArrayGen(data_gen))] assert_gpu_and_cpu_are_equal_collect( @@ -242,7 +234,7 @@ def test_stack(): # gpu stack not guarantee to produce the same output order as Spark does @ignore_order(local=True) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_stack_mixed_types(): base_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, string_gen, boolean_gen, date_gen, timestamp_gen, null_gen, DecimalGen(precision=7, scale=3), diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index 0c99fc4516a..c8d304c4c93 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -334,7 +334,6 @@ def test_hash_grpby_sum_count_action(data_gen, override_split_until_size, overri @allow_non_gpu("SortAggregateExec", "SortExec", "ShuffleExchangeExec") @ignore_order @pytest.mark.parametrize('data_gen', _grpkey_nested_structs_with_array_basic_child + _grpkey_list_with_non_nested_children, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_grpby_list_min_max(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100).coalesce(1).groupby('a').agg(f.min('b'), f.max('b')) @@ -617,7 +616,6 @@ def test_decimal128_min_max_group_by(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _all_basic_gens_with_all_nans_cases, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_min_max_group_by(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, byte_gen, data_gen) @@ -629,11 +627,10 @@ def test_min_max_group_by(data_gen): # support sorting certain nested/arbitrary types on the GPU # See https://github.com/NVIDIA/spark-rapids/issues/3715 # and https://github.com/rapidsai/cudf/issues/11222 -@allow_non_gpu("ProjectExec", "SortArray") +@allow_non_gpu("ProjectExec", "SortArray", *non_utc_allow) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_list_op, ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', [True, False], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_list(data_gen, use_obj_hash_agg): def doit(spark): df = gen_df(spark, data_gen, length=100)\ @@ -665,7 +662,7 @@ def doit(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_groupby_collect_set(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -674,7 +671,7 @@ def test_hash_groupby_collect_set(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_groupby_collect_set_on_nested_type(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -687,9 +684,8 @@ def test_hash_groupby_collect_set_on_nested_type(data_gen): # See https://github.com/NVIDIA/spark-rapids/issues/3715 # and https://github.com/rapidsai/cudf/issues/11222 @ignore_order(local=True) -@allow_non_gpu("ProjectExec", "SortArray") +@allow_non_gpu("ProjectExec", "SortArray", *non_utc_allow) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op_nested, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_set_on_nested_array_type(data_gen): conf = copy_and_update(_float_conf, { "spark.rapids.sql.castFloatToString.enabled": "true", @@ -711,7 +707,7 @@ def do_it(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_reduction_collect_set(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -719,7 +715,7 @@ def test_hash_reduction_collect_set(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_reduction_collect_set_on_nested_type(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -731,9 +727,8 @@ def test_hash_reduction_collect_set_on_nested_type(data_gen): # See https://github.com/NVIDIA/spark-rapids/issues/3715 # and https://github.com/rapidsai/cudf/issues/11222 @ignore_order(local=True) -@allow_non_gpu("ProjectExec", "SortArray") +@allow_non_gpu("ProjectExec", "SortArray", *non_utc_allow) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op_nested, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_reduction_collect_set_on_nested_array_type(data_gen): conf = copy_and_update(_float_conf, { "spark.rapids.sql.castFloatToString.enabled": "true", @@ -753,7 +748,7 @@ def do_it(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_groupby_collect_with_single_distinct(data_gen): # test collect_ops with other distinct aggregations assert_gpu_and_cpu_are_equal_collect( @@ -766,7 +761,7 @@ def test_hash_groupby_collect_with_single_distinct(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_op, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_groupby_single_distinct_collect(data_gen): # test distinct collect sql = """select a, @@ -790,7 +785,7 @@ def test_hash_groupby_single_distinct_collect(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_op, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_groupby_collect_with_multi_distinct(data_gen): def spark_fn(spark_session): return gen_df(spark_session, data_gen, length=100).groupby('a').agg( @@ -812,12 +807,11 @@ def spark_fn(spark_session): @allow_non_gpu('ObjectHashAggregateExec', 'SortAggregateExec', 'ShuffleExchangeExec', 'HashPartitioning', 'SortExec', 'SortArray', 'Alias', 'Literal', 'Count', 'CollectList', 'CollectSet', - 'AggregateExpression', 'ProjectExec') + 'AggregateExpression', 'ProjectExec', *non_utc_allow) @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) @pytest.mark.parametrize('replace_mode', _replace_modes_non_distinct, ids=idfn) @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', ['false', 'true'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_partial_replace_fallback(data_gen, replace_mode, aqe_enabled, @@ -859,13 +853,12 @@ def test_hash_groupby_collect_partial_replace_fallback(data_gen, @allow_non_gpu('ObjectHashAggregateExec', 'SortAggregateExec', 'ShuffleExchangeExec', 'HashPartitioning', 'SortExec', 'SortArray', 'Alias', 'Literal', 'Count', 'CollectList', 'CollectSet', - 'AggregateExpression', 'ProjectExec') + 'AggregateExpression', 'ProjectExec', *non_utc_allow) @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) @pytest.mark.parametrize('replace_mode', _replace_modes_single_distinct, ids=idfn) @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', ['false', 'true'], ids=idfn) @pytest.mark.xfail(condition=is_databricks104_or_later(), reason='https://github.com/NVIDIA/spark-rapids/issues/4963') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_partial_replace_with_distinct_fallback(data_gen, replace_mode, aqe_enabled, @@ -1264,7 +1257,7 @@ def test_first_last_reductions_decimal_types(data_gen): 'first(a)', 'last(a)', 'first(a, true)', 'last(a, true)')) @pytest.mark.parametrize('data_gen', _nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_first_last_reductions_nested_types(data_gen): assert_gpu_and_cpu_are_equal_collect( # Coalesce and sort are to make sure that first and last, which are non-deterministic @@ -1273,7 +1266,7 @@ def test_first_last_reductions_nested_types(data_gen): 'first(a)', 'last(a)', 'first(a, true)', 'last(a, true)')) @pytest.mark.parametrize('data_gen', _all_basic_gens_with_all_nans_cases, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_generic_reductions(data_gen): local_conf = copy_and_update(_float_conf, {'spark.sql.legacy.allowParameterlessCount': 'true'}) assert_gpu_and_cpu_are_equal_collect( @@ -1291,7 +1284,7 @@ def test_generic_reductions(data_gen): conf=local_conf) @pytest.mark.parametrize('data_gen', all_gen + _nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_count(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen) \ @@ -1303,7 +1296,7 @@ def test_count(data_gen): conf = {'spark.sql.legacy.allowParameterlessCount': 'true'}) @pytest.mark.parametrize('data_gen', all_basic_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_distinct_count_reductions(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).selectExpr( @@ -1327,7 +1320,7 @@ def test_arithmetic_reductions(data_gen): @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens + _nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_collect_list_reductions(data_gen): assert_gpu_and_cpu_are_equal_collect( # coalescing because collect_list is not deterministic @@ -1346,7 +1339,7 @@ def test_collect_list_reductions(data_gen): @pytest.mark.parametrize('data_gen', _no_neg_zero_all_basic_gens + decimal_gens + _struct_only_nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_collect_set_reductions(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('sort_array(collect_set(a))'), @@ -1360,7 +1353,7 @@ def test_collect_empty(): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen + _nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_groupby_first_last(data_gen): gen_fn = [('a', RepeatSeqGen(LongGen(), length=20)), ('b', data_gen)] agg_fn = lambda df: df.groupBy('a').agg( @@ -1374,7 +1367,7 @@ def test_groupby_first_last(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen + _struct_only_nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sorted_groupby_first_last(data_gen): gen_fn = [('a', RepeatSeqGen(LongGen(), length=20)), ('b', data_gen)] # sort by more than the group by columns to be sure that first/last don't remove the ordering @@ -1392,7 +1385,7 @@ def test_sorted_groupby_first_last(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('count_func', [f.count, f.countDistinct]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_agg_count(data_gen, count_func): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, [('a', data_gen), ('b', data_gen)], @@ -2049,7 +2042,7 @@ def test_std_variance_partial_replace_fallback(data_gen, null_gen] + array_gens_sample + struct_gens_sample @ignore_order(local=True) @pytest.mark.parametrize('data_gen', gens_for_max_min, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_min_max_in_groupby_and_reduction(data_gen): df_gen = [('a', data_gen), ('b', RepeatSeqGen(IntegerGen(), length=20))] diff --git a/integration_tests/src/main/python/hashing_test.py b/integration_tests/src/main/python/hashing_test.py index e2a753ecaeb..6bd56da933d 100644 --- a/integration_tests/src/main/python/hashing_test.py +++ b/integration_tests/src/main/python/hashing_test.py @@ -15,7 +15,6 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect -from conftest import is_not_utc from data_gen import * from marks import allow_non_gpu, ignore_order from spark_session import is_before_spark_320 @@ -47,13 +46,11 @@ @ignore_order(local=True) @pytest.mark.parametrize("gen", _xxhash_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_xxhash64_single_column(gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, gen).selectExpr("a", "xxhash64(a)")) @ignore_order(local=True) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_xxhash64_multi_column(): gen = StructGen(_struct_of_xxhash_gens.children, nullable=False) col_list = ",".join(gen.data_type.fieldNames()) diff --git a/integration_tests/src/main/python/hive_delimited_text_test.py b/integration_tests/src/main/python/hive_delimited_text_test.py index 4d07a077ec0..b0467aa8910 100644 --- a/integration_tests/src/main/python/hive_delimited_text_test.py +++ b/integration_tests/src/main/python/hive_delimited_text_test.py @@ -121,6 +121,7 @@ def read_impl(spark): return read_impl +non_utc_allow_for_test_basic_hive_text_read=['HiveTableScanExec', 'DataWritingCommandExec', 'WriteFilesExec'] if is_not_utc() else [] @pytest.mark.skipif(is_spark_cdh(), reason="Hive text reads are disabled on CDH, as per " "https://github.com/NVIDIA/spark-rapids/pull/7628") @@ -187,7 +188,7 @@ def read_impl(spark): ('hive-delim-text/carriage-return', StructType([StructField("str", StringType())]), {}), ('hive-delim-text/carriage-return-err', StructType([StructField("str", StringType())]), {}), ], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow_for_test_basic_hive_text_read) def test_basic_hive_text_read(std_input_path, name, schema, spark_tmp_table_factory, options): assert_gpu_and_cpu_are_equal_collect(read_hive_text_sql(std_input_path + '/' + name, schema, spark_tmp_table_factory, options), @@ -240,7 +241,7 @@ def read_hive_text_table(spark, text_table_name, fields="my_field"): "https://github.com/NVIDIA/spark-rapids/pull/7628") @approximate_float @pytest.mark.parametrize('data_gen', hive_text_supported_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow_for_test_basic_hive_text_read) def test_hive_text_round_trip(spark_tmp_path, data_gen, spark_tmp_table_factory): gen = StructGen([('my_field', data_gen)], nullable=False) data_path = spark_tmp_path + '/hive_text_table' @@ -282,9 +283,8 @@ def read_hive_text_table_partitions(spark, text_table_name, partition): reason="Hive text reads are disabled on CDH, as per " "https://github.com/NVIDIA/spark-rapids/pull/7628") @approximate_float -@allow_non_gpu("EqualTo,IsNotNull,Literal") # Accounts for partition predicate: `WHERE dt='1'` +@allow_non_gpu("EqualTo,IsNotNull,Literal", *non_utc_allow_for_test_basic_hive_text_read) # Accounts for partition predicate: `WHERE dt='1'` @pytest.mark.parametrize('data_gen', hive_text_supported_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_text_round_trip_partitioned(spark_tmp_path, data_gen, spark_tmp_table_factory): gen = StructGen([('my_field', data_gen)], nullable=False) data_path = spark_tmp_path + '/hive_text_table' @@ -301,9 +301,8 @@ def test_hive_text_round_trip_partitioned(spark_tmp_path, data_gen, spark_tmp_ta reason="Hive text reads are disabled on CDH, as per " "https://github.com/NVIDIA/spark-rapids/pull/7628") @approximate_float -@allow_non_gpu("EqualTo,IsNotNull,Literal,Or") # Accounts for partition predicate +@allow_non_gpu("EqualTo,IsNotNull,Literal,Or", *non_utc_allow_for_test_basic_hive_text_read) # Accounts for partition predicate @pytest.mark.parametrize('data_gen', hive_text_supported_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_text_round_trip_two_partitions(spark_tmp_path, data_gen, spark_tmp_table_factory): """ Added to reproduce: https://github.com/NVIDIA/spark-rapids/issues/7383 @@ -529,7 +528,7 @@ def create_table_with_compressed_files(spark): ('hive-delim-text/carriage-return', StructType([StructField("str", StringType())]), {}), ('hive-delim-text/carriage-return-err', StructType([StructField("str", StringType())]), {}), ], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow_for_test_basic_hive_text_read) def test_basic_hive_text_write(std_input_path, input_dir, schema, spark_tmp_table_factory, mode, options): # Configure table options, including schema. if options is None: diff --git a/integration_tests/src/main/python/hive_write_test.py b/integration_tests/src/main/python/hive_write_test.py index 7bc5ceede85..ae7052dffd7 100644 --- a/integration_tests/src/main/python/hive_write_test.py +++ b/integration_tests/src/main/python/hive_write_test.py @@ -59,7 +59,7 @@ def _restricted_timestamp(nullable=True): @pytest.mark.skipif(not is_hive_available(), reason="Hive is missing") @pytest.mark.parametrize("gens", _write_gens, ids=idfn) @pytest.mark.parametrize("storage", ["PARQUET", "nativeorc", "hiveorc"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_optimized_hive_ctas_basic(gens, storage, spark_tmp_table_factory): data_table = spark_tmp_table_factory.get() gen_list = [('c' + str(i), gen) for i, gen in enumerate(gens)] diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py index ba172715936..6660e663c92 100644 --- a/integration_tests/src/main/python/join_test.py +++ b/integration_tests/src/main/python/join_test.py @@ -170,7 +170,6 @@ def do_join(spark): (all_gen, '1g'), (join_small_batch_gens, '1000')), ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join(data_gen, join_type, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) @@ -181,7 +180,6 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', basic_nested_gens + [decimal_gen_128bit], ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_ridealong(data_gen, join_type): def do_join(spark): left, right = create_ridealong_df(spark, short_gen, data_gen, 500, 500) @@ -191,11 +189,11 @@ def do_join(spark): # For floating point values the normalization is done using a higher order function. We could probably work around this # for now it falls back to the CPU @allow_non_gpu('SortMergeJoinExec', 'SortExec', 'ArrayTransform', 'LambdaFunction', - 'NamedLambdaVariable', 'NormalizeNaNAndZero', 'ShuffleExchangeExec', 'HashPartitioning') + 'NamedLambdaVariable', 'NormalizeNaNAndZero', 'ShuffleExchangeExec', 'HashPartitioning', + *non_utc_allow) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', single_level_array_gens + [binary_gen], ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_wrong_key_fallback(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) @@ -215,7 +213,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen', basic_nested_gens + [decimal_gen_128bit], ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) @pytest.mark.parametrize('sub_part_enabled', ['false', 'true'], ids=['SubPartition_OFF', 'SubPartition_ON']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_join_ridealong(data_gen, join_type, sub_part_enabled): def do_join(spark): left, right = create_ridealong_df(spark, short_gen, data_gen, 50, 500) @@ -232,7 +230,7 @@ def do_join(spark): # Not all join types can be translated to a broadcast join, but this tests them to be sure we # can handle what spark is doing @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_join_right_table(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -244,7 +242,7 @@ def do_join(spark): # Not all join types can be translated to a broadcast join, but this tests them to be sure we # can handle what spark is doing @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_join_right_table_ridealong(data_gen, join_type): def do_join(spark): left, right = create_ridealong_df(spark, short_gen, data_gen, 500, 500) @@ -258,7 +256,7 @@ def do_join(spark): # Not all join types can be translated to a broadcast join, but this tests them to be sure we # can handle what spark is doing @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_join_right_table_with_job_group(data_gen, join_type): with_cpu_session(lambda spark : spark.sparkContext.setJobGroup("testjob1", "test", False)) def do_join(spark): @@ -273,7 +271,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen,batch_size', join_batch_size_test_params( (all_gen + basic_nested_gens, '1g'), (join_small_batch_gens + [basic_struct_gen, ArrayGen(string_gen)], '100')), ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cartesian_join(data_gen, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -313,7 +311,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen,batch_size', join_batch_size_test_params( (all_gen, '1g'), (join_small_batch_gens, '100')), ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_cartesian_join_with_condition(data_gen, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -331,7 +329,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen,batch_size', join_batch_size_test_params( (all_gen + basic_nested_gens, '1g'), (join_small_batch_gens, '100')), ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_nested_loop_join(data_gen, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -367,7 +365,7 @@ def do_join(spark): (join_ast_gen, '1g'), ([int_gen], 100)), ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Inner', 'LeftSemi', 'LeftAnti', 'Cross'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_right_broadcast_nested_loop_join_with_ast_condition(data_gen, join_type, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -382,7 +380,7 @@ def do_join(spark): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_ast_gen, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_left_broadcast_nested_loop_join_with_ast_condition(data_gen): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -441,7 +439,7 @@ def do_join(spark): float_gen, double_gen, string_gen, boolean_gen, date_gen, timestamp_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'FullOuter', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_nested_loop_join_with_array_contains(data_gen, join_type): arr_gen = ArrayGen(data_gen) literal = with_cpu_session(lambda spark: gen_scalar(data_gen)) @@ -454,7 +452,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_right_broadcast_nested_loop_join_condition_missing(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -470,7 +468,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Right'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_left_broadcast_nested_loop_join_condition_missing(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -485,7 +483,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen', all_gen + single_level_array_gens + [binary_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_right_broadcast_nested_loop_join_condition_missing_count(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -494,29 +492,27 @@ def do_join(spark): @pytest.mark.parametrize('data_gen', all_gen + single_level_array_gens + [binary_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Right'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_left_broadcast_nested_loop_join_condition_missing_count(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) return broadcast(left).join(right, how=join_type).selectExpr('COUNT(*)') assert_gpu_and_cpu_are_equal_collect(do_join) -@allow_non_gpu('BroadcastExchangeExec', 'BroadcastNestedLoopJoinExec', 'GreaterThanOrEqual') +@allow_non_gpu('BroadcastExchangeExec', 'BroadcastNestedLoopJoinExec', 'GreaterThanOrEqual', *non_utc_allow) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['LeftOuter', 'LeftSemi', 'LeftAnti', 'FullOuter'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_nested_loop_join_with_conditionals_build_left_fallback(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) return broadcast(left).join(right, (left.b >= right.r_b), join_type) assert_gpu_fallback_collect(do_join, 'BroadcastNestedLoopJoinExec') -@allow_non_gpu('BroadcastExchangeExec', 'BroadcastNestedLoopJoinExec', 'GreaterThanOrEqual') +@allow_non_gpu('BroadcastExchangeExec', 'BroadcastNestedLoopJoinExec', 'GreaterThanOrEqual', *non_utc_allow) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['RightOuter', 'FullOuter'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_nested_loop_with_conditionals_build_right_fallback(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -533,7 +529,7 @@ def do_join(spark): # Specify 200 shuffle partitions to test cases where streaming side is empty # as in https://github.com/NVIDIA/spark-rapids/issues/7516 @pytest.mark.parametrize('shuffle_conf', [{}, {'spark.sql.shuffle.partitions': 200}], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_join_left_table(data_gen, join_type, shuffle_conf): def do_join(spark): left, right = create_df(spark, data_gen, 250, 500) @@ -545,7 +541,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_ast_gen, ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_join_with_conditionals(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -600,7 +596,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_ast_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'Inner', 'FullOuter', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sortmerge_join_with_condition_ast(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -717,7 +713,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sortmerge_join_struct_as_key(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -729,7 +725,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sortmerge_join_struct_mixed_key(data_gen, join_type): def do_join(spark): left = two_col_df(spark, data_gen, int_gen, length=500) @@ -742,7 +738,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_sortmerge_join_struct_mixed_key_with_null_filter(data_gen, join_type): def do_join(spark): left = two_col_df(spark, data_gen, int_gen, length=500) @@ -757,7 +753,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_join_right_struct_as_key(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -769,7 +765,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_broadcast_join_right_struct_mixed_key(data_gen, join_type): def do_join(spark): left = two_col_df(spark, data_gen, int_gen, length=500) @@ -790,11 +786,11 @@ def do_join(spark): assert_gpu_and_cpu_are_equal_collect(do_join, conf=_sortmerge_join_conf) @allow_non_gpu('SortMergeJoinExec', 'SortExec', 'NormalizeNaNAndZero', 'CreateNamedStruct', - 'GetStructField', 'Literal', 'If', 'IsNull', 'ShuffleExchangeExec', 'HashPartitioning') + 'GetStructField', 'Literal', 'If', 'IsNull', 'ShuffleExchangeExec', 'HashPartitioning', + *non_utc_allow) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['FullOuter'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_struct_as_key_fallback(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index 6522ecb8499..d8aceb3e705 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -152,10 +152,12 @@ def test_json_input_meta(spark_tmp_path, v1_enabled_list): 'input_file_block_length()'), conf=updated_conf) +allow_non_gpu_for_json_scan = ['FileSourceScanExec', 'BatchScanExec'] if is_not_utc() else [] json_supported_date_formats = ['yyyy-MM-dd', 'yyyy/MM/dd', 'yyyy-MM', 'yyyy/MM', 'MM-yyyy', 'MM/yyyy', 'MM-dd-yyyy', 'MM/dd/yyyy', 'dd-MM-yyyy', 'dd/MM/yyyy'] @pytest.mark.parametrize('date_format', json_supported_date_formats, ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "json"]) +@allow_non_gpu(*allow_non_gpu_for_json_scan) def test_json_date_formats_round_trip(spark_tmp_path, date_format, v1_enabled_list): gen = StructGen([('a', DateGen())], nullable=False) data_path = spark_tmp_path + '/JSON_DATA' @@ -181,10 +183,11 @@ def test_json_date_formats_round_trip(spark_tmp_path, date_format, v1_enabled_li "'T'HH:mm[:ss]", "'T'HH:mm"] +not_utc_allow_for_test_json_scan = ['BatchScanExec', 'FileSourceScanExec'] if is_not_utc() else [] @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) @pytest.mark.parametrize('v1_enabled_list', ["", "json"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*not_utc_allow_for_test_json_scan) def test_json_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_enabled_list): full_format = date_format + ts_part data_gen = TimestampGen() @@ -204,20 +207,18 @@ def test_json_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_ena conf=updated_conf) @allow_non_gpu('FileSourceScanExec', 'ProjectExec') -@pytest.mark.skipif(is_before_spark_341(), reason='`TIMESTAMP_NTZ` is only supported in PySpark 341+') +@pytest.mark.skipif(is_before_spark_341(), reason='`TIMESTAMP_NTZ` is only supported in PySpark 341+.') @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) @pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_json_ts_formats_round_trip_ntz_v1(spark_tmp_path, date_format, ts_part, timestamp_type): json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, 'json', 'FileSourceScanExec') @allow_non_gpu('BatchScanExec', 'ProjectExec') -@pytest.mark.skipif(is_before_spark_341(), reason='`TIMESTAMP_NTZ` is only supported in PySpark 341+') +@pytest.mark.skipif(is_before_spark_341(), reason='`TIMESTAMP_NTZ` is only supported in PySpark 341+.') @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) @pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_json_ts_formats_round_trip_ntz_v2(spark_tmp_path, date_format, ts_part, timestamp_type): json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, '', 'BatchScanExec') @@ -245,11 +246,18 @@ def do_read(spark): if timestamp_type == "TIMESTAMP_LTZ": - assert_cpu_and_gpu_are_equal_collect_with_capture( - lambda spark : do_read(spark), - exist_classes = 'Gpu' + cpu_scan_class, - non_exist_classes = cpu_scan_class, - conf=updated_conf) + if is_not_utc(): + # non UTC is not support for json, skip capture check + # Tracked in https://github.com/NVIDIA/spark-rapids/issues/9912 + assert_gpu_and_cpu_are_equal_collect(lambda spark: do_read(spark), conf = updated_conf) + else: + assert_cpu_and_gpu_are_equal_collect_with_capture( + lambda spark : do_read(spark), + exist_classes = 'Gpu' + cpu_scan_class, + non_exist_classes = cpu_scan_class, + conf=updated_conf) + + else: # we fall back to CPU due to "unsupported data types in output: TimestampNTZType" assert_gpu_fallback_collect( @@ -281,6 +289,7 @@ def do_read(spark): @pytest.mark.parametrize('allow_non_numeric_numbers', ["true", "false"]) @pytest.mark.parametrize('allow_numeric_leading_zeros', ["true"]) @pytest.mark.parametrize('ansi_enabled', ["true", "false"]) +@allow_non_gpu(*not_utc_allow_for_test_json_scan) def test_basic_json_read(std_input_path, filename, schema, read_func, allow_non_numeric_numbers, allow_numeric_leading_zeros, ansi_enabled, spark_tmp_table_factory): updated_conf = copy_and_update(_enable_all_types_conf, {'spark.sql.ansi.enabled': ansi_enabled, @@ -330,6 +339,10 @@ def test_read_valid_json(spark_tmp_table_factory, std_input_path, read_func, fil {}), conf=conf) + +# allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653' +not_utc_json_scan_allow=['FileSourceScanExec'] if is_not_utc() else [] + @approximate_float @pytest.mark.parametrize('filename', [ 'dates.json', @@ -339,8 +352,8 @@ def test_read_valid_json(spark_tmp_table_factory, std_input_path, read_func, fil @pytest.mark.parametrize('ansi_enabled', ["true", "false"]) @pytest.mark.parametrize('time_parser_policy', [ pytest.param('LEGACY', marks=pytest.mark.allow_non_gpu('FileSourceScanExec')), - 'CORRECTED', - 'EXCEPTION' + pytest.param('CORRECTED', marks=pytest.mark.allow_non_gpu(*not_utc_json_scan_allow)), + pytest.param('EXCEPTION', marks=pytest.mark.allow_non_gpu(*not_utc_json_scan_allow)) ]) def test_json_read_valid_dates(std_input_path, filename, schema, read_func, ansi_enabled, time_parser_policy, spark_tmp_table_factory): updated_conf = copy_and_update(_enable_all_types_conf, @@ -355,6 +368,7 @@ def test_json_read_valid_dates(std_input_path, filename, schema, read_func, ansi else: assert_gpu_and_cpu_are_equal_collect(f, conf=updated_conf) + @approximate_float @pytest.mark.parametrize('filename', [ 'dates_invalid.json', @@ -364,8 +378,8 @@ def test_json_read_valid_dates(std_input_path, filename, schema, read_func, ansi @pytest.mark.parametrize('ansi_enabled', ["true", "false"]) @pytest.mark.parametrize('time_parser_policy', [ pytest.param('LEGACY', marks=pytest.mark.allow_non_gpu('FileSourceScanExec')), - 'CORRECTED', - 'EXCEPTION' + pytest.param('CORRECTED', marks=pytest.mark.allow_non_gpu(*not_utc_json_scan_allow)), + pytest.param('EXCEPTION', marks=pytest.mark.allow_non_gpu(*not_utc_json_scan_allow)) ]) def test_json_read_invalid_dates(std_input_path, filename, schema, read_func, ansi_enabled, time_parser_policy, spark_tmp_table_factory): updated_conf = copy_and_update(_enable_all_types_conf, @@ -385,6 +399,11 @@ def test_json_read_invalid_dates(std_input_path, filename, schema, read_func, an else: assert_gpu_and_cpu_are_equal_collect(f, conf=updated_conf) +# allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653' +non_utc_file_source_scan_allow = ['FileSourceScanExec'] if is_not_utc() else [] + +non_utc_project_allow = ['ProjectExec'] if is_not_utc() else [] + @approximate_float @pytest.mark.parametrize('filename', [ 'timestamps.json', @@ -394,10 +413,10 @@ def test_json_read_invalid_dates(std_input_path, filename, schema, read_func, an @pytest.mark.parametrize('ansi_enabled', ["true", "false"]) @pytest.mark.parametrize('time_parser_policy', [ pytest.param('LEGACY', marks=pytest.mark.allow_non_gpu('FileSourceScanExec')), - 'CORRECTED', - 'EXCEPTION' + # For non UTC cases, corrected and exception will have CPU fallback in lack of timezone support. + pytest.param('CORRECTED', marks=pytest.mark.allow_non_gpu(*non_utc_file_source_scan_allow)), + pytest.param('EXCEPTION', marks=pytest.mark.allow_non_gpu(*non_utc_file_source_scan_allow)) ]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_json_read_valid_timestamps(std_input_path, filename, schema, read_func, ansi_enabled, time_parser_policy, \ spark_tmp_table_factory): updated_conf = copy_and_update(_enable_all_types_conf, @@ -455,7 +474,7 @@ def test_json_read_count(spark_tmp_path, v1_enabled_list): lambda spark : spark.read.schema(schema).json(data_path), conf=updated_conf) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_json_map(): # The test here is working around some inconsistencies in how the keys are parsed for maps # on the GPU the keys are dense, but on the CPU they are sparse @@ -490,7 +509,7 @@ def test_from_json_map_fallback(): 'struct', 'struct', ]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_json_struct(schema): # note that column 'a' does not use leading zeroes due to https://github.com/NVIDIA/spark-rapids/issues/9588 json_string_gen = StringGen(r'{"a": [1-9]{0,5}, "b": "[A-Z]{0,5}", "c": 1\d\d\d}') \ @@ -510,7 +529,7 @@ def test_from_json_struct(schema): r'{ "bool": [0-9]{4}-[0-9]{2}-[0-9]{2} }', r'{ "bool": "[0-9]{4}-[0-9]{2}-[0-9]{2}" }' ]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_json_struct_boolean(pattern): json_string_gen = StringGen(pattern) \ .with_special_case('', weight=50) \ @@ -520,7 +539,7 @@ def test_from_json_struct_boolean(pattern): .select(f.col('a'), f.from_json('a', 'struct')), conf={"spark.rapids.sql.expression.JsonToStructs": True}) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_json_struct_decimal(): json_string_gen = StringGen(r'{ "a": "[+-]?([0-9]{0,5})?(\.[0-9]{0,2})?([eE][+-]?[0-9]{1,2})?" }') \ .with_special_pattern('', weight=50) \ @@ -551,8 +570,8 @@ def test_from_json_struct_decimal(): "(true|false)" ]) @pytest.mark.parametrize('date_format', [ - "", - "yyyy-MM-dd", + pytest.param("", marks=pytest.mark.allow_non_gpu(*non_utc_project_allow)), + pytest.param("yyyy-MM-dd", marks=pytest.mark.allow_non_gpu(*non_utc_project_allow)), # https://github.com/NVIDIA/spark-rapids/issues/9667 pytest.param("dd/MM/yyyy", marks=pytest.mark.allow_non_gpu('ProjectExec')), ]) @@ -560,7 +579,6 @@ def test_from_json_struct_decimal(): pytest.param("LEGACY", marks=pytest.mark.allow_non_gpu('ProjectExec')), "CORRECTED" ]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct_date(date_gen, date_format, time_parser_policy): json_string_gen = StringGen(r'{ "a": ' + date_gen + ' }') \ .with_special_case('{ "a": null }') \ @@ -608,6 +626,9 @@ def test_from_json_struct_date_fallback_non_default_format(date_gen, date_format conf={"spark.rapids.sql.expression.JsonToStructs": True, 'spark.sql.legacy.timeParserPolicy': 'CORRECTED'}) +# allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653' +non_utc_project_allow = ['ProjectExec'] if is_not_utc() else [] + @pytest.mark.parametrize('timestamp_gen', [ # "yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]" "\"[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?[1-8]{1}[0-9]{3}-[0-3]{1,2}-[0-3]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}(\\.[0-9]{1,6})?Z?[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]}?\"", @@ -632,8 +653,9 @@ def test_from_json_struct_date_fallback_non_default_format(date_gen, date_format "(true|false)" ]) @pytest.mark.parametrize('timestamp_format', [ - "", - "yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]", + # Even valid timestamp format, CPU fallback happens still since non UTC is not supported for json. + pytest.param("", marks=pytest.mark.allow_non_gpu(*non_utc_project_allow)), + pytest.param("yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]", marks=pytest.mark.allow_non_gpu(*non_utc_project_allow)), # https://github.com/NVIDIA/spark-rapids/issues/9723 pytest.param("yyyy-MM-dd'T'HH:mm:ss.SSSXXX", marks=pytest.mark.allow_non_gpu('ProjectExec')), pytest.param("dd/MM/yyyy'T'HH:mm:ss[.SSS][XXX]", marks=pytest.mark.allow_non_gpu('ProjectExec')), @@ -643,7 +665,6 @@ def test_from_json_struct_date_fallback_non_default_format(date_gen, date_format "CORRECTED" ]) @pytest.mark.parametrize('ansi_enabled', [ True, False ]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct_timestamp(timestamp_gen, timestamp_format, time_parser_policy, ansi_enabled): json_string_gen = StringGen(r'{ "a": ' + timestamp_gen + ' }') \ .with_special_case('{ "a": null }') \ @@ -695,7 +716,7 @@ def test_from_json_struct_timestamp_fallback_non_default_format(timestamp_gen, t @pytest.mark.parametrize('schema', ['struct', 'struct>', 'struct>']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_json_struct_of_struct(schema): json_string_gen = StringGen(r'{"teacher": "[A-Z]{1}[a-z]{2,5}",' \ r'"student": {"name": "[A-Z]{1}[a-z]{2,5}", "age": 1\d}}') \ @@ -710,7 +731,7 @@ def test_from_json_struct_of_struct(schema): @pytest.mark.parametrize('schema', ['struct', 'struct>>', 'struct>>']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_json_struct_of_list(schema): json_string_gen = StringGen(r'{"teacher": "[A-Z]{1}[a-z]{2,5}",' \ r'"student": \[{"name": "[A-Z]{1}[a-z]{2,5}", "class": "junior"},' \ @@ -723,7 +744,7 @@ def test_from_json_struct_of_list(schema): conf={"spark.rapids.sql.expression.JsonToStructs": True}) @pytest.mark.parametrize('schema', ['struct', 'struct']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_from_json_struct_all_empty_string_input(schema): json_string_gen = StringGen('') assert_gpu_and_cpu_are_equal_collect( @@ -846,6 +867,7 @@ def struct_to_json(spark): 'Etc/UTC', pytest.param('UTC+07:00', marks=pytest.mark.allow_non_gpu('ProjectExec')), ]) +@pytest.mark.skipif(is_not_utc(), reason='Duplicated as original test case designed which it is parameterized by timezone. https://github.com/NVIDIA/spark-rapids/issues/9653.') def test_structs_to_json_timestamp(spark_tmp_path, data_gen, timestamp_format, timezone): struct_gen = StructGen([ ("b", StructGen([('child', data_gen)], nullable=True)), diff --git a/integration_tests/src/main/python/limit_test.py b/integration_tests/src/main/python/limit_test.py index efe81c1058a..5e116b00654 100644 --- a/integration_tests/src/main/python/limit_test.py +++ b/integration_tests/src/main/python/limit_test.py @@ -15,13 +15,11 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect -from conftest import is_not_utc from data_gen import * from spark_session import is_before_spark_340 from marks import allow_non_gpu, approximate_float @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens + array_gens_sample + map_gens_sample + struct_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_simple_limit(data_gen): assert_gpu_and_cpu_are_equal_collect( # We need some processing after the limit to avoid a CollectLimitExec @@ -82,7 +80,6 @@ def test_non_zero_offset_with_limit(limit, offset, batch_size): @pytest.mark.skipif(is_before_spark_340(), reason='offset is introduced from Spark 3.4.0') @allow_non_gpu('ShuffleExchangeExec') # when limit = 0, ShuffleExchangeExec is not replaced. @approximate_float -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_order_by_offset_with_limit(limit, offset, data_gen, batch_size): # In CPU version of spark, (limit, offset) can not be negative number. # Test case description: diff --git a/integration_tests/src/main/python/map_test.py b/integration_tests/src/main/python/map_test.py index 5daeb916e22..b35789b62f5 100644 --- a/integration_tests/src/main/python/map_test.py +++ b/integration_tests/src/main/python/map_test.py @@ -57,7 +57,7 @@ @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_map_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -69,7 +69,7 @@ def test_map_keys(data_gen): @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_map_values(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -81,7 +81,6 @@ def test_map_values(data_gen): @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_entries(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -117,7 +116,6 @@ def decimal_value_gen(): [MapGen(StringGen(pattern='key_[0-9]', nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_map_value_string_keys(data_gen): index_gen = StringGen() assert_gpu_and_cpu_are_equal_collect( @@ -141,7 +139,6 @@ def test_get_map_value_string_keys(data_gen): @pytest.mark.parametrize('data_gen', numeric_key_map_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_map_value_numeric_keys(data_gen): key_gen = data_gen._key_gen assert_gpu_and_cpu_are_equal_collect( @@ -155,7 +152,7 @@ def test_get_map_value_numeric_keys(data_gen): @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_get_map_value_supported_keys(data_gen): key_gen = data_gen._key_gen # first expression is not guaranteed to hit @@ -194,7 +191,7 @@ def query_map_scalar(spark): @allow_non_gpu('WindowLocalExec') @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9683') @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_map_scalars_supported_key_types(data_gen): key_gen = data_gen._key_gen def query_map_scalar(spark): @@ -232,7 +229,7 @@ def query_map_scalar(spark): @pytest.mark.parametrize('data_gen', [MapGen(DateGen(nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_get_map_value_date_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -244,7 +241,7 @@ def test_get_map_value_date_keys(data_gen): @pytest.mark.parametrize('data_gen', [MapGen(TimestampGen(nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_get_map_value_timestamp_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -263,7 +260,7 @@ def test_map_side_effects(): @pytest.mark.parametrize('key_gen', [StringGen(nullable=False), IntegerGen(nullable=False), basic_struct_gen], ids=idfn) @pytest.mark.parametrize('value_gen', [StringGen(nullable=True), IntegerGen(nullable=True), basic_struct_gen], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_single_entry_map(key_gen, value_gen): data_gen = [('a', key_gen), ('b', value_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -319,11 +316,12 @@ def test_map_expr_literal_keys_dupe_last_win(): conf={'spark.sql.mapKeyDedupPolicy':'LAST_WIN'}) -def test_map_expr_literal_keys_dupe_exception(): +@pytest.mark.parametrize('map_expr',['map("key1", b, "key1", a) as m1', + 'map(double("NaN"), b, double("NaN"), a) as m1'], ids=idfn) +def test_map_expr_literal_keys_dupe_exception(map_expr): data_gen = [('a', StringGen(nullable=False)), ('b', StringGen(nullable=False))] assert_gpu_and_cpu_error( - lambda spark: gen_df(spark, data_gen).selectExpr( - 'map("key1", b, "key1", a) as m1').collect(), + lambda spark: gen_df(spark, data_gen).selectExpr(map_expr).collect(), conf={'spark.sql.mapKeyDedupPolicy':'EXCEPTION'}, error_message = "Duplicate map key") @@ -467,7 +465,7 @@ def test_simple_get_map_value_with_strict_index(strict_index, data_gen): [MapGen(StringGen(pattern='key_[0-9]', nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_element_at_map_string_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -481,7 +479,7 @@ def test_element_at_map_string_keys(data_gen): @pytest.mark.parametrize('data_gen', numeric_key_map_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_element_at_map_numeric_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -497,7 +495,7 @@ def test_element_at_map_numeric_keys(data_gen): [MapGen(DecimalGen(precision=35, scale=2, nullable=False), value(), max_length=6) for value in get_map_value_gens(precision=37, scale=0)], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_get_map_value_element_at_map_dec_col_keys(data_gen): keys = DecimalGen(precision=35, scale=2) assert_gpu_and_cpu_are_equal_collect( @@ -523,7 +521,7 @@ def test_get_map_value_element_at_map_string_col_keys_ansi(data_gen, ansi): [MapGen(StringGen(pattern='key_[0-9]', nullable=False), value(), max_length=6) for value in get_map_value_gens(precision=37, scale=0)], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_get_map_value_element_at_map_string_col_keys(data_gen): keys = StringGen(pattern='key_[0-9]') assert_gpu_and_cpu_are_equal_collect( @@ -580,7 +578,7 @@ def test_get_map_value_string_col_keys_ansi_null(data_gen): @pytest.mark.parametrize('data_gen', [MapGen(DateGen(nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_element_at_map_date_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -594,7 +592,7 @@ def test_element_at_map_date_keys(data_gen): [MapGen(TimestampGen(nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_element_at_map_timestamp_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -626,7 +624,7 @@ def test_map_element_at_ansi_null(data_gen): conf=ansi_enabled_conf) @pytest.mark.parametrize('data_gen', map_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_transform_values(data_gen): def do_it(spark): columns = ['a', 'b', @@ -665,7 +663,7 @@ def do_it(spark): @pytest.mark.parametrize('data_gen', map_gens_sample + decimal_128_map_gens + decimal_64_map_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_transform_keys(data_gen): # The processing here is very limited, because we need to be sure we do not create duplicate keys. # This can happen because of integer overflow, round off errors in floating point, etc. So for now @@ -725,7 +723,7 @@ def test_sql_map_scalars(query): @pytest.mark.parametrize('data_gen', map_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_map_filter(data_gen): columns = ['map_filter(a, (key, value) -> isnotnull(value) )', 'map_filter(a, (key, value) -> isnull(value) )', diff --git a/integration_tests/src/main/python/mortgage_test.py b/integration_tests/src/main/python/mortgage_test.py index 00bab066651..aed9aa63c85 100644 --- a/integration_tests/src/main/python/mortgage_test.py +++ b/integration_tests/src/main/python/mortgage_test.py @@ -15,7 +15,6 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_iterator -from conftest import is_not_utc from marks import approximate_float, incompat, ignore_order, allow_non_gpu, limit @incompat @@ -23,7 +22,6 @@ @limit @ignore_order @allow_non_gpu(any=True) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_mortgage(mortgage): assert_gpu_and_cpu_are_equal_iterator( lambda spark : mortgage.do_test_query(spark)) diff --git a/integration_tests/src/main/python/orc_cast_test.py b/integration_tests/src/main/python/orc_cast_test.py index cccd60125b9..48efd5c8174 100644 --- a/integration_tests/src/main/python/orc_cast_test.py +++ b/integration_tests/src/main/python/orc_cast_test.py @@ -17,6 +17,7 @@ from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error from conftest import is_not_utc from data_gen import * +from marks import allow_non_gpu from pyspark.sql.types import * from spark_session import with_cpu_session from orc_test import reader_opt_confs @@ -48,9 +49,10 @@ def test_casting_among_integer_types(spark_tmp_path, reader_confs, v1_enabled_li lambda spark: spark.read.schema(schema_str).orc(data_path), conf=all_confs) +non_utc_allow_orc_scan=['ColumnarToRowExec', 'FileScan'] if is_not_utc() else [] @pytest.mark.parametrize('to_type', ['float', 'double', 'string', 'timestamp']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow_orc_scan) def test_casting_from_integer(spark_tmp_path, to_type): orc_path = spark_tmp_path + '/orc_cast_integer' # The Python 'datetime' module only supports a max-year of 10000, so we set the Long type max @@ -68,11 +70,11 @@ def test_casting_from_integer(spark_tmp_path, to_type): schema_str.format(*([to_type] * len(data_gen)))).orc(orc_path) ) - +non_utc_allow_for_test_casting_from_overflow_long = ['FileSourceScanExec', 'ColumnarToRowExec', ] if is_not_utc() else [] @pytest.mark.parametrize('overflow_long_gen', [LongGen(min_val=int(1e16)), LongGen(max_val=int(-1e16))]) @pytest.mark.parametrize('to_type', ['timestamp']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow_for_test_casting_from_overflow_long) def test_casting_from_overflow_long(spark_tmp_path, overflow_long_gen,to_type): # Timestamp(micro-seconds) is actually type of int64, when casting long(int64) to timestamp, # we need to multiply 1e6 (or 1e3), and it may cause overflow. This function aims to test @@ -103,7 +105,7 @@ def test_casting_from_float_and_double(spark_tmp_path, to_type): @pytest.mark.parametrize('data_gen', [DoubleGen(max_exp=32, special_cases=None), DoubleGen(max_exp=32, special_cases=[8.88e9, 9.99e10, 1.314e11])]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow_orc_scan) def test_casting_from_double_to_timestamp(spark_tmp_path, data_gen): # ORC will assume the original double value in seconds, we need to convert them to # timestamp(INT64 in micro-seconds). @@ -127,7 +129,7 @@ def test_casting_from_double_to_timestamp(spark_tmp_path, data_gen): ) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow_for_test_casting_from_overflow_long) def test_casting_from_overflow_double_to_timestamp(spark_tmp_path): orc_path = spark_tmp_path + '/orc_casting_from_overflow_double_to_timestamp' with_cpu_session( diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py index 409d0850987..c3bbd917b2e 100644 --- a/integration_tests/src/main/python/orc_test.py +++ b/integration_tests/src/main/python/orc_test.py @@ -62,13 +62,14 @@ def get_orc_timestamp_gen(nullable=True): # The Count result can not be sorted, so local sort can not be used. reader_opt_confs_for_count = reader_opt_confs_common + [multithreaded_orc_file_reader_combine_unordered_conf] +non_utc_allow_orc_file_source_scan=['ColumnarToRowExec', 'FileSourceScanExec'] if is_not_utc() else [] @pytest.mark.parametrize('name', ['timestamp-date-test.orc']) @pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow_orc_file_source_scan) def test_basic_read(std_input_path, name, read_func, v1_enabled_list, orc_impl, reader_confs): all_confs = copy_and_update(reader_confs, { 'spark.sql.sources.useV1SourceList': v1_enabled_list, @@ -127,6 +128,8 @@ def test_basic_read(std_input_path, name, read_func, v1_enabled_list, orc_impl, MapGen(StructGen([['child0', byte_gen], ['child1', long_gen]], nullable=False), StructGen([['child0', byte_gen], ['child1', long_gen]]))] +non_utc_allow_orc_scan=['ColumnarToRowExec', 'FileSourceScanExec', 'BatchScanExec'] if is_not_utc() else [] + orc_gens_list = [orc_basic_gens, orc_array_gens_sample, orc_struct_gens_sample, @@ -159,8 +162,8 @@ def test_orc_fallback(spark_tmp_path, read_func, disable_conf): @pytest.mark.parametrize('orc_gens', orc_gens_list, ids=idfn) @pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) -@pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@pytest.mark.parametrize('v1_enabled_list', ['', 'orc']) +@allow_non_gpu(*non_utc_allow_orc_scan) def test_read_round_trip(spark_tmp_path, orc_gens, read_func, reader_confs, v1_enabled_list): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -186,7 +189,7 @@ def test_read_round_trip(spark_tmp_path, orc_gens, read_func, reader_confs, v1_e @pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow_orc_scan) def test_pred_push_round_trip(spark_tmp_path, orc_gen, read_func, v1_enabled_list, reader_confs): data_path = spark_tmp_path + '/ORC_DATA' # Append two struct columns to verify nested predicate pushdown. @@ -243,7 +246,7 @@ def test_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_list, rea @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow_orc_scan) def test_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed # we should go with a more standard set of generators @@ -310,7 +313,7 @@ def test_partitioned_read_just_partitions(spark_tmp_path, v1_enabled_list, reade @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow_orc_scan) def test_merge_schema_read(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed # we should go with a more standard set of generators @@ -589,7 +592,7 @@ def test_read_struct_without_stream(spark_tmp_path): @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('case_sensitive', ["false", "true"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow_orc_scan) def test_read_with_more_columns(spark_tmp_path, orc_gen, reader_confs, v1_enabled_list, case_sensitive): struct_gen = StructGen([('nested_col', orc_gen)]) # Map is not supported yet. @@ -777,7 +780,7 @@ def test_orc_read_varchar_as_string(std_input_path): @pytest.mark.parametrize('gens', orc_gens_list, ids=idfn) @pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow_orc_scan) def test_read_round_trip_for_multithreaded_combining(spark_tmp_path, gens, keep_order): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -792,7 +795,7 @@ def test_read_round_trip_for_multithreaded_combining(spark_tmp_path, gens, keep_ @pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow_orc_scan) def test_simple_partitioned_read_for_multithreaded_combining(spark_tmp_path, keep_order): orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)), diff --git a/integration_tests/src/main/python/orc_write_test.py b/integration_tests/src/main/python/orc_write_test.py index 5617f8e20e5..a28aad58406 100644 --- a/integration_tests/src/main/python/orc_write_test.py +++ b/integration_tests/src/main/python/orc_write_test.py @@ -81,7 +81,7 @@ @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_write_round_trip(spark_tmp_path, orc_gens, orc_impl): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -116,7 +116,7 @@ def test_write_round_trip_corner(spark_tmp_path, orc_gen, orc_impl): # There are race conditions around when individual files are read in for partitioned data @ignore_order @pytest.mark.parametrize('orc_gen', orc_part_write_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_part_write_round_trip(spark_tmp_path, orc_gen): gen_list = [('a', RepeatSeqGen(orc_gen, 10)), ('b', orc_gen)] @@ -170,8 +170,8 @@ def test_compress_write_round_trip(spark_tmp_path, compress): @pytest.mark.order(2) @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') -def test_write_save_table(spark_tmp_path, orc_gens, orc_impl, spark_tmp_table_factory): +@allow_non_gpu(*non_utc_allow) +def test_write_save_table_orc(spark_tmp_path, orc_gens, orc_impl, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' all_confs={'spark.sql.sources.useV1SourceList': "orc", @@ -189,11 +189,13 @@ def write_orc_sql_from(spark, df, data_path, write_to_table): write_cmd = 'CREATE TABLE `{}` USING ORC location \'{}\' AS SELECT * from `{}`'.format(write_to_table, data_path, tmp_view_name) spark.sql(write_cmd) +non_utc_hive_save_table_allow = ['ExecutedCommandExec', 'DataWritingCommandExec', 'CreateDataSourceTableAsSelectCommand', 'WriteFilesExec'] if is_not_utc() else [] + @pytest.mark.order(2) @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) @pytest.mark.parametrize('ts_type', ["TIMESTAMP_MICROS", "TIMESTAMP_MILLIS"]) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_hive_save_table_allow) def test_write_sql_save_table(spark_tmp_path, orc_gens, ts_type, orc_impl, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -203,9 +205,8 @@ def test_write_sql_save_table(spark_tmp_path, orc_gens, ts_type, orc_impl, spark data_path, conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True}) -@allow_non_gpu('DataWritingCommandExec,ExecutedCommandExec,WriteFilesExec') +@allow_non_gpu('DataWritingCommandExec,ExecutedCommandExec,WriteFilesExec', *non_utc_allow) @pytest.mark.parametrize('codec', ['zlib', 'lzo']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_orc_write_compression_fallback(spark_tmp_path, codec, spark_tmp_table_factory): gen = TimestampGen() data_path = spark_tmp_path + '/PARQUET_DATA' @@ -262,7 +263,7 @@ def sql_write(spark, path): @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_write_empty_orc_round_trip(spark_tmp_path, orc_gens): def create_empty_df(spark, path): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index f6cc2a0141b..ae3aeab7410 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -164,7 +164,7 @@ def setup_table(spark): @pytest.mark.parametrize('read_func', [read_parquet_df, read_parquet_sql]) @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_read_round_trip(spark_tmp_path, parquet_gens, read_func, reader_confs, v1_enabled_list): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -299,7 +299,7 @@ def test_parquet_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_l @pytest.mark.parametrize('read_func', [read_parquet_df, read_parquet_sql]) @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_pred_push_round_trip(spark_tmp_path, parquet_gen, read_func, v1_enabled_list, reader_confs): data_path = spark_tmp_path + '/PARQUET_DATA' gen_list = [('a', RepeatSeqGen(parquet_gen, 100)), ('b', parquet_gen)] @@ -313,13 +313,13 @@ def test_parquet_pred_push_round_trip(spark_tmp_path, parquet_gen, read_func, v1 lambda spark: rf(spark).select(f.col('a') >= s0), conf=all_confs) +@pytest.mark.skipif(is_not_utc(), reason="LEGACY datetime rebase mode is only supported for UTC timezone") @pytest.mark.parametrize('parquet_gens', [parquet_nested_datetime_gen], ids=idfn) @pytest.mark.parametrize('ts_type', parquet_ts_write_options) @pytest.mark.parametrize('ts_rebase_write', [('CORRECTED', 'LEGACY'), ('LEGACY', 'CORRECTED')]) @pytest.mark.parametrize('ts_rebase_read', [('CORRECTED', 'LEGACY'), ('LEGACY', 'CORRECTED')]) @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_read_roundtrip_datetime_with_legacy_rebase(spark_tmp_path, parquet_gens, ts_type, ts_rebase_write, ts_rebase_read, reader_confs, v1_enabled_list): @@ -356,10 +356,10 @@ def test_parquet_decimal_read_legacy(spark_tmp_path, parquet_gens, read_func, re all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) assert_gpu_and_cpu_are_equal_collect(read_func(data_path), conf=all_confs) +@pytest.mark.skipif(is_not_utc(), reason="LEGACY datetime rebase mode is only supported for UTC timezone") @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) @pytest.mark.parametrize('batch_size', [100, INT_MAX]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader_confs, batch_size): # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed # we should go with a more standard set of generators @@ -389,9 +389,9 @@ def test_parquet_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader # In this we are reading the data, but only reading the key the data was partitioned by +@pytest.mark.skipif(is_not_utc(), reason="LEGACY datetime rebase mode is only supported for UTC timezone") @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_partitioned_read_just_partitions(spark_tmp_path, v1_enabled_list, reader_confs): parquet_gens = [byte_gen] gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] @@ -534,7 +534,7 @@ def read_and_remove(spark): @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@pytest.mark.skipif(is_not_utc(), reason="LEGACY datetime rebase mode is only supported for UTC timezone") def test_parquet_read_merge_schema(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed # we should go with a more standard set of generators @@ -559,7 +559,7 @@ def test_parquet_read_merge_schema(spark_tmp_path, v1_enabled_list, reader_confs @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@pytest.mark.skipif(is_not_utc(), reason="LEGACY datetime rebase mode is only supported for UTC timezone") def test_parquet_read_merge_schema_from_conf(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed # we should go with a more standard set of generators @@ -875,7 +875,7 @@ def test_parquet_reading_from_unaligned_pages_basic_filters(spark_tmp_path, read @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @pytest.mark.parametrize('enable_dictionary', ["true", "false"], ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_reading_from_unaligned_pages_all_types(spark_tmp_path, reader_confs, enable_dictionary, v1_enabled_list): all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) data_path = spark_tmp_path + '/PARQUET_UNALIGNED_DATA' @@ -903,7 +903,7 @@ def test_parquet_reading_from_unaligned_pages_all_types(spark_tmp_path, reader_c @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @pytest.mark.parametrize('enable_dictionary', ["true", "false"], ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_reading_from_unaligned_pages_all_types_dict_optimized(spark_tmp_path, reader_confs, enable_dictionary, v1_enabled_list): all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) data_path = spark_tmp_path + '/PARQUET_UNALIGNED_DATA' diff --git a/integration_tests/src/main/python/parquet_testing_test.py b/integration_tests/src/main/python/parquet_testing_test.py index a4600de7b86..6c3ab0c14a2 100644 --- a/integration_tests/src/main/python/parquet_testing_test.py +++ b/integration_tests/src/main/python/parquet_testing_test.py @@ -17,7 +17,8 @@ from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error from conftest import get_std_input_path, is_parquet_testing_tests_forced, is_precommit_run, is_not_utc -from data_gen import copy_and_update +from data_gen import copy_and_update, non_utc_allow +from marks import allow_non_gpu from pathlib import Path import pytest from spark_session import is_before_spark_330, is_spark_350_or_later @@ -122,7 +123,7 @@ def gen_testing_params_for_valid_files(): @pytest.mark.parametrize("path", gen_testing_params_for_valid_files()) @pytest.mark.parametrize("confs", [_native_reader_confs, _java_reader_confs]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_testing_valid_files(path, confs): assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.read.parquet(path), conf=confs) diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index 9584f2a3520..d07def7483f 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -90,7 +90,7 @@ @pytest.mark.order(1) # at the head of xdist worker queue if pytest-order is installed @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_write_round_trip(spark_tmp_path, parquet_gens): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -136,7 +136,7 @@ def test_write_round_trip_corner(spark_tmp_path, par_gen): ArrayGen(TimestampGen(), max_length=10), MapGen(TimestampGen(nullable=False), TimestampGen())]], ids=idfn) @pytest.mark.parametrize('ts_type', parquet_ts_write_options) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_timestamp_write_round_trip(spark_tmp_path, parquet_gens, ts_type): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -150,7 +150,7 @@ def test_timestamp_write_round_trip(spark_tmp_path, parquet_gens, ts_type): @pytest.mark.parametrize('ts_type', parquet_ts_write_options) @pytest.mark.parametrize('ts_rebase', ['CORRECTED']) @ignore_order -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_write_ts_millis(spark_tmp_path, ts_type, ts_rebase): gen = TimestampGen() data_path = spark_tmp_path + '/PARQUET_DATA' @@ -174,7 +174,7 @@ def test_write_ts_millis(spark_tmp_path, ts_type, ts_rebase): @ignore_order @pytest.mark.order(1) # at the head of xdist worker queue if pytest-order is installed @pytest.mark.parametrize('parquet_gen', parquet_part_write_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_part_write_round_trip(spark_tmp_path, parquet_gen): gen_list = [('a', RepeatSeqGen(parquet_gen, 10)), ('b', parquet_gen)] @@ -188,8 +188,7 @@ def test_part_write_round_trip(spark_tmp_path, parquet_gen): @pytest.mark.skipif(is_spark_340_or_later() or is_databricks122_or_later(), reason="`WriteFilesExec` is only supported in Spark 340+") @pytest.mark.parametrize('data_gen', [TimestampGen()], ids=idfn) -@pytest.mark.allow_non_gpu("DataWritingCommandExec") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@pytest.mark.allow_non_gpu("DataWritingCommandExec", *non_utc_allow) def test_int96_write_conf(spark_tmp_path, data_gen): data_path = spark_tmp_path + '/PARQUET_DATA' confs = copy_and_update(writer_confs, { @@ -206,8 +205,7 @@ def test_int96_write_conf(spark_tmp_path, data_gen): @pytest.mark.skipif(is_before_spark_340() and not is_databricks122_or_later(), reason="`WriteFilesExec` is only supported in Spark 340+") @pytest.mark.parametrize('data_gen', [TimestampGen()], ids=idfn) # Note: From Spark 340, WriteFilesExec is introduced. -@pytest.mark.allow_non_gpu("DataWritingCommandExec", "WriteFilesExec") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@pytest.mark.allow_non_gpu("DataWritingCommandExec", "WriteFilesExec", *non_utc_allow) def test_int96_write_conf_with_write_exec(spark_tmp_path, data_gen): data_path = spark_tmp_path + '/PARQUET_DATA' confs = copy_and_update(writer_confs, { @@ -221,7 +219,7 @@ def test_int96_write_conf_with_write_exec(spark_tmp_path, data_gen): ['DataWritingCommandExec', 'WriteFilesExec'], confs) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_all_null_int96(spark_tmp_path): class AllNullTimestampGen(TimestampGen): def start(self, rand): @@ -251,7 +249,7 @@ def test_compress_write_round_trip(spark_tmp_path, compress): @pytest.mark.order(2) @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_write_save_table(spark_tmp_path, parquet_gens, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -267,9 +265,11 @@ def write_parquet_sql_from(spark, df, data_path, write_to_table): write_cmd = 'CREATE TABLE `{}` USING PARQUET location \'{}\' AS SELECT * from `{}`'.format(write_to_table, data_path, tmp_view_name) spark.sql(write_cmd) +non_utc_hive_save_table_allow = ['ExecutedCommandExec', 'DataWritingCommandExec', 'CreateDataSourceTableAsSelectCommand', 'WriteFilesExec'] if is_not_utc() else [] + @pytest.mark.order(2) @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_hive_save_table_allow) def test_write_sql_save_table(spark_tmp_path, parquet_gens, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -292,7 +292,7 @@ def writeParquetUpgradeCatchException(spark, df, data_path, spark_tmp_table_fact ('TIMESTAMP_MICROS', TimestampGen(start=datetime(1, 2, 1, tzinfo=timezone.utc), end=datetime(1899, 12, 31, tzinfo=timezone.utc))), ('TIMESTAMP_MILLIS', TimestampGen(start=datetime(1, 2, 1, tzinfo=timezone.utc), end=datetime(1899, 12, 31, tzinfo=timezone.utc)))]) @pytest.mark.parametrize('rebase', ["CORRECTED","EXCEPTION"]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_ts_write_fails_datetime_exception(spark_tmp_path, ts_write_data_gen, spark_tmp_table_factory, rebase): ts_write, gen = ts_write_data_gen data_path = spark_tmp_path + '/PARQUET_DATA' @@ -471,7 +471,7 @@ def generate_map_with_empty_validity(spark, path): @pytest.mark.parametrize('data_gen', parquet_nested_datetime_gen, ids=idfn) @pytest.mark.parametrize('ts_write', parquet_ts_write_options) @pytest.mark.parametrize('ts_rebase_write', ['EXCEPTION']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_write_fails_legacy_datetime(spark_tmp_path, data_gen, ts_write, ts_rebase_write): data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = {'spark.sql.parquet.outputTimestampType': ts_write, @@ -489,7 +489,7 @@ def writeParquetCatchException(spark, data_gen, data_path): @pytest.mark.parametrize('ts_write', parquet_ts_write_options) @pytest.mark.parametrize('ts_rebase_write', [('CORRECTED', 'LEGACY'), ('LEGACY', 'CORRECTED')]) @pytest.mark.parametrize('ts_rebase_read', [('CORRECTED', 'LEGACY'), ('LEGACY', 'CORRECTED')]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_parquet_write_roundtrip_datetime_with_legacy_rebase(spark_tmp_path, data_gen, ts_write, ts_rebase_write, ts_rebase_read): data_path = spark_tmp_path + '/PARQUET_DATA' @@ -533,7 +533,7 @@ def test_it(spark): with_gpu_session(test_it, conf) @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_write_empty_parquet_round_trip(spark_tmp_path, parquet_gens): def create_empty_df(spark, path): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] @@ -771,9 +771,12 @@ def read_table(spark, path): func(create_table, read_table, data_path, conf) +non_utc_hive_parquet_write_allow = ['DataWritingCommandExec', 'WriteFilesExec'] if is_not_utc() else [] + # Test to avoid regression on a known bug in Spark. For details please visit https://github.com/NVIDIA/spark-rapids/issues/8693 -@pytest.mark.parametrize('ts_rebase', ['LEGACY', 'CORRECTED']) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@pytest.mark.parametrize('ts_rebase', [ + pytest.param('LEGACY', marks=pytest.mark.skipif(is_not_utc(), reason="LEGACY datetime rebase mode is only supported for UTC timezone")), + pytest.param('CORRECTED', marks=pytest.mark.allow_non_gpu(*non_utc_hive_parquet_write_allow))]) def test_hive_timestamp_value(spark_tmp_table_factory, spark_tmp_path, ts_rebase): def func_test(create_table, read_table, data_path, conf): assert_gpu_and_cpu_writes_are_equal_collect(create_table, read_table, data_path, conf=conf) diff --git a/integration_tests/src/main/python/qa_nightly_select_test.py b/integration_tests/src/main/python/qa_nightly_select_test.py index 1349de3fcdf..786b71face5 100644 --- a/integration_tests/src/main/python/qa_nightly_select_test.py +++ b/integration_tests/src/main/python/qa_nightly_select_test.py @@ -24,8 +24,8 @@ from qa_nightly_sql import * import pytest from spark_session import with_cpu_session, is_jvm_charset_utf8 -from marks import approximate_float, ignore_order, incompat, qarun -from data_gen import copy_and_update +from marks import approximate_float, ignore_order, incompat, qarun, allow_non_gpu +from data_gen import copy_and_update, non_utc_allow def num_stringDf(spark): print("### CREATE DATAFRAME 1 ####") @@ -159,7 +159,7 @@ def idfn(val): @incompat @qarun @pytest.mark.parametrize('sql_query_line', SELECT_SQL, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_select(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -172,7 +172,7 @@ def test_select(sql_query_line, pytestconfig): @incompat @qarun @pytest.mark.parametrize('sql_query_line', SELECT_NEEDS_SORT_SQL, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_needs_sort_select(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -185,7 +185,6 @@ def test_needs_sort_select(sql_query_line, pytestconfig): @ignore_order(local=True) @qarun @pytest.mark.parametrize('sql_query_line', SELECT_JOIN_SQL, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_select_join(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -202,7 +201,6 @@ def init_tables(spark): @ignore_order(local=True) @qarun @pytest.mark.parametrize('sql_query_line', SELECT_PRE_ORDER_SQL, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_select_first_last(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -215,7 +213,6 @@ def test_select_first_last(sql_query_line, pytestconfig): @ignore_order(local=True) @qarun @pytest.mark.parametrize('sql_query_line', SELECT_FLOAT_SQL, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_select_float_order_local(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -230,7 +227,6 @@ def test_select_float_order_local(sql_query_line, pytestconfig): @qarun @pytest.mark.parametrize('sql_query_line', SELECT_REGEXP_SQL, ids=idfn) @pytest.mark.skipif(not is_jvm_charset_utf8(), reason="Regular expressions require UTF-8") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_select_regexp(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: diff --git a/integration_tests/src/main/python/repart_test.py b/integration_tests/src/main/python/repart_test.py index 60e0a191f25..54a540ce9a1 100644 --- a/integration_tests/src/main/python/repart_test.py +++ b/integration_tests/src/main/python/repart_test.py @@ -90,7 +90,6 @@ def test_union_struct_missing_children(data_gen): nested_struct, struct_of_maps], ids=idfn) # This tests union of two DFs of two cols each. The types of the left col and right col is the same -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_union(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).union(binary_op_df(spark, data_gen))) @@ -101,7 +100,6 @@ def test_union(data_gen): nested_struct, struct_of_maps], ids=idfn) # This tests union of two DFs of two cols each. The types of the left col and right col is the same -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_unionAll(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).unionAll(binary_op_df(spark, data_gen))) @@ -116,7 +114,6 @@ def test_unionAll(data_gen): struct_of_maps], ids=idfn) # This tests the union of two DFs of structs with missing child column names. The missing child # column will be replaced by nulls in the output DF. This is a feature added in 3.1+ -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_union_by_missing_col_name(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).withColumnRenamed("a", "x") @@ -158,7 +155,6 @@ def assert_union_equal(gen1, gen2): StructGen([['child0', DecimalGen(7, 2)]]), nested_struct, struct_of_maps], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_union_by_name(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).unionByName(binary_op_df(spark, data_gen))) @@ -170,14 +166,14 @@ def test_union_by_name(data_gen): pytest.param([('array' + str(i), gen) for i, gen in enumerate(array_gens_sample + [ArrayGen(BinaryGen(max_length=5), max_length=5)])]), pytest.param([('map' + str(i), gen) for i, gen in enumerate(map_gens_sample)]), ], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_coalesce_types(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen).coalesce(2)) @pytest.mark.parametrize('num_parts', [1, 10, 100, 1000, 2000], ids=idfn) @pytest.mark.parametrize('length', [0, 2048, 4096], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_coalesce_df(num_parts, length): #This should change eventually to be more than just the basic gens gen_list = [('_c' + str(i), gen) for i, gen in enumerate(all_basic_gens + decimal_gens + [binary_gen])] @@ -193,7 +189,7 @@ def test_coalesce_df(num_parts, length): @pytest.mark.parametrize('num_parts', [1, 10, 2345], ids=idfn) @pytest.mark.parametrize('length', [0, 2048, 4096], ids=idfn) @ignore_order(local=True) # To avoid extra data shuffle by 'sort on Spark' for this repartition test. -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_repartition_df(data_gen, num_parts, length): from pyspark.sql.functions import lit assert_gpu_and_cpu_are_equal_collect( @@ -210,7 +206,7 @@ def test_repartition_df(data_gen, num_parts, length): @pytest.mark.parametrize('num_parts', [1, 10, 2345], ids=idfn) @pytest.mark.parametrize('length', [0, 2048, 4096], ids=idfn) @ignore_order(local=True) # To avoid extra data shuffle by 'sort on Spark' for this repartition test. -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_repartition_df_for_round_robin(data_gen, num_parts, length): from pyspark.sql.functions import lit assert_gpu_and_cpu_are_equal_collect( @@ -284,7 +280,7 @@ def test_hash_fallback(data_gen): ([('a', decimal_gen_64bit), ('b', decimal_gen_64bit), ('c', decimal_gen_64bit)], ['a', 'b', 'c']), ([('a', decimal_gen_128bit), ('b', decimal_gen_128bit), ('c', decimal_gen_128bit)], ['a', 'b', 'c']), ], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_hash_repartition_exact(gen, num_parts): data_gen = gen[0] part_on = gen[1] diff --git a/integration_tests/src/main/python/row-based_udf_test.py b/integration_tests/src/main/python/row-based_udf_test.py index 19b02f2e24e..e849a87b10e 100644 --- a/integration_tests/src/main/python/row-based_udf_test.py +++ b/integration_tests/src/main/python/row-based_udf_test.py @@ -15,7 +15,6 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_sql -from conftest import is_not_utc from data_gen import * from spark_session import with_spark_session, is_spark_350_or_later from conftest import skip_unless_precommit_tests @@ -34,7 +33,6 @@ def load_hive_udf(spark, udfname, udfclass): @pytest.mark.xfail(condition=is_spark_350_or_later(), reason='https://github.com/NVIDIA/spark-rapids/issues/9064') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_empty_simple_udf(): with_spark_session(skip_if_no_hive) @@ -48,7 +46,6 @@ def evalfn(spark): "SELECT i, emptysimple(s, 'const_string') FROM hive_simple_udf_test_table", conf={'spark.rapids.sql.rowBasedUDF.enabled': 'true'}) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_empty_generic_udf(): with_spark_session(skip_if_no_hive) def evalfn(spark): diff --git a/integration_tests/src/main/python/row_conversion_test.py b/integration_tests/src/main/python/row_conversion_test.py index bc13419c8ec..92ea05d68be 100644 --- a/integration_tests/src/main/python/row_conversion_test.py +++ b/integration_tests/src/main/python/row_conversion_test.py @@ -15,7 +15,6 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect -from conftest import is_not_utc from data_gen import * from marks import allow_non_gpu, approximate_float, incompat from pyspark.sql.types import * @@ -29,7 +28,6 @@ # to be brought back to the CPU (rows) to be returned. # So we just need a very simple operation in the middle that # can be done on the GPU. -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_row_conversions(): gens = [["a", byte_gen], ["b", short_gen], ["c", int_gen], ["d", long_gen], ["e", float_gen], ["f", double_gen], ["g", string_gen], ["h", boolean_gen], @@ -44,7 +42,6 @@ def test_row_conversions(): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, gens).selectExpr("*", "a as a_again")) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_row_conversions_fixed_width(): gens = [["a", byte_gen], ["b", short_gen], ["c", int_gen], ["d", long_gen], ["e", float_gen], ["f", double_gen], ["h", boolean_gen], @@ -53,7 +50,6 @@ def test_row_conversions_fixed_width(): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, gens).selectExpr("*", "a as a_again")) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_row_conversions_fixed_width_wide(): gens = [["a{}".format(i), ByteGen(nullable=True)] for i in range(10)] + \ [["b{}".format(i), ShortGen(nullable=True)] for i in range(10)] + \ diff --git a/integration_tests/src/main/python/sample_test.py b/integration_tests/src/main/python/sample_test.py index 5ae72212bed..fc9d9fc4cbf 100644 --- a/integration_tests/src/main/python/sample_test.py +++ b/integration_tests/src/main/python/sample_test.py @@ -14,7 +14,6 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect -from conftest import is_not_utc from data_gen import * from pyspark.sql.types import * from spark_session import is_before_spark_330 @@ -39,7 +38,6 @@ def test_sample_produce_empty_batch(data_gen): # the following cases is the normal cases and do not use @ignore_order nested_gens = array_gens_sample + struct_gens_sample + map_gens_sample @pytest.mark.parametrize('data_gen', basic_gens + nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sample(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen, num_slices = 10) @@ -47,7 +45,6 @@ def test_sample(data_gen): ) @pytest.mark.parametrize('data_gen', basic_gens + nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sample_with_replacement(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen, num_slices = 10).sample( diff --git a/integration_tests/src/main/python/schema_evolution_test.py b/integration_tests/src/main/python/schema_evolution_test.py index d9f4c0f0899..ff501324cc0 100644 --- a/integration_tests/src/main/python/schema_evolution_test.py +++ b/integration_tests/src/main/python/schema_evolution_test.py @@ -16,7 +16,7 @@ from conftest import is_not_utc from data_gen import * from datetime import date, datetime, timezone -from marks import ignore_order +from marks import ignore_order, allow_non_gpu import pytest from spark_session import is_databricks_runtime, is_databricks113_or_later @@ -60,9 +60,10 @@ def get_ddl(col_gen_pairs): """Given a list of column_name, data_generator paris, returns the corresponding DDL string""" return ', '.join([f"{c} {g.data_type.simpleString()}" for c, g in col_gen_pairs]) +non_utc_allow_for_test_column_add_after_partition = ['ColumnarToRowExec', 'DataWritingCommandExec', 'ExecutedCommandExec', 'FileSourceScanExec', 'WriteFilesExec'] if is_not_utc() else [] @ignore_order(local=True) @pytest.mark.parametrize("format", _formats) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow_for_test_column_add_after_partition) def test_column_add_after_partition(spark_tmp_table_factory, format): # Databricks 10.4 appears to be missing https://issues.apache.org/jira/browse/SPARK-39417 # so avoid generating nulls for numeric partitions diff --git a/integration_tests/src/main/python/sort_test.py b/integration_tests/src/main/python/sort_test.py index 7fe208ae12d..3e447a5e772 100644 --- a/integration_tests/src/main/python/sort_test.py +++ b/integration_tests/src/main/python/sort_test.py @@ -52,7 +52,6 @@ def test_sort_nonbinary_carry_binary(data_gen): @pytest.mark.parametrize('data_gen', orderable_gens + orderable_not_null_gen, ids=idfn) @pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_orderby(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order)) @@ -60,7 +59,6 @@ def test_single_orderby(data_gen, order): @pytest.mark.parametrize('data_gen', single_level_array_gens, ids=idfn) @pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_first(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first(), f.col('a').desc_nulls_last()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_orderby_on_array(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order)) @@ -105,7 +103,6 @@ def test_single_orderby_fallback_for_array_of_struct(data_gen, order): marks=pytest.mark.xfail(reason='opposite null order not supported')), pytest.param(f.col('a').desc_nulls_last()), ], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_nested_orderby_plain(data_gen, order, shuffle_parts, stable_sort): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order), @@ -133,7 +130,6 @@ def test_single_nested_orderby_fallback_for_nullorder(data_gen, order): orderable_without_neg_decimal = [n for n in (orderable_gens + orderable_not_null_gen) if not (isinstance(n, DecimalGen) and n.scale < 0)] @pytest.mark.parametrize('data_gen', orderable_without_neg_decimal + single_level_array_gens, ids=idfn) @pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_orderby_with_limit(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order).limit(100)) @@ -144,7 +140,6 @@ def test_single_orderby_with_limit(data_gen, order): pytest.param(f.col('a').desc(), all_basic_struct_gen), pytest.param(f.col('a').desc_nulls_last(), all_basic_struct_gen) ], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_nested_orderby_with_limit(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order).limit(100), @@ -167,7 +162,6 @@ def test_single_nested_orderby_with_limit_fallback(data_gen, order): @pytest.mark.parametrize('data_gen', orderable_gens + orderable_not_null_gen + single_level_array_gens, ids=idfn) @pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first()], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_sort_in_part(data_gen, order): # We set `num_slices` to handle https://github.com/NVIDIA/spark-rapids/issues/2477 assert_gpu_and_cpu_are_equal_collect( @@ -190,7 +184,6 @@ def test_single_sort_in_part(data_gen, order): pytest.param(f.col('a').desc_nulls_last()), ], ids=idfn) @pytest.mark.parametrize('stable_sort', ['STABLE', 'OUTOFCORE'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_nested_sort_in_part(data_gen, order, stable_sort): sort_conf = {'spark.rapids.sql.stableSort.enabled': stable_sort == 'STABLE'} assert_gpu_and_cpu_are_equal_collect( @@ -201,13 +194,13 @@ def test_single_nested_sort_in_part(data_gen, order, stable_sort): boolean_gen, timestamp_gen, date_gen, string_gen, null_gen, StructGen([('child0', long_gen)]) ] + orderable_decimal_gens + single_level_array_gens @pytest.mark.parametrize('data_gen', orderable_gens_sort, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_multi_orderby(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).orderBy(f.col('a'), f.col('b').desc())) @pytest.mark.parametrize('data_gen', single_level_array_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_multi_orderby_on_array(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).orderBy(f.col('a'), f.col('b').desc())) @@ -215,7 +208,7 @@ def test_multi_orderby_on_array(data_gen): # SPARK CPU itself has issue with negative scale for take ordered and project orderable_gens_sort_without_neg_decimal = [n for n in orderable_gens_sort if not (isinstance(n, DecimalGen) and n.scale < 0)] @pytest.mark.parametrize('data_gen', orderable_gens_sort_without_neg_decimal + single_level_array_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_multi_orderby_with_limit(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).orderBy(f.col('a'), f.col('b').desc()).limit(100)) @@ -223,7 +216,7 @@ def test_multi_orderby_with_limit(data_gen): # We added in a partitioning optimization to take_ordered_and_project # This should trigger it. @pytest.mark.parametrize('data_gen', orderable_gens_sort_without_neg_decimal + single_level_array_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_multi_orderby_with_limit_single_part(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).coalesce(1).orderBy(f.col('a'), f.col('b').desc()).limit(100)) @@ -268,7 +261,7 @@ def test_single_orderby_with_skew(data_gen): # We are not trying all possibilities, just doing a few with numbers so the query works. @pytest.mark.parametrize('data_gen', [all_basic_struct_gen, StructGen([['child0', all_basic_struct_gen]])], ids=idfn) @pytest.mark.parametrize('stable_sort', ['STABLE', 'OUTOFCORE'], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_single_nested_orderby_with_skew(data_gen, stable_sort): sort_conf = {'spark.rapids.sql.stableSort.enabled': stable_sort == 'STABLE'} # When doing range partitioning the upstream data is sampled to try and get the bounds for cutoffs. @@ -312,7 +305,7 @@ def test_large_orderby(data_gen, stable_sort): simple_string_to_string_map_gen, ArrayGen(byte_gen, max_length=5)], ids=idfn) @pytest.mark.order(2) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_large_orderby_nested_ridealong(data_gen): # We use a UniqueLongGen to avoid duplicate keys that can cause ambiguity in the sort # results, especially on distributed clusters. @@ -333,7 +326,7 @@ def test_large_orderby_nested_ridealong(data_gen): ArrayGen(byte_gen, max_length=5), ArrayGen(decimal_gen_128bit, max_length=5)], ids=idfn) @pytest.mark.order(2) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_orderby_nested_ridealong_limit(data_gen): # We use a UniqueLongGen to avoid duplicate keys that can cause ambiguity in the sort # results, especially on distributed clusters. diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 65865d0bdc1..1e871e85bd5 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -653,10 +653,10 @@ def test_byte_length(): @incompat def test_initcap(): # Because we don't use the same unicode version we need to limit - # the charicter set to something more reasonable + # the character set to something more reasonable # upper and lower should cover the corner cases, this is mostly to # see if there are issues with spaces - gen = mk_str_gen('([aAbB1357ȺéŸ_@%-]{0,15}[ \r\n\t]{1,2}){1,5}') + gen = StringGen('([aAbB1357ȺéŸ_@%-]{0,15}[ \r\n\t]{1,2}){1,5}') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).select( f.initcap(f.col('a')))) diff --git a/integration_tests/src/main/python/struct_test.py b/integration_tests/src/main/python/struct_test.py index 986781c32e0..0e230a95408 100644 --- a/integration_tests/src/main/python/struct_test.py +++ b/integration_tests/src/main/python/struct_test.py @@ -15,7 +15,6 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql -from conftest import is_not_utc from data_gen import * from pyspark.sql.types import * @@ -34,7 +33,6 @@ def test_struct_scalar_project(): StructGen([["first", decimal_gen_64bit], ["second", decimal_gen_32bit], ["third", decimal_gen_32bit]]), StructGen([["first", decimal_gen_128bit], ["second", decimal_gen_128bit], ["third", decimal_gen_128bit]]), StructGen([["first", binary_gen], ["second", ArrayGen(BinaryGen(max_length=10), max_length=10)], ["third", binary_gen]])], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_struct_get_item(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr( @@ -45,7 +43,6 @@ def test_struct_get_item(data_gen): @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens + [binary_gen, null_gen] + single_level_array_gens + struct_gens_sample + map_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_make_struct(data_gen): # Spark has no good way to create a map literal without the map function # so we are inserting one. diff --git a/integration_tests/src/main/python/subquery_test.py b/integration_tests/src/main/python/subquery_test.py index 25a70b47a17..e6d641d4212 100644 --- a/integration_tests/src/main/python/subquery_test.py +++ b/integration_tests/src/main/python/subquery_test.py @@ -14,13 +14,11 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_sql -from conftest import is_not_utc from data_gen import * from marks import * @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_basic_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_scalar_subquery_basics(data_gen): # Fix num_slices at 1 to make sure that first/last returns same results under CPU and GPU. assert_gpu_and_cpu_are_equal_sql( @@ -33,7 +31,6 @@ def test_scalar_subquery_basics(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('basic_gen', all_basic_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_scalar_subquery_struct(basic_gen): # single-level struct gen = [('ss', StructGen([['a', basic_gen], ['b', basic_gen]]))] @@ -68,7 +65,6 @@ def test_scalar_subquery_struct(basic_gen): @ignore_order(local=True) @pytest.mark.parametrize('basic_gen', all_basic_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_scalar_subquery_array(basic_gen): # single-level array assert_gpu_and_cpu_are_equal_sql( diff --git a/integration_tests/src/main/python/time_window_test.py b/integration_tests/src/main/python/time_window_test.py index 52071926309..ff367b506fb 100644 --- a/integration_tests/src/main/python/time_window_test.py +++ b/integration_tests/src/main/python/time_window_test.py @@ -15,7 +15,6 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect -from conftest import is_not_utc from data_gen import * from datetime import datetime from marks import ignore_order, allow_non_gpu @@ -30,7 +29,6 @@ @pytest.mark.parametrize('data_gen', integral_gens + [string_gen], ids=idfn) @ignore_order -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_grouped_tumbling_window(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen],['data', data_gen]], nullable=False) assert_gpu_and_cpu_are_equal_collect( @@ -42,7 +40,6 @@ def test_grouped_tumbling_window(data_gen): @pytest.mark.parametrize('data_gen', integral_gens + [string_gen], ids=idfn) @ignore_order -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_grouped_sliding_window(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen],['data', data_gen]], nullable=False) assert_gpu_and_cpu_are_equal_collect( @@ -50,7 +47,6 @@ def test_grouped_sliding_window(data_gen): @pytest.mark.parametrize('data_gen', integral_gens + [string_gen], ids=idfn) @ignore_order -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_grouped_sliding_window_array(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen],['data', ArrayGen(data_gen)]], nullable=False) assert_gpu_and_cpu_are_equal_collect( @@ -58,7 +54,6 @@ def test_grouped_sliding_window_array(data_gen): @pytest.mark.parametrize('data_gen', integral_gens + [string_gen], ids=idfn) @ignore_order -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_tumbling_window(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen],['data', data_gen]], nullable=False) w = Window.partitionBy(f.window('ts', '5 hour')) @@ -67,7 +62,6 @@ def test_tumbling_window(data_gen): @pytest.mark.parametrize('data_gen', integral_gens + [string_gen], ids=idfn) @ignore_order -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sliding_window(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen],['data', data_gen]], nullable=False) w = Window.partitionBy(f.window('ts', '5 hour', '1 hour')) @@ -78,7 +72,6 @@ def test_sliding_window(data_gen): @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens + array_gens_sample + map_gens_sample, ids=idfn) # This includes an expand and we produce a different order than the CPU does. Sort locally to allow sorting of all types @ignore_order(local=True) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_just_window(data_gen): row_gen = StructGen([['ts', timestamp_gen],['data', data_gen]], nullable=False) assert_gpu_and_cpu_are_equal_collect( diff --git a/integration_tests/src/main/python/udf_test.py b/integration_tests/src/main/python/udf_test.py index 9e3f5d05bcc..0604b74585d 100644 --- a/integration_tests/src/main/python/udf_test.py +++ b/integration_tests/src/main/python/udf_test.py @@ -14,7 +14,7 @@ import pytest -from conftest import is_at_least_precommit_run, is_not_utc +from conftest import is_at_least_precommit_run from spark_session import is_databricks_runtime, is_before_spark_330, is_before_spark_350, is_spark_341 from pyspark.sql.pandas.utils import require_minimum_pyarrow_version, require_minimum_pandas_version @@ -84,7 +84,6 @@ def iterator_add(to_process: Iterator[Tuple[pd.Series, pd.Series]]) -> Iterator[ @pytest.mark.parametrize('data_gen', data_gens_nested_for_udf, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_pandas_scalar_udf_nested_type(data_gen): def nested_size(nested): return pd.Series([nested.size]).repeat(len(nested)) @@ -111,7 +110,6 @@ def pandas_sum(to_process: pd.Series) -> float: @approximate_float @pytest.mark.parametrize('data_gen', arrow_common_gen, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_aggregate_udf_more_types(data_gen): @f.pandas_udf('double') def group_size_udf(to_process: pd.Series) -> float: @@ -142,7 +140,6 @@ def pandas_sum(to_process: pd.Series) -> int: @ignore_order(local=True) @pytest.mark.parametrize('data_gen', arrow_common_gen, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_group_aggregate_udf_more_types(data_gen): @f.pandas_udf('long') def group_size_udf(to_process: pd.Series) -> int: @@ -261,7 +258,6 @@ def pandas_add(data): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', arrow_common_gen, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_group_apply_udf_more_types(data_gen): def group_size_udf(key, pdf): return pd.DataFrame([[len(key), len(pdf), len(pdf.columns)]]) @@ -289,7 +285,6 @@ def pandas_filter(iterator): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', data_gens_nested_for_udf, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_pandas_map_udf_nested_type(data_gen): # Supported UDF output types by plugin: (commonCudfTypes + ARRAY).nested() + STRUCT # STRUCT represents the whole dataframe in Map Pandas UDF, so no struct column in UDF output. diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py index d850403d118..3de2d3de859 100644 --- a/integration_tests/src/main/python/window_function_test.py +++ b/integration_tests/src/main/python/window_function_test.py @@ -15,7 +15,6 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_fallback_collect, assert_gpu_sql_fallback_collect -from conftest import is_not_utc from data_gen import * from marks import * from pyspark.sql.types import * @@ -451,7 +450,6 @@ def test_range_windows_with_string_order_by_column(data_gen, batch_size): # the order returned should be consistent because the data ends up in a single task (no partitioning) @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('b_gen', all_basic_gens + [decimal_gen_32bit, decimal_gen_128bit], ids=meta_idfn('data:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_batched_unbounded_no_part(b_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.castFloatToDecimal.enabled': True} @@ -469,7 +467,6 @@ def test_window_batched_unbounded_no_part(b_gen, batch_size): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('b_gen', all_basic_gens + [decimal_gen_32bit, decimal_gen_128bit], ids=meta_idfn('data:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_batched_unbounded(b_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.castFloatToDecimal.enabled': True} @@ -490,7 +487,6 @@ def test_window_batched_unbounded(b_gen, batch_size): # the order returned should be consistent because the data ends up in a single task (no partitioning) @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('b_gen', all_basic_gens + [decimal_gen_32bit, decimal_gen_128bit], ids=meta_idfn('data:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_rows_based_running_window_unpartitioned(b_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.castFloatToDecimal.enabled': True} @@ -526,7 +522,7 @@ def test_rows_based_running_window_unpartitioned(b_gen, batch_size): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # Testing multiple batch sizes. @pytest.mark.parametrize('a_gen', integral_gens + [string_gen, date_gen, timestamp_gen], ids=meta_idfn('data:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_running_window_without_partitions_runs_batched(a_gen, batch_size): """ This tests the running window optimization as applied to RANGE-based window specifications, @@ -650,7 +646,7 @@ def test_running_window_float_sum_without_partitions_runs_batched(batch_size): @pytest.mark.parametrize('data_gen', all_basic_gens + [decimal_gen_32bit, orderable_decimal_gen_128bit], ids=meta_idfn('data:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_window_running_rank_no_part(data_gen): # Keep the batch size small. We have tested these with operators with exact inputs already, this is mostly # testing the fixup operation. @@ -678,7 +674,7 @@ def test_window_running_rank_no_part(data_gen): # but small batch sizes can make sort very slow, so do the final order by locally @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_basic_gens + [decimal_gen_32bit], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_window_running_rank(data_gen): # Keep the batch size small. We have tested these with operators with exact inputs already, this is mostly # testing the fixup operation. @@ -706,7 +702,7 @@ def test_window_running_rank(data_gen): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('b_gen, c_gen', [(long_gen, x) for x in running_part_and_order_gens] + [(x, long_gen) for x in all_basic_gens + [decimal_gen_32bit]], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_rows_based_running_window_partitioned(b_gen, c_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.variableFloatAgg.enabled': True, @@ -746,7 +742,7 @@ def test_rows_based_running_window_partitioned(b_gen, c_gen, batch_size): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # Test different batch sizes. @pytest.mark.parametrize('part_gen', [int_gen, long_gen], ids=idfn) # Partitioning is not really the focus of the test. @pytest.mark.parametrize('order_gen', [x for x in all_basic_gens_no_null if x not in boolean_gens] + [decimal_gen_32bit], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_range_running_window_runs_batched(part_gen, order_gen, batch_size): """ This tests the running window optimization as applied to RANGE-based window specifications, @@ -890,7 +886,7 @@ def window(oby_column): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('c_gen', lead_lag_data_gens, ids=idfn) @pytest.mark.parametrize('a_b_gen', part_and_order_gens, ids=meta_idfn('partAndOrderBy:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_multi_types_window_aggs_for_rows_lead_lag(a_b_gen, c_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size} data_gen = [ @@ -948,7 +944,7 @@ def do_it(spark): @approximate_float @pytest.mark.parametrize('struct_gen', lead_lag_struct_with_arrays_gen, ids=idfn) @pytest.mark.parametrize('a_b_gen', part_and_order_gens, ids=meta_idfn('partAndOrderBy:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_lead_lag_for_structs_with_arrays(a_b_gen, struct_gen): data_gen = [ ('a', RepeatSeqGen(a_b_gen, length=20)), @@ -982,7 +978,7 @@ def do_it(spark): @pytest.mark.parametrize('c_gen', [UniqueLongGen()], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('b_gen', [long_gen], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('a_gen', [long_gen], ids=meta_idfn('partBy:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_window_aggs_for_rows_lead_lag_on_arrays(a_gen, b_gen, c_gen, d_gen): data_gen = [ ('a', RepeatSeqGen(a_gen, length=20)), @@ -1012,7 +1008,7 @@ def test_window_aggs_for_rows_lead_lag_on_arrays(a_gen, b_gen, c_gen, d_gen): @approximate_float @pytest.mark.parametrize('c_gen', [string_gen], ids=idfn) @pytest.mark.parametrize('a_b_gen', part_and_order_gens, ids=meta_idfn('partAndOrderBy:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_multi_types_window_aggs_for_rows(a_b_gen, c_gen): data_gen = [ ('a', RepeatSeqGen(a_b_gen, length=20)), @@ -1064,13 +1060,12 @@ def do_it(spark): assert_gpu_and_cpu_are_equal_collect(do_it, conf = {'spark.rapids.sql.batchSizeBytes': '100'}) @pytest.mark.skipif(is_before_spark_320(), reason="Only in Spark 3.2.0 is IGNORE NULLS supported for lead and lag by Spark") -@allow_non_gpu('WindowExec', 'Alias', 'WindowExpression', 'Lead', 'Literal', 'WindowSpecDefinition', 'SpecifiedWindowFrame') +@allow_non_gpu('WindowExec', 'Alias', 'WindowExpression', 'Lead', 'Literal', 'WindowSpecDefinition', 'SpecifiedWindowFrame', *non_utc_allow) @ignore_order(local=True) @pytest.mark.parametrize('d_gen', all_basic_gens, ids=meta_idfn('agg:')) @pytest.mark.parametrize('c_gen', [UniqueLongGen()], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('b_gen', [long_gen], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('a_gen', [long_gen], ids=meta_idfn('partBy:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_lead_ignore_nulls_fallback(a_gen, b_gen, c_gen, d_gen): data_gen = [ ('a', RepeatSeqGen(a_gen, length=20)), @@ -1089,13 +1084,12 @@ def test_window_aggs_lead_ignore_nulls_fallback(a_gen, b_gen, c_gen, d_gen): ''') @pytest.mark.skipif(is_before_spark_320(), reason="Only in Spark 3.2.0 is IGNORE NULLS supported for lead and lag by Spark") -@allow_non_gpu('WindowExec', 'Alias', 'WindowExpression', 'Lag', 'Literal', 'WindowSpecDefinition', 'SpecifiedWindowFrame') +@allow_non_gpu('WindowExec', 'Alias', 'WindowExpression', 'Lag', 'Literal', 'WindowSpecDefinition', 'SpecifiedWindowFrame', *non_utc_allow) @ignore_order(local=True) @pytest.mark.parametrize('d_gen', all_basic_gens, ids=meta_idfn('agg:')) @pytest.mark.parametrize('c_gen', [UniqueLongGen()], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('b_gen', [long_gen], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('a_gen', [long_gen], ids=meta_idfn('partBy:')) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_lag_ignore_nulls_fallback(a_gen, b_gen, c_gen, d_gen): data_gen = [ ('a', RepeatSeqGen(a_gen, length=20)), @@ -1120,7 +1114,7 @@ def test_window_aggs_lag_ignore_nulls_fallback(a_gen, b_gen, c_gen, d_gen): @pytest.mark.parametrize('data_gen', [_grpkey_longs_with_timestamps, pytest.param(_grpkey_longs_with_nullable_timestamps)], ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_window_aggs_for_ranges_timestamps(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, data_gen, length=2048), @@ -1268,7 +1262,7 @@ def test_window_aggregations_for_big_decimal_ranges(data_gen): # SortExec does not support array type, so sort the result locally. @ignore_order(local=True) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_window_aggs_for_rows_collect_list(): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, _gen_data_for_collect_list), @@ -1315,7 +1309,7 @@ def test_window_aggs_for_rows_collect_list(): @ignore_order(local=True) # This test is more directed at Databricks and their running window optimization instead of ours # this is why we do not validate that we inserted in a GpuRunningWindowExec, yet. -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_running_window_function_exec_for_all_aggs(): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, _gen_data_for_collect_list), @@ -1424,7 +1418,7 @@ def do_it(spark): # SortExec does not support array type, so sort the result locally. @ignore_order(local=True) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_window_aggs_for_rows_collect_set(): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, _gen_data_for_collect_set), @@ -1485,8 +1479,7 @@ def test_window_aggs_for_rows_collect_set(): # See https://github.com/NVIDIA/spark-rapids/issues/3715 # and https://github.com/rapidsai/cudf/issues/11222 @ignore_order(local=True) -@allow_non_gpu("ProjectExec", "SortArray") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu("ProjectExec", "SortArray", *non_utc_allow) def test_window_aggs_for_rows_collect_set_nested_array(): conf = copy_and_update(_float_conf, { "spark.rapids.sql.castFloatToString.enabled": "true", @@ -1599,7 +1592,7 @@ def do_it(spark): # but small batch sizes can make sort very slow, so do the final order by locally @ignore_order(local=True) @pytest.mark.parametrize('ride_along', all_basic_gens + decimal_gens + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_window_ride_along(ride_along): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, [('a', UniqueLongGen()), ('b', ride_along)]), @@ -1675,7 +1668,7 @@ def test_unbounded_to_unbounded_window(): 'last(a) IGNORE NULLS OVER (PARTITION BY b ORDER BY c) ' @pytest.mark.parametrize('data_gen', all_basic_gens_no_null + decimal_gens + _nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_window_first_last_nth(data_gen): assert_gpu_and_cpu_are_equal_sql( # Coalesce is to make sure that first and last, which are non-deterministic become deterministic @@ -1686,7 +1679,6 @@ def test_window_first_last_nth(data_gen): @pytest.mark.skipif(is_before_spark_320(), reason='IGNORE NULLS clause is not supported for FIRST(), LAST() and NTH_VALUE in Spark 3.1.x') @pytest.mark.parametrize('data_gen', all_basic_gens_no_null + decimal_gens + _nested_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_first_last_nth_ignore_nulls(data_gen): assert_gpu_and_cpu_are_equal_sql( # Coalesce is to make sure that first and last, which are non-deterministic become deterministic @@ -1697,7 +1689,7 @@ def test_window_first_last_nth_ignore_nulls(data_gen): @ignore_order(local=True) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@allow_non_gpu(*non_utc_allow) def test_to_date_with_window_functions(): """ This test ensures that date expressions participating alongside window aggregations diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh index 8a0b25a0c95..29be3bc43c0 100755 --- a/jenkins/databricks/build.sh +++ b/jenkins/databricks/build.sh @@ -112,6 +112,7 @@ initialize() echo "Build Version : ${BUILDVER}" echo "Skip Dependencies : ${SKIP_DEP_INSTALL}" echo "Include Default Spark Shim : ${WITH_DEFAULT_UPSTREAM_SHIM}" + echo "Extra environments : ${EXTRA_ENVS}" printf '+ %*s +\n' 100 '' | tr ' ' = } @@ -130,6 +131,10 @@ install_dependencies() ########################## # Main script starts here ########################## +## 'foo=abc,bar=123,...' to 'export foo=abc bar=123 ...' +if [ -n "$EXTRA_ENVS" ]; then + export ${EXTRA_ENVS//','/' '} +fi initialize if [[ $SKIP_DEP_INSTALL == "1" ]] diff --git a/jenkins/databricks/common_vars.sh b/jenkins/databricks/common_vars.sh index 5f02cbd9439..805eb989c53 100644 --- a/jenkins/databricks/common_vars.sh +++ b/jenkins/databricks/common_vars.sh @@ -15,6 +15,11 @@ # limitations under the License. # +## 'foo=abc,bar=123,...' to 'export foo=abc bar=123 ...' +if [ -n "$EXTRA_ENVS" ]; then + export ${EXTRA_ENVS//','/' '} +fi + SPARK_VER=${SPARK_VER:-$(< /databricks/spark/VERSION)} export SPARK_SHIM_VER=${SPARK_SHIM_VER:-spark${SPARK_VER//.}db} diff --git a/jenkins/databricks/params.py b/jenkins/databricks/params.py index 22a36fdf7c8..dce2436a6e6 100644 --- a/jenkins/databricks/params.py +++ b/jenkins/databricks/params.py @@ -26,11 +26,13 @@ base_spark_pom_version = '3.2.1' base_spark_version_to_install_databricks_jars = base_spark_pom_version clusterid = '' -# can take comma seperated maven options, e.g., -Pfoo=1,-Dbar=2,... +# can take comma separated maven options, e.g., -Pfoo=1,-Dbar=2,... mvn_opt = '' jar_path = '' -# `spark_conf` can take comma seperated multiple spark configurations, e.g., spark.foo=1,spark.bar=2,...' +# can take comma separated multiple spark configurations, e.g., spark.foo=1,spark.bar=2,...' spark_conf = '' +# can take comma separated environments, e.g., foo=abc,bar=123,...' +extra_envs = '' def usage(): @@ -48,11 +50,12 @@ def usage(): ' -j ' ' -n ' ' -f ' - ' -i ') + ' -i ' + ' -e ') try: - opts, script_args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:i:', + opts, script_args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:i:e:', ['workspace=', 'token=', 'clusterid=', @@ -62,9 +65,10 @@ def usage(): 'sparktgz=', 'basesparkpomversion=', 'mvnoptions=', - 'jarpath', - 'sparkconf', - 'sparkinstallver=']) + 'jarpath=', + 'sparkconf=', + 'sparkinstallver=', + 'extraenvs=']) except getopt.GetoptError: usage() sys.exit(2) @@ -97,6 +101,8 @@ def usage(): spark_conf = arg elif opt in ('-i', '--sparkinstallver'): base_spark_version_to_install_databricks_jars = arg + elif opt in ('-e', '--extraenvs'): + extra_envs = arg print('-w is ' + workspace) print('-c is ' + clusterid) @@ -109,3 +115,4 @@ def usage(): print('-j is ' + jar_path) print('-f is ' + spark_conf) print('-i is ' + base_spark_version_to_install_databricks_jars) +print('-e is ' + extra_envs) diff --git a/jenkins/databricks/run-build.py b/jenkins/databricks/run-build.py index 38c349237aa..277c4f7024c 100644 --- a/jenkins/databricks/run-build.py +++ b/jenkins/databricks/run-build.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,24 +29,25 @@ def main(): print("Master node address is: %s" % master_addr) print("Copying script") - rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (params.private_key_file, params.local_script, master_addr, params.script_dest) + ssh_args = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s" % params.private_key_file + rsync_command = "rsync -I -Pave \"ssh %s\" %s ubuntu@%s:%s" % (ssh_args, params.local_script, master_addr, params.script_dest) print("rsync command: %s" % rsync_command) subprocess.check_call(rsync_command, shell = True) print("Copying source") - rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" %s ubuntu@%s:%s" % (params.private_key_file, params.source_tgz, master_addr, params.tgz_dest) + rsync_command = "rsync -I -Pave \"ssh %s\" %s ubuntu@%s:%s" % (ssh_args, params.source_tgz, master_addr, params.tgz_dest) print("rsync command: %s" % rsync_command) subprocess.check_call(rsync_command, shell = True) - ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s " \ - "'SPARKSRCTGZ=%s BASE_SPARK_VERSION=%s BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=%s MVN_OPT=%s \ + ssh_command = "ssh %s ubuntu@%s " % (ssh_args, master_addr) + \ + "'SPARKSRCTGZ=%s BASE_SPARK_VERSION=%s BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=%s MVN_OPT=%s EXTRA_ENVS=%s \ bash %s %s 2>&1 | tee buildout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi'" % \ - (master_addr, params.private_key_file, params.tgz_dest, params.base_spark_pom_version, params.base_spark_version_to_install_databricks_jars, params.mvn_opt, params.script_dest, ' '.join(params.script_args)) + (params.tgz_dest, params.base_spark_pom_version, params.base_spark_version_to_install_databricks_jars, params.mvn_opt, params.extra_envs, params.script_dest, ' '.join(params.script_args)) print("ssh command: %s" % ssh_command) subprocess.check_call(ssh_command, shell = True) print("Copying built tarball back") - rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (params.private_key_file, master_addr) + rsync_command = "rsync -I -Pave \"ssh %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (ssh_args, master_addr) print("rsync command to get built tarball: %s" % rsync_command) subprocess.check_call(rsync_command, shell = True) diff --git a/jenkins/databricks/run-tests.py b/jenkins/databricks/run-tests.py index 19710f9bb28..cd0f8f0e04c 100644 --- a/jenkins/databricks/run-tests.py +++ b/jenkins/databricks/run-tests.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,15 +30,16 @@ def main(): print("Master node address is: %s" % master_addr) print("Copying script") - rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\"" \ - " %s ubuntu@%s:%s" % (params.private_key_file, params.local_script, master_addr, params.script_dest) + ssh_args = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s" % params.private_key_file + rsync_command = "rsync -I -Pave \"ssh %s\" %s ubuntu@%s:%s" % \ + (ssh_args, params.local_script, master_addr, params.script_dest) print("rsync command: %s" % rsync_command) subprocess.check_call(rsync_command, shell=True) - ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s " \ - "'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s bash %s %s 2>&1 | tee testout; " \ + ssh_command = "ssh %s ubuntu@%s " % (ssh_args, master_addr) + \ + "'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s EXTRA_ENVS=%s bash %s %s 2>&1 | tee testout; " \ "if [ ${PIPESTATUS[0]} -ne 0 ]; then false; else true; fi'" % \ - (master_addr, params.private_key_file, params.jar_path, params.spark_conf, params.base_spark_pom_version, + (params.jar_path, params.spark_conf, params.base_spark_pom_version, params.extra_envs, params.script_dest, ' '.join(params.script_args)) print("ssh command: %s" % ssh_command) try: @@ -46,9 +47,8 @@ def main(): finally: print("Copying test report tarball back") report_path_prefix = params.jar_path if params.jar_path else "/home/ubuntu/spark-rapids" - rsync_command = "rsync -I -Pave \"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2200 -i %s\"" \ - " ubuntu@%s:%s/integration_tests/target/run_dir*/TEST-pytest-*.xml ./" % \ - (params.private_key_file, master_addr, report_path_prefix) + rsync_command = "rsync -I -Pave \"ssh %s\" ubuntu@%s:%s/integration_tests/target/run_dir*/TEST-pytest-*.xml ./" % \ + (ssh_args, master_addr, report_path_prefix) print("rsync command: %s" % rsync_command) subprocess.check_call(rsync_command, shell = True) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCSVScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCSVScan.scala index 7f078cb4db2..611e9ce43a1 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCSVScan.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCSVScan.scala @@ -166,12 +166,29 @@ object GpuCSVScan { if (types.contains(DateType)) { GpuTextBasedDateUtils.tagCudfFormat(meta, GpuCsvUtils.dateFormatInRead(parsedOptions), parseString = true) + + // For date type, timezone needs to be checked also. This is because JVM timezone is used + // to get days offset before rebasing Julian to Gregorian in Spark while not in Rapids. + // + // In details, for CSV data format, Spark uses dateFormatter to parse string as date data + // type which utilizes [[org.apache.spark.sql.catalyst.DateFormatter]]. And CSV format + // (e.g., [[UnivocityParser]]), it uses [[LegacyFastDateFormatter]] which is based on + // Apache Commons FastDateFormat. It parse string into Java util.Date base on JVM default + // timezone. From Java util.Date, it's converted into java.sql.Date type. + // By leveraging [[JavaDateTimeUtils]], it finally do `rebaseJulianToGregorianDays` + // considering its offset to UTC timezone. + if (!GpuOverrides.isUTCTimezone(parsedOptions.zoneId)) { + meta.willNotWorkOnGpu(s"Not supported timezone type ${parsedOptions.zoneId}.") + } } if (types.contains(TimestampType)) { - meta.checkTimeZoneId(parsedOptions.zoneId) GpuTextBasedDateUtils.tagCudfFormat(meta, GpuCsvUtils.timestampFormatInRead(parsedOptions), parseString = true) + + if (!GpuOverrides.isUTCTimezone(parsedOptions.zoneId)) { + meta.willNotWorkOnGpu(s"Not supported timezone type ${parsedOptions.zoneId}.") + } } // TODO parsedOptions.emptyValueInRead diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index 8ef40af5689..71392cd008d 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -177,9 +177,6 @@ abstract class CastExprMetaBase[INPUT <: UnaryExpression with TimeZoneAwareExpre def buildTagMessage(entry: ConfEntry[_]): String = { s"${entry.doc}. To enable this operation on the GPU, set ${entry.key} to true." } - - // timezone tagging in type checks is good enough, so always false - override protected val needTimezoneTagging: Boolean = false } object CastOptions { diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala index abbda8e303b..d3e82a0637b 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala @@ -21,6 +21,7 @@ import java.net.URI import java.nio.ByteBuffer import java.nio.channels.Channels import java.nio.charset.StandardCharsets +import java.time.ZoneId import java.util import java.util.concurrent.{Callable, TimeUnit} import java.util.regex.Pattern @@ -154,6 +155,19 @@ object GpuOrcScan { meta.willNotWorkOnGpu("GpuOrcScan does not support default values in schema") } + // For date type, timezone needs to be checked also. This is because JVM timezone and UTC + // timezone offset is considered when getting [[java.sql.date]] from + // [[org.apache.spark.sql.execution.datasources.DaysWritable]] object + // which is a subclass of [[org.apache.hadoop.hive.serde2.io.DateWritable]]. + val types = schema.map(_.dataType).toSet + if (types.exists(GpuOverrides.isOrContainsDateOrTimestamp(_))) { + if (!GpuOverrides.isUTCTimezone()) { + meta.willNotWorkOnGpu("Only UTC timezone is supported for ORC. " + + s"Current timezone settings: (JVM : ${ZoneId.systemDefault()}, " + + s"session: ${SQLConf.get.sessionLocalTimeZone}). ") + } + } + FileFormatChecks.tag(meta, schema, OrcFormatType, ReadFileOp) } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 4d45dacfd0d..2a77fdbc06c 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -38,7 +38,7 @@ import org.apache.spark.sql.catalyst.optimizer.NormalizeNaNAndZero import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.TreeNodeTag -import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.catalyst.util.{ArrayData, DateTimeUtils} import org.apache.spark.sql.connector.read.Scan import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, BroadcastQueryStageExec, QueryStageExec, ShuffleQueryStageExec} @@ -319,7 +319,7 @@ final class InsertIntoHadoopFsRelationCommandMeta( private var fileFormat: Option[ColumnarFileFormat] = None - override def tagSelfForGpu(): Unit = { + override def tagSelfForGpuInternal(): Unit = { if (cmd.bucketSpec.isDefined) { willNotWorkOnGpu("bucketing is not supported") } @@ -625,6 +625,11 @@ object GpuOverrides extends Logging { timezoneId.normalized() == UTC_TIMEZONE_ID } + def isUTCTimezone(): Boolean = { + val zoneId = DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone) + isUTCTimezone(zoneId.normalized()) + } + def areAllSupportedTypes(types: DataType*): Boolean = types.forall(isSupportedType(_)) /** @@ -673,9 +678,7 @@ object GpuOverrides extends Logging { case FloatType => true case DoubleType => true case DateType => true - case TimestampType => - TypeChecks.areTimestampsSupported(ZoneId.systemDefault()) && - TypeChecks.areTimestampsSupported(SQLConf.get.sessionLocalTimeZone) + case TimestampType => true case StringType => true case dt: DecimalType if allowDecimal => dt.precision <= DType.DECIMAL64_MAX_PRECISION case NullType => allowNull @@ -700,6 +703,12 @@ object GpuOverrides extends Logging { def isOrContainsFloatingPoint(dataType: DataType): Boolean = TrampolineUtil.dataTypeExistsRecursively(dataType, dt => dt == FloatType || dt == DoubleType) + def isOrContainsDateOrTimestamp(dataType: DataType): Boolean = + TrampolineUtil.dataTypeExistsRecursively(dataType, dt => dt == TimestampType || dt == DateType) + + def isOrContainsTimestamp(dataType: DataType): Boolean = + TrampolineUtil.dataTypeExistsRecursively(dataType, dt => dt == TimestampType) + /** Tries to predict whether an adaptive plan will end up with data on the GPU or not. */ def probablyGpuPlan(adaptivePlan: AdaptiveSparkPlanExec, conf: RapidsConf): Boolean = { def findRootProcessingNode(plan: SparkPlan): SparkPlan = plan match { diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala index 7e845491ec0..e8ae977b1f6 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala @@ -129,7 +129,7 @@ object GpuParquetFileFormat { SparkShimImpl.parquetRebaseWrite(sqlConf)) if ((int96RebaseMode == DateTimeRebaseLegacy || dateTimeRebaseMode == DateTimeRebaseLegacy) - && !TypeChecks.areTimestampsSupported()) { + && !GpuOverrides.isUTCTimezone()) { meta.willNotWorkOnGpu("Only UTC timezone is supported in LEGACY rebase mode. " + s"Current timezone settings: (JVM : ${ZoneId.systemDefault()}, " + s"session: ${SQLConf.get.sessionLocalTimeZone}). " + diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/PlanShims.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/PlanShims.scala index cfb1586d6e3..dcab3e53f20 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/PlanShims.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/PlanShims.scala @@ -18,11 +18,19 @@ package com.nvidia.spark.rapids import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.types.DataType trait PlanShims { def extractExecutedPlan(plan: SparkPlan): SparkPlan def isAnsiCast(e: Expression): Boolean def isAnsiCastOptionallyAliased(e: Expression): Boolean + + /** + * Extra Ansi Cast's source's data type and target's data type + * @param e should be AnsiCast type or Cast under Ansi mode + * @return (source data type, target data type) + */ + def extractAnsiCastTypes(e: Expression): (DataType, DataType) } object PlanShims { @@ -35,7 +43,12 @@ object PlanShims { def isAnsiCast(e: Expression): Boolean = { shims.isAnsiCast(e) } + def isAnsiCastOptionallyAliased(e: Expression): Boolean = { shims.isAnsiCastOptionallyAliased(e) } + + def extractAnsiCastTypes(e: Expression): (DataType, DataType) = { + shims.extractAnsiCastTypes(e) + } } \ No newline at end of file diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala index 33ad80f48de..7e3303423fa 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala @@ -443,13 +443,11 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging { case Some(value) => ZoneId.of(value) case None => throw new RuntimeException(s"Driver time zone cannot be determined.") } - if (TypeChecks.areTimestampsSupported(driverTimezone)) { - val executorTimezone = ZoneId.systemDefault() - if (executorTimezone.normalized() != driverTimezone.normalized()) { - throw new RuntimeException(s" Driver and executor timezone mismatch. " + - s"Driver timezone is $driverTimezone and executor timezone is " + - s"$executorTimezone. Set executor timezone to $driverTimezone.") - } + val executorTimezone = ZoneId.systemDefault() + if (executorTimezone.normalized() != driverTimezone.normalized()) { + throw new RuntimeException(s" Driver and executor timezone mismatch. " + + s"Driver timezone is $driverTimezone and executor timezone is " + + s"$executorTimezone. Set executor timezone to $driverTimezone.") } GpuCoreDumpHandler.executorInit(conf, pluginContext) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala index 66803977cdb..e5cf75e9a13 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala @@ -22,7 +22,7 @@ import scala.collection.mutable import com.nvidia.spark.rapids.shims.{DistributionUtil, SparkShimImpl} -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BinaryExpression, ComplexTypeMergingExpression, Expression, QuaternaryExpression, String2TrimExpression, TernaryExpression, TimeZoneAwareExpression, UnaryExpression, WindowExpression, WindowFunction} +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BinaryExpression, Cast, ComplexTypeMergingExpression, Expression, QuaternaryExpression, String2TrimExpression, TernaryExpression, TimeZoneAwareExpression, UnaryExpression, UTCTimestamp, WindowExpression, WindowFunction} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, AggregateFunction, ImperativeAggregate, TypedImperativeAggregate} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.trees.TreeNodeTag @@ -34,9 +34,11 @@ import org.apache.spark.sql.execution.command.{DataWritingCommand, RunnableComma import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BroadcastNestedLoopJoinExec} import org.apache.spark.sql.execution.python.AggregateInPandasExec +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.rapids.TimeZoneDB import org.apache.spark.sql.rapids.aggregate.{CpuToGpuAggregateBufferConverter, GpuToCpuAggregateBufferConverter} import org.apache.spark.sql.rapids.execution.{GpuBroadcastHashJoinMetaBase, GpuBroadcastNestedLoopJoinMetaBase} -import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.types.{ArrayType, DataType, DateType, MapType, StringType, StructType} trait DataFromReplacementRule { val operationName: String @@ -382,14 +384,14 @@ abstract class RapidsMeta[INPUT <: BASE, BASE, OUTPUT <: BASE]( def checkTimeZoneId(sessionZoneId: ZoneId): Unit = { // Both of the Spark session time zone and JVM's default time zone should be UTC. - if (!TypeChecks.areTimestampsSupported(sessionZoneId)) { - willNotWorkOnGpu("Only UTC zone id is supported. " + + if (!TimeZoneDB.isSupportedTimezone(sessionZoneId)) { + willNotWorkOnGpu("Not supported zone id. " + s"Actual session local zone id: $sessionZoneId") } val defaultZoneId = ZoneId.systemDefault() - if (!TypeChecks.areTimestampsSupported(defaultZoneId)) { - willNotWorkOnGpu(s"Only UTC zone id is supported. Actual default zone id: $defaultZoneId") + if (!TimeZoneDB.isSupportedTimezone(defaultZoneId)) { + willNotWorkOnGpu(s"Not supported zone id. Actual default zone id: $defaultZoneId") } } @@ -544,7 +546,32 @@ abstract class DataWritingCommandMeta[INPUT <: DataWritingCommand]( override val childParts: Seq[PartMeta[_]] = Seq.empty override val childDataWriteCmds: Seq[DataWritingCommandMeta[_]] = Seq.empty - override def tagSelfForGpu(): Unit = {} + val checkTimeZone: Boolean = true + + final override def tagSelfForGpu(): Unit = { + if (checkTimeZone) { + timezoneCheck() + } + tagSelfForGpuInternal() + } + + protected def tagSelfForGpuInternal(): Unit = {} + + // Check whether data type of intput/output contains timestamp type, which + // is related to time zone. + // Only UTC time zone is allowed to be consistent with previous behavior + // for [[DataWritingCommand]]. Needs to override [[checkTimeZone]] to skip + // UTC time zone check in sub class of [[DataWritingCommand]]. + def timezoneCheck(): Unit = { + val types = (wrapped.inputSet.map(_.dataType) ++ wrapped.outputSet.map(_.dataType)).toSet + if (types.exists(GpuOverrides.isOrContainsTimestamp(_))) { + if (!GpuOverrides.isUTCTimezone()) { + willNotWorkOnGpu("Only UTC timezone is supported. " + + s"Current timezone settings: (JVM : ${ZoneId.systemDefault()}, " + + s"session: ${SQLConf.get.sessionLocalTimeZone}). ") + } + } + } } /** @@ -556,7 +583,7 @@ final class RuleNotFoundDataWritingCommandMeta[INPUT <: DataWritingCommand]( parent: Option[RapidsMeta[_, _, _]]) extends DataWritingCommandMeta[INPUT](cmd, conf, parent, new NoRuleDataFromReplacementRule) { - override def tagSelfForGpu(): Unit = { + override def tagSelfForGpuInternal(): Unit = { willNotWorkOnGpu(s"GPU does not currently support the operator ${cmd.getClass}") } @@ -1082,21 +1109,110 @@ abstract class BaseExprMeta[INPUT <: Expression]( val isFoldableNonLitAllowed: Boolean = false + // There are 4 levels of timezone check in GPU plan tag phase: + // Level 1: Check whether an expression is related to timezone. This is achieved by + // [[needTimeZoneCheck]] below. + // Level 2: Check on golden configuration 'spark.rapids.sql.nonUTC.enabled'. If + // yes, we pass to next level timezone check. If not, we only pass UTC case as before. + // Level 3: Check related expression has been implemented with timezone. There is a + // toggle flag [[isTimeZoneSupported]] for this. If false, fallback to UTC-only check as + // before. If yes, move to next level check. When we add timezone support for a related + // function. [[isTimeZoneSupported]] should be override as true. + // Level 4: Check whether the desired timezone is supported by Gpu kernel. + def checkExprForTimezone(): Unit = { + // Level 1 check + if (!needTimeZoneCheck) return + + // Level 2 check + if(!conf.nonUTCTimeZoneEnabled) return checkUTCTimezone(this) + + // Level 3 check + if (!isTimeZoneSupported) return checkUTCTimezone(this) + + // Level 4 check + if (TimeZoneDB.isSupportedTimezone(getZoneId())) { + willNotWorkOnGpu(TimeZoneDB.timezoneNotSupportedStr(this.wrapped.getClass.toString)) + } + } + + protected def getZoneId(): ZoneId = { + this.wrapped match { + case tzExpr: TimeZoneAwareExpression => tzExpr.zoneId + case ts: UTCTimestamp => { + assert(false, s"Have to override getZoneId() of BaseExprMeta in ${this.getClass.toString}") + throw new IllegalArgumentException(s"Failed to get zone id from ${ts.getClass.toString}") + } + case _ => throw new IllegalArgumentException( + s"Zone check should never been happened to ${this.getClass.toString} " + + "which is not timezone related") + } + } + + // Level 1 timezone checking flag + // Both [[isTimeZoneSupported]] and [[needTimeZoneCheck]] are needed to check whether timezone + // check needed. For cast expression, only some cases are needed pending on its data type and + // its child's data type. + // + //+------------------------+-------------------+-----------------------------------------+ + //| Value | needTimeZoneCheck | isTimeZoneSupported | + //+------------------------+-------------------+-----------------------------------------+ + //| TimezoneAwareExpression| True | False by default, True when implemented | + //| UTCTimestamp | True | False by default, True when implemented | + //| Others | False | N/A (will not be checked) | + //+------------------------+-------------------+-----------------------------------------+ + lazy val needTimeZoneCheck: Boolean = { + wrapped match { + // CurrentDate expression will not go through this even it's a `TimeZoneAwareExpression`. + // It will be treated as literal in Rapids. + case _: TimeZoneAwareExpression => + if (wrapped.isInstanceOf[Cast]) { + val cast = wrapped.asInstanceOf[Cast] + needsTimeZone(cast.child.dataType, cast.dataType) + } else if(PlanShims.isAnsiCast(wrapped)) { + val (from, to) = PlanShims.extractAnsiCastTypes(wrapped) + needsTimeZone(from, to) + } else{ + true + } + case _: UTCTimestamp => true + case _ => false + } + } + + // Mostly base on Spark existing [[Cast.needsTimeZone]] method. Two changes are made: + // 1. Override date related based on https://github.com/apache/spark/pull/40524 merged + // 2. Existing `needsTimezone` doesn't consider complex types to string which is timezone + // related. (incl. struct/map/list to string). + private[this] def needsTimeZone(from: DataType, to: DataType): Boolean = (from, to) match { + case (StringType, DateType) => false + case (DateType, StringType) => false + case (ArrayType(fromType, _), StringType) => needsTimeZone(fromType, to) + case (MapType(fromKey, fromValue, _), StringType) => + needsTimeZone(fromKey, to) || needsTimeZone(fromValue, to) + case (StructType(fromFields), StringType) => + fromFields.exists { + case fromField => + needsTimeZone(fromField.dataType, to) + } + // Avoid copying full implementation here. Otherwise needs to create shim for TimestampNTZ + // since Spark 3.4.0 + case _ => Cast.needsTimeZone(from, to) + } + + // Level 3 timezone checking flag, need to override to true when supports timezone in functions + // Useless if it's not timezone related expression defined in [[needTimeZoneCheck]] + val isTimeZoneSupported: Boolean = false + /** - * Whether to tag a TimeZoneAwareExpression for timezone after all the other tagging - * is done. - * By default a TimeZoneAwareExpression always requires the timezone tagging, but - * there are some exceptions, e.g. 'Cast', who requires timezone tagging only when it - * has timezone sensitive type as input or output. + * Timezone check which only allows UTC timezone. This is consistent with previous behavior. * - * Override this to match special cases. + * @param meta to check whether it's UTC */ - protected def needTimezoneTagging: Boolean = { - // A TimeZoneAwareExpression with no timezone sensitive types as input/output will - // escape from the timezone tagging in the prior type checks. So ask for tagging here. - // e.g. 'UnixTimestamp' with 'DateType' as the input, timezone will be taken into - // account when converting a Date to a Long. - !(dataType +: childExprs.map(_.dataType)).exists(TypeChecks.isTimezoneSensitiveType) + def checkUTCTimezone(meta: RapidsMeta[_, _, _]): Unit = { + if (!GpuOverrides.isUTCTimezone()) { + meta.willNotWorkOnGpu( + TimeZoneDB.nonUTCTimezoneNotSupportedStr(meta.wrapped.getClass.toString)) + } } final override def tagSelfForGpu(): Unit = { @@ -1105,12 +1221,8 @@ abstract class BaseExprMeta[INPUT <: Expression]( s"$wrapped is foldable and operates on non literals") } rule.getChecks.foreach(_.tag(this)) + checkExprForTimezone() tagExprForGpu() - wrapped match { - case tzAware: TimeZoneAwareExpression if needTimezoneTagging => - checkTimeZoneId(tzAware.zoneId) - case _ => // do nothing - } } /** diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TypeChecks.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TypeChecks.scala index 517b15d9912..b8dea05b73f 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TypeChecks.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TypeChecks.scala @@ -17,14 +17,11 @@ package com.nvidia.spark.rapids import java.io.{File, FileOutputStream} -import java.time.ZoneId import ai.rapids.cudf.DType import com.nvidia.spark.rapids.shims.{CastCheckShims, GpuTypeShims, TypeSigUtil} import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, UnaryExpression, WindowSpecDefinition} -import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ /** Trait of TypeSigUtil for different spark versions */ @@ -363,8 +360,7 @@ final class TypeSig private( case FloatType => check.contains(TypeEnum.FLOAT) case DoubleType => check.contains(TypeEnum.DOUBLE) case DateType => check.contains(TypeEnum.DATE) - case TimestampType if check.contains(TypeEnum.TIMESTAMP) => - TypeChecks.areTimestampsSupported() + case TimestampType => check.contains(TypeEnum.TIMESTAMP) case StringType => check.contains(TypeEnum.STRING) case dt: DecimalType => check.contains(TypeEnum.DECIMAL) && @@ -402,15 +398,6 @@ final class TypeSig private( } } - private[this] def timezoneNotSupportedMessage(dataType: DataType, - te: TypeEnum.Value, check: TypeEnum.ValueSet, isChild: Boolean): Seq[String] = { - if (check.contains(te) && !TypeChecks.areTimestampsSupported()) { - Seq(withChild(isChild, TypeChecks.timezoneNotSupportedString(dataType))) - } else { - basicNotSupportedMessage(dataType, te, check, isChild) - } - } - private[this] def reasonNotSupported( check: TypeEnum.ValueSet, dataType: DataType, @@ -433,7 +420,7 @@ final class TypeSig private( case DateType => basicNotSupportedMessage(dataType, TypeEnum.DATE, check, isChild) case TimestampType => - timezoneNotSupportedMessage(dataType, TypeEnum.TIMESTAMP, check, isChild) + basicNotSupportedMessage(dataType, TypeEnum.TIMESTAMP, check, isChild) case StringType => basicNotSupportedMessage(dataType, TypeEnum.STRING, check, isChild) case dt: DecimalType => @@ -780,30 +767,6 @@ abstract class TypeChecks[RET] { }.mkString(", ") } - /** - * Original log does not print enough info when timezone is not UTC, - * here check again to add UTC info. - */ - private def tagTimezoneInfoIfHasTimestampType( - unsupportedTypes: Map[DataType, Set[String]], - meta: RapidsMeta[_, _, _]): Unit = { - def checkTimestampType(dataType: DataType): Unit = dataType match { - case TimestampType if !TypeChecks.areTimestampsSupported() => - meta.willNotWorkOnGpu(TypeChecks.timezoneNotSupportedString(dataType)) - case ArrayType(elementType, _) => - checkTimestampType(elementType) - case MapType(keyType, valueType, _) => - checkTimestampType(keyType) - checkTimestampType(valueType) - case StructType(fields) => - fields.foreach(field => checkTimestampType(field.dataType)) - case _ => // do nothing - } - unsupportedTypes.foreach { case (dataType, _) => - checkTimestampType(dataType) - } - } - protected def tagUnsupportedTypes( meta: RapidsMeta[_, _, _], sig: TypeSig, @@ -815,43 +778,12 @@ abstract class TypeChecks[RET] { .groupBy(_.dataType) .mapValues(_.map(_.name).toSet).toMap - tagTimezoneInfoIfHasTimestampType(unsupportedTypes, meta) - if (unsupportedTypes.nonEmpty) { meta.willNotWorkOnGpu(msgFormat.format(stringifyTypeAttributeMap(unsupportedTypes))) } } } -object TypeChecks { - /** - * Check if the time zone passed is supported by plugin. - */ - def areTimestampsSupported(timezoneId: ZoneId): Boolean = { - timezoneId.normalized() == GpuOverrides.UTC_TIMEZONE_ID - } - - def areTimestampsSupported(zoneIdString: String): Boolean = { - val zoneId = DateTimeUtils.getZoneId(zoneIdString) - areTimestampsSupported(zoneId) - } - - def areTimestampsSupported(): Boolean = { - areTimestampsSupported(ZoneId.systemDefault()) && - areTimestampsSupported(SQLConf.get.sessionLocalTimeZone) - } - - def isTimezoneSensitiveType(dataType: DataType): Boolean = { - dataType == TimestampType - } - - def timezoneNotSupportedString(dataType: DataType): String = { - s"$dataType is not supported with timezone settings: (JVM:" + - s" ${ZoneId.systemDefault()}, session: ${SQLConf.get.sessionLocalTimeZone})." + - s" Set both of the timezones to UTC to enable $dataType support" - } -} - /** * Checks a set of named inputs to an SparkPlan node against a TypeSig */ diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala index c4840839616..04f28ef045d 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala @@ -166,10 +166,26 @@ object GpuJsonScan { if (types.contains(DateType)) { GpuTextBasedDateUtils.tagCudfFormat(meta, GpuJsonUtils.dateFormatInRead(parsedOptions), parseString = true) + + // For date type, timezone needs to be checked also. This is because JVM timezone is used + // to get days offset before rebasing Julian to Gregorian in Spark while not in Rapids. + // + // In details, for Json data format, Spark uses dateFormatter to parse string as date data + // type which utilizes [[org.apache.spark.sql.catalyst.DateFormatter]]. For Json format, it + // uses [[LegacyFastDateFormatter]] which is based on Apache Commons FastDateFormat. It parse + // string into Java util.Date base on JVM default timezone. From Java util.Date, it's + // converted into java.sql.Date type. By leveraging [[JavaDateTimeUtils]], it finally do + // `rebaseJulianToGregorianDays` considering its offset to UTC timezone. + if(!GpuOverrides.isUTCTimezone(parsedOptions.zoneId)){ + meta.willNotWorkOnGpu(s"Not supported timezone type ${parsedOptions.zoneId}.") + } } - if (types.contains(TimestampType)) { - meta.checkTimeZoneId(parsedOptions.zoneId) + if (types.contains(TimestampType) || types.contains(DateType)) { + if (!GpuOverrides.isUTCTimezone(parsedOptions.zoneId)) { + meta.willNotWorkOnGpu(s"Not supported timezone type ${parsedOptions.zoneId}.") + } + GpuTextBasedDateUtils.tagCudfFormat(meta, GpuJsonUtils.timestampFormatInRead(parsedOptions), parseString = true) } diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/HiveProviderImpl.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/HiveProviderImpl.scala index 476c3f75f3b..1eabc9b5cea 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/HiveProviderImpl.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/HiveProviderImpl.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.hive.rapids import java.nio.charset.Charset +import java.time.ZoneId import com.google.common.base.Charsets import com.nvidia.spark.RapidsUDF @@ -31,6 +32,7 @@ import org.apache.spark.sql.hive.{HiveGenericUDF, HiveSimpleUDF} import org.apache.spark.sql.hive.execution.HiveTableScanExec import org.apache.spark.sql.hive.rapids.GpuHiveTextFileUtils._ import org.apache.spark.sql.hive.rapids.shims.HiveProviderCmdShims +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.rapids.execution.TrampolineUtil import org.apache.spark.sql.types._ @@ -289,6 +291,15 @@ class HiveProviderImpl extends HiveProviderCmdShims { willNotWorkOnGpu("reading of decimal typed values has been disabled set " + s"${RapidsConf.ENABLE_READ_HIVE_DECIMALS} to true to enable this.") } + + val types = wrapped.schema.map(_.dataType).toSet + if (types.exists(GpuOverrides.isOrContainsTimestamp(_))) { + if (!GpuOverrides.isUTCTimezone()) { + willNotWorkOnGpu("Only UTC timezone is supported. " + + s"Current timezone settings: (JVM : ${ZoneId.systemDefault()}, " + + s"session: ${SQLConf.get.sessionLocalTimeZone}). ") + } + } } override def convertToGpu(): GpuExec = { diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala index 9e50a993537..d2f4380646c 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala @@ -16,6 +16,8 @@ package org.apache.spark.sql.rapids +import java.time.ZoneId + import ai.rapids.cudf._ import com.nvidia.spark.rapids._ import com.nvidia.spark.rapids.shims.OrcShims @@ -30,6 +32,7 @@ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.execution.datasources.FileFormat import org.apache.spark.sql.execution.datasources.orc.{OrcFileFormat, OrcOptions, OrcUtils} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ object GpuOrcFileFormat extends Logging { @@ -75,6 +78,19 @@ object GpuOrcFileFormat extends Logging { "If bloom filter is not required, unset \"orc.bloom.filter.columns\"") } + // For date type, timezone needs to be checked also. This is because JVM timezone and UTC + // timezone offset is considered when getting [[java.sql.date]] from + // [[org.apache.spark.sql.execution.datasources.DaysWritable]] object + // which is a subclass of [[org.apache.hadoop.hive.serde2.io.DateWritable]]. + val types = schema.map(_.dataType).toSet + if (types.exists(GpuOverrides.isOrContainsDateOrTimestamp(_))) { + if (!GpuOverrides.isUTCTimezone()) { + meta.willNotWorkOnGpu("Only UTC timezone is supported for ORC. " + + s"Current timezone settings: (JVM : ${ZoneId.systemDefault()}, " + + s"session: ${SQLConf.get.sessionLocalTimeZone}). ") + } + } + FileFormatChecks.tag(meta, schema, OrcFormatType, WriteFileOp) val sqlConf = spark.sessionState.conf diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/TimeZoneDB.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/TimeZoneDB.scala index 91c1928cc00..9202767cacc 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/TimeZoneDB.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/TimeZoneDB.scala @@ -22,8 +22,10 @@ import ai.rapids.cudf.{ColumnVector, DType, HostColumnVector} import com.nvidia.spark.rapids.Arm.withResource import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.internal.SQLConf object TimeZoneDB { + // Copied from Spark. Used to format time zone ID string with (+|-)h:mm and (+|-)hh:m def getZoneId(timezoneId: String): ZoneId = { val formattedZoneId = timezoneId @@ -40,6 +42,22 @@ object TimeZoneDB { rules.isFixedOffset || rules.getTransitionRules.isEmpty } + def isSupportedTimezone(timezoneId: ZoneId): Boolean = { + val rules = timezoneId.getRules + rules.isFixedOffset || rules.getTransitionRules.isEmpty + } + + def nonUTCTimezoneNotSupportedStr(exprName: String): String = { + s"$exprName is not supported with timezone settings: (JVM:" + + s" ${ZoneId.systemDefault()}, session: ${SQLConf.get.sessionLocalTimeZone})." + + s" Set both of the timezones to UTC to enable $exprName support" + } + + def timezoneNotSupportedStr(timezoneIdStr: String): String = { + s"Timezone $timezoneIdStr is not supported yet. Only Non DST (daylight saving time) timezone" + + s" is supported." + } + def cacheDatabase(): Unit = {} /** diff --git a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/AnsiCastShim.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/AnsiCastShim.scala index d879b6da1e9..e67425d2591 100644 --- a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/AnsiCastShim.scala +++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/AnsiCastShim.scala @@ -37,6 +37,7 @@ package com.nvidia.spark.rapids.shims import com.nvidia.spark.rapids.{GpuCast, GpuEvalMode} import org.apache.spark.sql.catalyst.expressions.{AnsiCast, Cast, Expression} +import org.apache.spark.sql.types.DataType object AnsiCastShim { def isAnsiCast(e: Expression): Boolean = e match { @@ -59,4 +60,9 @@ object AnsiCastShim { m.setAccessible(true) m.getBoolean(e) } + + def extractAnsiCastTypes(e: Expression): (DataType, DataType) = e match { + case c: AnsiCast => (c.child.dataType, c.dataType) + case _ => throw new UnsupportedOperationException(s"${e.getClass} is not AnsiCast type") + } } \ No newline at end of file diff --git a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectCommandMetaShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectCommandMetaShims.scala index 6d515d4845d..42fe88916c6 100644 --- a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectCommandMetaShims.scala +++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectCommandMetaShims.scala @@ -55,7 +55,7 @@ final class CreateDataSourceTableAsSelectCommandMeta( private var origProvider: Class[_] = _ private var gpuProvider: Option[ColumnarFileFormat] = None - override def tagSelfForGpu(): Unit = { + override def tagSelfForGpuInternal(): Unit = { if (cmd.table.bucketSpec.isDefined) { willNotWorkOnGpu("bucketing is not supported") } diff --git a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuOptimizedCreateHiveTableAsSelectCommandShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuOptimizedCreateHiveTableAsSelectCommandShims.scala index afcec8d5706..869b4a2b3b8 100644 --- a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuOptimizedCreateHiveTableAsSelectCommandShims.scala +++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/GpuOptimizedCreateHiveTableAsSelectCommandShims.scala @@ -171,7 +171,7 @@ final class OptimizedCreateHiveTableAsSelectCommandMeta( extends DataWritingCommandMeta[OptimizedCreateHiveTableAsSelectCommand]( cmd, conf, parent, rule) { - override def tagSelfForGpu(): Unit = { + override def tagSelfForGpuInternal(): Unit = { // It would be cleaner if we could simply call `cmd.getWritingCommand` and let // InsertIntoHadoopFsRelationCommandMeta tag the result, but calling getWritingCommand // before the table exists will crash. So this ends up replicating a portion of the logic diff --git a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala index 5be71b13a51..438038fb749 100644 --- a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala +++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala @@ -25,6 +25,7 @@ import com.nvidia.spark.rapids.{GpuAlias, PlanShims} import org.apache.spark.sql.catalyst.expressions.{Alias, Expression} import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.types.DataType class PlanShimsImpl extends PlanShims { def extractExecutedPlan(plan: SparkPlan): SparkPlan = plan @@ -35,4 +36,8 @@ class PlanShimsImpl extends PlanShims { case GpuAlias(e, _) => isAnsiCast(e) case e => isAnsiCast(e) } + + def extractAnsiCastTypes(e: Expression): (DataType, DataType) = { + AnsiCastShim.extractAnsiCastTypes(e) + } } diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuCreateHiveTableAsSelectCommand.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuCreateHiveTableAsSelectCommand.scala index 5d9e8e97bed..2e7a03b8def 100644 --- a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuCreateHiveTableAsSelectCommand.scala +++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuCreateHiveTableAsSelectCommand.scala @@ -52,7 +52,7 @@ final class GpuCreateHiveTableAsSelectCommandMeta(cmd: CreateHiveTableAsSelectCo private var cpuWritingCommand: Option[InsertIntoHiveTable] = None - override def tagSelfForGpu(): Unit = { + override def tagSelfForGpuInternal(): Unit = { val spark = SparkSession.active val tableDesc = cmd.tableDesc // For the *new* table. diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuInsertIntoHiveTable.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuInsertIntoHiveTable.scala index 2e10cda8414..a4e010bbc7b 100644 --- a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuInsertIntoHiveTable.scala +++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuInsertIntoHiveTable.scala @@ -68,7 +68,7 @@ final class GpuInsertIntoHiveTableMeta(cmd: InsertIntoHiveTable, private var fileFormat: Option[ColumnarFileFormat] = None - override def tagSelfForGpu(): Unit = { + override def tagSelfForGpuInternal(): Unit = { // Only Hive delimited text writes are currently supported. // Check whether that is the format currently in play. fileFormat = GpuHiveTextFileFormat.tagGpuSupport(this) diff --git a/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala index 1ac7eeddf3b..7af5e029afb 100644 --- a/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala +++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala @@ -40,6 +40,7 @@ import com.nvidia.spark.rapids.{GpuAlias, PlanShims} import org.apache.spark.sql.catalyst.expressions.{Alias, Expression} import org.apache.spark.sql.execution.{CommandResultExec, SparkPlan} +import org.apache.spark.sql.types.DataType class PlanShimsImpl extends PlanShims { def extractExecutedPlan(plan: SparkPlan): SparkPlan = plan match { @@ -54,4 +55,8 @@ class PlanShimsImpl extends PlanShims { case GpuAlias(e, _) => isAnsiCast(e) case e => isAnsiCast(e) } + + def extractAnsiCastTypes(e: Expression): (DataType, DataType) = { + AnsiCastShim.extractAnsiCastTypes(e) + } } diff --git a/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/AnsiCastShim.scala b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/AnsiCastShim.scala index a5b0c79766e..e82c1ae9aed 100644 --- a/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/AnsiCastShim.scala +++ b/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/AnsiCastShim.scala @@ -26,6 +26,7 @@ package com.nvidia.spark.rapids.shims import com.nvidia.spark.rapids.{GpuCast, GpuEvalMode} import org.apache.spark.sql.catalyst.expressions.{Cast, EvalMode, Expression} +import org.apache.spark.sql.types.DataType object AnsiCastShim { def isAnsiCast(e: Expression): Boolean = e match { @@ -41,4 +42,9 @@ object AnsiCastShim { case EvalMode.TRY => GpuEvalMode.TRY } } + + def extractAnsiCastTypes(e: Expression): (DataType, DataType) = e match { + case c: Cast => (c.child.dataType, c.dataType) + case _ => throw new UnsupportedOperationException(s"${e.getClass} is Cast type") + } } diff --git a/sql-plugin/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/GpuInsertIntoHiveTable.scala b/sql-plugin/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/GpuInsertIntoHiveTable.scala index c4f7235639b..cc4b005a1ec 100644 --- a/sql-plugin/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/GpuInsertIntoHiveTable.scala +++ b/sql-plugin/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/GpuInsertIntoHiveTable.scala @@ -54,7 +54,7 @@ final class GpuInsertIntoHiveTableMeta(cmd: InsertIntoHiveTable, private var fileFormat: Option[ColumnarFileFormat] = None - override def tagSelfForGpu(): Unit = { + override def tagSelfForGpuInternal(): Unit = { // Only Hive delimited text writes are currently supported. // Check whether that is the format currently in play. fileFormat = GpuHiveTextFileFormat.tagGpuSupport(this) diff --git a/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala b/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala index 2968081dff6..bf5567298bf 100644 --- a/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala +++ b/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/PlanShimsImpl.scala @@ -24,6 +24,7 @@ import com.nvidia.spark.rapids.{GpuAlias, PlanShims} import org.apache.spark.sql.catalyst.expressions.{Alias, Expression} import org.apache.spark.sql.execution.{CommandResultExec, SparkPlan} import org.apache.spark.sql.execution.adaptive.ResultQueryStageExec +import org.apache.spark.sql.types.DataType class PlanShimsImpl extends PlanShims { def extractExecutedPlan(plan: SparkPlan): SparkPlan = plan match { @@ -39,4 +40,8 @@ class PlanShimsImpl extends PlanShims { case GpuAlias(e, _) => isAnsiCast(e) case e => isAnsiCast(e) } + + def extractAnsiCastTypes(e: Expression): (DataType, DataType) = { + AnsiCastShim.extractAnsiCastTypes(e) + } } diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala index 702928225a7..74ef6e5cc73 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala @@ -16,11 +16,15 @@ package com.nvidia.spark.rapids +import java.sql.{Date, Timestamp} +import java.time.{ZonedDateTime, ZoneId} +import java.util.TimeZone + +import scala.collection.mutable.ListBuffer + import ai.rapids.cudf.{ColumnVector, RegexProgram} import com.nvidia.spark.rapids.Arm.withResource -import java.sql.{Date, Timestamp} import org.scalatest.BeforeAndAfterEach -import scala.collection.mutable.ListBuffer import org.apache.spark.SparkConf import org.apache.spark.sql.{Row, SparkSession} @@ -280,6 +284,48 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE assert(res) } + test("literals: ensure time literals are correct with different timezones") { + testTimeWithDiffTimezones("Asia/Shanghai", "America/New_York") + testTimeWithDiffTimezones("Asia/Shanghai", "UTC") + testTimeWithDiffTimezones("UTC", "Asia/Shanghai") + } + + private[this] def testTimeWithDiffTimezones(sessionTZStr: String, systemTZStr: String) = { + withTimeZones(sessionTimeZone = sessionTZStr, systemTimeZone = systemTZStr) { conf => + val df = withGpuSparkSession(spark => { + spark.sql("SELECT current_date(), current_timestamp(), now() FROM RANGE(1, 10)") + }, conf) + + val times = df.collect() + val zonedDateTime = ZonedDateTime.now(ZoneId.of("America/New_York")) + val res = times.forall(time => { + val diffDate = zonedDateTime.toLocalDate.toEpochDay - time.getLocalDate(0).toEpochDay + val diffTimestamp = + zonedDateTime.toInstant.getNano - time.getInstant(1).getNano + val diffNow = + zonedDateTime.toInstant.getNano - time.getInstant(2).getNano + // For date, at most 1 day difference when execution is crossing two days + // For timestamp or now, it should be less than 1 second allowing Spark's execution + diffDate.abs <= 1 & diffTimestamp.abs <= 1E9 & diffNow.abs <= 1E9 + }) + assert(res) + } + } + + private def withTimeZones(sessionTimeZone: String, + systemTimeZone: String)(f: SparkConf => Unit): Unit = { + val conf = new SparkConf() + conf.set(SQLConf.SESSION_LOCAL_TIMEZONE.key, sessionTimeZone) + conf.set(SQLConf.DATETIME_JAVA8API_ENABLED.key, "true") + val originTimeZone = TimeZone.getDefault + try { + TimeZone.setDefault(TimeZone.getTimeZone(systemTimeZone)) + f(conf) + } finally { + TimeZone.setDefault(originTimeZone) + } + } + private def testRegex(rule: RegexReplace, values: Seq[String], expected: Seq[String]): Unit = { withResource(ColumnVector.fromStrings(values: _*)) { v => withResource(ColumnVector.fromStrings(expected: _*)) { expected =>