diff --git a/integration_tests/src/main/python/aqe_test.py b/integration_tests/src/main/python/aqe_test.py index dd683c04fd2..189bef329d7 100755 --- a/integration_tests/src/main/python/aqe_test.py +++ b/integration_tests/src/main/python/aqe_test.py @@ -16,6 +16,7 @@ from pyspark.sql.functions import when, col, current_date, current_timestamp from pyspark.sql.types import * from asserts import assert_gpu_and_cpu_are_equal_collect, assert_cpu_and_gpu_are_equal_collect_with_capture +from conftest import is_not_utc from data_gen import * from marks import ignore_order, allow_non_gpu from spark_session import with_cpu_session, is_databricks113_or_later @@ -195,6 +196,7 @@ def do_it(spark): @ignore_order(local=True) @allow_non_gpu('BroadcastNestedLoopJoinExec', 'Cast', 'DateSub', *db_113_cpu_bnlj_join_allow) @pytest.mark.parametrize('join', joins, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_aqe_join_reused_exchange_inequality_condition(spark_tmp_path, join): data_path = spark_tmp_path + '/PARQUET_DATA' def prep(spark): diff --git a/integration_tests/src/main/python/arithmetic_ops_test.py b/integration_tests/src/main/python/arithmetic_ops_test.py index cb3c4ebd151..1408894310d 100644 --- a/integration_tests/src/main/python/arithmetic_ops_test.py +++ b/integration_tests/src/main/python/arithmetic_ops_test.py @@ -16,6 +16,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_fallback_collect, assert_gpu_and_cpu_are_equal_sql +from conftest import is_not_utc from data_gen import * from marks import ignore_order, incompat, approximate_float, allow_non_gpu, datagen_overrides from pyspark.sql.types import * @@ -985,6 +986,7 @@ def test_columnar_pow(data_gen): lambda spark : binary_op_df(spark, data_gen).selectExpr('pow(a, b)')) @pytest.mark.parametrize('data_gen', all_basic_gens + _arith_decimal_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_least(data_gen): num_cols = 20 s1 = with_cpu_session( @@ -1001,6 +1003,7 @@ def test_least(data_gen): f.least(*command_args))) @pytest.mark.parametrize('data_gen', all_basic_gens + _arith_decimal_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_greatest(data_gen): num_cols = 20 s1 = with_cpu_session( diff --git a/integration_tests/src/main/python/array_test.py b/integration_tests/src/main/python/array_test.py index ec29dce70d1..29f4e64b893 100644 --- a/integration_tests/src/main/python/array_test.py +++ b/integration_tests/src/main/python/array_test.py @@ -16,7 +16,7 @@ from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_and_cpu_error, assert_gpu_fallback_collect from data_gen import * -from conftest import is_databricks_runtime +from conftest import is_databricks_runtime, is_not_utc from marks import incompat from spark_session import is_before_spark_313, is_before_spark_330, is_databricks113_or_later, is_spark_330_or_later, is_databricks104_or_later, is_spark_33X, is_spark_340_or_later, is_spark_330, is_spark_330cdh from pyspark.sql.types import * @@ -103,11 +103,13 @@ @pytest.mark.parametrize('data_gen', array_item_test_gens, ids=idfn) @pytest.mark.parametrize('index_gen', array_index_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_item(data_gen, index_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, data_gen, index_gen).selectExpr('a[b]')) @pytest.mark.parametrize('data_gen', array_item_test_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_item_lit_ordinal(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -145,6 +147,7 @@ def test_array_item_with_strict_index(strict_index_enabled, index): # No need to test this for multiple data types for array. Only one is enough, but with two kinds of invalid index. @pytest.mark.parametrize('index', [-2, 100, array_neg_index_gen, array_out_index_gen], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_item_ansi_fail_invalid_index(index): message = "SparkArrayIndexOutOfBoundsException" if (is_databricks104_or_later() or is_spark_330_or_later()) else "java.lang.ArrayIndexOutOfBoundsException" if isinstance(index, int): @@ -171,6 +174,7 @@ def test_array_item_ansi_not_fail_all_null_data(): decimal_gen_32bit, decimal_gen_64bit, decimal_gen_128bit, binary_gen, StructGen([['child0', StructGen([['child01', IntegerGen()]])], ['child1', string_gen], ['child2', float_gen]], nullable=False), StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]], nullable=False)], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_make_array(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -183,6 +187,7 @@ def test_make_array(data_gen): @pytest.mark.parametrize('data_gen', single_level_array_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_orderby_array_unique(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark : append_unique_int_col_to_df(spark, unary_op_df(spark, data_gen)), @@ -212,6 +217,7 @@ def test_orderby_array_of_structs(data_gen): @pytest.mark.parametrize('data_gen', [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, string_gen, boolean_gen, date_gen, timestamp_gen], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_contains(data_gen): arr_gen = ArrayGen(data_gen) literal = with_cpu_session(lambda spark: gen_scalar(data_gen, force_no_nulls=True)) @@ -239,6 +245,7 @@ def test_array_contains_for_nans(data_gen): @pytest.mark.parametrize('data_gen', array_item_test_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_element_at(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, data_gen, array_no_zero_index_gen).selectExpr( @@ -303,6 +310,7 @@ def test_array_element_at_zero_index_fail(index, ansi_enabled): @pytest.mark.parametrize('data_gen', array_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_transform(data_gen): def do_it(spark): columns = ['a', 'b', @@ -337,6 +345,7 @@ def do_it(spark): string_gen, boolean_gen, date_gen, timestamp_gen, null_gen] + decimal_gens @pytest.mark.parametrize('data_gen', array_min_max_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_min_max(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, ArrayGen(data_gen)).selectExpr( @@ -361,6 +370,7 @@ def test_array_concat_decimal(data_gen): 'concat(a, a)'))) @pytest.mark.parametrize('data_gen', orderable_gens + nested_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_repeat_with_count_column(data_gen): cnt_gen = IntegerGen(min_val=-5, max_val=5, special_cases=[]) cnt_not_null_gen = IntegerGen(min_val=-5, max_val=5, special_cases=[], nullable=False) @@ -374,6 +384,7 @@ def test_array_repeat_with_count_column(data_gen): @pytest.mark.parametrize('data_gen', orderable_gens + nested_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_repeat_with_count_scalar(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -403,6 +414,7 @@ def test_sql_array_scalars(query): @pytest.mark.parametrize('data_gen', all_basic_gens + nested_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_array_struct_fields(data_gen): array_struct_gen = ArrayGen( StructGen([['child0', data_gen], ['child1', int_gen]]), @@ -441,6 +453,7 @@ def do_it(spark): @pytest.mark.parametrize('data_gen', array_zips_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_arrays_zip(data_gen): gen = StructGen( [('a', data_gen), ('b', data_gen), ('c', data_gen), ('d', data_gen)], nullable=False) @@ -473,6 +486,7 @@ def q1(spark): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens + decimal_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') @pytest.mark.skipif(is_before_spark_313() or is_spark_330() or is_spark_330cdh(), reason="NaN equality is only handled in Spark 3.1.3+ and SPARK-39976 issue with null and ArrayIntersect in Spark 3.3.0") def test_array_intersect(data_gen): gen = StructGen( @@ -514,6 +528,7 @@ def test_array_intersect_spark330(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens_no_nans + decimal_gens, ids=idfn) @pytest.mark.skipif(not is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_intersect_before_spark313(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -534,6 +549,7 @@ def test_array_intersect_before_spark313(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens + decimal_gens, ids=idfn) @pytest.mark.skipif(is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_union(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -554,6 +570,7 @@ def test_array_union(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens_no_nans + decimal_gens, ids=idfn) @pytest.mark.skipif(not is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_union_before_spark313(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -574,6 +591,7 @@ def test_array_union_before_spark313(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens + decimal_gens, ids=idfn) @pytest.mark.skipif(is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_except(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -594,6 +612,7 @@ def test_array_except(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens_no_nans + decimal_gens, ids=idfn) @pytest.mark.skipif(not is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_except_before_spark313(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -614,6 +633,7 @@ def test_array_except_before_spark313(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens + decimal_gens, ids=idfn) @pytest.mark.skipif(is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_arrays_overlap(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -635,6 +655,7 @@ def test_arrays_overlap(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens_no_nans + decimal_gens, ids=idfn) @pytest.mark.skipif(not is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_arrays_overlap_before_spark313(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -672,6 +693,7 @@ def test_array_remove_scalar(data_gen): FloatGen(special_cases=_non_neg_zero_float_special_cases + [-0.0]), DoubleGen(special_cases=_non_neg_zero_double_special_cases + [-0.0]), StringGen(pattern='[0-9]{1,5}'), boolean_gen, date_gen, timestamp_gen] + decimal_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_remove(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -686,6 +708,7 @@ def test_array_remove(data_gen): @pytest.mark.parametrize('data_gen', [ArrayGen(sub_gen) for sub_gen in array_gens_sample], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_flatten_array(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('flatten(a)') diff --git a/integration_tests/src/main/python/ast_test.py b/integration_tests/src/main/python/ast_test.py index 21e982e4fb8..2c06c51a876 100644 --- a/integration_tests/src/main/python/ast_test.py +++ b/integration_tests/src/main/python/ast_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_cpu_and_gpu_are_equal_collect_with_capture +from conftest import is_not_utc from data_gen import * from marks import approximate_float, datagen_overrides from spark_session import with_cpu_session, is_before_spark_330 @@ -70,6 +71,7 @@ def assert_binary_ast(data_descr, func, conf={}): assert_gpu_ast(is_supported, lambda spark: func(binary_op_df(spark, data_gen)), conf=conf) @pytest.mark.parametrize('data_gen', [boolean_gen, byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, timestamp_gen, date_gen], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_literal(spark_tmp_path, data_gen): # Write data to Parquet so Spark generates a plan using just the count of the data. data_path = spark_tmp_path + '/AST_TEST_DATA' @@ -79,6 +81,7 @@ def test_literal(spark_tmp_path, data_gen): func=lambda spark: spark.read.parquet(data_path).select(scalar)) @pytest.mark.parametrize('data_gen', [boolean_gen, byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, timestamp_gen, date_gen], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_null_literal(spark_tmp_path, data_gen): # Write data to Parquet so Spark generates a plan using just the count of the data. data_path = spark_tmp_path + '/AST_TEST_DATA' @@ -232,6 +235,7 @@ def test_expm1(data_descr): assert_unary_ast(data_descr, lambda df: df.selectExpr('expm1(a)')) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_eq(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, @@ -241,6 +245,7 @@ def test_eq(data_descr): f.col('a') == f.col('b'))) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_ne(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, @@ -250,6 +255,7 @@ def test_ne(data_descr): f.col('a') != f.col('b'))) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_lt(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, @@ -259,6 +265,7 @@ def test_lt(data_descr): f.col('a') < f.col('b'))) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_lte(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, @@ -268,6 +275,7 @@ def test_lte(data_descr): f.col('a') <= f.col('b'))) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_gt(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, @@ -277,6 +285,7 @@ def test_gt(data_descr): f.col('a') > f.col('b'))) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_gte(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, diff --git a/integration_tests/src/main/python/cache_test.py b/integration_tests/src/main/python/cache_test.py index 662d4d9d8aa..e028e93a959 100644 --- a/integration_tests/src/main/python/cache_test.py +++ b/integration_tests/src/main/python/cache_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_equal +from conftest import is_not_utc from data_gen import * import pyspark.sql.functions as f from spark_session import with_cpu_session, with_gpu_session, is_before_spark_330 @@ -64,6 +65,7 @@ def test_passing_gpuExpr_as_Expr(enable_vectorized_conf): @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('enable_vectorized_conf', enable_vectorized_confs, ids=idfn) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cache_join(data_gen, enable_vectorized_conf): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) @@ -91,6 +93,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('enable_vectorized_conf', enable_vectorized_confs, ids=idfn) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cache_expand_exec(data_gen, enable_vectorized_conf): def op_df(spark, length=2048): cached = gen_df(spark, StructGen([ @@ -165,6 +168,7 @@ def n_fold(spark): @pytest.mark.parametrize('enable_vectorized', ['true', 'false'], ids=idfn) @ignore_order @allow_non_gpu("SortExec", "ShuffleExchangeExec", "RangePartitioning") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cache_columnar(spark_tmp_path, data_gen, enable_vectorized, ts_write): data_path_gpu = spark_tmp_path + '/PARQUET_DATA' def read_parquet_cached(data_path): @@ -277,6 +281,7 @@ def helper(spark): @pytest.mark.parametrize('enable_vectorized_conf', enable_vectorized_confs, ids=idfn) @pytest.mark.parametrize('batch_size', [{"spark.rapids.sql.batchSizeBytes": "100"}, {}], ids=idfn) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cache_count(data_gen, with_x_session, enable_vectorized_conf, batch_size): test_conf = copy_and_update(enable_vectorized_conf, batch_size) generate_data_and_test_func_on_cached_df(with_x_session, lambda df: df.count(), data_gen, test_conf) diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index 53803a17403..dbb41b60bb7 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -14,10 +14,10 @@ import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_and_cpu_error, assert_gpu_fallback_collect, assert_spark_exception +from asserts import * +from conftest import is_not_utc from data_gen import * -from spark_session import is_before_spark_320, is_before_spark_330, is_spark_340_or_later, \ - is_databricks113_or_later +from spark_session import * from marks import allow_non_gpu, approximate_float, datagen_overrides from pyspark.sql.types import * from spark_init_internal import spark_version @@ -152,6 +152,7 @@ def test_cast_string_date_non_ansi(): StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'), StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9].[0-9]{0,6}Z?')], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_string_ts_valid_format(data_gen): # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. # This provides values that are valid in all of those formats. @@ -299,6 +300,7 @@ def _assert_cast_to_string_equal (data_gen, conf): @pytest.mark.parametrize('data_gen', all_array_gens_for_cast_to_string, ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_array_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, @@ -318,6 +320,7 @@ def test_cast_array_with_unmatched_element_to_string(data_gen, legacy): @pytest.mark.parametrize('data_gen', basic_map_gens_for_cast_to_string, ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_map_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, @@ -337,6 +340,7 @@ def test_cast_map_with_unmatched_element_to_string(data_gen, legacy): @pytest.mark.parametrize('data_gen', [StructGen([[str(i), gen] for i, gen in enumerate(basic_array_struct_gens_for_cast_to_string)] + [["map", MapGen(ByteGen(nullable=False), null_gen)]])], ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_struct_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, @@ -401,6 +405,7 @@ def test_cast_string_to_negative_scale_decimal(): @pytest.mark.skipif(is_before_spark_330(), reason="ansi cast throws exception only in 3.3.0+") @pytest.mark.parametrize('type', [DoubleType(), FloatType()], ids=idfn) @pytest.mark.parametrize('invalid_value', [float("inf"), float("-inf"), float("nan")]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_float_to_timestamp_ansi_for_nan_inf(type, invalid_value): def fun(spark): data = [invalid_value] @@ -412,6 +417,7 @@ def fun(spark): @pytest.mark.skipif(is_before_spark_330(), reason="ansi cast throws exception only in 3.3.0+") @pytest.mark.parametrize('type', [DoubleType(), FloatType()], ids=idfn) @pytest.mark.parametrize('invalid_value', [float(LONG_MAX) + 100, float(LONG_MIN) - 100]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_float_to_timestamp_ansi_overflow(type, invalid_value): def fun(spark): data = [invalid_value] @@ -420,6 +426,7 @@ def fun(spark): assert_gpu_and_cpu_error(fun, {"spark.sql.ansi.enabled": True}, "ArithmeticException") @pytest.mark.skipif(is_before_spark_330(), reason='330+ throws exception in ANSI mode') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_float_to_timestamp_side_effect(): def getDf(spark): data = [(True, float(LONG_MAX) + 100), (False, float(1))] @@ -431,6 +438,7 @@ def getDf(spark): # non ansi mode, will get null @pytest.mark.parametrize('type', [DoubleType(), FloatType()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_float_to_timestamp_for_nan_inf(type): def fun(spark): data = [(float("inf"),), (float("-inf"),), (float("nan"),)] @@ -450,6 +458,7 @@ def fun(spark): short_gen, int_gen, long_gen_to_timestamp], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_integral_to_timestamp(gen, ansi_enabled): if(is_before_spark_330() and ansi_enabled): # 330- does not support in ANSI mode pytest.skip() @@ -458,6 +467,7 @@ def test_cast_integral_to_timestamp(gen, ansi_enabled): conf={"spark.sql.ansi.enabled": ansi_enabled}) @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_float_to_timestamp(ansi_enabled): if(is_before_spark_330() and ansi_enabled): # 330- does not support in ANSI mode pytest.skip() @@ -467,6 +477,7 @@ def test_cast_float_to_timestamp(ansi_enabled): conf={"spark.sql.ansi.enabled": ansi_enabled}) @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_double_to_timestamp(ansi_enabled): if (is_before_spark_330() and ansi_enabled): # 330- does not support in ANSI mode pytest.skip() @@ -484,6 +495,7 @@ def test_cast_double_to_timestamp(ansi_enabled): (INT_MIN - 1, IntegerType()), ], ids=idfn) @pytest.mark.skipif(is_before_spark_330(), reason="Spark 330- does not ansi casting between numeric and timestamp") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_timestamp_to_integral_ansi_overflow(invalid_and_type): (invalid, to_type) = invalid_and_type assert_gpu_and_cpu_error( @@ -494,6 +506,7 @@ def test_cast_timestamp_to_integral_ansi_overflow(invalid_and_type): error_message="overflow") @pytest.mark.skipif(is_before_spark_330(), reason="Spark 330- does not ansi casting between numeric and timestamp") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_timestamp_to_numeric_ansi_no_overflow(): data = [datetime.fromtimestamp(i) for i in range(BYTE_MIN, BYTE_MAX + 1)] assert_gpu_and_cpu_are_equal_collect( @@ -502,12 +515,14 @@ def test_cast_timestamp_to_numeric_ansi_no_overflow(): "cast(value as float)", "cast(value as double)"), conf=ansi_enabled_conf) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_timestamp_to_numeric_non_ansi(): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, timestamp_gen) .selectExpr("cast(a as byte)", "cast(a as short)", "cast(a as int)", "cast(a as long)", "cast(a as float)", "cast(a as double)")) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_timestamp_to_string(): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, timestamp_gen) diff --git a/integration_tests/src/main/python/cmp_test.py b/integration_tests/src/main/python/cmp_test.py index a211e881aaf..a891b667016 100644 --- a/integration_tests/src/main/python/cmp_test.py +++ b/integration_tests/src/main/python/cmp_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect +from conftest import is_not_utc from data_gen import * from spark_session import with_cpu_session, is_before_spark_330 from pyspark.sql.types import * @@ -22,6 +23,7 @@ import pyspark.sql.functions as f @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + struct_gens_sample_with_decimal128_no_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_eq(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -55,6 +57,7 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + struct_gens_sample_with_decimal128_no_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_eq_ns(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -82,6 +85,7 @@ def test_eq_ns_for_interval(): f.col('a').eqNullSafe(f.col('b')))) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + struct_gens_sample_with_decimal128_no_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_ne(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -115,6 +119,7 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', orderable_gens + struct_gens_sample_with_decimal128_no_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_lt(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -148,6 +153,7 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', orderable_gens + struct_gens_sample_with_decimal128_no_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_lte(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -182,6 +188,7 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', orderable_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_gt(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -216,6 +223,7 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', orderable_gens + struct_gens_sample_with_decimal128_no_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_gte(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -250,6 +258,7 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + [binary_gen] + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_isnull(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select( @@ -269,23 +278,27 @@ def test_isnan(data_gen): f.isnan(f.col('a')))) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + [binary_gen] + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_dropna_any(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).dropna()) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + [binary_gen] + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_dropna_all(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).dropna(how='all')) #dropna is really a filter along with a test for null, but lets do an explicit filter test too @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_filter(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : three_col_df(spark, BooleanGen(), data_gen, data_gen).filter(f.col('a'))) # coalesce batch happens after a filter, but only if something else happens on the GPU after that @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_filter_with_project(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : two_col_df(spark, BooleanGen(), data_gen).filter(f.col('a')).selectExpr('*', 'a as a2')) @@ -295,6 +308,7 @@ def test_filter_with_project(data_gen): # and some constants that then make it so all we need is the number of rows # of input. @pytest.mark.parametrize('op', ['>', '<']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_empty_filter(op, spark_tmp_path): def do_it(spark): @@ -323,6 +337,7 @@ def test_filter_with_lit(expr): # Spark supports two different versions of 'IN', and it depends on the spark.sql.optimizer.inSetConversionThreshold conf # This is to test entries under that value. @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_in(data_gen): # nulls are not supported for in on the GPU yet num_entries = int(with_cpu_session(lambda spark: spark.conf.get('spark.sql.optimizer.inSetConversionThreshold'))) - 1 @@ -335,6 +350,7 @@ def test_in(data_gen): # This is to test entries over that value. @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9687') @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_in_set(data_gen): # nulls are not supported for in on the GPU yet num_entries = int(with_cpu_session(lambda spark: spark.conf.get('spark.sql.optimizer.inSetConversionThreshold'))) + 1 diff --git a/integration_tests/src/main/python/collection_ops_test.py b/integration_tests/src/main/python/collection_ops_test.py index 971523248ab..43cc782df0f 100644 --- a/integration_tests/src/main/python/collection_ops_test.py +++ b/integration_tests/src/main/python/collection_ops_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error +from conftest import is_not_utc from data_gen import * from pyspark.sql.types import * from string_test import mk_str_gen @@ -34,6 +35,7 @@ for sub_gen in all_gen + [null_gen]] @pytest.mark.parametrize('data_gen', non_nested_array_gens + nested_array_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_concat_list(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: three_col_df(spark, data_gen, data_gen, data_gen).selectExpr( @@ -44,6 +46,7 @@ def test_concat_list(data_gen): ) @pytest.mark.parametrize('dg', non_nested_array_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_concat_double_list_with_lit(dg): data_gen = ArrayGen(dg, max_length=2) array_lit = with_cpu_session(lambda spark: gen_scalar(data_gen)) @@ -67,6 +70,7 @@ def test_concat_double_list_with_lit(dg): @pytest.mark.parametrize('data_gen', non_nested_array_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_concat_list_with_lit(data_gen): lit_col1 = with_cpu_session(lambda spark: f.lit(gen_scalar(data_gen))).cast(data_gen.data_type) lit_col2 = with_cpu_session(lambda spark: f.lit(gen_scalar(data_gen))).cast(data_gen.data_type) @@ -95,6 +99,7 @@ def test_concat_string(): f.concat(f.col('a'), f.lit('')))) @pytest.mark.parametrize('data_gen', map_gens_sample + decimal_64_map_gens + decimal_128_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_concat(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: three_col_df(spark, data_gen, data_gen, data_gen @@ -106,6 +111,7 @@ def test_map_concat(data_gen): ) @pytest.mark.parametrize('data_gen', map_gens_sample + decimal_64_map_gens + decimal_128_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_concat_with_lit(data_gen): lit_col1 = with_cpu_session(lambda spark: f.lit(gen_scalar(data_gen))).cast(data_gen.data_type) lit_col2 = with_cpu_session(lambda spark: f.lit(gen_scalar(data_gen))).cast(data_gen.data_type) @@ -119,6 +125,7 @@ def test_map_concat_with_lit(data_gen): @pytest.mark.parametrize('data_gen', all_gen + nested_gens, ids=idfn) @pytest.mark.parametrize('size_of_null', ['true', 'false'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_size_of_array(data_gen, size_of_null): gen = ArrayGen(data_gen) assert_gpu_and_cpu_are_equal_collect( @@ -127,12 +134,14 @@ def test_size_of_array(data_gen, size_of_null): @pytest.mark.parametrize('data_gen', map_gens_sample, ids=idfn) @pytest.mark.parametrize('size_of_null', ['true', 'false'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_size_of_map(data_gen, size_of_null): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('size(a)'), conf={'spark.sql.legacy.sizeOfNull': size_of_null}) @pytest.mark.parametrize('data_gen', array_gens_sample + [string_gen], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_reverse(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('reverse(a)')) @@ -143,6 +152,7 @@ def test_reverse(data_gen): ] @pytest.mark.parametrize('data_gen', _sort_array_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sort_array(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select( @@ -150,6 +160,7 @@ def test_sort_array(data_gen): f.sort_array(f.col('a'), False))) @pytest.mark.parametrize('data_gen', _sort_array_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sort_array_lit(data_gen): array_lit = with_cpu_session(lambda spark: gen_scalar(data_gen)) assert_gpu_and_cpu_are_equal_collect( @@ -250,6 +261,7 @@ def test_sort_array_normalize_nans(): gens in sequence_normal_integral_gens] @pytest.mark.parametrize('start_gen,stop_gen', sequence_normal_no_step_integral_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sequence_without_step(start_gen, stop_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, start_gen, stop_gen).selectExpr( @@ -258,6 +270,7 @@ def test_sequence_without_step(start_gen, stop_gen): "sequence(20, b)")) @pytest.mark.parametrize('start_gen,stop_gen,step_gen', sequence_normal_integral_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sequence_with_step(start_gen, stop_gen, step_gen): # Get the datagen seed we use for all datagens, since we need to call start # on step_gen @@ -304,6 +317,7 @@ def test_sequence_with_step(start_gen, stop_gen, step_gen): ] @pytest.mark.parametrize('start_gen,stop_gen,step_gen', sequence_illegal_boundaries_integral_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sequence_illegal_boundaries(start_gen, stop_gen, step_gen): assert_gpu_and_cpu_error( lambda spark:three_col_df(spark, start_gen, stop_gen, step_gen).selectExpr( @@ -318,6 +332,7 @@ def test_sequence_illegal_boundaries(start_gen, stop_gen, step_gen): ] @pytest.mark.parametrize('stop_gen', sequence_too_long_length_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sequence_too_long_sequence(stop_gen): assert_gpu_and_cpu_error( # To avoid OOM, reduce the row number to 1, it is enough to verify this case. @@ -359,6 +374,7 @@ def get_sequence_data(gen, len): mixed_schema) # test for 3 cases mixed in a single dataset +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sequence_with_step_mixed_cases(): assert_gpu_and_cpu_are_equal_collect( lambda spark: get_sequence_cases_mixed_df(spark) diff --git a/integration_tests/src/main/python/conditionals_test.py b/integration_tests/src/main/python/conditionals_test.py index c819a64f549..48d5a05c099 100644 --- a/integration_tests/src/main/python/conditionals_test.py +++ b/integration_tests/src/main/python/conditionals_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect +from conftest import is_not_utc from data_gen import * from spark_session import is_before_spark_320, is_jvm_charset_utf8 from pyspark.sql.types import * @@ -44,6 +45,7 @@ def mk_str_gen(pattern): if_nested_gens = if_array_gens_sample + if_struct_gens_sample @pytest.mark.parametrize('data_gen', all_gens + if_nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_if_else(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -63,6 +65,7 @@ def test_if_else(data_gen): # Maps scalars are not really supported by Spark from python without jumping through a lot of hoops # so for now we are going to skip them @pytest.mark.parametrize('data_gen', map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_if_else_map(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : three_col_df(spark, boolean_gen, data_gen, data_gen).selectExpr( @@ -72,6 +75,7 @@ def test_if_else_map(data_gen): @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9685') @pytest.mark.order(1) # at the head of xdist worker queue if pytest-order is installed @pytest.mark.parametrize('data_gen', all_gens + all_nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_case_when(data_gen): num_cmps = 20 s1 = with_cpu_session( @@ -115,6 +119,7 @@ def test_nanvl(data_gen): f.nanvl(f.lit(float('nan')).cast(data_type), f.col('b')))) @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_nvl(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -134,6 +139,7 @@ def test_nvl(data_gen): # at least one `BoundReference` @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9684') @pytest.mark.parametrize('data_gen', all_gens + all_nested_gens_nonempty_struct + map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_coalesce(data_gen): num_cols = 20 s1 = with_cpu_session( @@ -155,6 +161,7 @@ def test_coalesce_constant_output(): lambda spark : spark.range(1, 100).selectExpr("4 + coalesce(5, id) as nine")) @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_nvl2(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -168,6 +175,7 @@ def test_nvl2(data_gen): 'nvl2(a, {}, c)'.format(null_lit))) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_nullif(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -181,6 +189,7 @@ def test_nullif(data_gen): 'nullif(a, {})'.format(null_lit))) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_ifnull(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -232,6 +241,7 @@ def test_conditional_with_side_effects_case_when(data_gen): conf = test_conf) @pytest.mark.parametrize('data_gen', [mk_str_gen('[a-z]{0,3}')], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_conditional_with_side_effects_sequence(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr( @@ -242,6 +252,7 @@ def test_conditional_with_side_effects_sequence(data_gen): @pytest.mark.skipif(is_before_spark_320(), reason='Earlier versions of Spark cannot cast sequence to string') @pytest.mark.parametrize('data_gen', [mk_str_gen('[a-z]{0,3}')], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_conditional_with_side_effects_sequence_cast(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr( diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py index 19ad8d29151..c10221a4407 100644 --- a/integration_tests/src/main/python/csv_test.py +++ b/integration_tests/src/main/python/csv_test.py @@ -14,14 +14,13 @@ import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_and_cpu_row_counts_equal, assert_gpu_fallback_write, \ - assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_fallback_collect -from conftest import get_non_gpu_allowed +from asserts import * +from conftest import get_non_gpu_allowed, is_not_utc from datetime import datetime, timezone from data_gen import * from marks import * from pyspark.sql.types import * -from spark_session import with_cpu_session, is_before_spark_330, is_spark_350_or_later, is_before_spark_340, is_before_spark_341 +from spark_session import * _acq_schema = StructType([ StructField('loan_id', LongType()), @@ -249,6 +248,7 @@ def read_impl(spark): @pytest.mark.parametrize('read_func', [read_csv_df, read_csv_sql]) @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) @pytest.mark.parametrize('ansi_enabled', ["true", "false"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_basic_csv_read(std_input_path, name, schema, options, read_func, v1_enabled_list, ansi_enabled, spark_tmp_table_factory): updated_conf=copy_and_update(_enable_all_types_conf, { 'spark.sql.sources.useV1SourceList': v1_enabled_list, @@ -289,6 +289,7 @@ def test_csv_read_small_floats(std_input_path, name, schema, options, read_func, @approximate_float @pytest.mark.parametrize('data_gen', csv_supported_gens, ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_round_trip(spark_tmp_path, data_gen, v1_enabled_list): gen = StructGen([('a', data_gen)], nullable=False) data_path = spark_tmp_path + '/CSV_DATA' @@ -405,6 +406,7 @@ def test_read_valid_and_invalid_dates(std_input_path, filename, v1_enabled_list, @pytest.mark.parametrize('ts_part', csv_supported_ts_parts) @pytest.mark.parametrize('date_format', csv_supported_date_formats) @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_enabled_list): full_format = date_format + ts_part data_gen = TimestampGen() @@ -475,6 +477,7 @@ def test_input_meta_fallback(spark_tmp_path, v1_enabled_list, disable_conf): conf=updated_conf) @allow_non_gpu('DataWritingCommandExec,ExecutedCommandExec,WriteFilesExec') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_csv_save_as_table_fallback(spark_tmp_path, spark_tmp_table_factory): gen = TimestampGen() data_path = spark_tmp_path + '/CSV_DATA' @@ -567,6 +570,7 @@ def test_csv_read_count(spark_tmp_path): @pytest.mark.parametrize("timestamp_type", [ pytest.param('TIMESTAMP_LTZ', marks=pytest.mark.xfail(is_spark_350_or_later(), reason="https://github.com/NVIDIA/spark-rapids/issues/9325")), "TIMESTAMP_NTZ"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_csv_infer_schema_timestamp_ntz_v1(spark_tmp_path, date_format, ts_part, timestamp_type): csv_infer_schema_timestamp_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, 'csv', 'FileSourceScanExec') @@ -619,6 +623,7 @@ def do_read(spark): @allow_non_gpu('FileSourceScanExec', 'CollectLimitExec', 'DeserializeToObjectExec') @pytest.mark.skipif(is_before_spark_340(), reason='`preferDate` is only supported in Spark 340+') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_csv_prefer_date_with_infer_schema(spark_tmp_path): # start date ""0001-01-02" required due to: https://github.com/NVIDIA/spark-rapids/issues/5606 data_gens = [byte_gen, short_gen, int_gen, long_gen, boolean_gen, timestamp_gen, DateGen(start=date(1, 1, 2))] diff --git a/integration_tests/src/main/python/datasourcev2_read_test.py b/integration_tests/src/main/python/datasourcev2_read_test.py index c4834a53c1c..cc141700cb8 100644 --- a/integration_tests/src/main/python/datasourcev2_read_test.py +++ b/integration_tests/src/main/python/datasourcev2_read_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_row_counts_equal +from conftest import is_not_utc from marks import * columnarClass = 'com.nvidia.spark.rapids.tests.datasourcev2.parquet.ArrowColumnarDataSourceV2' @@ -26,26 +27,31 @@ def readTable(types, classToUse): .orderBy("col1") @validate_execs_in_gpu_plan('HostColumnarToGpu') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_int(): assert_gpu_and_cpu_are_equal_collect(readTable("int", columnarClass)) @validate_execs_in_gpu_plan('HostColumnarToGpu') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_strings(): assert_gpu_and_cpu_are_equal_collect(readTable("string", columnarClass)) @validate_execs_in_gpu_plan('HostColumnarToGpu') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_all_types(): assert_gpu_and_cpu_are_equal_collect( readTable("int,bool,byte,short,long,string,float,double,date,timestamp", columnarClass), conf={'spark.rapids.sql.castFloatToString.enabled': 'true'}) @validate_execs_in_gpu_plan('HostColumnarToGpu') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_all_types_count(): assert_gpu_and_cpu_row_counts_equal( readTable("int,bool,byte,short,long,string,float,double,date,timestamp", columnarClass), conf={'spark.rapids.sql.castFloatToString.enabled': 'true'}) @validate_execs_in_gpu_plan('HostColumnarToGpu') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_arrow_off(): assert_gpu_and_cpu_are_equal_collect( readTable("int,bool,byte,short,long,string,float,double,date,timestamp", columnarClass), diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index 0d4f457f8e8..1d984193f9e 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -14,6 +14,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_gpu_and_cpu_error +from conftest import is_not_utc from data_gen import * from datetime import date, datetime, timezone from marks import ignore_order, incompat, allow_non_gpu @@ -25,6 +26,7 @@ vals = [(-584, 1563), (1943, 1101), (2693, 2167), (2729, 0), (44, 1534), (2635, 3319), (1885, -2828), (0, 2463), (932, 2286), (0, 0)] @pytest.mark.parametrize('data_gen', vals, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timesub(data_gen): days, seconds = data_gen assert_gpu_and_cpu_are_equal_collect( @@ -33,6 +35,7 @@ def test_timesub(data_gen): .selectExpr("a - (interval {} days {} seconds)".format(days, seconds))) @pytest.mark.parametrize('data_gen', vals, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timeadd(data_gen): days, seconds = data_gen assert_gpu_and_cpu_are_equal_collect( @@ -42,6 +45,7 @@ def test_timeadd(data_gen): .selectExpr("a + (interval {} days {} seconds)".format(days, seconds))) @pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timeadd_daytime_column(): gen_list = [ # timestamp column max year is 1000 @@ -59,6 +63,7 @@ def test_interval_seconds_overflow_exception(): error_message="IllegalArgumentException") @pytest.mark.parametrize('data_gen', vals, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timeadd_from_subquery(data_gen): def fun(spark): @@ -70,6 +75,7 @@ def fun(spark): assert_gpu_and_cpu_are_equal_collect(fun) @pytest.mark.parametrize('data_gen', vals, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timesub_from_subquery(data_gen): def fun(spark): @@ -85,6 +91,7 @@ def fun(spark): # [SPARK-34896][SQL] Return day-time interval from dates subtraction # 1. Add the SQL config `spark.sql.legacy.interval.enabled` which will control when Spark SQL should use `CalendarIntervalType` instead of ANSI intervals. @pytest.mark.parametrize('data_gen', vals, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_dateaddinterval(data_gen): days, seconds = data_gen assert_gpu_and_cpu_are_equal_collect( @@ -95,6 +102,7 @@ def test_dateaddinterval(data_gen): # test add days(not specify hours, minutes, seconds, milliseconds, microseconds) in ANSI mode. @pytest.mark.parametrize('data_gen', vals, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_dateaddinterval_ansi(data_gen): days, _ = data_gen # only specify the `days` @@ -122,14 +130,17 @@ def test_datediff(data_gen): 'datediff(a, date(null))', 'datediff(a, \'2016-03-02\')')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hour(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('hour(a)')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_minute(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('minute(a)')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_second(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('second(a)')) @@ -188,6 +199,7 @@ def test_datesub(data_gen): to_unix_timestamp_days_gen=[ByteGen(), ShortGen(), IntegerGen(min_val=-106032829, max_val=103819094, special_cases=[-106032829, 103819094,0,1,-1])] @pytest.mark.parametrize('data_gen', to_unix_timestamp_days_gen, ids=idfn) @incompat +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_dateadd_with_date_overflow(data_gen): string_type = to_cast_string(data_gen.data_type) assert_gpu_and_cpu_are_equal_collect( @@ -201,6 +213,7 @@ def test_dateadd_with_date_overflow(data_gen): to_unix_timestamp_days_gen=[ByteGen(), ShortGen(), IntegerGen(max_val=106032829, min_val=-103819094, special_cases=[106032829, -103819094,0,1,-1])] @pytest.mark.parametrize('data_gen', to_unix_timestamp_days_gen, ids=idfn) @incompat +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_datesub_with_date_overflow(data_gen): string_type = to_cast_string(data_gen.data_type) assert_gpu_and_cpu_are_equal_collect( @@ -232,6 +245,7 @@ def test_dayofyear(data_gen): lambda spark : unary_op_df(spark, data_gen).select(f.dayofyear(f.col('a')))) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_unix_timestamp(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col('a')))) @@ -248,6 +262,7 @@ def test_unsupported_fallback_unix_timestamp(data_gen): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_to_unix_timestamp(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("to_unix_timestamp(a)"), @@ -265,6 +280,7 @@ def test_unsupported_fallback_to_unix_timestamp(data_gen): @pytest.mark.parametrize('time_zone', ["UTC", "UTC+0", "UTC-0", "GMT", "GMT+0", "GMT-0"], ids=idfn) @pytest.mark.parametrize('data_gen', [timestamp_gen], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_utc_timestamp(data_gen, time_zone): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select(f.from_utc_timestamp(f.col('a'), time_zone))) @@ -280,6 +296,7 @@ def test_from_utc_timestamp_unsupported_timezone_fallback(data_gen, time_zone): @pytest.mark.parametrize('time_zone', ["UTC", "Asia/Shanghai", "EST", "MST", "VST"], ids=idfn) @pytest.mark.parametrize('data_gen', [timestamp_gen], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_utc_timestamp_supported_timezones(data_gen, time_zone): # Remove spark.rapids.test.CPU.timezone configuration when GPU kernel is ready to really test on GPU assert_gpu_and_cpu_are_equal_collect( @@ -337,6 +354,7 @@ def fun(spark): @pytest.mark.parametrize('parser_policy', ["CORRECTED", "EXCEPTION"], ids=idfn) # first get expected string via `date_format` +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_string_to_timestamp_functions_ansi_valid(parser_policy): expr_format = "{operator}(date_format(a, '{fmt}'), '{fmt}')" formats = ['yyyy-MM-dd', 'yyyy/MM/dd', 'yyyy-MM', 'yyyy/MM', 'dd/MM/yyyy', 'yyyy-MM-dd HH:mm:ss', @@ -354,6 +372,7 @@ def fun(spark): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_unix_timestamp_improved(data_gen, ansi_enabled): conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true", "spark.sql.legacy.timeParserPolicy": "CORRECTED"} @@ -363,6 +382,7 @@ def test_unix_timestamp_improved(data_gen, ansi_enabled): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_unix_timestamp(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col("a"))), @@ -370,6 +390,7 @@ def test_unix_timestamp(data_gen, ansi_enabled): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_to_unix_timestamp_improved(data_gen, ansi_enabled): conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true"} assert_gpu_and_cpu_are_equal_collect( @@ -388,6 +409,7 @@ def invalid_date_string_df(spark): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen,date_form', str_date_and_format_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_string_to_unix_timestamp(data_gen, date_form, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen, seed=1).selectExpr("to_unix_timestamp(a, '{}')".format(date_form)), @@ -401,6 +423,7 @@ def test_string_to_unix_timestamp_ansi_exception(): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen,date_form', str_date_and_format_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_string_unix_timestamp(data_gen, date_form, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen, seed=1).select(f.unix_timestamp(f.col('a'), date_form)), @@ -414,6 +437,7 @@ def test_string_unix_timestamp_ansi_exception(): @pytest.mark.parametrize('data_gen', [StringGen('200[0-9]-0[1-9]-[0-2][1-8]')], ids=idfn) @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_gettimestamp(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "yyyy-MM-dd")), @@ -421,6 +445,7 @@ def test_gettimestamp(data_gen, ansi_enabled): @pytest.mark.parametrize('data_gen', [StringGen('0[1-9]200[0-9]')], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_gettimestamp_format_MMyyyy(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "MMyyyy"))) @@ -435,6 +460,7 @@ def test_gettimestamp_ansi_exception(): 'MM-dd', 'MM/dd', 'dd-MM', 'dd/MM'] @pytest.mark.parametrize('date_format', supported_date_formats, ids=idfn) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_date_format(data_gen, date_format): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("date_format(a, '{}')".format(date_format))) @@ -469,6 +495,7 @@ def test_date_format_maybe(data_gen, date_format): @pytest.mark.parametrize('date_format', maybe_supported_date_formats, ids=idfn) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_date_format_maybe_incompat(data_gen, date_format): conf = {"spark.rapids.sql.incompatibleDateFormats.enabled": "true"} assert_gpu_and_cpu_are_equal_collect( @@ -480,6 +507,7 @@ def test_date_format_maybe_incompat(data_gen, date_format): # input_file_name(), otherwise filter happens before project. @allow_non_gpu('CollectLimitExec,FileSourceScanExec,DeserializeToObjectExec') @ignore_order() +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_date_format_mmyyyy_cast_canonicalization(spark_tmp_path): data_path = spark_tmp_path + '/CSV_DATA' gen = StringGen(pattern='[0][0-9][1][8-9][1-9][1-9]', nullable=False) @@ -525,10 +553,12 @@ def test_unsupported_fallback_to_date(): seconds_gens = [LongGen(min_val=-62135510400, max_val=253402214400), IntegerGen(), ShortGen(), ByteGen(), DoubleGen(min_exp=0, max_exp=32), ts_float_gen, DecimalGen(16, 6), DecimalGen(13, 3), DecimalGen(10, 0), DecimalGen(7, -3), DecimalGen(6, 6)] @pytest.mark.parametrize('data_gen', seconds_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timestamp_seconds(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_seconds(a)")) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timestamp_seconds_long_overflow(): assert_gpu_and_cpu_error( lambda spark : unary_op_df(spark, long_gen).selectExpr("timestamp_seconds(a)").collect(), @@ -536,6 +566,7 @@ def test_timestamp_seconds_long_overflow(): error_message='long overflow') @pytest.mark.parametrize('data_gen', [DecimalGen(7, 7), DecimalGen(20, 7)], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timestamp_seconds_rounding_necessary(data_gen): assert_gpu_and_cpu_error( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_seconds(a)").collect(), @@ -543,6 +574,7 @@ def test_timestamp_seconds_rounding_necessary(data_gen): error_message='Rounding necessary') @pytest.mark.parametrize('data_gen', [DecimalGen(19, 6), DecimalGen(20, 6)], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timestamp_seconds_decimal_overflow(data_gen): assert_gpu_and_cpu_error( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_seconds(a)").collect(), @@ -551,10 +583,12 @@ def test_timestamp_seconds_decimal_overflow(data_gen): millis_gens = [LongGen(min_val=-62135510400000, max_val=253402214400000), IntegerGen(), ShortGen(), ByteGen()] @pytest.mark.parametrize('data_gen', millis_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timestamp_millis(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_millis(a)")) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timestamp_millis_long_overflow(): assert_gpu_and_cpu_error( lambda spark : unary_op_df(spark, long_gen).selectExpr("timestamp_millis(a)").collect(), @@ -563,6 +597,7 @@ def test_timestamp_millis_long_overflow(): micros_gens = [LongGen(min_val=-62135510400000000, max_val=253402214400000000), IntegerGen(), ShortGen(), ByteGen()] @pytest.mark.parametrize('data_gen', micros_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timestamp_micros(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_micros(a)")) diff --git a/integration_tests/src/main/python/expand_exec_test.py b/integration_tests/src/main/python/expand_exec_test.py index d53000e9849..abb9a7bd094 100644 --- a/integration_tests/src/main/python/expand_exec_test.py +++ b/integration_tests/src/main/python/expand_exec_test.py @@ -14,6 +14,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_equal +from conftest import is_not_utc from data_gen import * import pyspark.sql.functions as f from marks import ignore_order @@ -22,6 +23,7 @@ # Many Spark versions have issues sorting large decimals, # see https://issues.apache.org/jira/browse/SPARK-40089. @ignore_order(local=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_expand_exec(data_gen): def op_df(spark, length=2048): return gen_df(spark, StructGen([ diff --git a/integration_tests/src/main/python/explain_test.py b/integration_tests/src/main/python/explain_test.py index b84754a3d3f..1837f31aa95 100644 --- a/integration_tests/src/main/python/explain_test.py +++ b/integration_tests/src/main/python/explain_test.py @@ -14,6 +14,7 @@ import pytest +from conftest import is_not_utc from data_gen import * from marks import * from pyspark.sql.functions import * @@ -49,6 +50,7 @@ def do_join_explain(spark): with_cpu_session(do_join_explain) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explain_set_config(): conf = {'spark.rapids.sql.hasExtendedYearValues': 'false', 'spark.rapids.sql.castStringToTimestamp.enabled': 'true'} diff --git a/integration_tests/src/main/python/fastparquet_compatibility_test.py b/integration_tests/src/main/python/fastparquet_compatibility_test.py index b51fa5a55ef..a12bd223778 100644 --- a/integration_tests/src/main/python/fastparquet_compatibility_test.py +++ b/integration_tests/src/main/python/fastparquet_compatibility_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect +from conftest import is_not_utc from data_gen import * from fastparquet_utils import get_fastparquet_result_canonicalizer from spark_session import is_databricks_runtime, spark_version, with_cpu_session, with_gpu_session @@ -141,6 +142,7 @@ def read_with_fastparquet_or_plugin(spark): marks=pytest.mark.xfail(is_databricks_runtime(), reason="https://github.com/NVIDIA/spark-rapids/issues/9778")), ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_reading_file_written_by_spark_cpu(data_gen, spark_tmp_path): """ This test writes data_gen output to Parquet via Apache Spark, then verifies that fastparquet and the RAPIDS @@ -207,6 +209,7 @@ def test_reading_file_written_by_spark_cpu(data_gen, spark_tmp_path): end=pandas_min_datetime), marks=pytest.mark.xfail(reason="fastparquet reads timestamps preceding 1900 incorrectly.")), ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_reading_file_written_with_gpu(spark_tmp_path, column_gen): """ This test writes the data-gen output to file via the RAPIDS plugin, then checks that the data is read identically @@ -389,6 +392,7 @@ def write_with_fastparquet(spark, data_gen): marks=pytest.mark.xfail(reason="fastparquet fails to read nullable Struct columns written from Apache Spark. " "It fails the rewrite to parquet, thereby failing the test.")), ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_reading_file_rewritten_with_fastparquet(column_gen, time_format, spark_tmp_path): """ This test is a workaround to test data-types that have problems being converted diff --git a/integration_tests/src/main/python/generate_expr_test.py b/integration_tests/src/main/python/generate_expr_test.py index 46ac5c92350..cde16352236 100644 --- a/integration_tests/src/main/python/generate_expr_test.py +++ b/integration_tests/src/main/python/generate_expr_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect +from conftest import is_not_utc from data_gen import * from marks import allow_non_gpu, ignore_order from pyspark.sql.types import * @@ -37,6 +38,7 @@ def four_op_df(spark, gen, length=2048): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_makearray(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : four_op_df(spark, data_gen).selectExpr('a', 'explode(array(b, c, d))')) @@ -45,6 +47,7 @@ def test_explode_makearray(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_litarray(data_gen): array_lit = with_cpu_session( lambda spark: gen_scalar(ArrayGen(data_gen, min_length=3, max_length=3, nullable=False))) @@ -60,6 +63,7 @@ def test_explode_litarray(data_gen): @pytest.mark.parametrize('data_gen', explode_gens + struct_gens_sample_with_decimal128 + array_gens_sample + map_gens_sample + arrays_with_binary + maps_with_binary, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_array_data(data_gen): data_gen = [int_gen, ArrayGen(data_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -70,6 +74,7 @@ def test_explode_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('map_gen', map_gens_sample + decimal_128_map_gens + maps_with_binary, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_map_data(map_gen): data_gen = [int_gen, map_gen] assert_gpu_and_cpu_are_equal_collect( @@ -80,6 +85,7 @@ def test_explode_map_data(map_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_nested_array_data(data_gen): data_gen = [int_gen, ArrayGen(ArrayGen(data_gen))] assert_gpu_and_cpu_are_equal_collect( @@ -94,6 +100,7 @@ def test_explode_nested_array_data(data_gen): @pytest.mark.parametrize('data_gen', explode_gens + struct_gens_sample_with_decimal128 + array_gens_sample + arrays_with_binary + map_gens_sample + maps_with_binary, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_outer_array_data(data_gen): data_gen = [int_gen, ArrayGen(data_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -104,6 +111,7 @@ def test_explode_outer_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('map_gen', map_gens_sample + decimal_128_map_gens + maps_with_binary, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_outer_map_data(map_gen): data_gen = [int_gen, map_gen] assert_gpu_and_cpu_are_equal_collect( @@ -114,6 +122,7 @@ def test_explode_outer_map_data(map_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_outer_nested_array_data(data_gen): data_gen = [int_gen, ArrayGen(ArrayGen(data_gen))] assert_gpu_and_cpu_are_equal_collect( @@ -125,6 +134,7 @@ def test_explode_outer_nested_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_posexplode_makearray(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : four_op_df(spark, data_gen).selectExpr('posexplode(array(b, c, d))', 'a')) @@ -133,6 +143,7 @@ def test_posexplode_makearray(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_posexplode_litarray(data_gen): array_lit = with_cpu_session( lambda spark: gen_scalar(ArrayGen(data_gen, min_length=3, max_length=3, nullable=False))) @@ -147,6 +158,7 @@ def test_posexplode_litarray(data_gen): @pytest.mark.parametrize('data_gen', explode_gens + struct_gens_sample_with_decimal128 + array_gens_sample + arrays_with_binary + map_gens_sample + maps_with_binary, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_posexplode_array_data(data_gen): data_gen = [int_gen, ArrayGen(data_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -157,6 +169,7 @@ def test_posexplode_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('map_gen', map_gens_sample + decimal_128_map_gens + maps_with_binary, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_posexplode_map_data(map_gen): data_gen = [int_gen, map_gen] assert_gpu_and_cpu_are_equal_collect( @@ -167,6 +180,7 @@ def test_posexplode_map_data(map_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_posexplode_nested_array_data(data_gen): data_gen = [int_gen, ArrayGen(ArrayGen(data_gen))] assert_gpu_and_cpu_are_equal_collect( @@ -181,6 +195,7 @@ def test_posexplode_nested_array_data(data_gen): @pytest.mark.parametrize('data_gen', explode_gens + struct_gens_sample_with_decimal128 + array_gens_sample + arrays_with_binary + map_gens_sample + maps_with_binary, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_posexplode_outer_array_data(data_gen): data_gen = [int_gen, ArrayGen(data_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -191,6 +206,7 @@ def test_posexplode_outer_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('map_gen', map_gens_sample + decimal_128_map_gens + maps_with_binary, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_posexplode_outer_map_data(map_gen): data_gen = [int_gen, map_gen] assert_gpu_and_cpu_are_equal_collect( @@ -201,6 +217,7 @@ def test_posexplode_outer_map_data(map_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_posexplode_nested_outer_array_data(data_gen): data_gen = [int_gen, ArrayGen(ArrayGen(data_gen))] assert_gpu_and_cpu_are_equal_collect( @@ -225,6 +242,7 @@ def test_stack(): # gpu stack not guarantee to produce the same output order as Spark does @ignore_order(local=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_stack_mixed_types(): base_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, string_gen, boolean_gen, date_gen, timestamp_gen, null_gen, DecimalGen(precision=7, scale=3), diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index 4ecf42a9b42..0c99fc4516a 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -15,11 +15,9 @@ import math import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_row_counts_equal,\ - assert_gpu_and_cpu_are_equal_sql,\ - assert_gpu_fallback_collect, assert_cpu_and_gpu_are_equal_sql_with_capture,\ - assert_cpu_and_gpu_are_equal_collect_with_capture, run_with_cpu, run_with_cpu_and_gpu +from asserts import * from conftest import is_databricks_runtime +from conftest import is_not_utc from data_gen import * from functools import reduce from pyspark.sql.types import * @@ -336,6 +334,7 @@ def test_hash_grpby_sum_count_action(data_gen, override_split_until_size, overri @allow_non_gpu("SortAggregateExec", "SortExec", "ShuffleExchangeExec") @ignore_order @pytest.mark.parametrize('data_gen', _grpkey_nested_structs_with_array_basic_child + _grpkey_list_with_non_nested_children, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_grpby_list_min_max(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100).coalesce(1).groupby('a').agg(f.min('b'), f.max('b')) @@ -618,6 +617,7 @@ def test_decimal128_min_max_group_by(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _all_basic_gens_with_all_nans_cases, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_min_max_group_by(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, byte_gen, data_gen) @@ -633,6 +633,7 @@ def test_min_max_group_by(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_list_op, ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', [True, False], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_list(data_gen, use_obj_hash_agg): def doit(spark): df = gen_df(spark, data_gen, length=100)\ @@ -664,6 +665,7 @@ def doit(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_set(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -672,6 +674,7 @@ def test_hash_groupby_collect_set(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_set_on_nested_type(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -686,6 +689,7 @@ def test_hash_groupby_collect_set_on_nested_type(data_gen): @ignore_order(local=True) @allow_non_gpu("ProjectExec", "SortArray") @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op_nested, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_set_on_nested_array_type(data_gen): conf = copy_and_update(_float_conf, { "spark.rapids.sql.castFloatToString.enabled": "true", @@ -707,6 +711,7 @@ def do_it(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_reduction_collect_set(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -714,6 +719,7 @@ def test_hash_reduction_collect_set(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_reduction_collect_set_on_nested_type(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -727,6 +733,7 @@ def test_hash_reduction_collect_set_on_nested_type(data_gen): @ignore_order(local=True) @allow_non_gpu("ProjectExec", "SortArray") @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op_nested, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_reduction_collect_set_on_nested_array_type(data_gen): conf = copy_and_update(_float_conf, { "spark.rapids.sql.castFloatToString.enabled": "true", @@ -746,6 +753,7 @@ def do_it(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_with_single_distinct(data_gen): # test collect_ops with other distinct aggregations assert_gpu_and_cpu_are_equal_collect( @@ -758,6 +766,7 @@ def test_hash_groupby_collect_with_single_distinct(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_op, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_single_distinct_collect(data_gen): # test distinct collect sql = """select a, @@ -781,6 +790,7 @@ def test_hash_groupby_single_distinct_collect(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_op, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_with_multi_distinct(data_gen): def spark_fn(spark_session): return gen_df(spark_session, data_gen, length=100).groupby('a').agg( @@ -807,6 +817,7 @@ def spark_fn(spark_session): @pytest.mark.parametrize('replace_mode', _replace_modes_non_distinct, ids=idfn) @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', ['false', 'true'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_partial_replace_fallback(data_gen, replace_mode, aqe_enabled, @@ -854,6 +865,7 @@ def test_hash_groupby_collect_partial_replace_fallback(data_gen, @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', ['false', 'true'], ids=idfn) @pytest.mark.xfail(condition=is_databricks104_or_later(), reason='https://github.com/NVIDIA/spark-rapids/issues/4963') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_partial_replace_with_distinct_fallback(data_gen, replace_mode, aqe_enabled, @@ -1252,6 +1264,7 @@ def test_first_last_reductions_decimal_types(data_gen): 'first(a)', 'last(a)', 'first(a, true)', 'last(a, true)')) @pytest.mark.parametrize('data_gen', _nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_first_last_reductions_nested_types(data_gen): assert_gpu_and_cpu_are_equal_collect( # Coalesce and sort are to make sure that first and last, which are non-deterministic @@ -1260,6 +1273,7 @@ def test_first_last_reductions_nested_types(data_gen): 'first(a)', 'last(a)', 'first(a, true)', 'last(a, true)')) @pytest.mark.parametrize('data_gen', _all_basic_gens_with_all_nans_cases, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_generic_reductions(data_gen): local_conf = copy_and_update(_float_conf, {'spark.sql.legacy.allowParameterlessCount': 'true'}) assert_gpu_and_cpu_are_equal_collect( @@ -1277,6 +1291,7 @@ def test_generic_reductions(data_gen): conf=local_conf) @pytest.mark.parametrize('data_gen', all_gen + _nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_count(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen) \ @@ -1288,6 +1303,7 @@ def test_count(data_gen): conf = {'spark.sql.legacy.allowParameterlessCount': 'true'}) @pytest.mark.parametrize('data_gen', all_basic_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_distinct_count_reductions(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).selectExpr( @@ -1311,6 +1327,7 @@ def test_arithmetic_reductions(data_gen): @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens + _nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_collect_list_reductions(data_gen): assert_gpu_and_cpu_are_equal_collect( # coalescing because collect_list is not deterministic @@ -1329,6 +1346,7 @@ def test_collect_list_reductions(data_gen): @pytest.mark.parametrize('data_gen', _no_neg_zero_all_basic_gens + decimal_gens + _struct_only_nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_collect_set_reductions(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('sort_array(collect_set(a))'), @@ -1342,6 +1360,7 @@ def test_collect_empty(): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen + _nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_groupby_first_last(data_gen): gen_fn = [('a', RepeatSeqGen(LongGen(), length=20)), ('b', data_gen)] agg_fn = lambda df: df.groupBy('a').agg( @@ -1355,6 +1374,7 @@ def test_groupby_first_last(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen + _struct_only_nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sorted_groupby_first_last(data_gen): gen_fn = [('a', RepeatSeqGen(LongGen(), length=20)), ('b', data_gen)] # sort by more than the group by columns to be sure that first/last don't remove the ordering @@ -1372,6 +1392,7 @@ def test_sorted_groupby_first_last(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('count_func', [f.count, f.countDistinct]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_agg_count(data_gen, count_func): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, [('a', data_gen), ('b', data_gen)], @@ -2028,6 +2049,7 @@ def test_std_variance_partial_replace_fallback(data_gen, null_gen] + array_gens_sample + struct_gens_sample @ignore_order(local=True) @pytest.mark.parametrize('data_gen', gens_for_max_min, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_min_max_in_groupby_and_reduction(data_gen): df_gen = [('a', data_gen), ('b', RepeatSeqGen(IntegerGen(), length=20))] diff --git a/integration_tests/src/main/python/hashing_test.py b/integration_tests/src/main/python/hashing_test.py index 6bd56da933d..e2a753ecaeb 100644 --- a/integration_tests/src/main/python/hashing_test.py +++ b/integration_tests/src/main/python/hashing_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect +from conftest import is_not_utc from data_gen import * from marks import allow_non_gpu, ignore_order from spark_session import is_before_spark_320 @@ -46,11 +47,13 @@ @ignore_order(local=True) @pytest.mark.parametrize("gen", _xxhash_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_xxhash64_single_column(gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, gen).selectExpr("a", "xxhash64(a)")) @ignore_order(local=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_xxhash64_multi_column(): gen = StructGen(_struct_of_xxhash_gens.children, nullable=False) col_list = ",".join(gen.data_type.fieldNames()) diff --git a/integration_tests/src/main/python/hive_delimited_text_test.py b/integration_tests/src/main/python/hive_delimited_text_test.py index e316f0df934..4d07a077ec0 100644 --- a/integration_tests/src/main/python/hive_delimited_text_test.py +++ b/integration_tests/src/main/python/hive_delimited_text_test.py @@ -13,7 +13,7 @@ # limitations under the License. from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_and_cpu_sql_writes_are_equal_collect, assert_gpu_fallback_collect -from conftest import get_non_gpu_allowed +from conftest import get_non_gpu_allowed, is_not_utc from data_gen import * from enum import Enum from marks import * @@ -187,6 +187,7 @@ def read_impl(spark): ('hive-delim-text/carriage-return', StructType([StructField("str", StringType())]), {}), ('hive-delim-text/carriage-return-err', StructType([StructField("str", StringType())]), {}), ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_basic_hive_text_read(std_input_path, name, schema, spark_tmp_table_factory, options): assert_gpu_and_cpu_are_equal_collect(read_hive_text_sql(std_input_path + '/' + name, schema, spark_tmp_table_factory, options), @@ -239,6 +240,7 @@ def read_hive_text_table(spark, text_table_name, fields="my_field"): "https://github.com/NVIDIA/spark-rapids/pull/7628") @approximate_float @pytest.mark.parametrize('data_gen', hive_text_supported_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_text_round_trip(spark_tmp_path, data_gen, spark_tmp_table_factory): gen = StructGen([('my_field', data_gen)], nullable=False) data_path = spark_tmp_path + '/hive_text_table' @@ -282,6 +284,7 @@ def read_hive_text_table_partitions(spark, text_table_name, partition): @approximate_float @allow_non_gpu("EqualTo,IsNotNull,Literal") # Accounts for partition predicate: `WHERE dt='1'` @pytest.mark.parametrize('data_gen', hive_text_supported_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_text_round_trip_partitioned(spark_tmp_path, data_gen, spark_tmp_table_factory): gen = StructGen([('my_field', data_gen)], nullable=False) data_path = spark_tmp_path + '/hive_text_table' @@ -300,6 +303,7 @@ def test_hive_text_round_trip_partitioned(spark_tmp_path, data_gen, spark_tmp_ta @approximate_float @allow_non_gpu("EqualTo,IsNotNull,Literal,Or") # Accounts for partition predicate @pytest.mark.parametrize('data_gen', hive_text_supported_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_text_round_trip_two_partitions(spark_tmp_path, data_gen, spark_tmp_table_factory): """ Added to reproduce: https://github.com/NVIDIA/spark-rapids/issues/7383 @@ -525,6 +529,7 @@ def create_table_with_compressed_files(spark): ('hive-delim-text/carriage-return', StructType([StructField("str", StringType())]), {}), ('hive-delim-text/carriage-return-err', StructType([StructField("str", StringType())]), {}), ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_basic_hive_text_write(std_input_path, input_dir, schema, spark_tmp_table_factory, mode, options): # Configure table options, including schema. if options is None: diff --git a/integration_tests/src/main/python/hive_write_test.py b/integration_tests/src/main/python/hive_write_test.py index d7de6f1084e..7bc5ceede85 100644 --- a/integration_tests/src/main/python/hive_write_test.py +++ b/integration_tests/src/main/python/hive_write_test.py @@ -14,14 +14,12 @@ import pytest -from asserts import assert_gpu_and_cpu_sql_writes_are_equal_collect, assert_gpu_fallback_collect, \ - assert_gpu_and_cpu_are_equal_collect, assert_equal, run_with_cpu_and_gpu -from conftest import spark_jvm +from asserts import * +from conftest import spark_jvm, is_not_utc from data_gen import * from datetime import date, datetime, timezone from marks import * -from spark_session import is_hive_available, is_spark_33X, is_spark_340_or_later, with_cpu_session, \ - is_databricks122_or_later +from spark_session import * # Using timestamps from 1970 to work around a cudf ORC bug # https://github.com/NVIDIA/spark-rapids/issues/140. @@ -61,6 +59,7 @@ def _restricted_timestamp(nullable=True): @pytest.mark.skipif(not is_hive_available(), reason="Hive is missing") @pytest.mark.parametrize("gens", _write_gens, ids=idfn) @pytest.mark.parametrize("storage", ["PARQUET", "nativeorc", "hiveorc"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_optimized_hive_ctas_basic(gens, storage, spark_tmp_table_factory): data_table = spark_tmp_table_factory.get() gen_list = [('c' + str(i), gen) for i, gen in enumerate(gens)] diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py index ee9a7fe1d8b..ba172715936 100644 --- a/integration_tests/src/main/python/join_test.py +++ b/integration_tests/src/main/python/join_test.py @@ -17,7 +17,7 @@ from pyspark.sql.functions import array_contains, broadcast, col from pyspark.sql.types import * from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_cpu_and_gpu_are_equal_collect_with_capture -from conftest import is_databricks_runtime, is_emr_runtime +from conftest import is_databricks_runtime, is_emr_runtime, is_not_utc from data_gen import * from marks import ignore_order, allow_non_gpu, incompat, validate_execs_in_gpu_plan from spark_session import with_cpu_session, is_before_spark_330, is_databricks_runtime @@ -170,6 +170,7 @@ def do_join(spark): (all_gen, '1g'), (join_small_batch_gens, '1000')), ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join(data_gen, join_type, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) @@ -180,6 +181,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', basic_nested_gens + [decimal_gen_128bit], ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_ridealong(data_gen, join_type): def do_join(spark): left, right = create_ridealong_df(spark, short_gen, data_gen, 500, 500) @@ -193,6 +195,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', single_level_array_gens + [binary_gen], ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_wrong_key_fallback(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) @@ -212,6 +215,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen', basic_nested_gens + [decimal_gen_128bit], ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) @pytest.mark.parametrize('sub_part_enabled', ['false', 'true'], ids=['SubPartition_OFF', 'SubPartition_ON']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_join_ridealong(data_gen, join_type, sub_part_enabled): def do_join(spark): left, right = create_ridealong_df(spark, short_gen, data_gen, 50, 500) @@ -228,6 +232,7 @@ def do_join(spark): # Not all join types can be translated to a broadcast join, but this tests them to be sure we # can handle what spark is doing @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_join_right_table(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -239,6 +244,7 @@ def do_join(spark): # Not all join types can be translated to a broadcast join, but this tests them to be sure we # can handle what spark is doing @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_join_right_table_ridealong(data_gen, join_type): def do_join(spark): left, right = create_ridealong_df(spark, short_gen, data_gen, 500, 500) @@ -252,6 +258,7 @@ def do_join(spark): # Not all join types can be translated to a broadcast join, but this tests them to be sure we # can handle what spark is doing @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_join_right_table_with_job_group(data_gen, join_type): with_cpu_session(lambda spark : spark.sparkContext.setJobGroup("testjob1", "test", False)) def do_join(spark): @@ -266,6 +273,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen,batch_size', join_batch_size_test_params( (all_gen + basic_nested_gens, '1g'), (join_small_batch_gens + [basic_struct_gen, ArrayGen(string_gen)], '100')), ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cartesian_join(data_gen, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -305,6 +313,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen,batch_size', join_batch_size_test_params( (all_gen, '1g'), (join_small_batch_gens, '100')), ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cartesian_join_with_condition(data_gen, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -322,6 +331,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen,batch_size', join_batch_size_test_params( (all_gen + basic_nested_gens, '1g'), (join_small_batch_gens, '100')), ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_nested_loop_join(data_gen, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -357,6 +367,7 @@ def do_join(spark): (join_ast_gen, '1g'), ([int_gen], 100)), ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Inner', 'LeftSemi', 'LeftAnti', 'Cross'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_right_broadcast_nested_loop_join_with_ast_condition(data_gen, join_type, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -371,6 +382,7 @@ def do_join(spark): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_ast_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_left_broadcast_nested_loop_join_with_ast_condition(data_gen): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -429,6 +441,7 @@ def do_join(spark): float_gen, double_gen, string_gen, boolean_gen, date_gen, timestamp_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'FullOuter', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_nested_loop_join_with_array_contains(data_gen, join_type): arr_gen = ArrayGen(data_gen) literal = with_cpu_session(lambda spark: gen_scalar(data_gen)) @@ -441,6 +454,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_right_broadcast_nested_loop_join_condition_missing(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -456,6 +470,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Right'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_left_broadcast_nested_loop_join_condition_missing(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -470,6 +485,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen', all_gen + single_level_array_gens + [binary_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_right_broadcast_nested_loop_join_condition_missing_count(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -478,6 +494,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen', all_gen + single_level_array_gens + [binary_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Right'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_left_broadcast_nested_loop_join_condition_missing_count(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -488,6 +505,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['LeftOuter', 'LeftSemi', 'LeftAnti', 'FullOuter'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_nested_loop_join_with_conditionals_build_left_fallback(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -498,6 +516,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['RightOuter', 'FullOuter'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_nested_loop_with_conditionals_build_right_fallback(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -514,6 +533,7 @@ def do_join(spark): # Specify 200 shuffle partitions to test cases where streaming side is empty # as in https://github.com/NVIDIA/spark-rapids/issues/7516 @pytest.mark.parametrize('shuffle_conf', [{}, {'spark.sql.shuffle.partitions': 200}], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_join_left_table(data_gen, join_type, shuffle_conf): def do_join(spark): left, right = create_df(spark, data_gen, 250, 500) @@ -525,6 +545,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_ast_gen, ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_join_with_conditionals(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -579,6 +600,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_ast_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'Inner', 'FullOuter', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_with_condition_ast(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -695,6 +717,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_struct_as_key(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -706,6 +729,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_struct_mixed_key(data_gen, join_type): def do_join(spark): left = two_col_df(spark, data_gen, int_gen, length=500) @@ -718,6 +742,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_struct_mixed_key_with_null_filter(data_gen, join_type): def do_join(spark): left = two_col_df(spark, data_gen, int_gen, length=500) @@ -732,6 +757,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_join_right_struct_as_key(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -743,6 +769,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_join_right_struct_mixed_key(data_gen, join_type): def do_join(spark): left = two_col_df(spark, data_gen, int_gen, length=500) @@ -767,6 +794,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['FullOuter'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_struct_as_key_fallback(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index bb99a01425f..41571a203d5 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -15,14 +15,13 @@ import pyspark.sql.functions as f import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_and_cpu_row_counts_equal, \ - assert_gpu_fallback_collect, assert_cpu_and_gpu_are_equal_collect_with_capture +from asserts import * from data_gen import * +from conftest import is_not_utc from datetime import timezone from conftest import is_databricks_runtime from marks import approximate_float, allow_non_gpu, ignore_order -from spark_session import with_cpu_session, with_gpu_session, is_before_spark_320, is_before_spark_330, is_before_spark_340, \ - is_before_spark_341 +from spark_session import * json_supported_gens = [ # Spark does not escape '\r' or '\n' even though it uses it to mark end of record @@ -185,6 +184,7 @@ def test_json_date_formats_round_trip(spark_tmp_path, date_format, v1_enabled_li @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) @pytest.mark.parametrize('v1_enabled_list', ["", "json"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_json_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_enabled_list): full_format = date_format + ts_part data_gen = TimestampGen() @@ -208,6 +208,7 @@ def test_json_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_ena @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) @pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_json_ts_formats_round_trip_ntz_v1(spark_tmp_path, date_format, ts_part, timestamp_type): json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, 'json', 'FileSourceScanExec') @@ -216,6 +217,7 @@ def test_json_ts_formats_round_trip_ntz_v1(spark_tmp_path, date_format, ts_part, @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) @pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_json_ts_formats_round_trip_ntz_v2(spark_tmp_path, date_format, ts_part, timestamp_type): json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, '', 'BatchScanExec') @@ -395,6 +397,7 @@ def test_json_read_invalid_dates(std_input_path, filename, schema, read_func, an 'CORRECTED', 'EXCEPTION' ]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_json_read_valid_timestamps(std_input_path, filename, schema, read_func, ansi_enabled, time_parser_policy, \ spark_tmp_table_factory): updated_conf = copy_and_update(_enable_all_types_conf, @@ -452,6 +455,7 @@ def test_json_read_count(spark_tmp_path, v1_enabled_list): lambda spark : spark.read.schema(schema).json(data_path), conf=updated_conf) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_map(): # The test here is working around some inconsistencies in how the keys are parsed for maps # on the GPU the keys are dense, but on the CPU they are sparse @@ -486,6 +490,7 @@ def test_from_json_map_fallback(): 'struct', 'struct', ]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct(schema): # note that column 'a' does not use leading zeroes due to https://github.com/NVIDIA/spark-rapids/issues/9588 json_string_gen = StringGen(r'{"a": [1-9]{0,5}, "b": "[A-Z]{0,5}", "c": 1\d\d\d}') \ @@ -505,6 +510,7 @@ def test_from_json_struct(schema): r'{ "bool": [0-9]{4}-[0-9]{2}-[0-9]{2} }', r'{ "bool": "[0-9]{4}-[0-9]{2}-[0-9]{2}" }' ]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct_boolean(pattern): json_string_gen = StringGen(pattern) \ .with_special_case('', weight=50) \ @@ -514,6 +520,7 @@ def test_from_json_struct_boolean(pattern): .select(f.col('a'), f.from_json('a', 'struct')), conf={"spark.rapids.sql.expression.JsonToStructs": True}) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct_decimal(): json_string_gen = StringGen(r'{ "a": "[+-]?([0-9]{0,5})?(\.[0-9]{0,2})?([eE][+-]?[0-9]{1,2})?" }') \ .with_special_pattern('', weight=50) \ @@ -553,6 +560,7 @@ def test_from_json_struct_decimal(): pytest.param("LEGACY", marks=pytest.mark.allow_non_gpu('ProjectExec')), "CORRECTED" ]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct_date(date_gen, date_format, time_parser_policy): json_string_gen = StringGen(r'{ "a": ' + date_gen + ' }') \ .with_special_case('{ "a": null }') \ @@ -635,6 +643,7 @@ def test_from_json_struct_date_fallback_non_default_format(date_gen, date_format "CORRECTED" ]) @pytest.mark.parametrize('ansi_enabled', [ True, False ]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct_timestamp(timestamp_gen, timestamp_format, time_parser_policy, ansi_enabled): json_string_gen = StringGen(r'{ "a": ' + timestamp_gen + ' }') \ .with_special_case('{ "a": null }') \ @@ -686,6 +695,7 @@ def test_from_json_struct_timestamp_fallback_non_default_format(timestamp_gen, t @pytest.mark.parametrize('schema', ['struct', 'struct>', 'struct>']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct_of_struct(schema): json_string_gen = StringGen(r'{"teacher": "[A-Z]{1}[a-z]{2,5}",' \ r'"student": {"name": "[A-Z]{1}[a-z]{2,5}", "age": 1\d}}') \ @@ -700,6 +710,7 @@ def test_from_json_struct_of_struct(schema): @pytest.mark.parametrize('schema', ['struct', 'struct>>', 'struct>>']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct_of_list(schema): json_string_gen = StringGen(r'{"teacher": "[A-Z]{1}[a-z]{2,5}",' \ r'"student": \[{"name": "[A-Z]{1}[a-z]{2,5}", "class": "junior"},' \ @@ -712,6 +723,7 @@ def test_from_json_struct_of_list(schema): conf={"spark.rapids.sql.expression.JsonToStructs": True}) @pytest.mark.parametrize('schema', ['struct', 'struct']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct_all_empty_string_input(schema): json_string_gen = StringGen('') assert_gpu_and_cpu_are_equal_collect( @@ -788,6 +800,7 @@ def test_read_case_col_name(spark_tmp_path, v1_enabled_list, col_name): pytest.param(True, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9517')), False ]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_structs_to_json(spark_tmp_path, data_gen, ignore_null_fields, pretty): struct_gen = StructGen([ ('a', data_gen), @@ -811,4 +824,4 @@ def struct_to_json(spark): assert_gpu_and_cpu_are_equal_collect( lambda spark : struct_to_json(spark), - conf=conf) \ No newline at end of file + conf=conf) diff --git a/integration_tests/src/main/python/limit_test.py b/integration_tests/src/main/python/limit_test.py index 5e116b00654..efe81c1058a 100644 --- a/integration_tests/src/main/python/limit_test.py +++ b/integration_tests/src/main/python/limit_test.py @@ -15,11 +15,13 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect +from conftest import is_not_utc from data_gen import * from spark_session import is_before_spark_340 from marks import allow_non_gpu, approximate_float @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens + array_gens_sample + map_gens_sample + struct_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_simple_limit(data_gen): assert_gpu_and_cpu_are_equal_collect( # We need some processing after the limit to avoid a CollectLimitExec @@ -80,6 +82,7 @@ def test_non_zero_offset_with_limit(limit, offset, batch_size): @pytest.mark.skipif(is_before_spark_340(), reason='offset is introduced from Spark 3.4.0') @allow_non_gpu('ShuffleExchangeExec') # when limit = 0, ShuffleExchangeExec is not replaced. @approximate_float +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_order_by_offset_with_limit(limit, offset, data_gen, batch_size): # In CPU version of spark, (limit, offset) can not be negative number. # Test case description: diff --git a/integration_tests/src/main/python/map_test.py b/integration_tests/src/main/python/map_test.py index 8504b38d00d..5daeb916e22 100644 --- a/integration_tests/src/main/python/map_test.py +++ b/integration_tests/src/main/python/map_test.py @@ -14,12 +14,12 @@ import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, \ - assert_gpu_fallback_collect, assert_cpu_and_gpu_are_equal_collect_with_capture +from asserts import * +from conftest import is_not_utc from data_gen import * from conftest import is_databricks_runtime from marks import allow_non_gpu, ignore_order, datagen_overrides -from spark_session import is_before_spark_330, is_databricks104_or_later, is_databricks113_or_later, is_spark_33X, is_spark_340_or_later +from spark_session import * from pyspark.sql.functions import create_map, col, lit, row_number from pyspark.sql.types import * from pyspark.sql.types import IntegralType @@ -57,6 +57,7 @@ @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -68,6 +69,7 @@ def test_map_keys(data_gen): @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_values(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -79,6 +81,7 @@ def test_map_values(data_gen): @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_entries(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -114,6 +117,7 @@ def decimal_value_gen(): [MapGen(StringGen(pattern='key_[0-9]', nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_map_value_string_keys(data_gen): index_gen = StringGen() assert_gpu_and_cpu_are_equal_collect( @@ -137,6 +141,7 @@ def test_get_map_value_string_keys(data_gen): @pytest.mark.parametrize('data_gen', numeric_key_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_map_value_numeric_keys(data_gen): key_gen = data_gen._key_gen assert_gpu_and_cpu_are_equal_collect( @@ -150,6 +155,7 @@ def test_get_map_value_numeric_keys(data_gen): @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_map_value_supported_keys(data_gen): key_gen = data_gen._key_gen # first expression is not guaranteed to hit @@ -188,6 +194,7 @@ def query_map_scalar(spark): @allow_non_gpu('WindowLocalExec') @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9683') @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_scalars_supported_key_types(data_gen): key_gen = data_gen._key_gen def query_map_scalar(spark): @@ -225,6 +232,7 @@ def query_map_scalar(spark): @pytest.mark.parametrize('data_gen', [MapGen(DateGen(nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_map_value_date_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -236,6 +244,7 @@ def test_get_map_value_date_keys(data_gen): @pytest.mark.parametrize('data_gen', [MapGen(TimestampGen(nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_map_value_timestamp_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -254,6 +263,7 @@ def test_map_side_effects(): @pytest.mark.parametrize('key_gen', [StringGen(nullable=False), IntegerGen(nullable=False), basic_struct_gen], ids=idfn) @pytest.mark.parametrize('value_gen', [StringGen(nullable=True), IntegerGen(nullable=True), basic_struct_gen], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_entry_map(key_gen, value_gen): data_gen = [('a', key_gen), ('b', value_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -457,6 +467,7 @@ def test_simple_get_map_value_with_strict_index(strict_index, data_gen): [MapGen(StringGen(pattern='key_[0-9]', nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_element_at_map_string_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -470,6 +481,7 @@ def test_element_at_map_string_keys(data_gen): @pytest.mark.parametrize('data_gen', numeric_key_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_element_at_map_numeric_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -485,6 +497,7 @@ def test_element_at_map_numeric_keys(data_gen): [MapGen(DecimalGen(precision=35, scale=2, nullable=False), value(), max_length=6) for value in get_map_value_gens(precision=37, scale=0)], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_map_value_element_at_map_dec_col_keys(data_gen): keys = DecimalGen(precision=35, scale=2) assert_gpu_and_cpu_are_equal_collect( @@ -510,6 +523,7 @@ def test_get_map_value_element_at_map_string_col_keys_ansi(data_gen, ansi): [MapGen(StringGen(pattern='key_[0-9]', nullable=False), value(), max_length=6) for value in get_map_value_gens(precision=37, scale=0)], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_map_value_element_at_map_string_col_keys(data_gen): keys = StringGen(pattern='key_[0-9]') assert_gpu_and_cpu_are_equal_collect( @@ -566,6 +580,7 @@ def test_get_map_value_string_col_keys_ansi_null(data_gen): @pytest.mark.parametrize('data_gen', [MapGen(DateGen(nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_element_at_map_date_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -579,6 +594,7 @@ def test_element_at_map_date_keys(data_gen): [MapGen(TimestampGen(nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_element_at_map_timestamp_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -610,6 +626,7 @@ def test_map_element_at_ansi_null(data_gen): conf=ansi_enabled_conf) @pytest.mark.parametrize('data_gen', map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_transform_values(data_gen): def do_it(spark): columns = ['a', 'b', @@ -648,6 +665,7 @@ def do_it(spark): @pytest.mark.parametrize('data_gen', map_gens_sample + decimal_128_map_gens + decimal_64_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_transform_keys(data_gen): # The processing here is very limited, because we need to be sure we do not create duplicate keys. # This can happen because of integer overflow, round off errors in floating point, etc. So for now @@ -707,6 +725,7 @@ def test_sql_map_scalars(query): @pytest.mark.parametrize('data_gen', map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_filter(data_gen): columns = ['map_filter(a, (key, value) -> isnotnull(value) )', 'map_filter(a, (key, value) -> isnull(value) )', diff --git a/integration_tests/src/main/python/mortgage_test.py b/integration_tests/src/main/python/mortgage_test.py index aed9aa63c85..00bab066651 100644 --- a/integration_tests/src/main/python/mortgage_test.py +++ b/integration_tests/src/main/python/mortgage_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_iterator +from conftest import is_not_utc from marks import approximate_float, incompat, ignore_order, allow_non_gpu, limit @incompat @@ -22,6 +23,7 @@ @limit @ignore_order @allow_non_gpu(any=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_mortgage(mortgage): assert_gpu_and_cpu_are_equal_iterator( lambda spark : mortgage.do_test_query(spark)) diff --git a/integration_tests/src/main/python/orc_cast_test.py b/integration_tests/src/main/python/orc_cast_test.py index 45860d5b299..cccd60125b9 100644 --- a/integration_tests/src/main/python/orc_cast_test.py +++ b/integration_tests/src/main/python/orc_cast_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error +from conftest import is_not_utc from data_gen import * from pyspark.sql.types import * from spark_session import with_cpu_session @@ -49,6 +50,7 @@ def test_casting_among_integer_types(spark_tmp_path, reader_confs, v1_enabled_li @pytest.mark.parametrize('to_type', ['float', 'double', 'string', 'timestamp']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_casting_from_integer(spark_tmp_path, to_type): orc_path = spark_tmp_path + '/orc_cast_integer' # The Python 'datetime' module only supports a max-year of 10000, so we set the Long type max @@ -70,6 +72,7 @@ def test_casting_from_integer(spark_tmp_path, to_type): @pytest.mark.parametrize('overflow_long_gen', [LongGen(min_val=int(1e16)), LongGen(max_val=int(-1e16))]) @pytest.mark.parametrize('to_type', ['timestamp']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_casting_from_overflow_long(spark_tmp_path, overflow_long_gen,to_type): # Timestamp(micro-seconds) is actually type of int64, when casting long(int64) to timestamp, # we need to multiply 1e6 (or 1e3), and it may cause overflow. This function aims to test @@ -100,6 +103,7 @@ def test_casting_from_float_and_double(spark_tmp_path, to_type): @pytest.mark.parametrize('data_gen', [DoubleGen(max_exp=32, special_cases=None), DoubleGen(max_exp=32, special_cases=[8.88e9, 9.99e10, 1.314e11])]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_casting_from_double_to_timestamp(spark_tmp_path, data_gen): # ORC will assume the original double value in seconds, we need to convert them to # timestamp(INT64 in micro-seconds). @@ -123,6 +127,7 @@ def test_casting_from_double_to_timestamp(spark_tmp_path, data_gen): ) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_casting_from_overflow_double_to_timestamp(spark_tmp_path): orc_path = spark_tmp_path + '/orc_casting_from_overflow_double_to_timestamp' with_cpu_session( diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py index cbb2ee9e703..409d0850987 100644 --- a/integration_tests/src/main/python/orc_test.py +++ b/integration_tests/src/main/python/orc_test.py @@ -14,13 +14,13 @@ import pytest -from asserts import assert_cpu_and_gpu_are_equal_sql_with_capture, assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_row_counts_equal, assert_gpu_fallback_collect, \ - assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_and_cpu_writes_are_equal_collect, assert_gpu_and_cpu_are_equal_sql +from asserts import * +from conftest import is_not_utc from data_gen import * from marks import * from pyspark.sql.types import * from spark_init_internal import spark_version -from spark_session import with_cpu_session, is_before_spark_320, is_before_spark_330, is_spark_cdh, is_spark_340_or_later +from spark_session import * from parquet_test import _nested_pruning_schemas from conftest import is_databricks_runtime @@ -68,6 +68,7 @@ def get_orc_timestamp_gen(nullable=True): @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_basic_read(std_input_path, name, read_func, v1_enabled_list, orc_impl, reader_confs): all_confs = copy_and_update(reader_confs, { 'spark.sql.sources.useV1SourceList': v1_enabled_list, @@ -159,6 +160,7 @@ def test_orc_fallback(spark_tmp_path, read_func, disable_conf): @pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_round_trip(spark_tmp_path, orc_gens, read_func, reader_confs, v1_enabled_list): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -184,6 +186,7 @@ def test_read_round_trip(spark_tmp_path, orc_gens, read_func, reader_confs, v1_e @pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_pred_push_round_trip(spark_tmp_path, orc_gen, read_func, v1_enabled_list, reader_confs): data_path = spark_tmp_path + '/ORC_DATA' # Append two struct columns to verify nested predicate pushdown. @@ -240,6 +243,7 @@ def test_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_list, rea @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed # we should go with a more standard set of generators @@ -306,6 +310,7 @@ def test_partitioned_read_just_partitions(spark_tmp_path, v1_enabled_list, reade @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_merge_schema_read(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed # we should go with a more standard set of generators @@ -584,6 +589,7 @@ def test_read_struct_without_stream(spark_tmp_path): @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('case_sensitive', ["false", "true"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_with_more_columns(spark_tmp_path, orc_gen, reader_confs, v1_enabled_list, case_sensitive): struct_gen = StructGen([('nested_col', orc_gen)]) # Map is not supported yet. @@ -771,6 +777,7 @@ def test_orc_read_varchar_as_string(std_input_path): @pytest.mark.parametrize('gens', orc_gens_list, ids=idfn) @pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_round_trip_for_multithreaded_combining(spark_tmp_path, gens, keep_order): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -785,6 +792,7 @@ def test_read_round_trip_for_multithreaded_combining(spark_tmp_path, gens, keep_ @pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_simple_partitioned_read_for_multithreaded_combining(spark_tmp_path, keep_order): orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)), diff --git a/integration_tests/src/main/python/orc_write_test.py b/integration_tests/src/main/python/orc_write_test.py index cee10b9ce4e..5617f8e20e5 100644 --- a/integration_tests/src/main/python/orc_write_test.py +++ b/integration_tests/src/main/python/orc_write_test.py @@ -16,6 +16,7 @@ from asserts import assert_gpu_and_cpu_writes_are_equal_collect, assert_gpu_fallback_write from spark_session import is_before_spark_320, is_spark_321cdh, is_spark_cdh, with_cpu_session, with_gpu_session +from conftest import is_not_utc from datetime import date, datetime, timezone from data_gen import * from marks import * @@ -80,6 +81,7 @@ @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_round_trip(spark_tmp_path, orc_gens, orc_impl): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -114,6 +116,7 @@ def test_write_round_trip_corner(spark_tmp_path, orc_gen, orc_impl): # There are race conditions around when individual files are read in for partitioned data @ignore_order @pytest.mark.parametrize('orc_gen', orc_part_write_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_part_write_round_trip(spark_tmp_path, orc_gen): gen_list = [('a', RepeatSeqGen(orc_gen, 10)), ('b', orc_gen)] @@ -167,6 +170,7 @@ def test_compress_write_round_trip(spark_tmp_path, compress): @pytest.mark.order(2) @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_save_table(spark_tmp_path, orc_gens, orc_impl, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -189,6 +193,7 @@ def write_orc_sql_from(spark, df, data_path, write_to_table): @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) @pytest.mark.parametrize('ts_type', ["TIMESTAMP_MICROS", "TIMESTAMP_MILLIS"]) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_sql_save_table(spark_tmp_path, orc_gens, ts_type, orc_impl, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -200,6 +205,7 @@ def test_write_sql_save_table(spark_tmp_path, orc_gens, ts_type, orc_impl, spark @allow_non_gpu('DataWritingCommandExec,ExecutedCommandExec,WriteFilesExec') @pytest.mark.parametrize('codec', ['zlib', 'lzo']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_orc_write_compression_fallback(spark_tmp_path, codec, spark_tmp_table_factory): gen = TimestampGen() data_path = spark_tmp_path + '/PARQUET_DATA' @@ -256,6 +262,7 @@ def sql_write(spark, path): @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_empty_orc_round_trip(spark_tmp_path, orc_gens): def create_empty_df(spark, path): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index 8efacc18d3e..f6cc2a0141b 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -15,8 +15,8 @@ import pytest -from asserts import assert_cpu_and_gpu_are_equal_collect_with_capture, assert_cpu_and_gpu_are_equal_sql_with_capture, assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_row_counts_equal, \ - assert_gpu_fallback_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_and_cpu_error, assert_spark_exception +from asserts import * +from conftest import is_not_utc from data_gen import * from parquet_write_test import parquet_nested_datetime_gen, parquet_ts_write_options from marks import * @@ -25,7 +25,7 @@ from pyspark.sql.types import * from pyspark.sql.functions import * from spark_init_internal import spark_version -from spark_session import with_cpu_session, with_gpu_session, is_before_spark_320, is_before_spark_330, is_spark_321cdh +from spark_session import * from conftest import is_databricks_runtime, is_dataproc_runtime @@ -164,6 +164,7 @@ def setup_table(spark): @pytest.mark.parametrize('read_func', [read_parquet_df, read_parquet_sql]) @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_read_round_trip(spark_tmp_path, parquet_gens, read_func, reader_confs, v1_enabled_list): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -298,6 +299,7 @@ def test_parquet_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_l @pytest.mark.parametrize('read_func', [read_parquet_df, read_parquet_sql]) @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_pred_push_round_trip(spark_tmp_path, parquet_gen, read_func, v1_enabled_list, reader_confs): data_path = spark_tmp_path + '/PARQUET_DATA' gen_list = [('a', RepeatSeqGen(parquet_gen, 100)), ('b', parquet_gen)] @@ -317,6 +319,7 @@ def test_parquet_pred_push_round_trip(spark_tmp_path, parquet_gen, read_func, v1 @pytest.mark.parametrize('ts_rebase_read', [('CORRECTED', 'LEGACY'), ('LEGACY', 'CORRECTED')]) @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_read_roundtrip_datetime_with_legacy_rebase(spark_tmp_path, parquet_gens, ts_type, ts_rebase_write, ts_rebase_read, reader_confs, v1_enabled_list): @@ -356,6 +359,7 @@ def test_parquet_decimal_read_legacy(spark_tmp_path, parquet_gens, read_func, re @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) @pytest.mark.parametrize('batch_size', [100, INT_MAX]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader_confs, batch_size): # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed # we should go with a more standard set of generators @@ -387,6 +391,7 @@ def test_parquet_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader # In this we are reading the data, but only reading the key the data was partitioned by @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_partitioned_read_just_partitions(spark_tmp_path, v1_enabled_list, reader_confs): parquet_gens = [byte_gen] gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] @@ -529,6 +534,7 @@ def read_and_remove(spark): @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_read_merge_schema(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed # we should go with a more standard set of generators @@ -553,6 +559,7 @@ def test_parquet_read_merge_schema(spark_tmp_path, v1_enabled_list, reader_confs @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_read_merge_schema_from_conf(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed # we should go with a more standard set of generators @@ -868,6 +875,7 @@ def test_parquet_reading_from_unaligned_pages_basic_filters(spark_tmp_path, read @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @pytest.mark.parametrize('enable_dictionary', ["true", "false"], ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_reading_from_unaligned_pages_all_types(spark_tmp_path, reader_confs, enable_dictionary, v1_enabled_list): all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) data_path = spark_tmp_path + '/PARQUET_UNALIGNED_DATA' @@ -895,6 +903,7 @@ def test_parquet_reading_from_unaligned_pages_all_types(spark_tmp_path, reader_c @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @pytest.mark.parametrize('enable_dictionary', ["true", "false"], ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_reading_from_unaligned_pages_all_types_dict_optimized(spark_tmp_path, reader_confs, enable_dictionary, v1_enabled_list): all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) data_path = spark_tmp_path + '/PARQUET_UNALIGNED_DATA' diff --git a/integration_tests/src/main/python/parquet_testing_test.py b/integration_tests/src/main/python/parquet_testing_test.py index 642d99c8f0b..a4600de7b86 100644 --- a/integration_tests/src/main/python/parquet_testing_test.py +++ b/integration_tests/src/main/python/parquet_testing_test.py @@ -16,7 +16,7 @@ # https://github.com/apache/parquet-testing from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error -from conftest import get_std_input_path, is_parquet_testing_tests_forced, is_precommit_run +from conftest import get_std_input_path, is_parquet_testing_tests_forced, is_precommit_run, is_not_utc from data_gen import copy_and_update from pathlib import Path import pytest @@ -122,6 +122,7 @@ def gen_testing_params_for_valid_files(): @pytest.mark.parametrize("path", gen_testing_params_for_valid_files()) @pytest.mark.parametrize("confs", [_native_reader_confs, _java_reader_confs]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_testing_valid_files(path, confs): assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.read.parquet(path), conf=confs) diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index c83939f4774..9584f2a3520 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -14,14 +14,14 @@ import pytest -from asserts import assert_gpu_and_cpu_sql_writes_are_equal_collect, assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_writes_are_equal_collect, assert_gpu_fallback_write, assert_spark_exception +from asserts import * +from conftest import is_not_utc from datetime import date, datetime, timezone from data_gen import * from enum import Enum from marks import * from pyspark.sql.types import * -from spark_session import with_cpu_session, with_gpu_session, is_before_spark_330, is_before_spark_320, is_spark_cdh, \ - is_databricks_runtime, is_before_spark_340, is_spark_340_or_later, is_databricks122_or_later +from spark_session import * import pyspark.sql.functions as f import pyspark.sql.utils @@ -90,6 +90,7 @@ @pytest.mark.order(1) # at the head of xdist worker queue if pytest-order is installed @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_round_trip(spark_tmp_path, parquet_gens): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -135,6 +136,7 @@ def test_write_round_trip_corner(spark_tmp_path, par_gen): ArrayGen(TimestampGen(), max_length=10), MapGen(TimestampGen(nullable=False), TimestampGen())]], ids=idfn) @pytest.mark.parametrize('ts_type', parquet_ts_write_options) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timestamp_write_round_trip(spark_tmp_path, parquet_gens, ts_type): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -148,6 +150,7 @@ def test_timestamp_write_round_trip(spark_tmp_path, parquet_gens, ts_type): @pytest.mark.parametrize('ts_type', parquet_ts_write_options) @pytest.mark.parametrize('ts_rebase', ['CORRECTED']) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_ts_millis(spark_tmp_path, ts_type, ts_rebase): gen = TimestampGen() data_path = spark_tmp_path + '/PARQUET_DATA' @@ -171,6 +174,7 @@ def test_write_ts_millis(spark_tmp_path, ts_type, ts_rebase): @ignore_order @pytest.mark.order(1) # at the head of xdist worker queue if pytest-order is installed @pytest.mark.parametrize('parquet_gen', parquet_part_write_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_part_write_round_trip(spark_tmp_path, parquet_gen): gen_list = [('a', RepeatSeqGen(parquet_gen, 10)), ('b', parquet_gen)] @@ -185,6 +189,7 @@ def test_part_write_round_trip(spark_tmp_path, parquet_gen): @pytest.mark.skipif(is_spark_340_or_later() or is_databricks122_or_later(), reason="`WriteFilesExec` is only supported in Spark 340+") @pytest.mark.parametrize('data_gen', [TimestampGen()], ids=idfn) @pytest.mark.allow_non_gpu("DataWritingCommandExec") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_int96_write_conf(spark_tmp_path, data_gen): data_path = spark_tmp_path + '/PARQUET_DATA' confs = copy_and_update(writer_confs, { @@ -202,6 +207,7 @@ def test_int96_write_conf(spark_tmp_path, data_gen): @pytest.mark.parametrize('data_gen', [TimestampGen()], ids=idfn) # Note: From Spark 340, WriteFilesExec is introduced. @pytest.mark.allow_non_gpu("DataWritingCommandExec", "WriteFilesExec") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_int96_write_conf_with_write_exec(spark_tmp_path, data_gen): data_path = spark_tmp_path + '/PARQUET_DATA' confs = copy_and_update(writer_confs, { @@ -215,6 +221,7 @@ def test_int96_write_conf_with_write_exec(spark_tmp_path, data_gen): ['DataWritingCommandExec', 'WriteFilesExec'], confs) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_all_null_int96(spark_tmp_path): class AllNullTimestampGen(TimestampGen): def start(self, rand): @@ -244,6 +251,7 @@ def test_compress_write_round_trip(spark_tmp_path, compress): @pytest.mark.order(2) @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_save_table(spark_tmp_path, parquet_gens, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -261,6 +269,7 @@ def write_parquet_sql_from(spark, df, data_path, write_to_table): @pytest.mark.order(2) @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_sql_save_table(spark_tmp_path, parquet_gens, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -283,6 +292,7 @@ def writeParquetUpgradeCatchException(spark, df, data_path, spark_tmp_table_fact ('TIMESTAMP_MICROS', TimestampGen(start=datetime(1, 2, 1, tzinfo=timezone.utc), end=datetime(1899, 12, 31, tzinfo=timezone.utc))), ('TIMESTAMP_MILLIS', TimestampGen(start=datetime(1, 2, 1, tzinfo=timezone.utc), end=datetime(1899, 12, 31, tzinfo=timezone.utc)))]) @pytest.mark.parametrize('rebase', ["CORRECTED","EXCEPTION"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_ts_write_fails_datetime_exception(spark_tmp_path, ts_write_data_gen, spark_tmp_table_factory, rebase): ts_write, gen = ts_write_data_gen data_path = spark_tmp_path + '/PARQUET_DATA' @@ -461,6 +471,7 @@ def generate_map_with_empty_validity(spark, path): @pytest.mark.parametrize('data_gen', parquet_nested_datetime_gen, ids=idfn) @pytest.mark.parametrize('ts_write', parquet_ts_write_options) @pytest.mark.parametrize('ts_rebase_write', ['EXCEPTION']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_write_fails_legacy_datetime(spark_tmp_path, data_gen, ts_write, ts_rebase_write): data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = {'spark.sql.parquet.outputTimestampType': ts_write, @@ -478,6 +489,7 @@ def writeParquetCatchException(spark, data_gen, data_path): @pytest.mark.parametrize('ts_write', parquet_ts_write_options) @pytest.mark.parametrize('ts_rebase_write', [('CORRECTED', 'LEGACY'), ('LEGACY', 'CORRECTED')]) @pytest.mark.parametrize('ts_rebase_read', [('CORRECTED', 'LEGACY'), ('LEGACY', 'CORRECTED')]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_write_roundtrip_datetime_with_legacy_rebase(spark_tmp_path, data_gen, ts_write, ts_rebase_write, ts_rebase_read): data_path = spark_tmp_path + '/PARQUET_DATA' @@ -521,6 +533,7 @@ def test_it(spark): with_gpu_session(test_it, conf) @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_empty_parquet_round_trip(spark_tmp_path, parquet_gens): def create_empty_df(spark, path): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] @@ -760,6 +773,7 @@ def read_table(spark, path): # Test to avoid regression on a known bug in Spark. For details please visit https://github.com/NVIDIA/spark-rapids/issues/8693 @pytest.mark.parametrize('ts_rebase', ['LEGACY', 'CORRECTED']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_timestamp_value(spark_tmp_table_factory, spark_tmp_path, ts_rebase): def func_test(create_table, read_table, data_path, conf): assert_gpu_and_cpu_writes_are_equal_collect(create_table, read_table, data_path, conf=conf) diff --git a/integration_tests/src/main/python/qa_nightly_select_test.py b/integration_tests/src/main/python/qa_nightly_select_test.py index ba3414e51fe..1349de3fcdf 100644 --- a/integration_tests/src/main/python/qa_nightly_select_test.py +++ b/integration_tests/src/main/python/qa_nightly_select_test.py @@ -16,6 +16,7 @@ from pyspark.sql.types import * from pyspark import SparkConf, SparkContext, SQLContext import pyspark.sql.functions as f +from conftest import is_not_utc import datetime from argparse import ArgumentParser from decimal import Decimal @@ -158,6 +159,7 @@ def idfn(val): @incompat @qarun @pytest.mark.parametrize('sql_query_line', SELECT_SQL, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_select(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -170,6 +172,7 @@ def test_select(sql_query_line, pytestconfig): @incompat @qarun @pytest.mark.parametrize('sql_query_line', SELECT_NEEDS_SORT_SQL, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_needs_sort_select(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -182,6 +185,7 @@ def test_needs_sort_select(sql_query_line, pytestconfig): @ignore_order(local=True) @qarun @pytest.mark.parametrize('sql_query_line', SELECT_JOIN_SQL, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_select_join(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -198,6 +202,7 @@ def init_tables(spark): @ignore_order(local=True) @qarun @pytest.mark.parametrize('sql_query_line', SELECT_PRE_ORDER_SQL, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_select_first_last(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -210,6 +215,7 @@ def test_select_first_last(sql_query_line, pytestconfig): @ignore_order(local=True) @qarun @pytest.mark.parametrize('sql_query_line', SELECT_FLOAT_SQL, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_select_float_order_local(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -224,6 +230,7 @@ def test_select_float_order_local(sql_query_line, pytestconfig): @qarun @pytest.mark.parametrize('sql_query_line', SELECT_REGEXP_SQL, ids=idfn) @pytest.mark.skipif(not is_jvm_charset_utf8(), reason="Regular expressions require UTF-8") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_select_regexp(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: diff --git a/integration_tests/src/main/python/repart_test.py b/integration_tests/src/main/python/repart_test.py index d44280ada69..60e0a191f25 100644 --- a/integration_tests/src/main/python/repart_test.py +++ b/integration_tests/src/main/python/repart_test.py @@ -16,6 +16,7 @@ from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect from spark_session import is_before_spark_320, is_before_spark_330 +from conftest import is_not_utc from data_gen import * from marks import ignore_order, allow_non_gpu import pyspark.sql.functions as f @@ -89,6 +90,7 @@ def test_union_struct_missing_children(data_gen): nested_struct, struct_of_maps], ids=idfn) # This tests union of two DFs of two cols each. The types of the left col and right col is the same +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_union(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).union(binary_op_df(spark, data_gen))) @@ -99,6 +101,7 @@ def test_union(data_gen): nested_struct, struct_of_maps], ids=idfn) # This tests union of two DFs of two cols each. The types of the left col and right col is the same +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_unionAll(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).unionAll(binary_op_df(spark, data_gen))) @@ -113,6 +116,7 @@ def test_unionAll(data_gen): struct_of_maps], ids=idfn) # This tests the union of two DFs of structs with missing child column names. The missing child # column will be replaced by nulls in the output DF. This is a feature added in 3.1+ +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_union_by_missing_col_name(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).withColumnRenamed("a", "x") @@ -154,6 +158,7 @@ def assert_union_equal(gen1, gen2): StructGen([['child0', DecimalGen(7, 2)]]), nested_struct, struct_of_maps], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_union_by_name(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).unionByName(binary_op_df(spark, data_gen))) @@ -165,12 +170,14 @@ def test_union_by_name(data_gen): pytest.param([('array' + str(i), gen) for i, gen in enumerate(array_gens_sample + [ArrayGen(BinaryGen(max_length=5), max_length=5)])]), pytest.param([('map' + str(i), gen) for i, gen in enumerate(map_gens_sample)]), ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_coalesce_types(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen).coalesce(2)) @pytest.mark.parametrize('num_parts', [1, 10, 100, 1000, 2000], ids=idfn) @pytest.mark.parametrize('length', [0, 2048, 4096], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_coalesce_df(num_parts, length): #This should change eventually to be more than just the basic gens gen_list = [('_c' + str(i), gen) for i, gen in enumerate(all_basic_gens + decimal_gens + [binary_gen])] @@ -186,6 +193,7 @@ def test_coalesce_df(num_parts, length): @pytest.mark.parametrize('num_parts', [1, 10, 2345], ids=idfn) @pytest.mark.parametrize('length', [0, 2048, 4096], ids=idfn) @ignore_order(local=True) # To avoid extra data shuffle by 'sort on Spark' for this repartition test. +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_repartition_df(data_gen, num_parts, length): from pyspark.sql.functions import lit assert_gpu_and_cpu_are_equal_collect( @@ -202,6 +210,7 @@ def test_repartition_df(data_gen, num_parts, length): @pytest.mark.parametrize('num_parts', [1, 10, 2345], ids=idfn) @pytest.mark.parametrize('length', [0, 2048, 4096], ids=idfn) @ignore_order(local=True) # To avoid extra data shuffle by 'sort on Spark' for this repartition test. +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_repartition_df_for_round_robin(data_gen, num_parts, length): from pyspark.sql.functions import lit assert_gpu_and_cpu_are_equal_collect( @@ -275,6 +284,7 @@ def test_hash_fallback(data_gen): ([('a', decimal_gen_64bit), ('b', decimal_gen_64bit), ('c', decimal_gen_64bit)], ['a', 'b', 'c']), ([('a', decimal_gen_128bit), ('b', decimal_gen_128bit), ('c', decimal_gen_128bit)], ['a', 'b', 'c']), ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_repartition_exact(gen, num_parts): data_gen = gen[0] part_on = gen[1] diff --git a/integration_tests/src/main/python/row-based_udf_test.py b/integration_tests/src/main/python/row-based_udf_test.py index e849a87b10e..19b02f2e24e 100644 --- a/integration_tests/src/main/python/row-based_udf_test.py +++ b/integration_tests/src/main/python/row-based_udf_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_sql +from conftest import is_not_utc from data_gen import * from spark_session import with_spark_session, is_spark_350_or_later from conftest import skip_unless_precommit_tests @@ -33,6 +34,7 @@ def load_hive_udf(spark, udfname, udfclass): @pytest.mark.xfail(condition=is_spark_350_or_later(), reason='https://github.com/NVIDIA/spark-rapids/issues/9064') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_empty_simple_udf(): with_spark_session(skip_if_no_hive) @@ -46,6 +48,7 @@ def evalfn(spark): "SELECT i, emptysimple(s, 'const_string') FROM hive_simple_udf_test_table", conf={'spark.rapids.sql.rowBasedUDF.enabled': 'true'}) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_empty_generic_udf(): with_spark_session(skip_if_no_hive) def evalfn(spark): diff --git a/integration_tests/src/main/python/row_conversion_test.py b/integration_tests/src/main/python/row_conversion_test.py index 92ea05d68be..bc13419c8ec 100644 --- a/integration_tests/src/main/python/row_conversion_test.py +++ b/integration_tests/src/main/python/row_conversion_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect +from conftest import is_not_utc from data_gen import * from marks import allow_non_gpu, approximate_float, incompat from pyspark.sql.types import * @@ -28,6 +29,7 @@ # to be brought back to the CPU (rows) to be returned. # So we just need a very simple operation in the middle that # can be done on the GPU. +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_row_conversions(): gens = [["a", byte_gen], ["b", short_gen], ["c", int_gen], ["d", long_gen], ["e", float_gen], ["f", double_gen], ["g", string_gen], ["h", boolean_gen], @@ -42,6 +44,7 @@ def test_row_conversions(): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, gens).selectExpr("*", "a as a_again")) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_row_conversions_fixed_width(): gens = [["a", byte_gen], ["b", short_gen], ["c", int_gen], ["d", long_gen], ["e", float_gen], ["f", double_gen], ["h", boolean_gen], @@ -50,6 +53,7 @@ def test_row_conversions_fixed_width(): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, gens).selectExpr("*", "a as a_again")) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_row_conversions_fixed_width_wide(): gens = [["a{}".format(i), ByteGen(nullable=True)] for i in range(10)] + \ [["b{}".format(i), ShortGen(nullable=True)] for i in range(10)] + \ diff --git a/integration_tests/src/main/python/sample_test.py b/integration_tests/src/main/python/sample_test.py index fc9d9fc4cbf..5ae72212bed 100644 --- a/integration_tests/src/main/python/sample_test.py +++ b/integration_tests/src/main/python/sample_test.py @@ -14,6 +14,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect +from conftest import is_not_utc from data_gen import * from pyspark.sql.types import * from spark_session import is_before_spark_330 @@ -38,6 +39,7 @@ def test_sample_produce_empty_batch(data_gen): # the following cases is the normal cases and do not use @ignore_order nested_gens = array_gens_sample + struct_gens_sample + map_gens_sample @pytest.mark.parametrize('data_gen', basic_gens + nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sample(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen, num_slices = 10) @@ -45,6 +47,7 @@ def test_sample(data_gen): ) @pytest.mark.parametrize('data_gen', basic_gens + nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sample_with_replacement(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen, num_slices = 10).sample( diff --git a/integration_tests/src/main/python/schema_evolution_test.py b/integration_tests/src/main/python/schema_evolution_test.py index 4138bb11e86..d9f4c0f0899 100644 --- a/integration_tests/src/main/python/schema_evolution_test.py +++ b/integration_tests/src/main/python/schema_evolution_test.py @@ -13,6 +13,7 @@ # limitations under the License. from asserts import assert_gpu_and_cpu_are_equal_collect +from conftest import is_not_utc from data_gen import * from datetime import date, datetime, timezone from marks import ignore_order @@ -61,6 +62,7 @@ def get_ddl(col_gen_pairs): @ignore_order(local=True) @pytest.mark.parametrize("format", _formats) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_column_add_after_partition(spark_tmp_table_factory, format): # Databricks 10.4 appears to be missing https://issues.apache.org/jira/browse/SPARK-39417 # so avoid generating nulls for numeric partitions diff --git a/integration_tests/src/main/python/sort_test.py b/integration_tests/src/main/python/sort_test.py index f3a73066af5..7fe208ae12d 100644 --- a/integration_tests/src/main/python/sort_test.py +++ b/integration_tests/src/main/python/sort_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect +from conftest import is_not_utc from data_gen import * from marks import allow_non_gpu from pyspark.sql.types import * @@ -51,6 +52,7 @@ def test_sort_nonbinary_carry_binary(data_gen): @pytest.mark.parametrize('data_gen', orderable_gens + orderable_not_null_gen, ids=idfn) @pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_orderby(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order)) @@ -58,6 +60,7 @@ def test_single_orderby(data_gen, order): @pytest.mark.parametrize('data_gen', single_level_array_gens, ids=idfn) @pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_first(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first(), f.col('a').desc_nulls_last()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_orderby_on_array(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order)) @@ -102,6 +105,7 @@ def test_single_orderby_fallback_for_array_of_struct(data_gen, order): marks=pytest.mark.xfail(reason='opposite null order not supported')), pytest.param(f.col('a').desc_nulls_last()), ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_nested_orderby_plain(data_gen, order, shuffle_parts, stable_sort): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order), @@ -129,6 +133,7 @@ def test_single_nested_orderby_fallback_for_nullorder(data_gen, order): orderable_without_neg_decimal = [n for n in (orderable_gens + orderable_not_null_gen) if not (isinstance(n, DecimalGen) and n.scale < 0)] @pytest.mark.parametrize('data_gen', orderable_without_neg_decimal + single_level_array_gens, ids=idfn) @pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_orderby_with_limit(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order).limit(100)) @@ -139,6 +144,7 @@ def test_single_orderby_with_limit(data_gen, order): pytest.param(f.col('a').desc(), all_basic_struct_gen), pytest.param(f.col('a').desc_nulls_last(), all_basic_struct_gen) ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_nested_orderby_with_limit(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order).limit(100), @@ -161,6 +167,7 @@ def test_single_nested_orderby_with_limit_fallback(data_gen, order): @pytest.mark.parametrize('data_gen', orderable_gens + orderable_not_null_gen + single_level_array_gens, ids=idfn) @pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_sort_in_part(data_gen, order): # We set `num_slices` to handle https://github.com/NVIDIA/spark-rapids/issues/2477 assert_gpu_and_cpu_are_equal_collect( @@ -183,6 +190,7 @@ def test_single_sort_in_part(data_gen, order): pytest.param(f.col('a').desc_nulls_last()), ], ids=idfn) @pytest.mark.parametrize('stable_sort', ['STABLE', 'OUTOFCORE'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_nested_sort_in_part(data_gen, order, stable_sort): sort_conf = {'spark.rapids.sql.stableSort.enabled': stable_sort == 'STABLE'} assert_gpu_and_cpu_are_equal_collect( @@ -193,11 +201,13 @@ def test_single_nested_sort_in_part(data_gen, order, stable_sort): boolean_gen, timestamp_gen, date_gen, string_gen, null_gen, StructGen([('child0', long_gen)]) ] + orderable_decimal_gens + single_level_array_gens @pytest.mark.parametrize('data_gen', orderable_gens_sort, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_multi_orderby(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).orderBy(f.col('a'), f.col('b').desc())) @pytest.mark.parametrize('data_gen', single_level_array_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_multi_orderby_on_array(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).orderBy(f.col('a'), f.col('b').desc())) @@ -205,6 +215,7 @@ def test_multi_orderby_on_array(data_gen): # SPARK CPU itself has issue with negative scale for take ordered and project orderable_gens_sort_without_neg_decimal = [n for n in orderable_gens_sort if not (isinstance(n, DecimalGen) and n.scale < 0)] @pytest.mark.parametrize('data_gen', orderable_gens_sort_without_neg_decimal + single_level_array_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_multi_orderby_with_limit(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).orderBy(f.col('a'), f.col('b').desc()).limit(100)) @@ -212,6 +223,7 @@ def test_multi_orderby_with_limit(data_gen): # We added in a partitioning optimization to take_ordered_and_project # This should trigger it. @pytest.mark.parametrize('data_gen', orderable_gens_sort_without_neg_decimal + single_level_array_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_multi_orderby_with_limit_single_part(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).coalesce(1).orderBy(f.col('a'), f.col('b').desc()).limit(100)) @@ -256,6 +268,7 @@ def test_single_orderby_with_skew(data_gen): # We are not trying all possibilities, just doing a few with numbers so the query works. @pytest.mark.parametrize('data_gen', [all_basic_struct_gen, StructGen([['child0', all_basic_struct_gen]])], ids=idfn) @pytest.mark.parametrize('stable_sort', ['STABLE', 'OUTOFCORE'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_nested_orderby_with_skew(data_gen, stable_sort): sort_conf = {'spark.rapids.sql.stableSort.enabled': stable_sort == 'STABLE'} # When doing range partitioning the upstream data is sampled to try and get the bounds for cutoffs. @@ -299,6 +312,7 @@ def test_large_orderby(data_gen, stable_sort): simple_string_to_string_map_gen, ArrayGen(byte_gen, max_length=5)], ids=idfn) @pytest.mark.order(2) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_large_orderby_nested_ridealong(data_gen): # We use a UniqueLongGen to avoid duplicate keys that can cause ambiguity in the sort # results, especially on distributed clusters. @@ -319,6 +333,7 @@ def test_large_orderby_nested_ridealong(data_gen): ArrayGen(byte_gen, max_length=5), ArrayGen(decimal_gen_128bit, max_length=5)], ids=idfn) @pytest.mark.order(2) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_orderby_nested_ridealong_limit(data_gen): # We use a UniqueLongGen to avoid duplicate keys that can cause ambiguity in the sort # results, especially on distributed clusters. diff --git a/integration_tests/src/main/python/struct_test.py b/integration_tests/src/main/python/struct_test.py index 0e230a95408..986781c32e0 100644 --- a/integration_tests/src/main/python/struct_test.py +++ b/integration_tests/src/main/python/struct_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql +from conftest import is_not_utc from data_gen import * from pyspark.sql.types import * @@ -33,6 +34,7 @@ def test_struct_scalar_project(): StructGen([["first", decimal_gen_64bit], ["second", decimal_gen_32bit], ["third", decimal_gen_32bit]]), StructGen([["first", decimal_gen_128bit], ["second", decimal_gen_128bit], ["third", decimal_gen_128bit]]), StructGen([["first", binary_gen], ["second", ArrayGen(BinaryGen(max_length=10), max_length=10)], ["third", binary_gen]])], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_struct_get_item(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr( @@ -43,6 +45,7 @@ def test_struct_get_item(data_gen): @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens + [binary_gen, null_gen] + single_level_array_gens + struct_gens_sample + map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_make_struct(data_gen): # Spark has no good way to create a map literal without the map function # so we are inserting one. diff --git a/integration_tests/src/main/python/subquery_test.py b/integration_tests/src/main/python/subquery_test.py index e6d641d4212..25a70b47a17 100644 --- a/integration_tests/src/main/python/subquery_test.py +++ b/integration_tests/src/main/python/subquery_test.py @@ -14,11 +14,13 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_sql +from conftest import is_not_utc from data_gen import * from marks import * @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_basic_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_scalar_subquery_basics(data_gen): # Fix num_slices at 1 to make sure that first/last returns same results under CPU and GPU. assert_gpu_and_cpu_are_equal_sql( @@ -31,6 +33,7 @@ def test_scalar_subquery_basics(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('basic_gen', all_basic_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_scalar_subquery_struct(basic_gen): # single-level struct gen = [('ss', StructGen([['a', basic_gen], ['b', basic_gen]]))] @@ -65,6 +68,7 @@ def test_scalar_subquery_struct(basic_gen): @ignore_order(local=True) @pytest.mark.parametrize('basic_gen', all_basic_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_scalar_subquery_array(basic_gen): # single-level array assert_gpu_and_cpu_are_equal_sql( diff --git a/integration_tests/src/main/python/time_window_test.py b/integration_tests/src/main/python/time_window_test.py index ff367b506fb..52071926309 100644 --- a/integration_tests/src/main/python/time_window_test.py +++ b/integration_tests/src/main/python/time_window_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect +from conftest import is_not_utc from data_gen import * from datetime import datetime from marks import ignore_order, allow_non_gpu @@ -29,6 +30,7 @@ @pytest.mark.parametrize('data_gen', integral_gens + [string_gen], ids=idfn) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_grouped_tumbling_window(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen],['data', data_gen]], nullable=False) assert_gpu_and_cpu_are_equal_collect( @@ -40,6 +42,7 @@ def test_grouped_tumbling_window(data_gen): @pytest.mark.parametrize('data_gen', integral_gens + [string_gen], ids=idfn) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_grouped_sliding_window(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen],['data', data_gen]], nullable=False) assert_gpu_and_cpu_are_equal_collect( @@ -47,6 +50,7 @@ def test_grouped_sliding_window(data_gen): @pytest.mark.parametrize('data_gen', integral_gens + [string_gen], ids=idfn) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_grouped_sliding_window_array(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen],['data', ArrayGen(data_gen)]], nullable=False) assert_gpu_and_cpu_are_equal_collect( @@ -54,6 +58,7 @@ def test_grouped_sliding_window_array(data_gen): @pytest.mark.parametrize('data_gen', integral_gens + [string_gen], ids=idfn) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_tumbling_window(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen],['data', data_gen]], nullable=False) w = Window.partitionBy(f.window('ts', '5 hour')) @@ -62,6 +67,7 @@ def test_tumbling_window(data_gen): @pytest.mark.parametrize('data_gen', integral_gens + [string_gen], ids=idfn) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sliding_window(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen],['data', data_gen]], nullable=False) w = Window.partitionBy(f.window('ts', '5 hour', '1 hour')) @@ -72,6 +78,7 @@ def test_sliding_window(data_gen): @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens + array_gens_sample + map_gens_sample, ids=idfn) # This includes an expand and we produce a different order than the CPU does. Sort locally to allow sorting of all types @ignore_order(local=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_just_window(data_gen): row_gen = StructGen([['ts', timestamp_gen],['data', data_gen]], nullable=False) assert_gpu_and_cpu_are_equal_collect( diff --git a/integration_tests/src/main/python/udf_test.py b/integration_tests/src/main/python/udf_test.py index db8425f6387..88281279162 100644 --- a/integration_tests/src/main/python/udf_test.py +++ b/integration_tests/src/main/python/udf_test.py @@ -14,7 +14,7 @@ import pytest -from conftest import is_at_least_precommit_run +from conftest import is_at_least_precommit_run, is_not_utc from spark_session import is_databricks_runtime, is_before_spark_330, is_before_spark_350, is_spark_340_or_later from pyspark.sql.pandas.utils import require_minimum_pyarrow_version, require_minimum_pandas_version @@ -90,6 +90,7 @@ def iterator_add(to_process: Iterator[Tuple[pd.Series, pd.Series]]) -> Iterator[ @pytest.mark.parametrize('data_gen', data_gens_nested_for_udf, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_pandas_scalar_udf_nested_type(data_gen): def nested_size(nested): return pd.Series([nested.size]).repeat(len(nested)) @@ -116,6 +117,7 @@ def pandas_sum(to_process: pd.Series) -> float: @approximate_float @pytest.mark.parametrize('data_gen', arrow_common_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_aggregate_udf_more_types(data_gen): @f.pandas_udf('double') def group_size_udf(to_process: pd.Series) -> float: @@ -146,6 +148,7 @@ def pandas_sum(to_process: pd.Series) -> int: @ignore_order(local=True) @pytest.mark.parametrize('data_gen', arrow_common_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_group_aggregate_udf_more_types(data_gen): @f.pandas_udf('long') def group_size_udf(to_process: pd.Series) -> int: @@ -261,6 +264,7 @@ def pandas_add(data): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', arrow_common_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_group_apply_udf_more_types(data_gen): def group_size_udf(key, pdf): return pd.DataFrame([[len(key), len(pdf), len(pdf.columns)]]) @@ -288,6 +292,7 @@ def pandas_filter(iterator): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', data_gens_nested_for_udf, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_pandas_map_udf_nested_type(data_gen): # Supported UDF output types by plugin: (commonCudfTypes + ARRAY).nested() + STRUCT # STRUCT represents the whole dataframe in Map Pandas UDF, so no struct column in UDF output. diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py index e01c68ed35c..d850403d118 100644 --- a/integration_tests/src/main/python/window_function_test.py +++ b/integration_tests/src/main/python/window_function_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_fallback_collect, assert_gpu_sql_fallback_collect +from conftest import is_not_utc from data_gen import * from marks import * from pyspark.sql.types import * @@ -450,6 +451,7 @@ def test_range_windows_with_string_order_by_column(data_gen, batch_size): # the order returned should be consistent because the data ends up in a single task (no partitioning) @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('b_gen', all_basic_gens + [decimal_gen_32bit, decimal_gen_128bit], ids=meta_idfn('data:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_batched_unbounded_no_part(b_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.castFloatToDecimal.enabled': True} @@ -467,6 +469,7 @@ def test_window_batched_unbounded_no_part(b_gen, batch_size): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('b_gen', all_basic_gens + [decimal_gen_32bit, decimal_gen_128bit], ids=meta_idfn('data:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_batched_unbounded(b_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.castFloatToDecimal.enabled': True} @@ -487,6 +490,7 @@ def test_window_batched_unbounded(b_gen, batch_size): # the order returned should be consistent because the data ends up in a single task (no partitioning) @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('b_gen', all_basic_gens + [decimal_gen_32bit, decimal_gen_128bit], ids=meta_idfn('data:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_rows_based_running_window_unpartitioned(b_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.castFloatToDecimal.enabled': True} @@ -522,6 +526,7 @@ def test_rows_based_running_window_unpartitioned(b_gen, batch_size): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # Testing multiple batch sizes. @pytest.mark.parametrize('a_gen', integral_gens + [string_gen, date_gen, timestamp_gen], ids=meta_idfn('data:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_running_window_without_partitions_runs_batched(a_gen, batch_size): """ This tests the running window optimization as applied to RANGE-based window specifications, @@ -645,6 +650,7 @@ def test_running_window_float_sum_without_partitions_runs_batched(batch_size): @pytest.mark.parametrize('data_gen', all_basic_gens + [decimal_gen_32bit, orderable_decimal_gen_128bit], ids=meta_idfn('data:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_running_rank_no_part(data_gen): # Keep the batch size small. We have tested these with operators with exact inputs already, this is mostly # testing the fixup operation. @@ -672,6 +678,7 @@ def test_window_running_rank_no_part(data_gen): # but small batch sizes can make sort very slow, so do the final order by locally @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_basic_gens + [decimal_gen_32bit], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_running_rank(data_gen): # Keep the batch size small. We have tested these with operators with exact inputs already, this is mostly # testing the fixup operation. @@ -699,6 +706,7 @@ def test_window_running_rank(data_gen): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('b_gen, c_gen', [(long_gen, x) for x in running_part_and_order_gens] + [(x, long_gen) for x in all_basic_gens + [decimal_gen_32bit]], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_rows_based_running_window_partitioned(b_gen, c_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.variableFloatAgg.enabled': True, @@ -738,6 +746,7 @@ def test_rows_based_running_window_partitioned(b_gen, c_gen, batch_size): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # Test different batch sizes. @pytest.mark.parametrize('part_gen', [int_gen, long_gen], ids=idfn) # Partitioning is not really the focus of the test. @pytest.mark.parametrize('order_gen', [x for x in all_basic_gens_no_null if x not in boolean_gens] + [decimal_gen_32bit], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_range_running_window_runs_batched(part_gen, order_gen, batch_size): """ This tests the running window optimization as applied to RANGE-based window specifications, @@ -881,6 +890,7 @@ def window(oby_column): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('c_gen', lead_lag_data_gens, ids=idfn) @pytest.mark.parametrize('a_b_gen', part_and_order_gens, ids=meta_idfn('partAndOrderBy:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_multi_types_window_aggs_for_rows_lead_lag(a_b_gen, c_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size} data_gen = [ @@ -938,6 +948,7 @@ def do_it(spark): @approximate_float @pytest.mark.parametrize('struct_gen', lead_lag_struct_with_arrays_gen, ids=idfn) @pytest.mark.parametrize('a_b_gen', part_and_order_gens, ids=meta_idfn('partAndOrderBy:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_lead_lag_for_structs_with_arrays(a_b_gen, struct_gen): data_gen = [ ('a', RepeatSeqGen(a_b_gen, length=20)), @@ -971,6 +982,7 @@ def do_it(spark): @pytest.mark.parametrize('c_gen', [UniqueLongGen()], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('b_gen', [long_gen], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('a_gen', [long_gen], ids=meta_idfn('partBy:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_for_rows_lead_lag_on_arrays(a_gen, b_gen, c_gen, d_gen): data_gen = [ ('a', RepeatSeqGen(a_gen, length=20)), @@ -1000,6 +1012,7 @@ def test_window_aggs_for_rows_lead_lag_on_arrays(a_gen, b_gen, c_gen, d_gen): @approximate_float @pytest.mark.parametrize('c_gen', [string_gen], ids=idfn) @pytest.mark.parametrize('a_b_gen', part_and_order_gens, ids=meta_idfn('partAndOrderBy:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_multi_types_window_aggs_for_rows(a_b_gen, c_gen): data_gen = [ ('a', RepeatSeqGen(a_b_gen, length=20)), @@ -1057,6 +1070,7 @@ def do_it(spark): @pytest.mark.parametrize('c_gen', [UniqueLongGen()], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('b_gen', [long_gen], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('a_gen', [long_gen], ids=meta_idfn('partBy:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_lead_ignore_nulls_fallback(a_gen, b_gen, c_gen, d_gen): data_gen = [ ('a', RepeatSeqGen(a_gen, length=20)), @@ -1081,6 +1095,7 @@ def test_window_aggs_lead_ignore_nulls_fallback(a_gen, b_gen, c_gen, d_gen): @pytest.mark.parametrize('c_gen', [UniqueLongGen()], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('b_gen', [long_gen], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('a_gen', [long_gen], ids=meta_idfn('partBy:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_lag_ignore_nulls_fallback(a_gen, b_gen, c_gen, d_gen): data_gen = [ ('a', RepeatSeqGen(a_gen, length=20)), @@ -1105,6 +1120,7 @@ def test_window_aggs_lag_ignore_nulls_fallback(a_gen, b_gen, c_gen, d_gen): @pytest.mark.parametrize('data_gen', [_grpkey_longs_with_timestamps, pytest.param(_grpkey_longs_with_nullable_timestamps)], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_for_ranges_timestamps(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, data_gen, length=2048), @@ -1252,6 +1268,7 @@ def test_window_aggregations_for_big_decimal_ranges(data_gen): # SortExec does not support array type, so sort the result locally. @ignore_order(local=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_for_rows_collect_list(): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, _gen_data_for_collect_list), @@ -1298,6 +1315,7 @@ def test_window_aggs_for_rows_collect_list(): @ignore_order(local=True) # This test is more directed at Databricks and their running window optimization instead of ours # this is why we do not validate that we inserted in a GpuRunningWindowExec, yet. +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_running_window_function_exec_for_all_aggs(): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, _gen_data_for_collect_list), @@ -1406,6 +1424,7 @@ def do_it(spark): # SortExec does not support array type, so sort the result locally. @ignore_order(local=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_for_rows_collect_set(): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, _gen_data_for_collect_set), @@ -1467,6 +1486,7 @@ def test_window_aggs_for_rows_collect_set(): # and https://github.com/rapidsai/cudf/issues/11222 @ignore_order(local=True) @allow_non_gpu("ProjectExec", "SortArray") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_for_rows_collect_set_nested_array(): conf = copy_and_update(_float_conf, { "spark.rapids.sql.castFloatToString.enabled": "true", @@ -1579,6 +1599,7 @@ def do_it(spark): # but small batch sizes can make sort very slow, so do the final order by locally @ignore_order(local=True) @pytest.mark.parametrize('ride_along', all_basic_gens + decimal_gens + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_ride_along(ride_along): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, [('a', UniqueLongGen()), ('b', ride_along)]), @@ -1654,6 +1675,7 @@ def test_unbounded_to_unbounded_window(): 'last(a) IGNORE NULLS OVER (PARTITION BY b ORDER BY c) ' @pytest.mark.parametrize('data_gen', all_basic_gens_no_null + decimal_gens + _nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_first_last_nth(data_gen): assert_gpu_and_cpu_are_equal_sql( # Coalesce is to make sure that first and last, which are non-deterministic become deterministic @@ -1664,6 +1686,7 @@ def test_window_first_last_nth(data_gen): @pytest.mark.skipif(is_before_spark_320(), reason='IGNORE NULLS clause is not supported for FIRST(), LAST() and NTH_VALUE in Spark 3.1.x') @pytest.mark.parametrize('data_gen', all_basic_gens_no_null + decimal_gens + _nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_first_last_nth_ignore_nulls(data_gen): assert_gpu_and_cpu_are_equal_sql( # Coalesce is to make sure that first and last, which are non-deterministic become deterministic @@ -1674,6 +1697,7 @@ def test_window_first_last_nth_ignore_nulls(data_gen): @ignore_order(local=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_to_date_with_window_functions(): """ This test ensures that date expressions participating alongside window aggregations diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index a13b5137af0..c9b1369807c 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -155,6 +155,12 @@ ci_2() { # Download a Scala 2.12 build of spark prepare_spark $SPARK_VER 2.12 ./integration_tests/run_pyspark_from_build.sh + + # Test a portion of cases for non-UTC time zone because of limited GPU resources. + # Here testing: parquet scan, orc scan, csv scan, cast, TimeZoneAwareExpression, FromUTCTimestamp + # Nightly CIs will cover all the cases. + TZ=Iran TEST='test_parquet_read_round_trip or test_read_round_trip or test_basic_csv_read or test_cast_string_ts_valid_format or test_unix_timestamp or test_from_utc_timestamp' ./integration_tests/run_pyspark_from_build.sh + # enable avro test separately INCLUDE_SPARK_AVRO_JAR=true TEST='avro_test.py' ./integration_tests/run_pyspark_from_build.sh # export 'LC_ALL' to set locale with UTF-8 so regular expressions are enabled