Skip to content

Commit

Permalink
xfail all the impacted cases when using non-UTC time zone [databricks] (
Browse files Browse the repository at this point in the history
#9773)

* re-organize imports

Signed-off-by: Chong Gao <[email protected]>

* xfail all the cases when it's non-UTC time zone

* Temporarily testing non-UTC test cases becasue of non-UTC TZ pipeline is not ready

* Xfail more cases that involve timestamp type

* Xfail Databricks cases because its default rebase mode is legacy

* Xfail Databricks cases because its default rebase mode is legacy

* Xfail Databricks cases because its default rebase mode is legacy

* Xfail failed cases for Spark 341

* Revert "Xfail Databricks cases because its default rebase mode is legacy"

This reverts commit 34cbc5d.

* Revert "Xfail Databricks cases because its default rebase mode is legacy"

This reverts commit 57a1476.

* Revert "Xfail Databricks cases because its default rebase mode is legacy"

This reverts commit c2b5ffb.

* Revert "Xfail more cases that involve timestamp type"

This reverts commit 3f8bc40.

* Revert "Temporarily testing non-UTC test cases becasue of non-UTC TZ pipeline is not ready"

This reverts commit 9530b23.

* Temporarily testing non-UTC test cases

* Temporarily testing non-UTC test cases

* Fix

* Restore TEST_PARALLEL from 1 to 5 becasue of running is slow

* Add one more file for non-UTC time zone

* Fix import error

* Test a portion of cases for non-UTC time zone in pre-merge

---------

Signed-off-by: Chong Gao <[email protected]>
Co-authored-by: Chong Gao <[email protected]>
  • Loading branch information
res-life and Chong Gao authored Nov 27, 2023
1 parent d13f170 commit 6aef5c2
Show file tree
Hide file tree
Showing 44 changed files with 441 additions and 38 deletions.
2 changes: 2 additions & 0 deletions integration_tests/src/main/python/aqe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from pyspark.sql.functions import when, col, current_date, current_timestamp
from pyspark.sql.types import *
from asserts import assert_gpu_and_cpu_are_equal_collect, assert_cpu_and_gpu_are_equal_collect_with_capture
from conftest import is_not_utc
from data_gen import *
from marks import ignore_order, allow_non_gpu
from spark_session import with_cpu_session, is_databricks113_or_later
Expand Down Expand Up @@ -195,6 +196,7 @@ def do_it(spark):
@ignore_order(local=True)
@allow_non_gpu('BroadcastNestedLoopJoinExec', 'Cast', 'DateSub', *db_113_cpu_bnlj_join_allow)
@pytest.mark.parametrize('join', joins, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_aqe_join_reused_exchange_inequality_condition(spark_tmp_path, join):
data_path = spark_tmp_path + '/PARQUET_DATA'
def prep(spark):
Expand Down
3 changes: 3 additions & 0 deletions integration_tests/src/main/python/arithmetic_ops_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import pytest

from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_fallback_collect, assert_gpu_and_cpu_are_equal_sql
from conftest import is_not_utc
from data_gen import *
from marks import ignore_order, incompat, approximate_float, allow_non_gpu, datagen_overrides
from pyspark.sql.types import *
Expand Down Expand Up @@ -987,6 +988,7 @@ def test_columnar_pow(data_gen):
lambda spark : binary_op_df(spark, data_gen).selectExpr('pow(a, b)'))

@pytest.mark.parametrize('data_gen', all_basic_gens + _arith_decimal_gens, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_least(data_gen):
num_cols = 20
s1 = with_cpu_session(
Expand All @@ -1003,6 +1005,7 @@ def test_least(data_gen):
f.least(*command_args)))

@pytest.mark.parametrize('data_gen', all_basic_gens + _arith_decimal_gens, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_greatest(data_gen):
num_cols = 20
s1 = with_cpu_session(
Expand Down
25 changes: 24 additions & 1 deletion integration_tests/src/main/python/array_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_and_cpu_error, assert_gpu_fallback_collect
from data_gen import *
from conftest import is_databricks_runtime
from conftest import is_databricks_runtime, is_not_utc
from marks import incompat
from spark_session import is_before_spark_313, is_before_spark_330, is_databricks113_or_later, is_spark_330_or_later, is_databricks104_or_later, is_spark_33X, is_spark_340_or_later, is_spark_330, is_spark_330cdh
from pyspark.sql.types import *
Expand Down Expand Up @@ -103,11 +103,13 @@

@pytest.mark.parametrize('data_gen', array_item_test_gens, ids=idfn)
@pytest.mark.parametrize('index_gen', array_index_gens, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_array_item(data_gen, index_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark: two_col_df(spark, data_gen, index_gen).selectExpr('a[b]'))

@pytest.mark.parametrize('data_gen', array_item_test_gens, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_array_item_lit_ordinal(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, data_gen).selectExpr(
Expand Down Expand Up @@ -145,6 +147,7 @@ def test_array_item_with_strict_index(strict_index_enabled, index):

# No need to test this for multiple data types for array. Only one is enough, but with two kinds of invalid index.
@pytest.mark.parametrize('index', [-2, 100, array_neg_index_gen, array_out_index_gen], ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_array_item_ansi_fail_invalid_index(index):
message = "SparkArrayIndexOutOfBoundsException" if (is_databricks104_or_later() or is_spark_330_or_later()) else "java.lang.ArrayIndexOutOfBoundsException"
if isinstance(index, int):
Expand All @@ -171,6 +174,7 @@ def test_array_item_ansi_not_fail_all_null_data():
decimal_gen_32bit, decimal_gen_64bit, decimal_gen_128bit, binary_gen,
StructGen([['child0', StructGen([['child01', IntegerGen()]])], ['child1', string_gen], ['child2', float_gen]], nullable=False),
StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]], nullable=False)], ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_make_array(data_gen):
(s1, s2) = with_cpu_session(
lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)))
Expand All @@ -183,6 +187,7 @@ def test_make_array(data_gen):


@pytest.mark.parametrize('data_gen', single_level_array_gens, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_orderby_array_unique(data_gen):
assert_gpu_and_cpu_are_equal_sql(
lambda spark : append_unique_int_col_to_df(spark, unary_op_df(spark, data_gen)),
Expand Down Expand Up @@ -212,6 +217,7 @@ def test_orderby_array_of_structs(data_gen):
@pytest.mark.parametrize('data_gen', [byte_gen, short_gen, int_gen, long_gen,
float_gen, double_gen,
string_gen, boolean_gen, date_gen, timestamp_gen], ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_array_contains(data_gen):
arr_gen = ArrayGen(data_gen)
literal = with_cpu_session(lambda spark: gen_scalar(data_gen, force_no_nulls=True))
Expand Down Expand Up @@ -239,6 +245,7 @@ def test_array_contains_for_nans(data_gen):


@pytest.mark.parametrize('data_gen', array_item_test_gens, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_array_element_at(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark: two_col_df(spark, data_gen, array_no_zero_index_gen).selectExpr(
Expand Down Expand Up @@ -303,6 +310,7 @@ def test_array_element_at_zero_index_fail(index, ansi_enabled):


@pytest.mark.parametrize('data_gen', array_gens_sample, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_array_transform(data_gen):
def do_it(spark):
columns = ['a', 'b',
Expand Down Expand Up @@ -337,6 +345,7 @@ def do_it(spark):
string_gen, boolean_gen, date_gen, timestamp_gen, null_gen] + decimal_gens

@pytest.mark.parametrize('data_gen', array_min_max_gens, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_array_min_max(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark : unary_op_df(spark, ArrayGen(data_gen)).selectExpr(
Expand All @@ -361,6 +370,7 @@ def test_array_concat_decimal(data_gen):
'concat(a, a)')))

@pytest.mark.parametrize('data_gen', orderable_gens + nested_gens_sample, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_array_repeat_with_count_column(data_gen):
cnt_gen = IntegerGen(min_val=-5, max_val=5, special_cases=[])
cnt_not_null_gen = IntegerGen(min_val=-5, max_val=5, special_cases=[], nullable=False)
Expand All @@ -374,6 +384,7 @@ def test_array_repeat_with_count_column(data_gen):


@pytest.mark.parametrize('data_gen', orderable_gens + nested_gens_sample, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_array_repeat_with_count_scalar(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, data_gen).selectExpr(
Expand Down Expand Up @@ -403,6 +414,7 @@ def test_sql_array_scalars(query):


@pytest.mark.parametrize('data_gen', all_basic_gens + nested_gens_sample, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_get_array_struct_fields(data_gen):
array_struct_gen = ArrayGen(
StructGen([['child0', data_gen], ['child1', int_gen]]),
Expand Down Expand Up @@ -441,6 +453,7 @@ def do_it(spark):


@pytest.mark.parametrize('data_gen', array_zips_gen, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_arrays_zip(data_gen):
gen = StructGen(
[('a', data_gen), ('b', data_gen), ('c', data_gen), ('d', data_gen)], nullable=False)
Expand Down Expand Up @@ -473,6 +486,7 @@ def q1(spark):

@incompat
@pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens + decimal_gens, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
@pytest.mark.skipif(is_before_spark_313() or is_spark_330() or is_spark_330cdh(), reason="NaN equality is only handled in Spark 3.1.3+ and SPARK-39976 issue with null and ArrayIntersect in Spark 3.3.0")
def test_array_intersect(data_gen):
gen = StructGen(
Expand Down Expand Up @@ -514,6 +528,7 @@ def test_array_intersect_spark330(data_gen):
@incompat
@pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens_no_nans + decimal_gens, ids=idfn)
@pytest.mark.skipif(not is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+")
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_array_intersect_before_spark313(data_gen):
gen = StructGen(
[('a', ArrayGen(data_gen, nullable=True)),
Expand All @@ -534,6 +549,7 @@ def test_array_intersect_before_spark313(data_gen):
@incompat
@pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens + decimal_gens, ids=idfn)
@pytest.mark.skipif(is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+")
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_array_union(data_gen):
gen = StructGen(
[('a', ArrayGen(data_gen, nullable=True)),
Expand All @@ -554,6 +570,7 @@ def test_array_union(data_gen):
@incompat
@pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens_no_nans + decimal_gens, ids=idfn)
@pytest.mark.skipif(not is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+")
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_array_union_before_spark313(data_gen):
gen = StructGen(
[('a', ArrayGen(data_gen, nullable=True)),
Expand All @@ -574,6 +591,7 @@ def test_array_union_before_spark313(data_gen):
@incompat
@pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens + decimal_gens, ids=idfn)
@pytest.mark.skipif(is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+")
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_array_except(data_gen):
gen = StructGen(
[('a', ArrayGen(data_gen, nullable=True)),
Expand All @@ -594,6 +612,7 @@ def test_array_except(data_gen):
@incompat
@pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens_no_nans + decimal_gens, ids=idfn)
@pytest.mark.skipif(not is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+")
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_array_except_before_spark313(data_gen):
gen = StructGen(
[('a', ArrayGen(data_gen, nullable=True)),
Expand All @@ -614,6 +633,7 @@ def test_array_except_before_spark313(data_gen):
@incompat
@pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens + decimal_gens, ids=idfn)
@pytest.mark.skipif(is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+")
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_arrays_overlap(data_gen):
gen = StructGen(
[('a', ArrayGen(data_gen, nullable=True)),
Expand All @@ -635,6 +655,7 @@ def test_arrays_overlap(data_gen):
@incompat
@pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens_no_nans + decimal_gens, ids=idfn)
@pytest.mark.skipif(not is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+")
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_arrays_overlap_before_spark313(data_gen):
gen = StructGen(
[('a', ArrayGen(data_gen, nullable=True)),
Expand Down Expand Up @@ -672,6 +693,7 @@ def test_array_remove_scalar(data_gen):
FloatGen(special_cases=_non_neg_zero_float_special_cases + [-0.0]),
DoubleGen(special_cases=_non_neg_zero_double_special_cases + [-0.0]),
StringGen(pattern='[0-9]{1,5}'), boolean_gen, date_gen, timestamp_gen] + decimal_gens, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_array_remove(data_gen):
gen = StructGen(
[('a', ArrayGen(data_gen, nullable=True)),
Expand All @@ -686,6 +708,7 @@ def test_array_remove(data_gen):


@pytest.mark.parametrize('data_gen', [ArrayGen(sub_gen) for sub_gen in array_gens_sample], ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_flatten_array(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, data_gen).selectExpr('flatten(a)')
Expand Down
9 changes: 9 additions & 0 deletions integration_tests/src/main/python/ast_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pytest

from asserts import assert_cpu_and_gpu_are_equal_collect_with_capture
from conftest import is_not_utc
from data_gen import *
from marks import approximate_float, datagen_overrides
from spark_session import with_cpu_session, is_before_spark_330
Expand Down Expand Up @@ -70,6 +71,7 @@ def assert_binary_ast(data_descr, func, conf={}):
assert_gpu_ast(is_supported, lambda spark: func(binary_op_df(spark, data_gen)), conf=conf)

@pytest.mark.parametrize('data_gen', [boolean_gen, byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, timestamp_gen, date_gen], ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_literal(spark_tmp_path, data_gen):
# Write data to Parquet so Spark generates a plan using just the count of the data.
data_path = spark_tmp_path + '/AST_TEST_DATA'
Expand All @@ -79,6 +81,7 @@ def test_literal(spark_tmp_path, data_gen):
func=lambda spark: spark.read.parquet(data_path).select(scalar))

@pytest.mark.parametrize('data_gen', [boolean_gen, byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, timestamp_gen, date_gen], ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_null_literal(spark_tmp_path, data_gen):
# Write data to Parquet so Spark generates a plan using just the count of the data.
data_path = spark_tmp_path + '/AST_TEST_DATA'
Expand Down Expand Up @@ -232,6 +235,7 @@ def test_expm1(data_descr):
assert_unary_ast(data_descr, lambda df: df.selectExpr('expm1(a)'))

@pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_eq(data_descr):
(s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2))
assert_binary_ast(data_descr,
Expand All @@ -241,6 +245,7 @@ def test_eq(data_descr):
f.col('a') == f.col('b')))

@pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_ne(data_descr):
(s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2))
assert_binary_ast(data_descr,
Expand All @@ -250,6 +255,7 @@ def test_ne(data_descr):
f.col('a') != f.col('b')))

@pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_lt(data_descr):
(s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2))
assert_binary_ast(data_descr,
Expand All @@ -259,6 +265,7 @@ def test_lt(data_descr):
f.col('a') < f.col('b')))

@pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_lte(data_descr):
(s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2))
assert_binary_ast(data_descr,
Expand All @@ -268,6 +275,7 @@ def test_lte(data_descr):
f.col('a') <= f.col('b')))

@pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_gt(data_descr):
(s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2))
assert_binary_ast(data_descr,
Expand All @@ -277,6 +285,7 @@ def test_gt(data_descr):
f.col('a') > f.col('b')))

@pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn)
@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
def test_gte(data_descr):
(s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2))
assert_binary_ast(data_descr,
Expand Down
Loading

0 comments on commit 6aef5c2

Please sign in to comment.