Skip to content

Commit

Permalink
Address missed review comments
Browse files Browse the repository at this point in the history
Signed-off-by: Kuhu Shukla <[email protected]>
  • Loading branch information
kuhushukla committed Dec 6, 2024
1 parent 0ced895 commit 2e2202b
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
@allow_non_gpu(*non_utc_allow)
def test_write_hive_bucketed_table(spark_tmp_table_factory, file_format):
num_rows = 2048
# Use every type except boolean , see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen
def gen_table(spark):
Expand Down
4 changes: 2 additions & 2 deletions integration_tests/src/main/python/hive_write_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def _restricted_timestamp(nullable=True):
end=datetime(2262, 4, 11, tzinfo=timezone.utc),
nullable=nullable)

# Use every type except boolean , see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen
_basic_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
Expand All @@ -48,7 +48,7 @@ def _restricted_timestamp(nullable=True):
ArrayGen(ArrayGen(string_gen, max_length=10), max_length=10),
ArrayGen(StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]]))]

# Use every type except boolean , see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen
_map_gens = [simple_string_to_string_map_gen] + [MapGen(f(nullable=False), f()) for f in [
Expand Down
17 changes: 10 additions & 7 deletions integration_tests/src/main/python/orc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def test_basic_read(std_input_path, name, read_func, v1_enabled_list, orc_impl,
#E at org.apache.orc.TypeDescription.parseInt(TypeDescription.java:244)
#E at org.apache.orc.TypeDescription.parseType(TypeDescription.java:362)
# ...
# Use every type except boolean , see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen
orc_basic_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
Expand Down Expand Up @@ -204,7 +204,7 @@ def test_read_round_trip(spark_tmp_path, orc_gens, read_func, reader_confs, v1_e
read_func(data_path),
conf=all_confs)

# Use every type except boolean , see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen
orc_pred_push_gens = [
Expand Down Expand Up @@ -283,7 +283,7 @@ def test_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_list, rea
def test_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader_confs):
# Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed
# we should go with a more standard set of generators
# Use every type except boolean , see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen
orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
Expand Down Expand Up @@ -353,7 +353,7 @@ def test_partitioned_read_just_partitions(spark_tmp_path, v1_enabled_list, reade
def test_merge_schema_read(spark_tmp_path, v1_enabled_list, reader_confs):
# Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed
# we should go with a more standard set of generators
# Use every type except boolean , see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen
orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
Expand Down Expand Up @@ -837,7 +837,7 @@ def test_read_round_trip_for_multithreaded_combining(spark_tmp_path, gens, keep_
@pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))])
@allow_non_gpu(*non_utc_allow_orc_scan)
def test_simple_partitioned_read_for_multithreaded_combining(spark_tmp_path, keep_order):
# Use every type except boolean , see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen
orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
Expand Down Expand Up @@ -942,6 +942,9 @@ def test_orc_column_name_with_dots(spark_tmp_path, reader_confs):
("f.g", int_gen),
("h", string_gen)])),
("i.j", long_gen)])),
# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen for column k
("k", int_gen)]
with_cpu_session(lambda spark: gen_df(spark, gens).write.orc(data_path))
assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark), conf=all_confs)
Expand All @@ -960,7 +963,7 @@ def test_orc_with_null_column(spark_tmp_path, reader_confs):
def gen_null_df(spark):
return spark.createDataFrame(
[(None, None, None, None, None)],
# Use every type except boolean , see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen
"c1 int, c2 long, c3 float, c4 double, c5 int")
Expand All @@ -984,7 +987,7 @@ def test_orc_with_null_column_with_1m_rows(spark_tmp_path, reader_confs):
def gen_null_df(spark):
return spark.createDataFrame(
data,
# Use every type except boolean , see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen
"c1 int, c2 long, c3 float, c4 double, c5 int")
Expand Down
7 changes: 5 additions & 2 deletions integration_tests/src/main/python/orc_write_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from pyspark.sql.types import *

pytestmark = pytest.mark.nightly_resource_consuming_test
# Use every type except boolean , see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen.
orc_write_basic_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
Expand Down Expand Up @@ -65,7 +65,7 @@
ArrayGen(ArrayGen(short_gen, max_length=10), max_length=10),
ArrayGen(ArrayGen(string_gen, max_length=10), max_length=10),
ArrayGen(StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]]))]
# Use every type except boolean , see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen.
orc_write_basic_map_gens = [simple_string_to_string_map_gen] + [MapGen(f(nullable=False), f()) for f in [
Expand Down Expand Up @@ -377,6 +377,9 @@ def test_orc_write_column_name_with_dots(spark_tmp_path):
("f.g", int_gen),
("h", string_gen)])),
("i.j", long_gen)])),
# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen for column k
("k", int_gen)]
assert_gpu_and_cpu_writes_are_equal_collect(
lambda spark, path: gen_df(spark, gens).coalesce(1).write.orc(path),
Expand Down
4 changes: 2 additions & 2 deletions integration_tests/src/main/python/schema_evolution_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

# List of additional column data generators to use when adding columns
_additional_gens = [
# Use every type except boolean , see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen
byte_gen,
Expand All @@ -51,7 +51,7 @@
# simple_string_to_string_map_gen),
ArrayGen(_custom_date_gen),
struct_gen_decimal128,
# Use every type except boolean , see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen from int_gen for c1
StructGen([("c0", ArrayGen(long_gen)), ("c1", int_gen)]),
Expand Down

0 comments on commit 2e2202b

Please sign in to comment.