From 2f9cdd66e447cd46a62a1364692b7e4b88453ff3 Mon Sep 17 00:00:00 2001 From: MithunR Date: Wed, 4 Dec 2024 17:39:01 -0800 Subject: [PATCH 1/2] Make serialization key checks uniform for Hive Text file read/write. This is a minor change, to bring the serialization format checks on the Read and Write sides of Hive text files in line with each other. The read side would check if the serialization-key is set to empty (""): https://github.com/NVIDIA/spark-rapids/blob/aa2da410511d8a737e207257769ec662a79174fe/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/HiveProviderImpl.scala#L155-L161 The write side was checking for the expected default: "1": https://github.com/NVIDIA/spark-rapids/blob/aa2da410511d8a737e207257769ec662a79174fe/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveFileFormat.scala#L130-L136 This caused confusion in behaviour, as voiced in https://github.com/NVIDIA/spark-rapids/issues/11803. The change in this commit makes the checks uniformly conservative. Signed-off-by: MithunR --- .../org/apache/spark/sql/hive/rapids/GpuHiveFileFormat.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveFileFormat.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveFileFormat.scala index 3b5244e5c79..efcad625e34 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveFileFormat.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveFileFormat.scala @@ -127,7 +127,7 @@ object GpuHiveFileFormat extends Logging { s"only $lazySimpleSerDe is currently supported for text") } - val serializationFormat = storage.properties.getOrElse(serializationKey, "1") + val serializationFormat = storage.properties.getOrElse(serializationKey, "") if (serializationFormat != ctrlASeparatedFormat) { meta.willNotWorkOnGpu(s"unsupported serialization format found: " + s"$serializationFormat, " + From c8c8f9d71d8b1c86cc5da15ecc90919d42640110 Mon Sep 17 00:00:00 2001 From: MithunR Date: Fri, 6 Dec 2024 15:27:18 -0800 Subject: [PATCH 2/2] xfail for 3.2. --- .../src/main/python/hive_delimited_text_test.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/integration_tests/src/main/python/hive_delimited_text_test.py b/integration_tests/src/main/python/hive_delimited_text_test.py index 6821f4b3cb6..704ce07bd6e 100644 --- a/integration_tests/src/main/python/hive_delimited_text_test.py +++ b/integration_tests/src/main/python/hive_delimited_text_test.py @@ -18,7 +18,7 @@ from enum import Enum from marks import * from pyspark.sql.types import * -from spark_session import is_spark_cdh, with_cpu_session +from spark_session import is_before_spark_330, is_spark_cdh, with_cpu_session hive_text_enabled_conf = {"spark.rapids.sql.format.hive.text.enabled": True, "spark.rapids.sql.format.hive.text.read.enabled": True} @@ -522,7 +522,12 @@ def create_table_with_compressed_files(spark): "https://github.com/NVIDIA/spark-rapids/pull/7628") @approximate_float @ignore_order(local=True) -@pytest.mark.parametrize('mode', [TableWriteMode.CTAS, TableWriteMode.CreateThenWrite]) +@pytest.mark.parametrize('mode', [pytest.param(TableWriteMode.CTAS, + marks=pytest.mark.xfail(condition=is_before_spark_330(), + reason="Spark 3.2.x seems not to set a table's " + "serialization.format to 1, by default. " + "Other Spark versions seem fine.")), + TableWriteMode.CreateThenWrite]) @pytest.mark.parametrize('input_dir,schema,options', [ ('hive-delim-text/simple-boolean-values', make_schema(BooleanType()), {}), ('hive-delim-text/simple-int-values', make_schema(ByteType()), {}),