From e3d6bff75fe720890cd8b467d5ae6e97fab7a7b2 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 29 Jul 2024 14:57:52 -0700 Subject: [PATCH 01/40] Add a new interface to retrieve all configs with their defaults; Add a new stage for integration test to populate default configs Signed-off-by: Jihoon Son --- integration_tests/pom.xml | 30 +++++++ integration_tests/run_pyspark_from_build.sh | 1 + .../src/main/python/spark_session.py | 26 ++---- .../rapids/tests/DumpDefaultConfigs.scala | 83 +++++++++++++++++++ pom.xml | 2 +- scala2.13/integration_tests/pom.xml | 30 +++++++ .../com/nvidia/spark/rapids/RapidsConf.scala | 20 +++++ 7 files changed, 173 insertions(+), 19 deletions(-) create mode 100644 integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index 75bd244f31d..aabeb45187f 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -180,6 +180,36 @@ org.codehaus.mojo exec-maven-plugin + + clean previously populated config file + verify + + exec + + + rm + + -f + ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json + + + + + populate default configs for testing + verify + + java + + + true + com.nvidia.spark.rapids.tests.DumpDefaultConfigs + provided + + json + ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json + + + run pyspark tests verify diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 22a23349791..b3f223d7a46 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -15,6 +15,7 @@ set -ex SCRIPTPATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" +export SPARK_RAPIDS_DEFAULT_CONFIGS_PATH=${SCRIPTPATH}/target/spark-rapids-default-configs.json cd "$SCRIPTPATH" if [[ $( echo ${SKIP_TESTS} | tr [:upper:] [:lower:] ) == "true" ]]; diff --git a/integration_tests/src/main/python/spark_session.py b/integration_tests/src/main/python/spark_session.py index 26388617fff..8356b49f48c 100644 --- a/integration_tests/src/main/python/spark_session.py +++ b/integration_tests/src/main/python/spark_session.py @@ -21,6 +21,7 @@ from pyspark.sql.types import TimestampType, DateType, _acceptable_types from spark_init_internal import get_spark_i_know_what_i_am_doing, spark_version from unittest.mock import patch +import json def _from_scala_map(scala_map): ret = {} @@ -40,23 +41,9 @@ def _from_scala_map(scala_map): # These settings can be overridden by specific tests if necessary. # Many of these are redundant with default settings for the configs but are set here explicitly # to ensure any cluster settings do not interfere with tests that assume the defaults. -_default_conf = { - 'spark.rapids.sql.castDecimalToFloat.enabled': 'false', - 'spark.rapids.sql.castFloatToDecimal.enabled': 'false', - 'spark.rapids.sql.castFloatToIntegralTypes.enabled': 'false', - 'spark.rapids.sql.castFloatToString.enabled': 'false', - 'spark.rapids.sql.castStringToFloat.enabled': 'false', - 'spark.rapids.sql.castStringToTimestamp.enabled': 'false', - 'spark.rapids.sql.fast.sample': 'false', - 'spark.rapids.sql.hasExtendedYearValues': 'true', - 'spark.rapids.sql.hashOptimizeSort.enabled': 'false', - 'spark.rapids.sql.improvedFloatOps.enabled': 'false', - 'spark.rapids.sql.incompatibleDateFormats.enabled': 'false', - 'spark.rapids.sql.incompatibleOps.enabled': 'false', - 'spark.rapids.sql.mode': 'executeongpu', - 'spark.rapids.sql.variableFloatAgg.enabled': 'false', - 'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true', -} +conf_file_path = os.environ['SPARK_RAPIDS_DEFAULT_CONFIGS_PATH'] +with open(conf_file_path) as conf_file: + _default_conf = json.load(conf_file) def _set_all_confs(conf): newconf = _default_conf.copy() @@ -67,7 +54,10 @@ def _set_all_confs(conf): newconf.update(conf) for key, value in newconf.items(): if _spark.conf.get(key, None) != value: - _spark.conf.set(key, value) + if isinstance(value, str): + _spark.conf.set(key, value) + else: + _spark.conf.set(key, str(value)) def reset_spark_session_conf(): """Reset all of the configs for a given spark session.""" diff --git a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala new file mode 100644 index 00000000000..57f77247216 --- /dev/null +++ b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.tests + +import java.io.{BufferedOutputStream, DataOutputStream, FileOutputStream} +import java.util.Locale + +import com.nvidia.spark.rapids.Arm.withResource +import com.nvidia.spark.rapids.ShimReflectionUtils + +/** + * Dump all spark-rapids configs with their defaults. + */ +object DumpDefaultConfigs { + + object Format extends Enumeration { + type Format = Value + val PLAIN, JSON = Value + } + + def main(args: Array[String]): Unit = { + if (args.length < 2) { + System.err.println(s"Usage: ${this.getClass.getCanonicalName} {format} {output_path}") + System.exit(1) + } + + val format = Format.withName(args(0).toUpperCase(Locale.US)) + val outputPath = args(1) + + println(s"Dumping all spark-rapids configs and their defaults at ${outputPath}") + + // We use the reflection as RapidsConf should be accessed via the shim layer. + val clazz = ShimReflectionUtils.loadClass("com.nvidia.spark.rapids.RapidsConf") + val m = clazz.getDeclaredMethod("getAllConfigsWithDefault") + val allConfs: Map[String, Any] = m.invoke(null).asInstanceOf[Map[String, Any]] + val fos: FileOutputStream = new FileOutputStream(outputPath) + withResource(fos) { _ => + val bos: BufferedOutputStream = new BufferedOutputStream(fos) + withResource(bos) { _ => + format match { + case Format.PLAIN => + val dos: DataOutputStream = new DataOutputStream(bos) + withResource(dos) { _ => + allConfs.foreach({ case (k, v) => + val valStr = v match { + case some: Some[_] => some.getOrElse("") + case _ => + if (v == null) { + "" + } else { + v.toString + } + } + dos.writeUTF(s"'${k}': '${valStr}',") + }) + } + case Format.JSON => + import org.json4s.jackson.Serialization.writePretty + import org.json4s.DefaultFormats + import java.nio.charset.StandardCharsets + implicit val formats = DefaultFormats + bos.write(writePretty(allConfs).getBytes(StandardCharsets.UTF_8)) + case _ => + System.err.println(s"Unknown format: ${format}") + } + } + } + } +} diff --git a/pom.xml b/pom.xml index 91b116c46a9..556b04e92cf 100644 --- a/pom.xml +++ b/pom.xml @@ -1439,7 +1439,7 @@ This will force full Scala code rebuild in downstream modules. org.codehaus.mojo exec-maven-plugin - 3.0.0 + 3.3.0 org.apache.maven.plugins diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml index f7e443d5a22..2ee846d0e70 100644 --- a/scala2.13/integration_tests/pom.xml +++ b/scala2.13/integration_tests/pom.xml @@ -180,6 +180,36 @@ org.codehaus.mojo exec-maven-plugin + + clean previously populated config file + verify + + exec + + + rm + + -f + ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json + + + + + populate default configs for testing + verify + + java + + + true + com.nvidia.spark.rapids.tests.DumpDefaultConfigs + provided + + json + ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json + + + run pyspark tests verify diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index 1cb03958111..5440b8dbfa7 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -124,6 +124,7 @@ abstract class ConfEntry[T](val key: String, val converter: String => T, val doc def get(conf: Map[String, String]): T def get(conf: SQLConf): T + def getDefault(): T def help(asTable: Boolean = false): Unit override def toString: String = key @@ -147,6 +148,10 @@ class ConfEntryWithDefault[T](key: String, converter: String => T, doc: String, } } + override def getDefault(): T = { + defaultValue + } + override def help(asTable: Boolean = false): Unit = { if (!isInternal) { val startupOnlyStr = if (isStartupOnly) "Startup" else "Runtime" @@ -182,6 +187,10 @@ class OptionalConfEntry[T](key: String, val rawConverter: String => T, doc: Stri } } + override def getDefault(): Option[T] = { + None + } + override def help(asTable: Boolean = false): Unit = { if (!isInternal) { val startupOnlyStr = if (isStartupOnly) "Startup" else "Runtime" @@ -2383,6 +2392,17 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression. println("-----|-----------------|-------------|---------------|------") } + /** + * Returns all spark-rapids configs with their default values. + * This function is used to dump default configs, so that they + * could be used by the integration test. + */ + def getAllConfigsWithDefault: Map[String, Any] = { + val allConfs = registeredConfs.clone() + allConfs.append(RapidsPrivateUtil.getPrivateConfigs(): _*) + allConfs.map(e => e.key -> e.getDefault).toMap + } + def help(asTable: Boolean = false): Unit = { helpCommon(asTable) helpAdvanced(asTable) From e7f1ec5ac429ced373a0c0fc2169062746d3ee72 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 1 Aug 2024 11:00:46 -0700 Subject: [PATCH 02/40] address comments --- integration_tests/pom.xml | 6 +++--- .../spark/rapids/tests/DumpDefaultConfigs.scala | 17 +++++++---------- scala2.13/integration_tests/pom.xml | 6 +++--- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index aabeb45187f..d9a1b504247 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -181,7 +181,7 @@ exec-maven-plugin - clean previously populated config file + clean-previously-populated-config-file verify exec @@ -195,7 +195,7 @@ - populate default configs for testing + populate-default-configs-for-testing verify java @@ -211,7 +211,7 @@ - run pyspark tests + run-pyspark-tests verify exec diff --git a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala index 57f77247216..0b9179471a2 100644 --- a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala +++ b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala @@ -17,10 +17,13 @@ package com.nvidia.spark.rapids.tests import java.io.{BufferedOutputStream, DataOutputStream, FileOutputStream} +import java.nio.charset.StandardCharsets import java.util.Locale import com.nvidia.spark.rapids.Arm.withResource import com.nvidia.spark.rapids.ShimReflectionUtils +import org.json4s.DefaultFormats +import org.json4s.jackson.Serialization.writePretty /** * Dump all spark-rapids configs with their defaults. @@ -47,15 +50,12 @@ object DumpDefaultConfigs { val clazz = ShimReflectionUtils.loadClass("com.nvidia.spark.rapids.RapidsConf") val m = clazz.getDeclaredMethod("getAllConfigsWithDefault") val allConfs: Map[String, Any] = m.invoke(null).asInstanceOf[Map[String, Any]] - val fos: FileOutputStream = new FileOutputStream(outputPath) - withResource(fos) { _ => - val bos: BufferedOutputStream = new BufferedOutputStream(fos) - withResource(bos) { _ => + withResource(new FileOutputStream(outputPath)) { fos => + withResource(new BufferedOutputStream(fos)) { bos => format match { case Format.PLAIN => - val dos: DataOutputStream = new DataOutputStream(bos) - withResource(dos) { _ => - allConfs.foreach({ case (k, v) => + withResource(new DataOutputStream(bos)) { dos => + allConfs.foreach( { case (k, v) => val valStr = v match { case some: Some[_] => some.getOrElse("") case _ => @@ -69,9 +69,6 @@ object DumpDefaultConfigs { }) } case Format.JSON => - import org.json4s.jackson.Serialization.writePretty - import org.json4s.DefaultFormats - import java.nio.charset.StandardCharsets implicit val formats = DefaultFormats bos.write(writePretty(allConfs).getBytes(StandardCharsets.UTF_8)) case _ => diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml index 2ee846d0e70..d403ef8e5e2 100644 --- a/scala2.13/integration_tests/pom.xml +++ b/scala2.13/integration_tests/pom.xml @@ -181,7 +181,7 @@ exec-maven-plugin - clean previously populated config file + clean-previously-populated-config-file verify exec @@ -195,7 +195,7 @@ - populate default configs for testing + populate-default-configs-for-testing verify java @@ -211,7 +211,7 @@ - run pyspark tests + run-pyspark-tests verify exec From 37c0f92c765ada05a6470576708b48e90d1e18be Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 1 Aug 2024 13:36:11 -0700 Subject: [PATCH 03/40] missing version update in 2.13 pom --- scala2.13/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml index 88df149ccdd..833282f5904 100644 --- a/scala2.13/pom.xml +++ b/scala2.13/pom.xml @@ -1439,7 +1439,7 @@ This will force full Scala code rebuild in downstream modules. org.codehaus.mojo exec-maven-plugin - 3.0.0 + 3.3.0 org.apache.maven.plugins From 85f05f35d956d501eb5ca864e86e09ed84aa3ada Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 1 Aug 2024 13:39:29 -0700 Subject: [PATCH 04/40] fix match arms --- .../com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala index 0b9179471a2..bda989892e0 100644 --- a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala +++ b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala @@ -57,7 +57,8 @@ object DumpDefaultConfigs { withResource(new DataOutputStream(bos)) { dos => allConfs.foreach( { case (k, v) => val valStr = v match { - case some: Some[_] => some.getOrElse("") + case Some(optVal) => optVal.toString + case None => "" case _ => if (v == null) { "" From 6ba7300daa440b706712345e8d44a618cde4f261 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 1 Aug 2024 13:55:39 -0700 Subject: [PATCH 05/40] take the json file path as an input --- integration_tests/conftest.py | 3 +++ integration_tests/run_pyspark_from_build.sh | 2 +- integration_tests/src/main/python/conftest.py | 6 ++++++ integration_tests/src/main/python/spark_session.py | 5 ++--- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/integration_tests/conftest.py b/integration_tests/conftest.py index f2e3435d0c5..3b7e1db15a8 100644 --- a/integration_tests/conftest.py +++ b/integration_tests/conftest.py @@ -62,3 +62,6 @@ def pytest_addoption(parser): parser.addoption( "--pyarrow_test", action='store_true', default=False, help="if enable pyarrow tests" ) + parser.addoption( + "--default_configs_path", action="store", default=None, help="path to a JSON file that stores default configs for integration test" + ) diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index b3f223d7a46..8c7ecf6b3da 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -15,7 +15,6 @@ set -ex SCRIPTPATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" -export SPARK_RAPIDS_DEFAULT_CONFIGS_PATH=${SCRIPTPATH}/target/spark-rapids-default-configs.json cd "$SCRIPTPATH" if [[ $( echo ${SKIP_TESTS} | tr [:upper:] [:lower:] ) == "true" ]]; @@ -233,6 +232,7 @@ else "$TEST_ARGS" $RUN_TEST_PARAMS --junitxml=TEST-pytest-`date +%s%N`.xml + --default_configs_path="${SCRIPTPATH}/target/spark-rapids-default-configs.json" "$@") NUM_LOCAL_EXECS=${NUM_LOCAL_EXECS:-0} diff --git a/integration_tests/src/main/python/conftest.py b/integration_tests/src/main/python/conftest.py index 49f69f31837..1503c024fe8 100644 --- a/integration_tests/src/main/python/conftest.py +++ b/integration_tests/src/main/python/conftest.py @@ -156,6 +156,10 @@ def is_parquet_testing_tests_forced(): def get_inject_oom_conf(): return _inject_oom +_default_configs_path = None + +def get_default_configs_path(): + return _default_configs_path # For datagen: we expect a seed to be provided by the environment, or default to 0. # Note that tests can override their seed when calling into datagen by setting seed= in their tests. @@ -304,6 +308,8 @@ def pytest_configure(config): raise Exception("not supported test type {}".format(test_type)) global _is_parquet_testing_tests_forced _is_parquet_testing_tests_forced = config.getoption("force_parquet_testing_tests") + global _default_configs_path + _default_configs_path = config.getoption("default_configs_path") # For OOM injection: we expect a seed to be provided by the environment, or default to 1. # This is done such that any worker started by the xdist plugin for pytest will diff --git a/integration_tests/src/main/python/spark_session.py b/integration_tests/src/main/python/spark_session.py index 8356b49f48c..f67c3cd52a2 100644 --- a/integration_tests/src/main/python/spark_session.py +++ b/integration_tests/src/main/python/spark_session.py @@ -16,7 +16,7 @@ import calendar, time from datetime import date, datetime from contextlib import contextmanager, ExitStack -from conftest import is_allowing_any_non_gpu, get_non_gpu_allowed, get_validate_execs_in_gpu_plan, is_databricks_runtime, is_at_least_precommit_run, get_inject_oom_conf, is_per_test_ansi_mode_enabled +from conftest import is_allowing_any_non_gpu, get_non_gpu_allowed, get_validate_execs_in_gpu_plan, is_databricks_runtime, is_at_least_precommit_run, get_inject_oom_conf, is_per_test_ansi_mode_enabled, get_default_configs_path from pyspark.sql import DataFrame from pyspark.sql.types import TimestampType, DateType, _acceptable_types from spark_init_internal import get_spark_i_know_what_i_am_doing, spark_version @@ -41,8 +41,7 @@ def _from_scala_map(scala_map): # These settings can be overridden by specific tests if necessary. # Many of these are redundant with default settings for the configs but are set here explicitly # to ensure any cluster settings do not interfere with tests that assume the defaults. -conf_file_path = os.environ['SPARK_RAPIDS_DEFAULT_CONFIGS_PATH'] -with open(conf_file_path) as conf_file: +with open(get_default_configs_path()) as conf_file: _default_conf = json.load(conf_file) def _set_all_confs(conf): From 8dbd354cb7f740fcd0cc1bcec2c31dabf4c13b8a Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 1 Aug 2024 14:54:41 -0700 Subject: [PATCH 06/40] add the new config file in the assembly --- integration_tests/pom.xml | 66 +++++++++++---------- integration_tests/run_pyspark_from_build.sh | 4 +- integration_tests/src/assembly/bin.xml | 4 ++ 3 files changed, 42 insertions(+), 32 deletions(-) diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index d9a1b504247..3d66a26bedc 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -78,6 +78,42 @@ + + org.codehaus.mojo + exec-maven-plugin + + + clean-previously-populated-config-file + package + + exec + + + rm + + -f + ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json + + + + + populate-default-configs-for-testing + package + + java + + + true + com.nvidia.spark.rapids.tests.DumpDefaultConfigs + provided + + json + ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json + + + + + maven-assembly-plugin 3.6.0 @@ -180,36 +216,6 @@ org.codehaus.mojo exec-maven-plugin - - clean-previously-populated-config-file - verify - - exec - - - rm - - -f - ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json - - - - - populate-default-configs-for-testing - verify - - java - - - true - com.nvidia.spark.rapids.tests.DumpDefaultConfigs - provided - - json - ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json - - - run-pyspark-tests verify diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 8c7ecf6b3da..4e125a6476b 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -76,7 +76,7 @@ else # Make sure we have Parquet version >= 1.12 in the dependency LOWEST_PARQUET_JAR=$(echo -e "$MIN_PARQUET_JAR\n$PARQUET_HADOOP_TESTS" | sort -V | head -1) export INCLUDE_PARQUET_HADOOP_TEST_JAR=$([[ "$LOWEST_PARQUET_JAR" == "$MIN_PARQUET_JAR" ]] && echo true || echo false) - PLUGIN_JARS=$(echo "$TARGET_DIR"/../../dist/target/rapids-4-spark_*.jar) + PLUGIN_JARS=$(echo "$TARGET_DIR"/../../dist/rapids-4-spark_*.jar) # the integration-test-spark3xx.jar, should not include the integration-test-spark3xxtest.jar TEST_JARS=$(echo "$TARGET_DIR"/rapids-4-spark-integration-tests*-$INTEGRATION_TEST_VERSION.jar) fi @@ -232,7 +232,7 @@ else "$TEST_ARGS" $RUN_TEST_PARAMS --junitxml=TEST-pytest-`date +%s%N`.xml - --default_configs_path="${SCRIPTPATH}/target/spark-rapids-default-configs.json" + --default_configs_path="${TARGET_DIR}/spark-rapids-default-configs.json" "$@") NUM_LOCAL_EXECS=${NUM_LOCAL_EXECS:-0} diff --git a/integration_tests/src/assembly/bin.xml b/integration_tests/src/assembly/bin.xml index 6209d0b152a..47f5f9e9d52 100644 --- a/integration_tests/src/assembly/bin.xml +++ b/integration_tests/src/assembly/bin.xml @@ -50,6 +50,10 @@ ${project.build.outputDirectory}/rapids4spark-version-info.properties integration_tests + + ${spark.rapids.source.basedir}/integration_tests/target/spark-rapids-default-configs.json + integration_tests + From cb88561a5f14eb97facd9e34a7def587c7f9f21f Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 1 Aug 2024 14:57:36 -0700 Subject: [PATCH 07/40] missing 2.13 change --- scala2.13/integration_tests/pom.xml | 66 ++++++++++++++++------------- 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml index d403ef8e5e2..1115b016e13 100644 --- a/scala2.13/integration_tests/pom.xml +++ b/scala2.13/integration_tests/pom.xml @@ -78,6 +78,42 @@ + + org.codehaus.mojo + exec-maven-plugin + + + clean-previously-populated-config-file + package + + exec + + + rm + + -f + ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json + + + + + populate-default-configs-for-testing + package + + java + + + true + com.nvidia.spark.rapids.tests.DumpDefaultConfigs + provided + + json + ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json + + + + + maven-assembly-plugin 3.6.0 @@ -180,36 +216,6 @@ org.codehaus.mojo exec-maven-plugin - - clean-previously-populated-config-file - verify - - exec - - - rm - - -f - ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json - - - - - populate-default-configs-for-testing - verify - - java - - - true - com.nvidia.spark.rapids.tests.DumpDefaultConfigs - provided - - json - ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json - - - run-pyspark-tests verify From 3a26d1598c3080cd74dcc5f659693aef98d4abfe Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 2 Aug 2024 10:04:59 -0700 Subject: [PATCH 08/40] use maven build directory var --- integration_tests/src/assembly/bin.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/src/assembly/bin.xml b/integration_tests/src/assembly/bin.xml index 47f5f9e9d52..b46d026ef80 100644 --- a/integration_tests/src/assembly/bin.xml +++ b/integration_tests/src/assembly/bin.xml @@ -51,7 +51,7 @@ integration_tests - ${spark.rapids.source.basedir}/integration_tests/target/spark-rapids-default-configs.json + ${project.build.directory}/spark-rapids-default-configs.json integration_tests From 5d59de7c62865806dbfd1096da2ed1c14894d440 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 2 Aug 2024 10:07:32 -0700 Subject: [PATCH 09/40] revert unintended change --- integration_tests/run_pyspark_from_build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 4e125a6476b..341daa6f838 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -76,7 +76,7 @@ else # Make sure we have Parquet version >= 1.12 in the dependency LOWEST_PARQUET_JAR=$(echo -e "$MIN_PARQUET_JAR\n$PARQUET_HADOOP_TESTS" | sort -V | head -1) export INCLUDE_PARQUET_HADOOP_TEST_JAR=$([[ "$LOWEST_PARQUET_JAR" == "$MIN_PARQUET_JAR" ]] && echo true || echo false) - PLUGIN_JARS=$(echo "$TARGET_DIR"/../../dist/rapids-4-spark_*.jar) + PLUGIN_JARS=$(echo "$TARGET_DIR"/../../dist/target/rapids-4-spark_*.jar) # the integration-test-spark3xx.jar, should not include the integration-test-spark3xxtest.jar TEST_JARS=$(echo "$TARGET_DIR"/rapids-4-spark-integration-tests*-$INTEGRATION_TEST_VERSION.jar) fi From a7a04d05e6b272873acc798b348bb860fa0ada53 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 2 Aug 2024 10:10:28 -0700 Subject: [PATCH 10/40] remove unnecessary clean --- integration_tests/pom.xml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index 3d66a26bedc..7d49b09af32 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -82,20 +82,6 @@ org.codehaus.mojo exec-maven-plugin - - clean-previously-populated-config-file - package - - exec - - - rm - - -f - ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json - - - populate-default-configs-for-testing package From a7b50625b493717d6967673c2510cc884fdb231f Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 29 Jul 2024 14:57:52 -0700 Subject: [PATCH 11/40] Add a new interface to retrieve all configs with their defaults; Add a new stage for integration test to populate default configs Signed-off-by: Jihoon Son --- integration_tests/pom.xml | 30 +++++++ integration_tests/run_pyspark_from_build.sh | 1 + .../src/main/python/spark_session.py | 26 ++---- .../rapids/tests/DumpDefaultConfigs.scala | 83 +++++++++++++++++++ pom.xml | 2 +- scala2.13/integration_tests/pom.xml | 30 +++++++ .../com/nvidia/spark/rapids/RapidsConf.scala | 20 +++++ 7 files changed, 173 insertions(+), 19 deletions(-) create mode 100644 integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index 3ea20b75610..389261537b8 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -180,6 +180,36 @@ org.codehaus.mojo exec-maven-plugin + + clean previously populated config file + verify + + exec + + + rm + + -f + ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json + + + + + populate default configs for testing + verify + + java + + + true + com.nvidia.spark.rapids.tests.DumpDefaultConfigs + provided + + json + ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json + + + run pyspark tests verify diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 22a23349791..b3f223d7a46 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -15,6 +15,7 @@ set -ex SCRIPTPATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" +export SPARK_RAPIDS_DEFAULT_CONFIGS_PATH=${SCRIPTPATH}/target/spark-rapids-default-configs.json cd "$SCRIPTPATH" if [[ $( echo ${SKIP_TESTS} | tr [:upper:] [:lower:] ) == "true" ]]; diff --git a/integration_tests/src/main/python/spark_session.py b/integration_tests/src/main/python/spark_session.py index 26388617fff..8356b49f48c 100644 --- a/integration_tests/src/main/python/spark_session.py +++ b/integration_tests/src/main/python/spark_session.py @@ -21,6 +21,7 @@ from pyspark.sql.types import TimestampType, DateType, _acceptable_types from spark_init_internal import get_spark_i_know_what_i_am_doing, spark_version from unittest.mock import patch +import json def _from_scala_map(scala_map): ret = {} @@ -40,23 +41,9 @@ def _from_scala_map(scala_map): # These settings can be overridden by specific tests if necessary. # Many of these are redundant with default settings for the configs but are set here explicitly # to ensure any cluster settings do not interfere with tests that assume the defaults. -_default_conf = { - 'spark.rapids.sql.castDecimalToFloat.enabled': 'false', - 'spark.rapids.sql.castFloatToDecimal.enabled': 'false', - 'spark.rapids.sql.castFloatToIntegralTypes.enabled': 'false', - 'spark.rapids.sql.castFloatToString.enabled': 'false', - 'spark.rapids.sql.castStringToFloat.enabled': 'false', - 'spark.rapids.sql.castStringToTimestamp.enabled': 'false', - 'spark.rapids.sql.fast.sample': 'false', - 'spark.rapids.sql.hasExtendedYearValues': 'true', - 'spark.rapids.sql.hashOptimizeSort.enabled': 'false', - 'spark.rapids.sql.improvedFloatOps.enabled': 'false', - 'spark.rapids.sql.incompatibleDateFormats.enabled': 'false', - 'spark.rapids.sql.incompatibleOps.enabled': 'false', - 'spark.rapids.sql.mode': 'executeongpu', - 'spark.rapids.sql.variableFloatAgg.enabled': 'false', - 'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true', -} +conf_file_path = os.environ['SPARK_RAPIDS_DEFAULT_CONFIGS_PATH'] +with open(conf_file_path) as conf_file: + _default_conf = json.load(conf_file) def _set_all_confs(conf): newconf = _default_conf.copy() @@ -67,7 +54,10 @@ def _set_all_confs(conf): newconf.update(conf) for key, value in newconf.items(): if _spark.conf.get(key, None) != value: - _spark.conf.set(key, value) + if isinstance(value, str): + _spark.conf.set(key, value) + else: + _spark.conf.set(key, str(value)) def reset_spark_session_conf(): """Reset all of the configs for a given spark session.""" diff --git a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala new file mode 100644 index 00000000000..57f77247216 --- /dev/null +++ b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.tests + +import java.io.{BufferedOutputStream, DataOutputStream, FileOutputStream} +import java.util.Locale + +import com.nvidia.spark.rapids.Arm.withResource +import com.nvidia.spark.rapids.ShimReflectionUtils + +/** + * Dump all spark-rapids configs with their defaults. + */ +object DumpDefaultConfigs { + + object Format extends Enumeration { + type Format = Value + val PLAIN, JSON = Value + } + + def main(args: Array[String]): Unit = { + if (args.length < 2) { + System.err.println(s"Usage: ${this.getClass.getCanonicalName} {format} {output_path}") + System.exit(1) + } + + val format = Format.withName(args(0).toUpperCase(Locale.US)) + val outputPath = args(1) + + println(s"Dumping all spark-rapids configs and their defaults at ${outputPath}") + + // We use the reflection as RapidsConf should be accessed via the shim layer. + val clazz = ShimReflectionUtils.loadClass("com.nvidia.spark.rapids.RapidsConf") + val m = clazz.getDeclaredMethod("getAllConfigsWithDefault") + val allConfs: Map[String, Any] = m.invoke(null).asInstanceOf[Map[String, Any]] + val fos: FileOutputStream = new FileOutputStream(outputPath) + withResource(fos) { _ => + val bos: BufferedOutputStream = new BufferedOutputStream(fos) + withResource(bos) { _ => + format match { + case Format.PLAIN => + val dos: DataOutputStream = new DataOutputStream(bos) + withResource(dos) { _ => + allConfs.foreach({ case (k, v) => + val valStr = v match { + case some: Some[_] => some.getOrElse("") + case _ => + if (v == null) { + "" + } else { + v.toString + } + } + dos.writeUTF(s"'${k}': '${valStr}',") + }) + } + case Format.JSON => + import org.json4s.jackson.Serialization.writePretty + import org.json4s.DefaultFormats + import java.nio.charset.StandardCharsets + implicit val formats = DefaultFormats + bos.write(writePretty(allConfs).getBytes(StandardCharsets.UTF_8)) + case _ => + System.err.println(s"Unknown format: ${format}") + } + } + } + } +} diff --git a/pom.xml b/pom.xml index 358aa0c52b8..e7a6bb18489 100644 --- a/pom.xml +++ b/pom.xml @@ -1439,7 +1439,7 @@ This will force full Scala code rebuild in downstream modules. org.codehaus.mojo exec-maven-plugin - 3.0.0 + 3.3.0 org.apache.maven.plugins diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml index 4c3ea72f341..ff1da425fbe 100644 --- a/scala2.13/integration_tests/pom.xml +++ b/scala2.13/integration_tests/pom.xml @@ -180,6 +180,36 @@ org.codehaus.mojo exec-maven-plugin + + clean previously populated config file + verify + + exec + + + rm + + -f + ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json + + + + + populate default configs for testing + verify + + java + + + true + com.nvidia.spark.rapids.tests.DumpDefaultConfigs + provided + + json + ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json + + + run pyspark tests verify diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index 14551471e66..fc25e463f05 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -124,6 +124,7 @@ abstract class ConfEntry[T](val key: String, val converter: String => T, val doc def get(conf: Map[String, String]): T def get(conf: SQLConf): T + def getDefault(): T def help(asTable: Boolean = false): Unit override def toString: String = key @@ -147,6 +148,10 @@ class ConfEntryWithDefault[T](key: String, converter: String => T, doc: String, } } + override def getDefault(): T = { + defaultValue + } + override def help(asTable: Boolean = false): Unit = { if (!isInternal) { val startupOnlyStr = if (isStartupOnly) "Startup" else "Runtime" @@ -182,6 +187,10 @@ class OptionalConfEntry[T](key: String, val rawConverter: String => T, doc: Stri } } + override def getDefault(): Option[T] = { + None + } + override def help(asTable: Boolean = false): Unit = { if (!isInternal) { val startupOnlyStr = if (isStartupOnly) "Startup" else "Runtime" @@ -2374,6 +2383,17 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression. println("-----|-----------------|-------------|---------------|------") } + /** + * Returns all spark-rapids configs with their default values. + * This function is used to dump default configs, so that they + * could be used by the integration test. + */ + def getAllConfigsWithDefault: Map[String, Any] = { + val allConfs = registeredConfs.clone() + allConfs.append(RapidsPrivateUtil.getPrivateConfigs(): _*) + allConfs.map(e => e.key -> e.getDefault).toMap + } + def help(asTable: Boolean = false): Unit = { helpCommon(asTable) helpAdvanced(asTable) From 44837451f00dd1edd9dc2a2739961d41c212bd6e Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 1 Aug 2024 11:00:46 -0700 Subject: [PATCH 12/40] address comments --- integration_tests/pom.xml | 6 +++--- .../spark/rapids/tests/DumpDefaultConfigs.scala | 17 +++++++---------- scala2.13/integration_tests/pom.xml | 6 +++--- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index 389261537b8..b4b5465e615 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -181,7 +181,7 @@ exec-maven-plugin - clean previously populated config file + clean-previously-populated-config-file verify exec @@ -195,7 +195,7 @@ - populate default configs for testing + populate-default-configs-for-testing verify java @@ -211,7 +211,7 @@ - run pyspark tests + run-pyspark-tests verify exec diff --git a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala index 57f77247216..0b9179471a2 100644 --- a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala +++ b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala @@ -17,10 +17,13 @@ package com.nvidia.spark.rapids.tests import java.io.{BufferedOutputStream, DataOutputStream, FileOutputStream} +import java.nio.charset.StandardCharsets import java.util.Locale import com.nvidia.spark.rapids.Arm.withResource import com.nvidia.spark.rapids.ShimReflectionUtils +import org.json4s.DefaultFormats +import org.json4s.jackson.Serialization.writePretty /** * Dump all spark-rapids configs with their defaults. @@ -47,15 +50,12 @@ object DumpDefaultConfigs { val clazz = ShimReflectionUtils.loadClass("com.nvidia.spark.rapids.RapidsConf") val m = clazz.getDeclaredMethod("getAllConfigsWithDefault") val allConfs: Map[String, Any] = m.invoke(null).asInstanceOf[Map[String, Any]] - val fos: FileOutputStream = new FileOutputStream(outputPath) - withResource(fos) { _ => - val bos: BufferedOutputStream = new BufferedOutputStream(fos) - withResource(bos) { _ => + withResource(new FileOutputStream(outputPath)) { fos => + withResource(new BufferedOutputStream(fos)) { bos => format match { case Format.PLAIN => - val dos: DataOutputStream = new DataOutputStream(bos) - withResource(dos) { _ => - allConfs.foreach({ case (k, v) => + withResource(new DataOutputStream(bos)) { dos => + allConfs.foreach( { case (k, v) => val valStr = v match { case some: Some[_] => some.getOrElse("") case _ => @@ -69,9 +69,6 @@ object DumpDefaultConfigs { }) } case Format.JSON => - import org.json4s.jackson.Serialization.writePretty - import org.json4s.DefaultFormats - import java.nio.charset.StandardCharsets implicit val formats = DefaultFormats bos.write(writePretty(allConfs).getBytes(StandardCharsets.UTF_8)) case _ => diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml index ff1da425fbe..3b099ccb38a 100644 --- a/scala2.13/integration_tests/pom.xml +++ b/scala2.13/integration_tests/pom.xml @@ -181,7 +181,7 @@ exec-maven-plugin - clean previously populated config file + clean-previously-populated-config-file verify exec @@ -195,7 +195,7 @@ - populate default configs for testing + populate-default-configs-for-testing verify java @@ -211,7 +211,7 @@ - run pyspark tests + run-pyspark-tests verify exec From 52fce4cd6d4c4555f5cad3edd2431dae1a8e5328 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 1 Aug 2024 13:36:11 -0700 Subject: [PATCH 13/40] missing version update in 2.13 pom --- scala2.13/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml index 1161334a4c0..e0574430ba6 100644 --- a/scala2.13/pom.xml +++ b/scala2.13/pom.xml @@ -1439,7 +1439,7 @@ This will force full Scala code rebuild in downstream modules. org.codehaus.mojo exec-maven-plugin - 3.0.0 + 3.3.0 org.apache.maven.plugins From 35735c363440d00aced154c03c4e9d4bc9396db3 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 1 Aug 2024 13:39:29 -0700 Subject: [PATCH 14/40] fix match arms --- .../com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala index 0b9179471a2..bda989892e0 100644 --- a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala +++ b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala @@ -57,7 +57,8 @@ object DumpDefaultConfigs { withResource(new DataOutputStream(bos)) { dos => allConfs.foreach( { case (k, v) => val valStr = v match { - case some: Some[_] => some.getOrElse("") + case Some(optVal) => optVal.toString + case None => "" case _ => if (v == null) { "" From 9f9ef1e36b60c9329e9d64e4ad3aa55f89e7278f Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 1 Aug 2024 13:55:39 -0700 Subject: [PATCH 15/40] take the json file path as an input --- integration_tests/conftest.py | 3 +++ integration_tests/run_pyspark_from_build.sh | 2 +- integration_tests/src/main/python/conftest.py | 6 ++++++ integration_tests/src/main/python/spark_session.py | 5 ++--- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/integration_tests/conftest.py b/integration_tests/conftest.py index f2e3435d0c5..3b7e1db15a8 100644 --- a/integration_tests/conftest.py +++ b/integration_tests/conftest.py @@ -62,3 +62,6 @@ def pytest_addoption(parser): parser.addoption( "--pyarrow_test", action='store_true', default=False, help="if enable pyarrow tests" ) + parser.addoption( + "--default_configs_path", action="store", default=None, help="path to a JSON file that stores default configs for integration test" + ) diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index b3f223d7a46..8c7ecf6b3da 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -15,7 +15,6 @@ set -ex SCRIPTPATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" -export SPARK_RAPIDS_DEFAULT_CONFIGS_PATH=${SCRIPTPATH}/target/spark-rapids-default-configs.json cd "$SCRIPTPATH" if [[ $( echo ${SKIP_TESTS} | tr [:upper:] [:lower:] ) == "true" ]]; @@ -233,6 +232,7 @@ else "$TEST_ARGS" $RUN_TEST_PARAMS --junitxml=TEST-pytest-`date +%s%N`.xml + --default_configs_path="${SCRIPTPATH}/target/spark-rapids-default-configs.json" "$@") NUM_LOCAL_EXECS=${NUM_LOCAL_EXECS:-0} diff --git a/integration_tests/src/main/python/conftest.py b/integration_tests/src/main/python/conftest.py index 49f69f31837..1503c024fe8 100644 --- a/integration_tests/src/main/python/conftest.py +++ b/integration_tests/src/main/python/conftest.py @@ -156,6 +156,10 @@ def is_parquet_testing_tests_forced(): def get_inject_oom_conf(): return _inject_oom +_default_configs_path = None + +def get_default_configs_path(): + return _default_configs_path # For datagen: we expect a seed to be provided by the environment, or default to 0. # Note that tests can override their seed when calling into datagen by setting seed= in their tests. @@ -304,6 +308,8 @@ def pytest_configure(config): raise Exception("not supported test type {}".format(test_type)) global _is_parquet_testing_tests_forced _is_parquet_testing_tests_forced = config.getoption("force_parquet_testing_tests") + global _default_configs_path + _default_configs_path = config.getoption("default_configs_path") # For OOM injection: we expect a seed to be provided by the environment, or default to 1. # This is done such that any worker started by the xdist plugin for pytest will diff --git a/integration_tests/src/main/python/spark_session.py b/integration_tests/src/main/python/spark_session.py index 8356b49f48c..f67c3cd52a2 100644 --- a/integration_tests/src/main/python/spark_session.py +++ b/integration_tests/src/main/python/spark_session.py @@ -16,7 +16,7 @@ import calendar, time from datetime import date, datetime from contextlib import contextmanager, ExitStack -from conftest import is_allowing_any_non_gpu, get_non_gpu_allowed, get_validate_execs_in_gpu_plan, is_databricks_runtime, is_at_least_precommit_run, get_inject_oom_conf, is_per_test_ansi_mode_enabled +from conftest import is_allowing_any_non_gpu, get_non_gpu_allowed, get_validate_execs_in_gpu_plan, is_databricks_runtime, is_at_least_precommit_run, get_inject_oom_conf, is_per_test_ansi_mode_enabled, get_default_configs_path from pyspark.sql import DataFrame from pyspark.sql.types import TimestampType, DateType, _acceptable_types from spark_init_internal import get_spark_i_know_what_i_am_doing, spark_version @@ -41,8 +41,7 @@ def _from_scala_map(scala_map): # These settings can be overridden by specific tests if necessary. # Many of these are redundant with default settings for the configs but are set here explicitly # to ensure any cluster settings do not interfere with tests that assume the defaults. -conf_file_path = os.environ['SPARK_RAPIDS_DEFAULT_CONFIGS_PATH'] -with open(conf_file_path) as conf_file: +with open(get_default_configs_path()) as conf_file: _default_conf = json.load(conf_file) def _set_all_confs(conf): From d73581eccc0501d99a0cdf30204d0d72cc460f5c Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 1 Aug 2024 14:54:41 -0700 Subject: [PATCH 16/40] add the new config file in the assembly --- integration_tests/pom.xml | 66 +++++++++++---------- integration_tests/run_pyspark_from_build.sh | 4 +- integration_tests/src/assembly/bin.xml | 4 ++ 3 files changed, 42 insertions(+), 32 deletions(-) diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index b4b5465e615..073f49cb4e1 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -78,6 +78,42 @@ + + org.codehaus.mojo + exec-maven-plugin + + + clean-previously-populated-config-file + package + + exec + + + rm + + -f + ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json + + + + + populate-default-configs-for-testing + package + + java + + + true + com.nvidia.spark.rapids.tests.DumpDefaultConfigs + provided + + json + ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json + + + + + maven-assembly-plugin 3.6.0 @@ -180,36 +216,6 @@ org.codehaus.mojo exec-maven-plugin - - clean-previously-populated-config-file - verify - - exec - - - rm - - -f - ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json - - - - - populate-default-configs-for-testing - verify - - java - - - true - com.nvidia.spark.rapids.tests.DumpDefaultConfigs - provided - - json - ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json - - - run-pyspark-tests verify diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 8c7ecf6b3da..4e125a6476b 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -76,7 +76,7 @@ else # Make sure we have Parquet version >= 1.12 in the dependency LOWEST_PARQUET_JAR=$(echo -e "$MIN_PARQUET_JAR\n$PARQUET_HADOOP_TESTS" | sort -V | head -1) export INCLUDE_PARQUET_HADOOP_TEST_JAR=$([[ "$LOWEST_PARQUET_JAR" == "$MIN_PARQUET_JAR" ]] && echo true || echo false) - PLUGIN_JARS=$(echo "$TARGET_DIR"/../../dist/target/rapids-4-spark_*.jar) + PLUGIN_JARS=$(echo "$TARGET_DIR"/../../dist/rapids-4-spark_*.jar) # the integration-test-spark3xx.jar, should not include the integration-test-spark3xxtest.jar TEST_JARS=$(echo "$TARGET_DIR"/rapids-4-spark-integration-tests*-$INTEGRATION_TEST_VERSION.jar) fi @@ -232,7 +232,7 @@ else "$TEST_ARGS" $RUN_TEST_PARAMS --junitxml=TEST-pytest-`date +%s%N`.xml - --default_configs_path="${SCRIPTPATH}/target/spark-rapids-default-configs.json" + --default_configs_path="${TARGET_DIR}/spark-rapids-default-configs.json" "$@") NUM_LOCAL_EXECS=${NUM_LOCAL_EXECS:-0} diff --git a/integration_tests/src/assembly/bin.xml b/integration_tests/src/assembly/bin.xml index 6209d0b152a..47f5f9e9d52 100644 --- a/integration_tests/src/assembly/bin.xml +++ b/integration_tests/src/assembly/bin.xml @@ -50,6 +50,10 @@ ${project.build.outputDirectory}/rapids4spark-version-info.properties integration_tests + + ${spark.rapids.source.basedir}/integration_tests/target/spark-rapids-default-configs.json + integration_tests + From 99ea474231efddd408aad285fb5ed44e7500810d Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 1 Aug 2024 14:57:36 -0700 Subject: [PATCH 17/40] missing 2.13 change --- scala2.13/integration_tests/pom.xml | 66 ++++++++++++++++------------- 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml index 3b099ccb38a..5e2ec78cdbe 100644 --- a/scala2.13/integration_tests/pom.xml +++ b/scala2.13/integration_tests/pom.xml @@ -78,6 +78,42 @@ + + org.codehaus.mojo + exec-maven-plugin + + + clean-previously-populated-config-file + package + + exec + + + rm + + -f + ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json + + + + + populate-default-configs-for-testing + package + + java + + + true + com.nvidia.spark.rapids.tests.DumpDefaultConfigs + provided + + json + ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json + + + + + maven-assembly-plugin 3.6.0 @@ -180,36 +216,6 @@ org.codehaus.mojo exec-maven-plugin - - clean-previously-populated-config-file - verify - - exec - - - rm - - -f - ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json - - - - - populate-default-configs-for-testing - verify - - java - - - true - com.nvidia.spark.rapids.tests.DumpDefaultConfigs - provided - - json - ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json - - - run-pyspark-tests verify From 519bb7d4f51c62664b9093fe20457ee91e77d63e Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 2 Aug 2024 10:04:59 -0700 Subject: [PATCH 18/40] use maven build directory var --- integration_tests/src/assembly/bin.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/src/assembly/bin.xml b/integration_tests/src/assembly/bin.xml index 47f5f9e9d52..b46d026ef80 100644 --- a/integration_tests/src/assembly/bin.xml +++ b/integration_tests/src/assembly/bin.xml @@ -51,7 +51,7 @@ integration_tests - ${spark.rapids.source.basedir}/integration_tests/target/spark-rapids-default-configs.json + ${project.build.directory}/spark-rapids-default-configs.json integration_tests From c3ec423d511c0228a5ea4309410fa1d399ba5cd0 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 2 Aug 2024 10:07:32 -0700 Subject: [PATCH 19/40] revert unintended change --- integration_tests/run_pyspark_from_build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 4e125a6476b..341daa6f838 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -76,7 +76,7 @@ else # Make sure we have Parquet version >= 1.12 in the dependency LOWEST_PARQUET_JAR=$(echo -e "$MIN_PARQUET_JAR\n$PARQUET_HADOOP_TESTS" | sort -V | head -1) export INCLUDE_PARQUET_HADOOP_TEST_JAR=$([[ "$LOWEST_PARQUET_JAR" == "$MIN_PARQUET_JAR" ]] && echo true || echo false) - PLUGIN_JARS=$(echo "$TARGET_DIR"/../../dist/rapids-4-spark_*.jar) + PLUGIN_JARS=$(echo "$TARGET_DIR"/../../dist/target/rapids-4-spark_*.jar) # the integration-test-spark3xx.jar, should not include the integration-test-spark3xxtest.jar TEST_JARS=$(echo "$TARGET_DIR"/rapids-4-spark-integration-tests*-$INTEGRATION_TEST_VERSION.jar) fi From 076351c158afd75ba366e6e7eeb42533f1573fb1 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 2 Aug 2024 10:10:28 -0700 Subject: [PATCH 20/40] remove unnecessary clean --- integration_tests/pom.xml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index 073f49cb4e1..ab91639bb87 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -82,20 +82,6 @@ org.codehaus.mojo exec-maven-plugin - - clean-previously-populated-config-file - package - - exec - - - rm - - -f - ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json - - - populate-default-configs-for-testing package From b02faa2788d60916e4734b6e8fc92dff1273d768 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 2 Aug 2024 16:54:50 -0700 Subject: [PATCH 21/40] Add things in RapidsConf --- dist/pom.xml | 18 ++++ integration_tests/pom.xml | 22 ---- integration_tests/run_pyspark_from_build.sh | 3 +- integration_tests/src/assembly/bin.xml | 2 +- .../rapids/tests/DumpDefaultConfigs.scala | 81 -------------- .../com/nvidia/spark/rapids/RapidsConf.scala | 101 +++++++++++++++--- 6 files changed, 106 insertions(+), 121 deletions(-) delete mode 100644 integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala diff --git a/dist/pom.xml b/dist/pom.xml index abf6232c1a2..730ae5512a3 100644 --- a/dist/pom.xml +++ b/dist/pom.xml @@ -387,6 +387,23 @@ self.log("... OK") + + populate_default_configs_for_testing + package + + run + + + + + + + + + + + + update_config_docs verify @@ -400,6 +417,7 @@ self.log("... OK") + diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index ab91639bb87..e868bf0b0a2 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -78,28 +78,6 @@ - - org.codehaus.mojo - exec-maven-plugin - - - populate-default-configs-for-testing - package - - java - - - true - com.nvidia.spark.rapids.tests.DumpDefaultConfigs - provided - - json - ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json - - - - - maven-assembly-plugin 3.6.0 diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 341daa6f838..02a0efadbd5 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -223,6 +223,7 @@ else REPORT_CHARS=${REPORT_CHARS:="fE"} # default as (f)ailed, (E)rror STD_INPUT_PATH="$INPUT_PATH"/src/test/resources + DEFAULT_CONFIGS_PATH="${SCRIPTPATH}"/../dist/target/spark-rapids-default-configs.json TEST_COMMON_OPTS=(-v -r"$REPORT_CHARS" "$TEST_TAGS" @@ -232,7 +233,7 @@ else "$TEST_ARGS" $RUN_TEST_PARAMS --junitxml=TEST-pytest-`date +%s%N`.xml - --default_configs_path="${TARGET_DIR}/spark-rapids-default-configs.json" + --default_configs_path="${DEFAULT_CONFIGS_PATH}" "$@") NUM_LOCAL_EXECS=${NUM_LOCAL_EXECS:-0} diff --git a/integration_tests/src/assembly/bin.xml b/integration_tests/src/assembly/bin.xml index b46d026ef80..a0b0bc1f43b 100644 --- a/integration_tests/src/assembly/bin.xml +++ b/integration_tests/src/assembly/bin.xml @@ -51,7 +51,7 @@ integration_tests - ${project.build.directory}/spark-rapids-default-configs.json + ${spark.rapids.source.basedir}/dist/target/spark-rapids-default-configs.json integration_tests diff --git a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala deleted file mode 100644 index bda989892e0..00000000000 --- a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.tests - -import java.io.{BufferedOutputStream, DataOutputStream, FileOutputStream} -import java.nio.charset.StandardCharsets -import java.util.Locale - -import com.nvidia.spark.rapids.Arm.withResource -import com.nvidia.spark.rapids.ShimReflectionUtils -import org.json4s.DefaultFormats -import org.json4s.jackson.Serialization.writePretty - -/** - * Dump all spark-rapids configs with their defaults. - */ -object DumpDefaultConfigs { - - object Format extends Enumeration { - type Format = Value - val PLAIN, JSON = Value - } - - def main(args: Array[String]): Unit = { - if (args.length < 2) { - System.err.println(s"Usage: ${this.getClass.getCanonicalName} {format} {output_path}") - System.exit(1) - } - - val format = Format.withName(args(0).toUpperCase(Locale.US)) - val outputPath = args(1) - - println(s"Dumping all spark-rapids configs and their defaults at ${outputPath}") - - // We use the reflection as RapidsConf should be accessed via the shim layer. - val clazz = ShimReflectionUtils.loadClass("com.nvidia.spark.rapids.RapidsConf") - val m = clazz.getDeclaredMethod("getAllConfigsWithDefault") - val allConfs: Map[String, Any] = m.invoke(null).asInstanceOf[Map[String, Any]] - withResource(new FileOutputStream(outputPath)) { fos => - withResource(new BufferedOutputStream(fos)) { bos => - format match { - case Format.PLAIN => - withResource(new DataOutputStream(bos)) { dos => - allConfs.foreach( { case (k, v) => - val valStr = v match { - case Some(optVal) => optVal.toString - case None => "" - case _ => - if (v == null) { - "" - } else { - v.toString - } - } - dos.writeUTF(s"'${k}': '${valStr}',") - }) - } - case Format.JSON => - implicit val formats = DefaultFormats - bos.write(writePretty(allConfs).getBytes(StandardCharsets.UTF_8)) - case _ => - System.err.println(s"Unknown format: ${format}") - } - } - } - } -} diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index fc25e463f05..5a3842cece1 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -15,15 +15,19 @@ */ package com.nvidia.spark.rapids -import java.io.{File, FileOutputStream} +import java.io.{BufferedOutputStream, DataOutputStream, File, FileOutputStream} +import java.nio.charset.StandardCharsets import java.util - -import scala.collection.JavaConverters._ -import scala.collection.mutable.{HashMap, ListBuffer} +import java.util.Locale import ai.rapids.cudf.Cuda +import com.nvidia.spark.rapids.Arm.withResource import com.nvidia.spark.rapids.jni.RmmSpark.OomInjectionType import com.nvidia.spark.rapids.lore.{LoreId, OutputLoreId} +import org.json4s.DefaultFormats +import org.json4s.jackson.Serialization.writePretty +import scala.collection.JavaConverters._ +import scala.collection.mutable.{HashMap, ListBuffer} import org.apache.spark.SparkConf import org.apache.spark.internal.Logging @@ -2394,6 +2398,42 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression. allConfs.map(e => e.key -> e.getDefault).toMap } + object Format extends Enumeration { + type Format = Value + val PLAIN, JSON = Value + } + + def dumpAllConfigsWithDefault(format: Format.Value, outputPath: String): Unit = { + val allConfs: Map[String, Any] = RapidsConf.getAllConfigsWithDefault + withResource(new FileOutputStream(outputPath)) { fos => + withResource(new BufferedOutputStream(fos)) { bos => + format match { + case Format.PLAIN => + withResource(new DataOutputStream(bos)) { dos => + allConfs.foreach( { case (k, v) => + val valStr = v match { + case Some(optVal) => optVal.toString + case None => "" + case _ => + if (v == null) { + "" + } else { + v.toString + } + } + dos.writeUTF(s"'${k}': '${valStr}',") + }) + } + case Format.JSON => + implicit val formats = DefaultFormats + bos.write(writePretty(allConfs).getBytes(StandardCharsets.UTF_8)) + case _ => + System.err.println(s"Unknown format: ${format}") + } + } + } + } + def help(asTable: Boolean = false): Unit = { helpCommon(asTable) helpAdvanced(asTable) @@ -2528,19 +2568,48 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression. GpuOverrides.parts.values.toSeq.sortBy(_.tag.toString).foreach(_.confHelp(asTable)) } def main(args: Array[String]): Unit = { - // Include the configs in PythonConfEntries - com.nvidia.spark.rapids.python.PythonConfEntries.init() - val configs = new FileOutputStream(new File(args(0))) - Console.withOut(configs) { - Console.withErr(configs) { - RapidsConf.helpCommon(true) - } + if (args.length < 1) { + System.err.println(s"Usage: ${this.getClass.getCanonicalName} {command} ...") + System.exit(1) } - val advanced = new FileOutputStream(new File(args(1))) - Console.withOut(advanced) { - Console.withErr(advanced) { - RapidsConf.helpAdvanced(true) - } + + val command = args(0) + + command match { + case "help" => + if (args.length < 3) { + System.err.println(s"Usage: ${this.getClass.getCanonicalName} ${command}" + + s" {common_configs_path} {advanced_configs_path}") + System.exit(1) + } + + // Include the configs in PythonConfEntries + com.nvidia.spark.rapids.python.PythonConfEntries.init() + val configs = new FileOutputStream(new File(args(1))) + Console.withOut(configs) { + Console.withErr(configs) { + RapidsConf.helpCommon(true) + } + } + val advanced = new FileOutputStream(new File(args(2))) + Console.withOut(advanced) { + Console.withErr(advanced) { + RapidsConf.helpAdvanced(true) + } + } + case "dump-default-configs" => + if (args.length < 3) { + System.err.println(s"Usage: ${this.getClass.getCanonicalName} ${command}" + + s" {format} {output_path}") + System.exit(1) + } + + val format: RapidsConf.Format.Value = + RapidsConf.Format.withName(args(1).toUpperCase(Locale.US)) + val outputPath = args(2) + + println(s"Dumping all spark-rapids configs and their defaults at ${outputPath}") + RapidsConf.dumpAllConfigsWithDefault(format, outputPath) } } } From 4a3c862a5c6f3aac24be14def44940a2c68163db Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 5 Aug 2024 14:07:24 -0700 Subject: [PATCH 22/40] missing change for 2.13 --- scala2.13/dist/pom.xml | 18 +++++++++++++++ scala2.13/integration_tests/pom.xml | 36 ----------------------------- 2 files changed, 18 insertions(+), 36 deletions(-) diff --git a/scala2.13/dist/pom.xml b/scala2.13/dist/pom.xml index b9a958493d8..9a7880b763d 100644 --- a/scala2.13/dist/pom.xml +++ b/scala2.13/dist/pom.xml @@ -387,6 +387,23 @@ self.log("... OK") + + populate_default_configs_for_testing + package + + run + + + + + + + + + + + + update_config_docs verify @@ -400,6 +417,7 @@ self.log("... OK") + diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml index 5e2ec78cdbe..25d17a3f890 100644 --- a/scala2.13/integration_tests/pom.xml +++ b/scala2.13/integration_tests/pom.xml @@ -78,42 +78,6 @@ - - org.codehaus.mojo - exec-maven-plugin - - - clean-previously-populated-config-file - package - - exec - - - rm - - -f - ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json - - - - - populate-default-configs-for-testing - package - - java - - - true - com.nvidia.spark.rapids.tests.DumpDefaultConfigs - provided - - json - ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json - - - - - maven-assembly-plugin 3.6.0 From 8a7f31692749ecdaa010d5d3a8fb5de8d190f378 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 5 Aug 2024 15:26:27 -0700 Subject: [PATCH 23/40] fix directory path for scala 2.13 --- integration_tests/src/assembly/bin.xml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/integration_tests/src/assembly/bin.xml b/integration_tests/src/assembly/bin.xml index a0b0bc1f43b..19f1f1aaba0 100644 --- a/integration_tests/src/assembly/bin.xml +++ b/integration_tests/src/assembly/bin.xml @@ -51,7 +51,8 @@ integration_tests - ${spark.rapids.source.basedir}/dist/target/spark-rapids-default-configs.json + + ${project.basedir}/../dist/target/spark-rapids-default-configs.json integration_tests From ddb145a21f7e93d151c7cfff91d290ae9ee1aa00 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 5 Aug 2024 16:22:48 -0700 Subject: [PATCH 24/40] exclude jackson from spark-hive --- integration_tests/pom.xml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index e868bf0b0a2..3f6fe864903 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -68,6 +68,16 @@ org.apache.spark spark-hive_${scala.binary.version} + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + From 0542fdfb501b013f02edda0795ffb14a20b2c038 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 5 Aug 2024 16:28:22 -0700 Subject: [PATCH 25/40] missing change for 2.13 --- scala2.13/integration_tests/pom.xml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml index 25d17a3f890..6bbd314ce0e 100644 --- a/scala2.13/integration_tests/pom.xml +++ b/scala2.13/integration_tests/pom.xml @@ -68,6 +68,16 @@ org.apache.spark spark-hive_${scala.binary.version} + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + From dab6f76bf45502c627fe00d6ddf3776c759e6e41 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Tue, 6 Aug 2024 11:16:36 -0700 Subject: [PATCH 26/40] exclude old jackson stuff from iceberg --- pom.xml | 14 ++++++++++++++ scala2.13/pom.xml | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/pom.xml b/pom.xml index e7a6bb18489..54046364651 100644 --- a/pom.xml +++ b/pom.xml @@ -939,6 +939,20 @@ iceberg-core ${iceberg.version} provided + + + com.fasterxml.jackson.core + jackson-annotations + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + org.apache.spark diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml index e0574430ba6..cc33e64bf66 100644 --- a/scala2.13/pom.xml +++ b/scala2.13/pom.xml @@ -939,6 +939,20 @@ iceberg-core ${iceberg.version} provided + + + com.fasterxml.jackson.core + jackson-annotations + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + org.apache.spark From 49a63d60ba73be8721112eaa41d7602554cf8db7 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Tue, 6 Aug 2024 14:35:33 -0700 Subject: [PATCH 27/40] copyrights --- integration_tests/conftest.py | 2 +- integration_tests/src/assembly/bin.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integration_tests/conftest.py b/integration_tests/conftest.py index 3b7e1db15a8..1587b6591bd 100644 --- a/integration_tests/conftest.py +++ b/integration_tests/conftest.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/integration_tests/src/assembly/bin.xml b/integration_tests/src/assembly/bin.xml index 19f1f1aaba0..05593b1ada6 100644 --- a/integration_tests/src/assembly/bin.xml +++ b/integration_tests/src/assembly/bin.xml @@ -1,6 +1,6 @@ + + + + + + From c02e1c4460cd0697fd4be6342d23e8d6074f5e09 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Tue, 6 Aug 2024 18:26:58 -0700 Subject: [PATCH 29/40] fix config file path --- integration_tests/run_pyspark_from_build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 02a0efadbd5..a0e87150a9e 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -223,7 +223,7 @@ else REPORT_CHARS=${REPORT_CHARS:="fE"} # default as (f)ailed, (E)rror STD_INPUT_PATH="$INPUT_PATH"/src/test/resources - DEFAULT_CONFIGS_PATH="${SCRIPTPATH}"/../dist/target/spark-rapids-default-configs.json + DEFAULT_CONFIGS_PATH="${TARGET_DIR}"/../../dist/target/spark-rapids-default-configs.json TEST_COMMON_OPTS=(-v -r"$REPORT_CHARS" "$TEST_TAGS" From b04484a98c4040c91a84b7869dd55e4f824ab579 Mon Sep 17 00:00:00 2001 From: Gera Shegalov Date: Wed, 7 Aug 2024 02:41:56 -0700 Subject: [PATCH 30/40] move most dump changes to rapids conf - fork generation step with maven.compile.classpath - change to a phase before package Signed-off-by: Gera Shegalov --- integration_tests/pom.xml | 9 ++-- integration_tests/src/assembly/bin.xml | 2 +- .../rapids/tests/DumpDefaultConfigs.scala | 49 +------------------ .../com/nvidia/spark/rapids/RapidsConf.scala | 44 +++++++++++++++++ 4 files changed, 53 insertions(+), 51 deletions(-) diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index 3e0811ffd8c..f69ee397ec9 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -94,16 +94,19 @@ populate-default-configs-for-testing - package + generate-test-resources run - + - + diff --git a/integration_tests/src/assembly/bin.xml b/integration_tests/src/assembly/bin.xml index 29f9c15fb88..72a118a48cd 100644 --- a/integration_tests/src/assembly/bin.xml +++ b/integration_tests/src/assembly/bin.xml @@ -51,7 +51,7 @@ integration_tests - ${project.build.directory}/spark-rapids-default-configs.json + ${project.build.outputDirectory}/spark-rapids-default-configs.json integration_tests diff --git a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala index bda989892e0..20dc1e90244 100644 --- a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala +++ b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala @@ -16,66 +16,21 @@ package com.nvidia.spark.rapids.tests -import java.io.{BufferedOutputStream, DataOutputStream, FileOutputStream} -import java.nio.charset.StandardCharsets -import java.util.Locale +import org.apache.commons.lang3.reflect.MethodUtils -import com.nvidia.spark.rapids.Arm.withResource import com.nvidia.spark.rapids.ShimReflectionUtils -import org.json4s.DefaultFormats -import org.json4s.jackson.Serialization.writePretty /** * Dump all spark-rapids configs with their defaults. */ object DumpDefaultConfigs { - - object Format extends Enumeration { - type Format = Value - val PLAIN, JSON = Value - } - def main(args: Array[String]): Unit = { if (args.length < 2) { System.err.println(s"Usage: ${this.getClass.getCanonicalName} {format} {output_path}") System.exit(1) } - - val format = Format.withName(args(0).toUpperCase(Locale.US)) - val outputPath = args(1) - - println(s"Dumping all spark-rapids configs and their defaults at ${outputPath}") - // We use the reflection as RapidsConf should be accessed via the shim layer. val clazz = ShimReflectionUtils.loadClass("com.nvidia.spark.rapids.RapidsConf") - val m = clazz.getDeclaredMethod("getAllConfigsWithDefault") - val allConfs: Map[String, Any] = m.invoke(null).asInstanceOf[Map[String, Any]] - withResource(new FileOutputStream(outputPath)) { fos => - withResource(new BufferedOutputStream(fos)) { bos => - format match { - case Format.PLAIN => - withResource(new DataOutputStream(bos)) { dos => - allConfs.foreach( { case (k, v) => - val valStr = v match { - case Some(optVal) => optVal.toString - case None => "" - case _ => - if (v == null) { - "" - } else { - v.toString - } - } - dos.writeUTF(s"'${k}': '${valStr}',") - }) - } - case Format.JSON => - implicit val formats = DefaultFormats - bos.write(writePretty(allConfs).getBytes(StandardCharsets.UTF_8)) - case _ => - System.err.println(s"Unknown format: ${format}") - } - } - } + MethodUtils.invokeStaticMethod(clazz, "dumpConfigsWithDefault", args) } } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index d7b5782b335..17029d742fd 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -2576,6 +2576,50 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression. RapidsConf.dumpAllConfigsWithDefault(format, outputPath) } } + + object Format extends Enumeration { + type Format = Value + val PLAIN, JSON = Value + } + + def dumpConfigsWithDefault(args: Array[String]): Unit = { + import com.nvidia.spark.rapids.Arm._ + + val format = Format.withName(args(0).toUpperCase(java.util.Locale.US)) + val outputPath = args(1) + + println(s"Dumping all spark-rapids configs and their defaults at ${outputPath}") + + val allConfs = getAllConfigsWithDefault + withResource(new java.io.FileOutputStream(outputPath)) { fos => + withResource(new java.io.BufferedOutputStream(fos)) { bos => + format match { + case Format.PLAIN => + withResource(new java.io.DataOutputStream(bos)) { dos => + allConfs.foreach( { case (k, v) => + val valStr = v match { + case Some(optVal) => optVal.toString + case None => "" + case _ => + if (v == null) { + "" + } else { + v.toString + } + } + dos.writeUTF(s"'${k}': '${valStr}',") + }) + } + case Format.JSON => + implicit val formats = org.json4s.DefaultFormats + bos.write(org.json4s.jackson.Serialization.writePretty(allConfs) + .getBytes(java.nio.charset.StandardCharsets.UTF_8)) + case _ => + System.err.println(s"Unknown format: ${format}") + } + } + } + } } class RapidsConf(conf: Map[String, String]) extends Logging { From 3e59847ead0489a7a0176edbd405f4f9815a1269 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Wed, 7 Aug 2024 11:51:58 -0700 Subject: [PATCH 31/40] clean up after merge --- dist/pom.xml | 18 ----- integration_tests/pom.xml | 9 +-- integration_tests/run_pyspark_from_build.sh | 4 +- integration_tests/src/assembly/bin.xml | 2 +- .../rapids/tests/DumpDefaultConfigs.scala | 3 +- .../com/nvidia/spark/rapids/RapidsConf.scala | 70 ++++++------------- 6 files changed, 28 insertions(+), 78 deletions(-) diff --git a/dist/pom.xml b/dist/pom.xml index 730ae5512a3..abf6232c1a2 100644 --- a/dist/pom.xml +++ b/dist/pom.xml @@ -387,23 +387,6 @@ self.log("... OK") - - populate_default_configs_for_testing - package - - run - - - - - - - - - - - - update_config_docs verify @@ -417,7 +400,6 @@ self.log("... OK") - diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index f69ee397ec9..8829cd2d6ec 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -106,16 +106,9 @@ classpathref="maven.compile.classpath" fork="true"> - + - - - - - - - diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 27e71de5922..cd6cc5cef00 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -223,7 +223,7 @@ else REPORT_CHARS=${REPORT_CHARS:="fE"} # default as (f)ailed, (E)rror STD_INPUT_PATH="$INPUT_PATH"/src/test/resources - DEFAULT_CONFIGS_PATH="${TARGET_DIR}"/../../dist/target/spark-rapids-default-configs.json + DEFAULT_CONFIGS_PATH=${DEFAULT_CONFIGS_PATH:-${TARGET_DIR}/spark-rapids-default-configs.json} TEST_COMMON_OPTS=(-v -r"$REPORT_CHARS" "$TEST_TAGS" @@ -233,7 +233,7 @@ else "$TEST_ARGS" $RUN_TEST_PARAMS --junitxml=TEST-pytest-`date +%s%N`.xml - --default_configs_path="${TARGET_DIR}/spark-rapids-default-configs.json" + --default_configs_path="${DEFAULT_CONFIGS_PATH}" "$@") NUM_LOCAL_EXECS=${NUM_LOCAL_EXECS:-0} diff --git a/integration_tests/src/assembly/bin.xml b/integration_tests/src/assembly/bin.xml index 72a118a48cd..29f9c15fb88 100644 --- a/integration_tests/src/assembly/bin.xml +++ b/integration_tests/src/assembly/bin.xml @@ -51,7 +51,7 @@ integration_tests - ${project.build.outputDirectory}/spark-rapids-default-configs.json + ${project.build.directory}/spark-rapids-default-configs.json integration_tests diff --git a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala index 20dc1e90244..79451ced31b 100644 --- a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala +++ b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala @@ -16,9 +16,8 @@ package com.nvidia.spark.rapids.tests -import org.apache.commons.lang3.reflect.MethodUtils - import com.nvidia.spark.rapids.ShimReflectionUtils +import org.apache.commons.lang3.reflect.MethodUtils /** * Dump all spark-rapids configs with their defaults. diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index 17029d742fd..718ad4f1c69 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -21,7 +21,6 @@ import java.util import java.util.Locale import ai.rapids.cudf.Cuda -import com.nvidia.spark.rapids.Arm.withResource import com.nvidia.spark.rapids.jni.RmmSpark.OomInjectionType import com.nvidia.spark.rapids.lore.{LoreId, OutputLoreId} import org.json4s.DefaultFormats @@ -2532,48 +2531,25 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression. GpuOverrides.parts.values.toSeq.sortBy(_.tag.toString).foreach(_.confHelp(asTable)) } def main(args: Array[String]): Unit = { - if (args.length < 1) { - System.err.println(s"Usage: ${this.getClass.getCanonicalName} {command} ...") + if (args.length < 2) { + System.err.println(s"Usage: ${this.getClass.getCanonicalName}" + + s" {common_configs_path} {advanced_configs_path}") System.exit(1) } - val command = args(0) - - command match { - case "help" => - if (args.length < 3) { - System.err.println(s"Usage: ${this.getClass.getCanonicalName} ${command}" + - s" {common_configs_path} {advanced_configs_path}") - System.exit(1) - } - - // Include the configs in PythonConfEntries - com.nvidia.spark.rapids.python.PythonConfEntries.init() - val configs = new FileOutputStream(new File(args(1))) - Console.withOut(configs) { - Console.withErr(configs) { - RapidsConf.helpCommon(true) - } - } - val advanced = new FileOutputStream(new File(args(2))) - Console.withOut(advanced) { - Console.withErr(advanced) { - RapidsConf.helpAdvanced(true) - } - } - case "dump-default-configs" => - if (args.length < 3) { - System.err.println(s"Usage: ${this.getClass.getCanonicalName} ${command}" + - s" {format} {output_path}") - System.exit(1) - } - - val format: RapidsConf.Format.Value = - RapidsConf.Format.withName(args(1).toUpperCase(Locale.US)) - val outputPath = args(2) - - println(s"Dumping all spark-rapids configs and their defaults at ${outputPath}") - RapidsConf.dumpAllConfigsWithDefault(format, outputPath) + // Include the configs in PythonConfEntries + com.nvidia.spark.rapids.python.PythonConfEntries.init() + val configs = new FileOutputStream(new File(args(0))) + Console.withOut(configs) { + Console.withErr(configs) { + RapidsConf.helpCommon(true) + } + } + val advanced = new FileOutputStream(new File(args(1))) + Console.withOut(advanced) { + Console.withErr(advanced) { + RapidsConf.helpAdvanced(true) + } } } @@ -2585,17 +2561,17 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression. def dumpConfigsWithDefault(args: Array[String]): Unit = { import com.nvidia.spark.rapids.Arm._ - val format = Format.withName(args(0).toUpperCase(java.util.Locale.US)) + val format = Format.withName(args(0).toUpperCase(Locale.US)) val outputPath = args(1) println(s"Dumping all spark-rapids configs and their defaults at ${outputPath}") val allConfs = getAllConfigsWithDefault - withResource(new java.io.FileOutputStream(outputPath)) { fos => - withResource(new java.io.BufferedOutputStream(fos)) { bos => + withResource(new FileOutputStream(outputPath)) { fos => + withResource(new BufferedOutputStream(fos)) { bos => format match { case Format.PLAIN => - withResource(new java.io.DataOutputStream(bos)) { dos => + withResource(new DataOutputStream(bos)) { dos => allConfs.foreach( { case (k, v) => val valStr = v match { case Some(optVal) => optVal.toString @@ -2611,9 +2587,9 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression. }) } case Format.JSON => - implicit val formats = org.json4s.DefaultFormats - bos.write(org.json4s.jackson.Serialization.writePretty(allConfs) - .getBytes(java.nio.charset.StandardCharsets.UTF_8)) + implicit val formats = DefaultFormats + bos.write(writePretty(allConfs) + .getBytes(StandardCharsets.UTF_8)) case _ => System.err.println(s"Unknown format: ${format}") } From fa380fb475e29cc85cc362727114fb4ea468c28d Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Wed, 7 Aug 2024 11:52:26 -0700 Subject: [PATCH 32/40] scala 2.13 --- scala2.13/dist/pom.xml | 18 ------------- scala2.13/integration_tests/pom.xml | 39 +++++++++++------------------ 2 files changed, 14 insertions(+), 43 deletions(-) diff --git a/scala2.13/dist/pom.xml b/scala2.13/dist/pom.xml index 9a7880b763d..b9a958493d8 100644 --- a/scala2.13/dist/pom.xml +++ b/scala2.13/dist/pom.xml @@ -387,23 +387,6 @@ self.log("... OK") - - populate_default_configs_for_testing - package - - run - - - - - - - - - - - - update_config_docs verify @@ -417,7 +400,6 @@ self.log("... OK") - diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml index 7e30a7927e4..6380d0dabb3 100644 --- a/scala2.13/integration_tests/pom.xml +++ b/scala2.13/integration_tests/pom.xml @@ -89,37 +89,26 @@ - org.codehaus.mojo - exec-maven-plugin + org.apache.maven.plugins + maven-antrun-plugin - - clean-previously-populated-config-file - package - - exec - - - rm - - -f - ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json - - - populate-default-configs-for-testing - package + generate-test-resources - java + run - true - com.nvidia.spark.rapids.tests.DumpDefaultConfigs - provided - - json - ${spark.rapids.source.basedir}/${rapids.module}/target/spark-rapids-default-configs.json - + + + + + + + From 64b58d9c6d78007bb92bbb861a0bfee0c4c1b82f Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Wed, 7 Aug 2024 13:06:03 -0700 Subject: [PATCH 33/40] more strict arg check --- .../com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala | 2 +- .../src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala index 79451ced31b..14ea0c02cb0 100644 --- a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala +++ b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala @@ -24,7 +24,7 @@ import org.apache.commons.lang3.reflect.MethodUtils */ object DumpDefaultConfigs { def main(args: Array[String]): Unit = { - if (args.length < 2) { + if (args.length != 2) { System.err.println(s"Usage: ${this.getClass.getCanonicalName} {format} {output_path}") System.exit(1) } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index 718ad4f1c69..aa960d75766 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -2531,7 +2531,7 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression. GpuOverrides.parts.values.toSeq.sortBy(_.tag.toString).foreach(_.confHelp(asTable)) } def main(args: Array[String]): Unit = { - if (args.length < 2) { + if (args.length != 2) { System.err.println(s"Usage: ${this.getClass.getCanonicalName}" + s" {common_configs_path} {advanced_configs_path}") System.exit(1) From f33683941de9a5f584eb91c1c25f8b2dcbe1842d Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 8 Aug 2024 10:53:00 -0700 Subject: [PATCH 34/40] unpack ambiguous string arguments --- .../com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala | 2 +- .../main/scala/com/nvidia/spark/rapids/RapidsConf.scala | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala index 14ea0c02cb0..d3328cd42c3 100644 --- a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala +++ b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/DumpDefaultConfigs.scala @@ -30,6 +30,6 @@ object DumpDefaultConfigs { } // We use the reflection as RapidsConf should be accessed via the shim layer. val clazz = ShimReflectionUtils.loadClass("com.nvidia.spark.rapids.RapidsConf") - MethodUtils.invokeStaticMethod(clazz, "dumpConfigsWithDefault", args) + MethodUtils.invokeStaticMethod(clazz, "dumpConfigsWithDefault", args(0), args(1)) } } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index aa960d75766..38c966f62d0 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -2558,11 +2558,10 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression. val PLAIN, JSON = Value } - def dumpConfigsWithDefault(args: Array[String]): Unit = { + def dumpConfigsWithDefault(formatName: String, outputPath: String): Unit = { import com.nvidia.spark.rapids.Arm._ - val format = Format.withName(args(0).toUpperCase(Locale.US)) - val outputPath = args(1) + val format = Format.withName(formatName.toUpperCase(Locale.US)) println(s"Dumping all spark-rapids configs and their defaults at ${outputPath}") @@ -2587,7 +2586,7 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression. }) } case Format.JSON => - implicit val formats = DefaultFormats + implicit val formats: DefaultFormats.type = DefaultFormats bos.write(writePretty(allConfs) .getBytes(StandardCharsets.UTF_8)) case _ => From c4ba1a241e6382c39ada1c2067e2d26be38ba893 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 9 Aug 2024 11:04:12 -0700 Subject: [PATCH 35/40] allow legacy negative scale for decimals for some tests --- integration_tests/src/main/python/hash_aggregate_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index ea429d4533c..c74b8611475 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -372,6 +372,7 @@ def test_computation_in_grpby_columns(): @pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_grpby_sum(data_gen, conf): + conf = copy_and_update(conf, {'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}) assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100).groupby('a').agg(f.sum('b')), conf = conf) @@ -384,6 +385,7 @@ def test_hash_grpby_sum(data_gen, conf): @pytest.mark.parametrize('data_gen', [_grpkey_short_sum_full_decimals, _grpkey_short_sum_full_neg_scale_decimals], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_grpby_sum_full_decimal(data_gen, conf): + conf = copy_and_update(conf, {'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}) assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100).groupby('a').agg(f.sum('b')), conf = conf) From 45beca389221c8c5535ba6800ba1e4355dcaa6f1 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 9 Aug 2024 13:38:59 -0700 Subject: [PATCH 36/40] should fork for RapidsConf --- dist/pom.xml | 2 +- scala2.13/dist/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dist/pom.xml b/dist/pom.xml index abf6232c1a2..28fea40ebdf 100644 --- a/dist/pom.xml +++ b/dist/pom.xml @@ -399,7 +399,7 @@ self.log("... OK") - + diff --git a/scala2.13/dist/pom.xml b/scala2.13/dist/pom.xml index b9a958493d8..ebfa0fcf829 100644 --- a/scala2.13/dist/pom.xml +++ b/scala2.13/dist/pom.xml @@ -399,7 +399,7 @@ self.log("... OK") - + From d45dd1e4fe18931ed2591fda9bf22c8df91d6f97 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 9 Aug 2024 14:09:04 -0700 Subject: [PATCH 37/40] remove System.exit() from RapidsConf.main() --- dist/pom.xml | 2 +- .../src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/dist/pom.xml b/dist/pom.xml index 28fea40ebdf..abf6232c1a2 100644 --- a/dist/pom.xml +++ b/dist/pom.xml @@ -399,7 +399,7 @@ self.log("... OK") - + diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index 38c966f62d0..494ba34237e 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -2531,12 +2531,6 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression. GpuOverrides.parts.values.toSeq.sortBy(_.tag.toString).foreach(_.confHelp(asTable)) } def main(args: Array[String]): Unit = { - if (args.length != 2) { - System.err.println(s"Usage: ${this.getClass.getCanonicalName}" + - s" {common_configs_path} {advanced_configs_path}") - System.exit(1) - } - // Include the configs in PythonConfEntries com.nvidia.spark.rapids.python.PythonConfEntries.init() val configs = new FileOutputStream(new File(args(0))) From 951c5f3f5ae2e5e06f18c760fc94e67f6535a255 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 9 Aug 2024 14:16:21 -0700 Subject: [PATCH 38/40] missing change for scala 2.13 --- scala2.13/dist/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scala2.13/dist/pom.xml b/scala2.13/dist/pom.xml index ebfa0fcf829..b9a958493d8 100644 --- a/scala2.13/dist/pom.xml +++ b/scala2.13/dist/pom.xml @@ -399,7 +399,7 @@ self.log("... OK") - + From ab5d8b4553de684d181bf45daeabbe192189d708 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 12 Aug 2024 14:28:26 -0700 Subject: [PATCH 39/40] Fix more tests to set configs --- .../src/main/python/arithmetic_ops_test.py | 113 +++++++++++------- integration_tests/src/main/python/ast_test.py | 8 +- .../src/main/python/cast_test.py | 27 +++-- .../src/main/python/date_time_test.py | 3 +- .../src/main/python/hash_aggregate_test.py | 11 +- .../src/main/python/string_test.py | 3 +- 6 files changed, 108 insertions(+), 57 deletions(-) diff --git a/integration_tests/src/main/python/arithmetic_ops_test.py b/integration_tests/src/main/python/arithmetic_ops_test.py index d7fd941b97b..18e9bba6a2c 100644 --- a/integration_tests/src/main/python/arithmetic_ops_test.py +++ b/integration_tests/src/main/python/arithmetic_ops_test.py @@ -92,6 +92,8 @@ pytest.param(_decimal_gen_38_0, marks=pytest.mark.skipif( is_spark_330_or_later(), reason='This case overflows in Spark 3.3.0+'))] +allow_neg_scale_conf = {'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'} + def _get_overflow_df(spark, data, data_type, expr): return spark.createDataFrame( SparkContext.getOrCreate().parallelize([data]), @@ -108,7 +110,8 @@ def test_addition(data_gen): f.lit(-12).cast(data_type) + f.col('b'), f.lit(None).cast(data_type) + f.col('a'), f.col('b') + f.lit(None).cast(data_type), - f.col('a') + f.col('b'))) + f.col('a') + f.col('b')), + conf=allow_neg_scale_conf) # If it will not overflow for multiply it is good for add too @pytest.mark.parametrize('data_gen', _no_overflow_multiply_gens, ids=idfn) @@ -133,7 +136,8 @@ def test_subtraction(data_gen): f.lit(-12).cast(data_type) - f.col('b'), f.lit(None).cast(data_type) - f.col('a'), f.col('b') - f.lit(None).cast(data_type), - f.col('a') - f.col('b'))) + f.col('a') - f.col('b')), + conf=allow_neg_scale_conf) @pytest.mark.parametrize('lhs', [byte_gen, short_gen, int_gen, long_gen, DecimalGen(6, 5), DecimalGen(6, 4), DecimalGen(5, 4), DecimalGen(5, 3), DecimalGen(4, 2), DecimalGen(3, -2), @@ -145,7 +149,8 @@ def test_subtraction(data_gen): @disable_ansi_mode def test_addition_subtraction_mixed(lhs, rhs, addOrSub): assert_gpu_and_cpu_are_equal_collect( - lambda spark : two_col_df(spark, lhs, rhs).selectExpr(f"a {addOrSub} b") + lambda spark : two_col_df(spark, lhs, rhs).selectExpr(f"a {addOrSub} b"), + conf=allow_neg_scale_conf ) # If it will not overflow for multiply it is good for subtract too @@ -178,7 +183,8 @@ def test_multiplication(data_gen): f.lit(None).cast(data_type) * f.col('a'), f.col('b') * f.lit(None).cast(data_type), f.col('a') * f.col('b') - )) + ), + conf=allow_neg_scale_conf) @allow_non_gpu('ProjectExec', 'Alias', 'Multiply', 'Cast') @pytest.mark.parametrize('data_gen', _no_overflow_multiply_gens_for_fallback, ids=idfn) @@ -215,16 +221,18 @@ def test_multiplication_ansi_overflow(): def test_multiplication_mixed(lhs, rhs): assert_gpu_and_cpu_are_equal_collect( lambda spark : two_col_df(spark, lhs, rhs).select( - f.col('a') * f.col('b'))) + f.col('a') * f.col('b')), + conf=allow_neg_scale_conf) @approximate_float # we should get the perfectly correct answer for floats except when casting a decimal to a float in some corner cases. @pytest.mark.parametrize('lhs', [float_gen, double_gen], ids=idfn) @pytest.mark.parametrize('rhs', [DecimalGen(6, 3), DecimalGen(10, -2), DecimalGen(15, 3)], ids=idfn) def test_float_multiplication_mixed(lhs, rhs): + conf = copy_and_update(allow_neg_scale_conf, {'spark.rapids.sql.castDecimalToFloat.enabled': 'true'}) assert_gpu_and_cpu_are_equal_collect( lambda spark : two_col_df(spark, lhs, rhs).select( f.col('a') * f.col('b')), - conf={'spark.rapids.sql.castDecimalToFloat.enabled': 'true'}) + conf=conf) @pytest.mark.parametrize('data_gen', [double_gen, decimal_gen_32bit_neg_scale, DecimalGen(6, 3), DecimalGen(5, 5), DecimalGen(6, 0), DecimalGen(7, 4), DecimalGen(15, 0), DecimalGen(18, 0), @@ -238,7 +246,8 @@ def test_division(data_gen): f.lit(-12).cast(data_type) / f.col('b'), f.lit(None).cast(data_type) / f.col('a'), f.col('b') / f.lit(None).cast(data_type), - f.col('a') / f.col('b'))) + f.col('a') / f.col('b')), + conf=allow_neg_scale_conf) @pytest.mark.parametrize('rhs', [byte_gen, short_gen, int_gen, long_gen, DecimalGen(4, 1), DecimalGen(5, 0), DecimalGen(5, 1), DecimalGen(10, 5)], ids=idfn) @pytest.mark.parametrize('lhs', [byte_gen, short_gen, int_gen, long_gen, DecimalGen(5, 3), DecimalGen(4, 2), DecimalGen(1, -2), DecimalGen(16, 1)], ids=idfn) @@ -247,7 +256,8 @@ def test_division_mixed(lhs, rhs): assert_gpu_and_cpu_are_equal_collect( lambda spark : two_col_df(spark, lhs, rhs).select( f.col('a'), f.col('b'), - f.col('a') / f.col('b'))) + f.col('a') / f.col('b')), + conf=allow_neg_scale_conf) # Spark has some problems with some decimal operations where it can try to generate a type that is invalid (scale > precision) which results in an error # instead of increasing the precision. So we have a second test that deals with a few of these use cases @@ -258,7 +268,8 @@ def test_division_mixed_larger_dec(lhs, rhs): assert_gpu_and_cpu_are_equal_collect( lambda spark : two_col_df(spark, lhs, rhs).select( f.col('a'), f.col('b'), - f.col('a') / f.col('b'))) + f.col('a') / f.col('b')), + conf=allow_neg_scale_conf) @disable_ansi_mode def test_special_decimal_division(): @@ -268,7 +279,8 @@ def test_special_decimal_division(): data_gen = DecimalGen(precision, scale) assert_gpu_and_cpu_are_equal_collect( lambda spark : two_col_df(spark, data_gen, data_gen).select( - f.col('a') / f.col('b'))) + f.col('a') / f.col('b')), + conf=allow_neg_scale_conf) @approximate_float # we should get the perfectly correct answer for floats except when casting a decimal to a float in some corner cases. @pytest.mark.parametrize('rhs', [float_gen, double_gen], ids=idfn) @@ -278,7 +290,7 @@ def test_float_division_mixed(lhs, rhs): assert_gpu_and_cpu_are_equal_collect( lambda spark : two_col_df(spark, lhs, rhs).select( f.col('a') / f.col('b')), - conf={'spark.rapids.sql.castDecimalToFloat.enabled': 'true'}) + conf=allow_neg_scale_conf) @pytest.mark.parametrize('data_gen', integral_gens + [ decimal_gen_32bit, decimal_gen_64bit, _decimal_gen_7_7, _decimal_gen_18_3, _decimal_gen_30_2, @@ -301,7 +313,8 @@ def test_int_division(data_gen): def test_int_division_mixed(lhs, rhs): assert_gpu_and_cpu_are_equal_collect( lambda spark : two_col_df(spark, lhs, rhs).selectExpr( - 'a DIV b')) + 'a DIV b'), + conf=allow_neg_scale_conf) @pytest.mark.parametrize('data_gen', _arith_data_gens, ids=idfn) @disable_ansi_mode @@ -313,7 +326,8 @@ def test_mod(data_gen): f.lit(-12).cast(data_type) % f.col('b'), f.lit(None).cast(data_type) % f.col('a'), f.col('b') % f.lit(None).cast(data_type), - f.col('a') % f.col('b'))) + f.col('a') % f.col('b')), + conf=allow_neg_scale_conf) # pmod currently falls back for Decimal(precision=38) # https://github.com/NVIDIA/spark-rapids/issues/6336 @@ -434,7 +448,8 @@ def test_mod_pmod_by_zero_not_ansi(data_gen): @disable_ansi_mode def test_mod_mixed(lhs, rhs): assert_gpu_and_cpu_are_equal_collect( - lambda spark : two_col_df(spark, lhs, rhs).selectExpr(f"a % b")) + lambda spark : two_col_df(spark, lhs, rhs).selectExpr(f"a % b"), + conf=allow_neg_scale_conf) # @pytest.mark.skipif(not is_databricks113_or_later() and not is_spark_340_or_later(), reason="https://github.com/NVIDIA/spark-rapids/issues/8330") @pytest.mark.parametrize('lhs', [DecimalGen(38,0), DecimalGen(37,2), DecimalGen(38,5), DecimalGen(38,-10), DecimalGen(38,7)], ids=idfn) @@ -442,7 +457,8 @@ def test_mod_mixed(lhs, rhs): @disable_ansi_mode def test_mod_mixed_decimal128(lhs, rhs): assert_gpu_and_cpu_are_equal_collect( - lambda spark : two_col_df(spark, lhs, rhs).selectExpr("a", "b", f"a % b")) + lambda spark : two_col_df(spark, lhs, rhs).selectExpr("a", "b", f"a % b"), + conf=allow_neg_scale_conf) # Split into 4 tests to permute https://github.com/NVIDIA/spark-rapids/issues/7553 failures @pytest.mark.parametrize('lhs', [byte_gen, short_gen, int_gen, long_gen], ids=idfn) @@ -461,7 +477,8 @@ def test_pmod_mixed_numeric(lhs, rhs): def test_pmod_mixed_decimal_lhs(lhs, rhs): assert_gpu_fallback_collect( lambda spark : two_col_df(spark, lhs, rhs).selectExpr(f"pmod(a, b)"), - "Pmod") + "Pmod", + conf=allow_neg_scale_conf) @allow_non_gpu("ProjectExec", "Pmod") @pytest.mark.parametrize('lhs', [byte_gen, short_gen, int_gen, long_gen], ids=idfn) @@ -472,7 +489,8 @@ def test_pmod_mixed_decimal_lhs(lhs, rhs): def test_pmod_mixed_decimal_rhs(lhs, rhs): assert_gpu_fallback_collect( lambda spark : two_col_df(spark, lhs, rhs).selectExpr(f"pmod(a, b)"), - "Pmod") + "Pmod", + conf=allow_neg_scale_conf) @allow_non_gpu("ProjectExec", "Pmod") @pytest.mark.parametrize('lhs', [DecimalGen(6, 5), DecimalGen(6, 4), DecimalGen(5, 4), DecimalGen(5, 3), @@ -485,7 +503,8 @@ def test_pmod_mixed_decimal_rhs(lhs, rhs): def test_pmod_mixed_decimal(lhs, rhs): assert_gpu_fallback_collect( lambda spark : two_col_df(spark, lhs, rhs).selectExpr(f"pmod(a, b)"), - "Pmod") + "Pmod", + conf=allow_neg_scale_conf) @pytest.mark.parametrize('data_gen', double_gens, ids=idfn) def test_signum(data_gen): @@ -496,27 +515,31 @@ def test_signum(data_gen): @disable_ansi_mode def test_unary_minus(data_gen): assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr('-a')) + lambda spark : unary_op_df(spark, data_gen).selectExpr('-a'), + conf=allow_neg_scale_conf) @pytest.mark.parametrize('data_gen', _arith_decimal_gens_high_precision, ids=idfn) @pytest.mark.skipif(is_scala213(), reason="Apache Spark built with Scala 2.13 produces inconsistent results at high precision (SPARK-45438)") def test_unary_minus_decimal128(data_gen): assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr('-a')) + lambda spark : unary_op_df(spark, data_gen).selectExpr('-a'), + conf=allow_neg_scale_conf) @pytest.mark.parametrize('data_gen', _no_overflow_multiply_gens + [float_gen, double_gen] + _arith_decimal_gens_low_precision, ids=idfn) def test_unary_minus_ansi_no_overflow(data_gen): + conf = copy_and_update(ansi_enabled_conf, allow_neg_scale_conf) assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr('-a'), - conf=ansi_enabled_conf) + conf=conf) @pytest.mark.parametrize('data_gen', _arith_decimal_gens_high_precision, ids=idfn) @pytest.mark.skipif(is_scala213(), reason="Apache Spark built with Scala 2.13 produces inconsistent results at high precision (SPARK-45438)") def test_unary_minus_ansi_no_overflow_decimal128(data_gen): + conf = copy_and_update(ansi_enabled_conf, allow_neg_scale_conf) assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr('-a'), - conf=ansi_enabled_conf) + conf=conf) @pytest.mark.parametrize('data_type,value', [ (LongType(), LONG_MIN), @@ -540,34 +563,39 @@ def test_unary_minus_ansi_overflow(data_type, value): @pytest.mark.parametrize('data_gen', _arith_data_gens, ids=idfn) def test_unary_positive(data_gen): assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr('+a')) + lambda spark : unary_op_df(spark, data_gen).selectExpr('+a'), + conf=allow_neg_scale_conf) @pytest.mark.parametrize('data_gen', numeric_gens + _arith_decimal_gens_low_precision, ids=idfn) @disable_ansi_mode def test_abs(data_gen): assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr('abs(a)')) + lambda spark : unary_op_df(spark, data_gen).selectExpr('abs(a)'), + conf=allow_neg_scale_conf) @pytest.mark.parametrize('data_gen', _arith_decimal_gens_high_precision, ids=idfn) @pytest.mark.skipif(is_scala213(), reason="Apache Spark built with Scala 2.13 produces inconsistent results at high precision (SPARK-45438)") def test_abs_decimal128(data_gen): assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr('abs(a)')) + lambda spark : unary_op_df(spark, data_gen).selectExpr('abs(a)'), + conf=allow_neg_scale_conf) # ANSI is ignored for abs prior to 3.2.0, but still okay to test it a little more. @pytest.mark.parametrize('data_gen', _no_overflow_multiply_gens + [float_gen, double_gen] + _arith_decimal_gens_low_precision, ids=idfn) def test_abs_ansi_no_overflow(data_gen): + conf = copy_and_update(ansi_enabled_conf, allow_neg_scale_conf) assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr('abs(a)'), - conf=ansi_enabled_conf) + conf=conf) @pytest.mark.parametrize('data_gen', _arith_decimal_gens_high_precision, ids=idfn) @pytest.mark.skipif(is_scala213(), reason="Apache Spark built with Scala 2.13 produces inconsistent results at high precision") def test_abs_ansi_no_overflow_decimal128(data_gen): + conf = copy_and_update(ansi_enabled_conf, allow_neg_scale_conf) assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr('a','abs(a)'), - conf=ansi_enabled_conf) + conf=conf) # Only run this test for Spark v3.2.0 and later to verify abs will # throw exceptions for overflow when ANSI mode is enabled. @@ -644,11 +672,11 @@ def test_floor_ceil_overflow(data_gen): "pyspark.errors.exceptions.captured.ArithmeticException: [NUMERIC_VALUE_OUT_OF_RANGE.WITH_SUGGESTION]" assert_gpu_and_cpu_error( lambda spark: unary_op_df(spark, data_gen).selectExpr('floor(a)').collect(), - conf={}, + conf=allow_neg_scale_conf, error_message=exception_type) assert_gpu_and_cpu_error( lambda spark: unary_op_df(spark, data_gen).selectExpr('ceil(a)').collect(), - conf={}, + conf=allow_neg_scale_conf, error_message=exception_type) @pytest.mark.parametrize('data_gen', double_gens, ids=idfn) @@ -715,7 +743,8 @@ def test_decimal_bround(data_gen): 'bround(a, -1)', 'bround(a, 1)', 'bround(a, 2)', - 'bround(a, 10)')) + 'bround(a, 10)'), + conf=allow_neg_scale_conf) @incompat @approximate_float @@ -730,7 +759,8 @@ def test_decimal_round(data_gen): 'round(a, -1)', 'round(a, 1)', 'round(a, 2)', - 'round(a, 10)')) + 'round(a, 10)'), + conf=allow_neg_scale_conf) @incompat @@ -866,7 +896,8 @@ def test_cosh(data_gen): @pytest.mark.parametrize('data_gen', double_gens, ids=idfn) def test_acosh(data_gen): assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr('acosh(a)')) + lambda spark : unary_op_df(spark, data_gen).selectExpr('acosh(a)'), + conf={'spark.rapids.sql.improvedFloatOps.enabled': 'false'}) # The default approximate is 1e-6 or 1 in a million # in some cases we need to adjust this because the algorithm is different @@ -876,8 +907,7 @@ def test_acosh(data_gen): @pytest.mark.parametrize('data_gen', [DoubleGen(min_exp=-20, max_exp=20)], ids=idfn) def test_columnar_acosh_improved(data_gen): assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr('acosh(a)'), - {'spark.rapids.sql.improvedFloatOps.enabled': 'true'}) + lambda spark : unary_op_df(spark, data_gen).selectExpr('acosh(a)')) @approximate_float @pytest.mark.parametrize('data_gen', double_gens, ids=idfn) @@ -901,7 +931,8 @@ def test_asin(data_gen): @pytest.mark.parametrize('data_gen', double_gens, ids=idfn) def test_asinh(data_gen): assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr('asinh(a)')) + lambda spark : unary_op_df(spark, data_gen).selectExpr('asinh(a)'), + conf={'spark.rapids.sql.improvedFloatOps.enabled': 'false'}) # The default approximate is 1e-6 or 1 in a million # in some cases we need to adjust this because the algorithm is different @@ -912,7 +943,7 @@ def test_asinh(data_gen): def test_columnar_asinh_improved(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr('asinh(a)'), - {'spark.rapids.sql.improvedFloatOps.enabled': 'true'}) + conf={'spark.rapids.sql.improvedFloatOps.enabled': 'true'}) @approximate_float @pytest.mark.parametrize('data_gen', double_gens, ids=idfn) @@ -1017,7 +1048,7 @@ def test_columnar_pow(data_gen): def test_least(data_gen): num_cols = 20 s1 = with_cpu_session( - lambda spark: gen_scalar(data_gen, force_no_nulls=not isinstance(data_gen, NullGen))) + lambda spark: gen_scalar(data_gen, force_no_nulls=not isinstance(data_gen, NullGen)), conf=allow_neg_scale_conf) # we want lots of nulls gen = StructGen([('_c' + str(x), data_gen.copy_special_case(None, weight=100.0)) for x in range(0, num_cols)], nullable=False) @@ -1027,13 +1058,14 @@ def test_least(data_gen): data_type = data_gen.data_type assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, gen).select( - f.least(*command_args))) + f.least(*command_args)), + conf=allow_neg_scale_conf) @pytest.mark.parametrize('data_gen', all_basic_gens + _arith_decimal_gens, ids=idfn) def test_greatest(data_gen): num_cols = 20 s1 = with_cpu_session( - lambda spark: gen_scalar(data_gen, force_no_nulls=not isinstance(data_gen, NullGen))) + lambda spark: gen_scalar(data_gen, force_no_nulls=not isinstance(data_gen, NullGen)), conf=allow_neg_scale_conf) # we want lots of nulls gen = StructGen([('_c' + str(x), data_gen.copy_special_case(None, weight=100.0)) for x in range(0, num_cols)], nullable=False) @@ -1042,7 +1074,8 @@ def test_greatest(data_gen): data_type = data_gen.data_type assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, gen).select( - f.greatest(*command_args))) + f.greatest(*command_args)), + conf=allow_neg_scale_conf) def _test_div_by_zero(ansi_mode, expr, is_lit=False): diff --git a/integration_tests/src/main/python/ast_test.py b/integration_tests/src/main/python/ast_test.py index ed6547a429b..7e9c91e4a47 100644 --- a/integration_tests/src/main/python/ast_test.py +++ b/integration_tests/src/main/python/ast_test.py @@ -191,12 +191,14 @@ def test_atan(data_descr): @approximate_float @pytest.mark.parametrize('data_descr', [(double_gen, False)], ids=idfn) def test_asinh(data_descr): - assert_unary_ast(data_descr, lambda df: df.selectExpr('asinh(a)')) + assert_unary_ast(data_descr, lambda df: df.selectExpr('asinh(a)'), + conf={'spark.rapids.sql.improvedFloatOps.enabled': 'false'}) @approximate_float @pytest.mark.parametrize('data_descr', ast_double_descr, ids=idfn) def test_acosh(data_descr): - assert_unary_ast(data_descr, lambda df: df.selectExpr('acosh(a)')) + assert_unary_ast(data_descr, lambda df: df.selectExpr('acosh(a)'), + conf={'spark.rapids.sql.improvedFloatOps.enabled': 'false'}) @approximate_float @pytest.mark.parametrize('data_descr', ast_double_descr, ids=idfn) @@ -211,7 +213,7 @@ def test_atanh(data_descr): @pytest.mark.parametrize('data_descr', [(DoubleGen(min_exp=-20, max_exp=20), True)], ids=idfn) def test_asinh_improved(data_descr): assert_unary_ast(data_descr, lambda df: df.selectExpr('asinh(a)'), - conf={'spark.rapids.sql.improvedFloatOps.enabled': 'true'}) + conf={'spark.rapids.sql.improvedFloatOps.enabled': 'true'}) # The default approximate is 1e-6 or 1 in a million # in some cases we need to adjust this because the algorithm is different diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index f7784178182..ec22253a9a3 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -194,7 +194,8 @@ def test_cast_string_timestamp_fallback(): def test_cast_decimal_to(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type), f.col('a')), - conf = {'spark.rapids.sql.castDecimalToFloat.enabled': 'true'}) + conf = {'spark.rapids.sql.castDecimalToFloat.enabled': 'true', + 'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}) @approximate_float @pytest.mark.parametrize('data_gen', [ @@ -208,7 +209,8 @@ def test_ansi_cast_decimal_to(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type), f.col('a')), conf = {'spark.rapids.sql.castDecimalToFloat.enabled': True, - 'spark.sql.ansi.enabled': True}) + 'spark.sql.ansi.enabled': True, + 'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}) @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/10050') @pytest.mark.parametrize('data_gen', [ @@ -228,7 +230,8 @@ def test_ansi_cast_decimal_to(data_gen, to_type): DecimalType(1, -1)], ids=meta_idfn('to:')) def test_cast_decimal_to_decimal(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type), f.col('a'))) + lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type), f.col('a')), + conf={'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}) @pytest.mark.parametrize('data_gen', [byte_gen, short_gen, int_gen, long_gen], ids=idfn) @pytest.mark.parametrize('to_type', [ @@ -248,22 +251,26 @@ def test_cast_integral_to_decimal(data_gen, to_type): def test_cast_byte_to_decimal_overflow(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, byte_gen).select( - f.col('a').cast(DecimalType(2, -1)))) + f.col('a').cast(DecimalType(2, -1))), + conf={'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}) def test_cast_short_to_decimal_overflow(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, short_gen).select( - f.col('a').cast(DecimalType(4, -1)))) + f.col('a').cast(DecimalType(4, -1))), + conf={'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}) def test_cast_int_to_decimal_overflow(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, int_gen).select( - f.col('a').cast(DecimalType(9, -1)))) + f.col('a').cast(DecimalType(9, -1))), + conf={'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}) def test_cast_long_to_decimal_overflow(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, long_gen).select( - f.col('a').cast(DecimalType(18, -1)))) + f.col('a').cast(DecimalType(18, -1))), + conf={'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}) _float_special_cases = [(float("inf"), 5.0), (float("-inf"), 5.0), (float("nan"), 5.0)] @@ -282,7 +289,8 @@ def test_cast_floating_point_to_decimal(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select( f.col('a'), f.col('a').cast(to_type)), - conf={'spark.rapids.sql.castFloatToDecimal.enabled': 'true'}) + conf={'spark.rapids.sql.castFloatToDecimal.enabled': 'true', + 'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}) # casting these types to string should be passed basic_gens_for_cast_to_string = [ByteGen, ShortGen, IntegerGen, LongGen, StringGen, BooleanGen, DateGen, TimestampGen] @@ -439,7 +447,8 @@ def is_neg_dec_scale_bug_version(): def test_cast_string_to_negative_scale_decimal(): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, StringGen("[0-9]{9}")).select( - f.col('a').cast(DecimalType(8, -3)))) + f.col('a').cast(DecimalType(8, -3))), + conf={'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}) @pytest.mark.skipif(is_before_spark_330(), reason="ansi cast throws exception only in 3.3.0+") @pytest.mark.parametrize('type', [DoubleType(), FloatType()], ids=idfn) diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index 0c877f00238..34678a2e1ad 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -622,7 +622,8 @@ def test_unsupported_fallback_to_date(): @allow_non_gpu(*non_utc_allow) def test_timestamp_seconds(data_gen): assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_seconds(a)")) + lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_seconds(a)"), + conf={'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}) @allow_non_gpu(*non_utc_allow) def test_timestamp_seconds_long_overflow(): diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index c74b8611475..5f1470d4d9c 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -410,6 +410,7 @@ def test_hash_reduction_sum(data_gen, conf): @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) @datagen_overrides(seed=0, permanent=True, reason='https://github.com/NVIDIA/spark-rapids/issues/9779') def test_hash_reduction_sum_full_decimal(data_gen, conf): + conf = copy_and_update(conf, {'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}) assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen, length=100).selectExpr("SUM(a)"), conf = conf) @@ -454,7 +455,8 @@ def test_hash_avg_nulls_partial_only(data_gen): @pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn) def test_intersect_all(data_gen): assert_gpu_and_cpu_are_equal_collect( - lambda spark : gen_df(spark, data_gen, length=100).intersectAll(gen_df(spark, data_gen, length=100))) + lambda spark : gen_df(spark, data_gen, length=100).intersectAll(gen_df(spark, data_gen, length=100)), + conf={'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}) @approximate_float @ignore_order @@ -463,7 +465,8 @@ def test_intersect_all(data_gen): @pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn) def test_exceptAll(data_gen): assert_gpu_and_cpu_are_equal_collect( - lambda spark : gen_df(spark, data_gen, length=100).exceptAll(gen_df(spark, data_gen, length=100).filter('a != b'))) + lambda spark : gen_df(spark, data_gen, length=100).exceptAll(gen_df(spark, data_gen, length=100).filter('a != b')), + conf={'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}) # Spark fails to sort some decimal values due to overflow when calculating the sorting prefix. # See https://issues.apache.org/jira/browse/SPARK-40129 @@ -491,6 +494,7 @@ def test_exceptAll(data_gen): @pytest.mark.parametrize('data_gen', _pivot_gens_with_decimals, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_grpby_pivot(data_gen, conf): + conf = copy_and_update(conf, {'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'}) assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) .groupby('a') @@ -1112,7 +1116,8 @@ def test_hash_groupby_typed_imperative_agg_without_gpu_implementation_fallback() non_exist_classes='GpuApproximatePercentile,GpuObjectHashAggregateExec', table_name='table', sql="""select k, - approx_percentile(v, array(0.25, 0.5, 0.75)) from table group by k""") + approx_percentile(v, array(0.25, 0.5, 0.75)) from table group by k""", + conf={'spark.rapids.sql.incompatibleOps.enabled': 'false'}) @approximate_float @ignore_order diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 4ae4a827aa0..e9bc9db6462 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -878,7 +878,8 @@ def test_format_number_supported(data_gen): 'format_number(a, 10)', 'format_number(a, 13)', 'format_number(a, 30)', - 'format_number(a, 100)') + 'format_number(a, 100)'), + conf={'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'} ) format_float_special_vals = [float('nan'), float('inf'), float('-inf'), 0.0, -0.0, From 04d86b8f596daf9a0743894cf165ace55af13007 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 15 Aug 2024 10:25:35 -0700 Subject: [PATCH 40/40] add back explicit configs --- integration_tests/src/main/python/arithmetic_ops_test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/integration_tests/src/main/python/arithmetic_ops_test.py b/integration_tests/src/main/python/arithmetic_ops_test.py index 18e9bba6a2c..9943d7deb7e 100644 --- a/integration_tests/src/main/python/arithmetic_ops_test.py +++ b/integration_tests/src/main/python/arithmetic_ops_test.py @@ -287,10 +287,11 @@ def test_special_decimal_division(): @pytest.mark.parametrize('lhs', [DecimalGen(5, 3), DecimalGen(4, 2), DecimalGen(1, -2), DecimalGen(16, 1)], ids=idfn) @disable_ansi_mode def test_float_division_mixed(lhs, rhs): + conf = copy_and_update(allow_neg_scale_conf, {'spark.rapids.sql.castDecimalToFloat.enabled': 'true'}) assert_gpu_and_cpu_are_equal_collect( lambda spark : two_col_df(spark, lhs, rhs).select( f.col('a') / f.col('b')), - conf=allow_neg_scale_conf) + conf=conf) @pytest.mark.parametrize('data_gen', integral_gens + [ decimal_gen_32bit, decimal_gen_64bit, _decimal_gen_7_7, _decimal_gen_18_3, _decimal_gen_30_2, @@ -907,7 +908,8 @@ def test_acosh(data_gen): @pytest.mark.parametrize('data_gen', [DoubleGen(min_exp=-20, max_exp=20)], ids=idfn) def test_columnar_acosh_improved(data_gen): assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr('acosh(a)')) + lambda spark : unary_op_df(spark, data_gen).selectExpr('acosh(a)'), + {'spark.rapids.sql.improvedFloatOps.enabled': 'true'}) @approximate_float @pytest.mark.parametrize('data_gen', double_gens, ids=idfn)