From 5068921bd54ae60aed16f8df129278796c840fe8 Mon Sep 17 00:00:00 2001 From: nvauto <70000568+nvauto@users.noreply.github.com> Date: Mon, 25 Nov 2024 06:15:06 +0000 Subject: [PATCH 01/37] Init version 25.02.0-SNAPSHOT Keep the rapids JNI and private dependency version at 24.12.0-SNAPSHOT until the nightly CI for the branch-25.02 branch is complete. Track the dependency update process at: https://github.com/NVIDIA/spark-rapids/issues/11755 Signed-off-by: nvauto <70000568+nvauto@users.noreply.github.com> --- CONTRIBUTING.md | 8 ++++---- README.md | 2 +- aggregator/pom.xml | 4 ++-- api_validation/pom.xml | 4 ++-- datagen/README.md | 6 +++--- datagen/ScaleTest.md | 2 +- datagen/pom.xml | 4 ++-- delta-lake/delta-20x/pom.xml | 4 ++-- delta-lake/delta-21x/pom.xml | 4 ++-- delta-lake/delta-22x/pom.xml | 4 ++-- delta-lake/delta-23x/pom.xml | 4 ++-- delta-lake/delta-24x/pom.xml | 4 ++-- delta-lake/delta-spark330db/pom.xml | 4 ++-- delta-lake/delta-spark332db/pom.xml | 4 ++-- delta-lake/delta-spark341db/pom.xml | 4 ++-- delta-lake/delta-spark350db143/pom.xml | 4 ++-- delta-lake/delta-stub/pom.xml | 4 ++-- dist/pom.xml | 4 ++-- docs/configs.md | 2 +- docs/dev/shims.md | 12 ++++++------ docs/dev/testing.md | 4 ++-- integration_tests/README.md | 6 +++--- integration_tests/ScaleTest.md | 2 +- integration_tests/pom.xml | 4 ++-- jdk-profiles/pom.xml | 4 ++-- jenkins/databricks/create.py | 2 +- jenkins/databricks/init_cudf_udf.sh | 1 + jenkins/version-def.sh | 4 ++-- pom.xml | 3 ++- scala2.13/aggregator/pom.xml | 4 ++-- scala2.13/api_validation/pom.xml | 4 ++-- scala2.13/datagen/pom.xml | 4 ++-- scala2.13/delta-lake/delta-20x/pom.xml | 4 ++-- scala2.13/delta-lake/delta-21x/pom.xml | 4 ++-- scala2.13/delta-lake/delta-22x/pom.xml | 4 ++-- scala2.13/delta-lake/delta-23x/pom.xml | 4 ++-- scala2.13/delta-lake/delta-24x/pom.xml | 4 ++-- scala2.13/delta-lake/delta-spark330db/pom.xml | 4 ++-- scala2.13/delta-lake/delta-spark332db/pom.xml | 4 ++-- scala2.13/delta-lake/delta-spark341db/pom.xml | 4 ++-- scala2.13/delta-lake/delta-spark350db143/pom.xml | 4 ++-- scala2.13/delta-lake/delta-stub/pom.xml | 4 ++-- scala2.13/dist/pom.xml | 4 ++-- scala2.13/integration_tests/pom.xml | 4 ++-- scala2.13/jdk-profiles/pom.xml | 4 ++-- scala2.13/pom.xml | 3 ++- scala2.13/shim-deps/cloudera/pom.xml | 4 ++-- scala2.13/shim-deps/databricks/pom.xml | 4 ++-- scala2.13/shim-deps/pom.xml | 4 ++-- scala2.13/shuffle-plugin/pom.xml | 4 ++-- scala2.13/sql-plugin-api/pom.xml | 4 ++-- scala2.13/sql-plugin/pom.xml | 4 ++-- scala2.13/tests/pom.xml | 4 ++-- scala2.13/tools/pom.xml | 4 ++-- scala2.13/udf-compiler/pom.xml | 4 ++-- shim-deps/cloudera/pom.xml | 4 ++-- shim-deps/databricks/pom.xml | 4 ++-- shim-deps/pom.xml | 4 ++-- shuffle-plugin/pom.xml | 4 ++-- sql-plugin-api/pom.xml | 4 ++-- .../scala/com/nvidia/spark/rapids/ShimLoader.scala | 8 ++++---- sql-plugin/pom.xml | 4 ++-- .../main/scala/com/nvidia/spark/rapids/Plugin.scala | 6 +++--- .../scala/com/nvidia/spark/rapids/RapidsConf.scala | 2 +- tests/pom.xml | 4 ++-- tools/pom.xml | 4 ++-- udf-compiler/pom.xml | 4 ++-- 67 files changed, 138 insertions(+), 135 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 83b30747abd..e4077ee5994 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -127,15 +127,15 @@ mvn -pl dist -PnoSnapshots package -DskipTests Verify that shim-specific classes are hidden from a conventional classloader. ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl +$ javap -cp dist/target/rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl Error: class not found: com.nvidia.spark.rapids.shims.SparkShimImpl ``` However, its bytecode can be loaded if prefixed with `spark3XY` not contained in the package name ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar spark320.com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 -Warning: File dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar(/spark320/com/nvidia/spark/rapids/shims/SparkShimImpl.class) does not contain class spark320.com.nvidia.spark.rapids.shims.SparkShimImpl +$ javap -cp dist/target/rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar spark320.com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 +Warning: File dist/target/rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar(/spark320/com/nvidia/spark/rapids/shims/SparkShimImpl.class) does not contain class spark320.com.nvidia.spark.rapids.shims.SparkShimImpl Compiled from "SparkShims.scala" public final class com.nvidia.spark.rapids.shims.SparkShimImpl { ``` @@ -178,7 +178,7 @@ mvn package -pl dist -am -Dbuildver=340 -DallowConventionalDistJar=true Verify `com.nvidia.spark.rapids.shims.SparkShimImpl` is conventionally loadable: ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 +$ javap -cp dist/target/rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 Compiled from "SparkShims.scala" public final class com.nvidia.spark.rapids.shims.SparkShimImpl { ``` diff --git a/README.md b/README.md index 94b73565190..61914e49df0 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ as a `provided` dependency. com.nvidia rapids-4-spark_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT provided ``` diff --git a/aggregator/pom.xml b/aggregator/pom.xml index c7a6c220247..a47745776bc 100644 --- a/aggregator/pom.xml +++ b/aggregator/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark-aggregator_2.12 RAPIDS Accelerator for Apache Spark Aggregator Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT aggregator diff --git a/api_validation/pom.xml b/api_validation/pom.xml index cddcf0c1ce1..f3339375806 100644 --- a/api_validation/pom.xml +++ b/api_validation/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-api-validation_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT api_validation diff --git a/datagen/README.md b/datagen/README.md index 022cc2f1eba..1c49c8db58e 100644 --- a/datagen/README.md +++ b/datagen/README.md @@ -24,12 +24,12 @@ Where `$SPARK_VERSION` is a compressed version number, like 330 for Spark 3.3.0. After this the jar should be at `target/datagen_2.12-$PLUGIN_VERSION-spark$SPARK_VERSION.jar` -for example a Spark 3.3.0 jar for the 24.12.0 release would be -`target/datagen_2.12-24.12.0-spark330.jar` +for example a Spark 3.3.0 jar for the 25.02.0 release would be +`target/datagen_2.12-25.02.0-spark330.jar` To get a spark shell with this you can run ```shell -spark-shell --jars target/datagen_2.12-24.12.0-spark330.jar +spark-shell --jars target/datagen_2.12-25.02.0-spark330.jar ``` After that you should be good to go. diff --git a/datagen/ScaleTest.md b/datagen/ScaleTest.md index a728ad9a13e..8e692173f5f 100644 --- a/datagen/ScaleTest.md +++ b/datagen/ScaleTest.md @@ -44,7 +44,7 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \ --class com.nvidia.rapids.tests.scaletest.ScaleTestDataGen \ # the main class --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \ # one dependency jar just shipped with Spark under $SPARK_HOME -./target/datagen_2.12-24.12.0-SNAPSHOT-spark332.jar \ +./target/datagen_2.12-25.02.0-SNAPSHOT-spark332.jar \ 1 \ 10 \ parquet \ diff --git a/datagen/pom.xml b/datagen/pom.xml index 9bdf897cfd7..fc2d8bc677c 100644 --- a/datagen/pom.xml +++ b/datagen/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml datagen_2.12 Data Generator Tools for generating large amounts of data - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT datagen diff --git a/delta-lake/delta-20x/pom.xml b/delta-lake/delta-20x/pom.xml index 1d41911c767..ba5443a7be2 100644 --- a/delta-lake/delta-20x/pom.xml +++ b/delta-lake/delta-20x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-20x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-20x diff --git a/delta-lake/delta-21x/pom.xml b/delta-lake/delta-21x/pom.xml index 7514088ca3a..602686d79ab 100644 --- a/delta-lake/delta-21x/pom.xml +++ b/delta-lake/delta-21x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-21x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-21x diff --git a/delta-lake/delta-22x/pom.xml b/delta-lake/delta-22x/pom.xml index 2ed0ea3b159..7867c573607 100644 --- a/delta-lake/delta-22x/pom.xml +++ b/delta-lake/delta-22x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-22x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-22x diff --git a/delta-lake/delta-23x/pom.xml b/delta-lake/delta-23x/pom.xml index 1daebdd0efb..f537de0be36 100644 --- a/delta-lake/delta-23x/pom.xml +++ b/delta-lake/delta-23x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../pom.xml rapids-4-spark-delta-23x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.3.x Support Delta Lake 2.3.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-23x diff --git a/delta-lake/delta-24x/pom.xml b/delta-lake/delta-24x/pom.xml index 36ec92b70c0..443681b6cb3 100644 --- a/delta-lake/delta-24x/pom.xml +++ b/delta-lake/delta-24x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-24x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-24x diff --git a/delta-lake/delta-spark330db/pom.xml b/delta-lake/delta-spark330db/pom.xml index 95f54c6807c..4812c9d0097 100644 --- a/delta-lake/delta-spark330db/pom.xml +++ b/delta-lake/delta-spark330db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark330db_2.12 RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-spark330db diff --git a/delta-lake/delta-spark332db/pom.xml b/delta-lake/delta-spark332db/pom.xml index 4d792ee1ca5..306553caa43 100644 --- a/delta-lake/delta-spark332db/pom.xml +++ b/delta-lake/delta-spark332db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark332db_2.12 RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-spark332db diff --git a/delta-lake/delta-spark341db/pom.xml b/delta-lake/delta-spark341db/pom.xml index 4b229e2e5b5..c7b4a4e2738 100644 --- a/delta-lake/delta-spark341db/pom.xml +++ b/delta-lake/delta-spark341db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark341db_2.12 RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT false diff --git a/delta-lake/delta-spark350db143/pom.xml b/delta-lake/delta-spark350db143/pom.xml index 1bca394b67c..1e166244e1e 100644 --- a/delta-lake/delta-spark350db143/pom.xml +++ b/delta-lake/delta-spark350db143/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark350db143_2.12 RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT false diff --git a/delta-lake/delta-stub/pom.xml b/delta-lake/delta-stub/pom.xml index 6d0471f9f01..31b8e03b366 100644 --- a/delta-lake/delta-stub/pom.xml +++ b/delta-lake/delta-stub/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-stub_2.12 RAPIDS Accelerator for Apache Spark Delta Lake Stub Delta Lake stub for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-stub diff --git a/dist/pom.xml b/dist/pom.xml index d628dd4ba3b..b34292a25cd 100644 --- a/dist/pom.xml +++ b/dist/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark_2.12 RAPIDS Accelerator for Apache Spark Distribution Creates the distribution package of the RAPIDS plugin for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT com.nvidia diff --git a/docs/configs.md b/docs/configs.md index 7f9544496c4..04aecb41f02 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -10,7 +10,7 @@ The following is the list of options that `rapids-plugin-4-spark` supports. On startup use: `--conf [conf key]=[conf value]`. For example: ``` -${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar \ +${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ --conf spark.rapids.sql.concurrentGpuTasks=2 ``` diff --git a/docs/dev/shims.md b/docs/dev/shims.md index 0d62eb4cae8..24252df607e 100644 --- a/docs/dev/shims.md +++ b/docs/dev/shims.md @@ -68,17 +68,17 @@ Using JarURLConnection URLs we create a Parallel World of the current version wi Spark 3.0.2's URLs: ```text -jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/ -jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark-shared/ -jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark302/ +jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/ +jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/spark-shared/ +jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/spark302/ ``` Spark 3.2.0's URLs : ```text -jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/ -jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark-shared/ -jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark320/ +jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/ +jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/spark-shared/ +jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/spark320/ ``` ### Late Inheritance in Public Classes diff --git a/docs/dev/testing.md b/docs/dev/testing.md index 9f1c33091f1..fe6c0b94c1f 100644 --- a/docs/dev/testing.md +++ b/docs/dev/testing.md @@ -5,5 +5,5 @@ nav_order: 2 parent: Developer Overview --- An overview of testing can be found within the repository at: -* [Unit tests](https://github.com/NVIDIA/spark-rapids/tree/branch-24.12/tests#readme) -* [Integration testing](https://github.com/NVIDIA/spark-rapids/tree/branch-24.12/integration_tests#readme) +* [Unit tests](https://github.com/NVIDIA/spark-rapids/tree/branch-25.02/tests#readme) +* [Integration testing](https://github.com/NVIDIA/spark-rapids/tree/branch-25.02/integration_tests#readme) diff --git a/integration_tests/README.md b/integration_tests/README.md index f5237de21a0..031b318bddf 100644 --- a/integration_tests/README.md +++ b/integration_tests/README.md @@ -263,7 +263,7 @@ individually, so you don't risk running unit tests along with the integration te http://www.scalatest.org/user_guide/using_the_scalatest_shell ```shell -spark-shell --jars rapids-4-spark-tests_2.12-24.12.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-24.12.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar +spark-shell --jars rapids-4-spark-tests_2.12-25.02.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-25.02.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar ``` First you import the `scalatest_shell` and tell the tests where they can find the test files you @@ -286,7 +286,7 @@ If you just want to verify the SQL replacement is working you will need to add t assumes CUDA 11.0 is being used and the Spark distribution is built with Scala 2.12. ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar" ./runtests.py +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar" ./runtests.py ``` You don't have to enable the plugin for this to work, the test framework will do that for you. @@ -443,7 +443,7 @@ To run cudf_udf tests, need following configuration changes: As an example, here is the `spark-submit` command with the cudf_udf parameter on CUDA 11.0: ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-24.12.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar" ./runtests.py --cudf_udf +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-25.02.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar" ./runtests.py --cudf_udf ``` ### Enabling fuzz tests diff --git a/integration_tests/ScaleTest.md b/integration_tests/ScaleTest.md index d9f47fab5cb..8b91331abc9 100644 --- a/integration_tests/ScaleTest.md +++ b/integration_tests/ScaleTest.md @@ -97,7 +97,7 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \ --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \ --class com.nvidia.spark.rapids.tests.scaletest.ScaleTest \ -./target/rapids-4-spark-integration-tests_2.12-24.12.0-SNAPSHOT-spark332.jar \ +./target/rapids-4-spark-integration-tests_2.12-25.02.0-SNAPSHOT-spark332.jar \ 10 \ 100 \ parquet \ diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index aaff3455298..6054c30c795 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-integration-tests_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT integration_tests diff --git a/jdk-profiles/pom.xml b/jdk-profiles/pom.xml index caaa47245a8..b45da24bd58 100644 --- a/jdk-profiles/pom.xml +++ b/jdk-profiles/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT com.nvidia rapids-4-spark-jdk-profiles_2.12 pom Shim JDK Profiles - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT jdk8 diff --git a/jenkins/databricks/create.py b/jenkins/databricks/create.py index 44c4c856466..7815f170dfb 100644 --- a/jenkins/databricks/create.py +++ b/jenkins/databricks/create.py @@ -27,7 +27,7 @@ def main(): workspace = 'https://dbc-9ff9942e-a9c4.cloud.databricks.com' token = '' sshkey = '' - cluster_name = 'CI-GPU-databricks-24.12.0-SNAPSHOT' + cluster_name = 'CI-GPU-databricks-25.02.0-SNAPSHOT' idletime = 240 runtime = '13.3.x-gpu-ml-scala2.12' num_workers = 1 diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh index 16b90b95c0e..94ca7473143 100755 --- a/jenkins/databricks/init_cudf_udf.sh +++ b/jenkins/databricks/init_cudf_udf.sh @@ -20,6 +20,7 @@ set -ex +# TODO: https://github.com/NVIDIA/spark-rapids/issues/11755 CUDF_VER=${CUDF_VER:-24.12} CUDA_VER=${CUDA_VER:-11.8} diff --git a/jenkins/version-def.sh b/jenkins/version-def.sh index 8600a2f8689..6c9a9fac4cb 100755 --- a/jenkins/version-def.sh +++ b/jenkins/version-def.sh @@ -29,8 +29,8 @@ IFS=$PRE_IFS CUDA_CLASSIFIER=${CUDA_CLASSIFIER:-"cuda11"} CLASSIFIER=${CLASSIFIER:-"$CUDA_CLASSIFIER"} # default as CUDA_CLASSIFIER for compatibility -PROJECT_VER=${PROJECT_VER:-"24.12.0-SNAPSHOT"} -PROJECT_TEST_VER=${PROJECT_TEST_VER:-"24.12.0-SNAPSHOT"} +PROJECT_VER=${PROJECT_VER:-"25.02.0-SNAPSHOT"} +PROJECT_TEST_VER=${PROJECT_TEST_VER:-"25.02.0-SNAPSHOT"} SPARK_VER=${SPARK_VER:-"3.2.0"} SPARK_VER_213=${SPARK_VER_213:-"3.3.0"} # Make a best attempt to set the default value for the shuffle shim. diff --git a/pom.xml b/pom.xml index 12828404031..79a6a765470 100644 --- a/pom.xml +++ b/pom.xml @@ -23,7 +23,7 @@ rapids-4-spark-parent_2.12 RAPIDS Accelerator for Apache Spark Root Project The root project of the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT pom https://nvidia.github.io/spark-rapids/ @@ -829,6 +829,7 @@ spark${buildver} cuda11 ${cuda.version} + 24.12.0-SNAPSHOT 24.12.0-SNAPSHOT 2.12 diff --git a/scala2.13/aggregator/pom.xml b/scala2.13/aggregator/pom.xml index 74956108068..d9db5bcf14e 100644 --- a/scala2.13/aggregator/pom.xml +++ b/scala2.13/aggregator/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark-aggregator_2.13 RAPIDS Accelerator for Apache Spark Aggregator Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT aggregator diff --git a/scala2.13/api_validation/pom.xml b/scala2.13/api_validation/pom.xml index f236345c301..2a2e08c6071 100644 --- a/scala2.13/api_validation/pom.xml +++ b/scala2.13/api_validation/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-api-validation_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT api_validation diff --git a/scala2.13/datagen/pom.xml b/scala2.13/datagen/pom.xml index d53ebc014c7..e8a07a79841 100644 --- a/scala2.13/datagen/pom.xml +++ b/scala2.13/datagen/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml datagen_2.13 Data Generator Tools for generating large amounts of data - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT datagen diff --git a/scala2.13/delta-lake/delta-20x/pom.xml b/scala2.13/delta-lake/delta-20x/pom.xml index 20c77038f40..57551a3d164 100644 --- a/scala2.13/delta-lake/delta-20x/pom.xml +++ b/scala2.13/delta-lake/delta-20x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-20x_2.13 RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-20x diff --git a/scala2.13/delta-lake/delta-21x/pom.xml b/scala2.13/delta-lake/delta-21x/pom.xml index 75a41cfa8e0..6fbcf6bb8d1 100644 --- a/scala2.13/delta-lake/delta-21x/pom.xml +++ b/scala2.13/delta-lake/delta-21x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-21x_2.13 RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-21x diff --git a/scala2.13/delta-lake/delta-22x/pom.xml b/scala2.13/delta-lake/delta-22x/pom.xml index c6111eb51a0..ff919c7b48c 100644 --- a/scala2.13/delta-lake/delta-22x/pom.xml +++ b/scala2.13/delta-lake/delta-22x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-22x_2.13 RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-22x diff --git a/scala2.13/delta-lake/delta-23x/pom.xml b/scala2.13/delta-lake/delta-23x/pom.xml index 84d1d7275c2..fe927c7a092 100644 --- a/scala2.13/delta-lake/delta-23x/pom.xml +++ b/scala2.13/delta-lake/delta-23x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-parent_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../pom.xml rapids-4-spark-delta-23x_2.13 RAPIDS Accelerator for Apache Spark Delta Lake 2.3.x Support Delta Lake 2.3.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-23x diff --git a/scala2.13/delta-lake/delta-24x/pom.xml b/scala2.13/delta-lake/delta-24x/pom.xml index 0ffe6c84e10..781f7975523 100644 --- a/scala2.13/delta-lake/delta-24x/pom.xml +++ b/scala2.13/delta-lake/delta-24x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-24x_2.13 RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-24x diff --git a/scala2.13/delta-lake/delta-spark330db/pom.xml b/scala2.13/delta-lake/delta-spark330db/pom.xml index 3c30b1b0dc8..d6f2ee68e10 100644 --- a/scala2.13/delta-lake/delta-spark330db/pom.xml +++ b/scala2.13/delta-lake/delta-spark330db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark330db_2.13 RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-spark330db diff --git a/scala2.13/delta-lake/delta-spark332db/pom.xml b/scala2.13/delta-lake/delta-spark332db/pom.xml index a3501c1003c..de53ab84f32 100644 --- a/scala2.13/delta-lake/delta-spark332db/pom.xml +++ b/scala2.13/delta-lake/delta-spark332db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark332db_2.13 RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-spark332db diff --git a/scala2.13/delta-lake/delta-spark341db/pom.xml b/scala2.13/delta-lake/delta-spark341db/pom.xml index c740362b11f..bd6a72ea04b 100644 --- a/scala2.13/delta-lake/delta-spark341db/pom.xml +++ b/scala2.13/delta-lake/delta-spark341db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark341db_2.13 RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT false diff --git a/scala2.13/delta-lake/delta-spark350db143/pom.xml b/scala2.13/delta-lake/delta-spark350db143/pom.xml index d6046b64578..c19c2e0ad21 100644 --- a/scala2.13/delta-lake/delta-spark350db143/pom.xml +++ b/scala2.13/delta-lake/delta-spark350db143/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark350db143_2.13 RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT false diff --git a/scala2.13/delta-lake/delta-stub/pom.xml b/scala2.13/delta-lake/delta-stub/pom.xml index 2f90b85acd7..2a334190cea 100644 --- a/scala2.13/delta-lake/delta-stub/pom.xml +++ b/scala2.13/delta-lake/delta-stub/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-stub_2.13 RAPIDS Accelerator for Apache Spark Delta Lake Stub Delta Lake stub for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-stub diff --git a/scala2.13/dist/pom.xml b/scala2.13/dist/pom.xml index 15df1ec69f8..0c8f12a9214 100644 --- a/scala2.13/dist/pom.xml +++ b/scala2.13/dist/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark_2.13 RAPIDS Accelerator for Apache Spark Distribution Creates the distribution package of the RAPIDS plugin for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT com.nvidia diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml index 88ab2531235..67afb46c779 100644 --- a/scala2.13/integration_tests/pom.xml +++ b/scala2.13/integration_tests/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-integration-tests_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT integration_tests diff --git a/scala2.13/jdk-profiles/pom.xml b/scala2.13/jdk-profiles/pom.xml index 793bf0fb327..6ec2f369b96 100644 --- a/scala2.13/jdk-profiles/pom.xml +++ b/scala2.13/jdk-profiles/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-parent_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT com.nvidia rapids-4-spark-jdk-profiles_2.13 pom Shim JDK Profiles - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT jdk8 diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml index d52c8658423..d1368d81d97 100644 --- a/scala2.13/pom.xml +++ b/scala2.13/pom.xml @@ -23,7 +23,7 @@ rapids-4-spark-parent_2.13 RAPIDS Accelerator for Apache Spark Root Project The root project of the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT pom https://nvidia.github.io/spark-rapids/ @@ -829,6 +829,7 @@ spark${buildver} cuda11 ${cuda.version} + 24.12.0-SNAPSHOT 24.12.0-SNAPSHOT 2.13 diff --git a/scala2.13/shim-deps/cloudera/pom.xml b/scala2.13/shim-deps/cloudera/pom.xml index 95c49a2b1ca..be06f76c136 100644 --- a/scala2.13/shim-deps/cloudera/pom.xml +++ b/scala2.13/shim-deps/cloudera/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-parent_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../pom.xml rapids-4-spark-cdh-bom pom CDH Shim Dependencies - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/cloudera diff --git a/scala2.13/shim-deps/databricks/pom.xml b/scala2.13/shim-deps/databricks/pom.xml index 9d6ff787ef1..4feb4045327 100644 --- a/scala2.13/shim-deps/databricks/pom.xml +++ b/scala2.13/shim-deps/databricks/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-parent_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../pom.xml rapids-4-spark-db-bom pom Databricks Shim Dependencies - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/databricks diff --git a/scala2.13/shim-deps/pom.xml b/scala2.13/shim-deps/pom.xml index 66cfa22afea..6c7a4b991a7 100644 --- a/scala2.13/shim-deps/pom.xml +++ b/scala2.13/shim-deps/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark-shim-deps-parent_2.13 pom Shim Dependencies Profiles - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT release321cdh diff --git a/scala2.13/shuffle-plugin/pom.xml b/scala2.13/shuffle-plugin/pom.xml index b9e76b2f068..6f915a66212 100644 --- a/scala2.13/shuffle-plugin/pom.xml +++ b/scala2.13/shuffle-plugin/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-shuffle_2.13 RAPIDS Accelerator for Apache Spark Shuffle Plugin Accelerated shuffle plugin for the RAPIDS plugin for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT shuffle-plugin diff --git a/scala2.13/sql-plugin-api/pom.xml b/scala2.13/sql-plugin-api/pom.xml index 3c48d7c13f2..91802901fc0 100644 --- a/scala2.13/sql-plugin-api/pom.xml +++ b/scala2.13/sql-plugin-api/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-sql-plugin-api_2.13 Module for Non-Shimmable API - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT sql-plugin-api false diff --git a/scala2.13/sql-plugin/pom.xml b/scala2.13/sql-plugin/pom.xml index b96e1517690..e0ceea49776 100644 --- a/scala2.13/sql-plugin/pom.xml +++ b/scala2.13/sql-plugin/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-sql_2.13 RAPIDS Accelerator for Apache Spark SQL Plugin The RAPIDS SQL plugin for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT sql-plugin diff --git a/scala2.13/tests/pom.xml b/scala2.13/tests/pom.xml index 377dc4671fb..6aa80019d27 100644 --- a/scala2.13/tests/pom.xml +++ b/scala2.13/tests/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-tests_2.13 RAPIDS Accelerator for Apache Spark Tests RAPIDS plugin for Apache Spark integration tests - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT tests diff --git a/scala2.13/tools/pom.xml b/scala2.13/tools/pom.xml index 0c3179e09ff..866987242ae 100644 --- a/scala2.13/tools/pom.xml +++ b/scala2.13/tools/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark-tools-support pom RAPIDS Accelerator for Apache Spark Tools Support Supporting code for RAPIDS Accelerator tools - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT com.nvidia diff --git a/scala2.13/udf-compiler/pom.xml b/scala2.13/udf-compiler/pom.xml index 10ad46a48aa..09cce00ef27 100644 --- a/scala2.13/udf-compiler/pom.xml +++ b/scala2.13/udf-compiler/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-udf_2.13 RAPIDS Accelerator for Apache Spark Scala UDF Plugin The RAPIDS Scala UDF plugin for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT udf-compiler diff --git a/shim-deps/cloudera/pom.xml b/shim-deps/cloudera/pom.xml index a9b71366927..0505fed6bac 100644 --- a/shim-deps/cloudera/pom.xml +++ b/shim-deps/cloudera/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../pom.xml rapids-4-spark-cdh-bom pom CDH Shim Dependencies - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/cloudera diff --git a/shim-deps/databricks/pom.xml b/shim-deps/databricks/pom.xml index edfa3d6f896..82618ba65cd 100644 --- a/shim-deps/databricks/pom.xml +++ b/shim-deps/databricks/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../pom.xml rapids-4-spark-db-bom pom Databricks Shim Dependencies - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/databricks diff --git a/shim-deps/pom.xml b/shim-deps/pom.xml index d90dfc34190..e5a047f5169 100644 --- a/shim-deps/pom.xml +++ b/shim-deps/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark-shim-deps-parent_2.12 pom Shim Dependencies Profiles - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT release321cdh diff --git a/shuffle-plugin/pom.xml b/shuffle-plugin/pom.xml index 69d8f1b765b..fd92b6b0957 100644 --- a/shuffle-plugin/pom.xml +++ b/shuffle-plugin/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-shuffle_2.12 RAPIDS Accelerator for Apache Spark Shuffle Plugin Accelerated shuffle plugin for the RAPIDS plugin for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT shuffle-plugin diff --git a/sql-plugin-api/pom.xml b/sql-plugin-api/pom.xml index 090a809fc05..b1080ef7d39 100644 --- a/sql-plugin-api/pom.xml +++ b/sql-plugin-api/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-sql-plugin-api_2.12 Module for Non-Shimmable API - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT sql-plugin-api false diff --git a/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala b/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala index 23a9ece7468..533fee141c5 100644 --- a/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala +++ b/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala @@ -48,11 +48,11 @@ import org.apache.spark.util.MutableURLClassLoader Each shim can see a consistent parallel world without conflicts by referencing only one conflicting directory. E.g., Spark 3.2.0 Shim will use - jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark-shared/ - jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark320/ + jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/spark-shared/ + jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/spark320/ Spark 3.3.1 will use - jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark-shared/ - jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark331/ + jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/spark-shared/ + jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/spark331/ Using these Jar URL's allows referencing different bytecode produced from identical sources by incompatible Scala / Spark dependencies. */ diff --git a/sql-plugin/pom.xml b/sql-plugin/pom.xml index c9cfb8ce99f..2b0a62a5b90 100644 --- a/sql-plugin/pom.xml +++ b/sql-plugin/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-sql_2.12 RAPIDS Accelerator for Apache Spark SQL Plugin The RAPIDS SQL plugin for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT sql-plugin diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala index e20b21da520..331835a6634 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala @@ -133,11 +133,11 @@ object RapidsPluginUtils extends Logging { val possibleRapidsJarURLs = classloader.getResources(propName).asScala.toSet.toSeq.filter { url => { val urlPath = url.toString - // Filter out submodule jars, e.g. rapids-4-spark-aggregator_2.12-24.12.0-spark341.jar, + // Filter out submodule jars, e.g. rapids-4-spark-aggregator_2.12-25.02.0-spark341.jar, // and files stored under subdirs of '!/', e.g. - // rapids-4-spark_2.12-24.12.0-cuda11.jar!/spark330/rapids4spark-version-info.properties + // rapids-4-spark_2.12-25.02.0-cuda11.jar!/spark330/rapids4spark-version-info.properties // We only want to find the main jar, e.g. - // rapids-4-spark_2.12-24.12.0-cuda11.jar!/rapids4spark-version-info.properties + // rapids-4-spark_2.12-25.02.0-cuda11.jar!/rapids4spark-version-info.properties !urlPath.contains("rapids-4-spark-") && urlPath.endsWith("!/" + propName) } } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index e279385be82..b77beb2e2bd 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -2441,7 +2441,7 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression. |On startup use: `--conf [conf key]=[conf value]`. For example: | |``` - |${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar \ + |${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar \ |--conf spark.plugins=com.nvidia.spark.SQLPlugin \ |--conf spark.rapids.sql.concurrentGpuTasks=2 |``` diff --git a/tests/pom.xml b/tests/pom.xml index a8fef6b7930..bc67c0c3de0 100644 --- a/tests/pom.xml +++ b/tests/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-tests_2.12 RAPIDS Accelerator for Apache Spark Tests RAPIDS plugin for Apache Spark integration tests - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT tests diff --git a/tools/pom.xml b/tools/pom.xml index df919f112ef..23bae1bcd8d 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark-tools-support pom RAPIDS Accelerator for Apache Spark Tools Support Supporting code for RAPIDS Accelerator tools - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT com.nvidia diff --git a/udf-compiler/pom.xml b/udf-compiler/pom.xml index afe827baf78..a32c1d3813f 100644 --- a/udf-compiler/pom.xml +++ b/udf-compiler/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-udf_2.12 RAPIDS Accelerator for Apache Spark Scala UDF Plugin The RAPIDS Scala UDF plugin for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT udf-compiler From 65394412f54f003c5be7b1a572a8e38164a5f025 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Mon, 25 Nov 2024 12:57:40 -0600 Subject: [PATCH 02/37] Enable JSON Scan and from_json by default (#11753) Signed-off-by: Robert (Bobby) Evans Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- .../advanced_configs.md | 6 +- docs/compatibility.md | 161 ++++++++---------- docs/supported_ops.md | 4 +- .../nvidia/spark/rapids/GpuOverrides.scala | 8 +- .../com/nvidia/spark/rapids/RapidsConf.scala | 4 +- .../320/supportedDataSource.csv | 2 +- tools/generated_files/320/supportedExprs.csv | 4 +- .../321/supportedDataSource.csv | 2 +- tools/generated_files/321/supportedExprs.csv | 4 +- .../321cdh/supportedDataSource.csv | 2 +- .../generated_files/321cdh/supportedExprs.csv | 4 +- .../322/supportedDataSource.csv | 2 +- tools/generated_files/322/supportedExprs.csv | 4 +- .../323/supportedDataSource.csv | 2 +- tools/generated_files/323/supportedExprs.csv | 4 +- .../324/supportedDataSource.csv | 2 +- tools/generated_files/324/supportedExprs.csv | 4 +- .../330/supportedDataSource.csv | 2 +- tools/generated_files/330/supportedExprs.csv | 4 +- .../330cdh/supportedDataSource.csv | 2 +- .../generated_files/330cdh/supportedExprs.csv | 4 +- .../331/supportedDataSource.csv | 2 +- tools/generated_files/331/supportedExprs.csv | 4 +- .../332/supportedDataSource.csv | 2 +- tools/generated_files/332/supportedExprs.csv | 4 +- .../332cdh/supportedDataSource.csv | 2 +- .../generated_files/332cdh/supportedExprs.csv | 4 +- .../333/supportedDataSource.csv | 2 +- tools/generated_files/333/supportedExprs.csv | 4 +- .../334/supportedDataSource.csv | 2 +- tools/generated_files/334/supportedExprs.csv | 4 +- .../340/supportedDataSource.csv | 2 +- tools/generated_files/340/supportedExprs.csv | 4 +- .../341/supportedDataSource.csv | 2 +- tools/generated_files/341/supportedExprs.csv | 4 +- .../342/supportedDataSource.csv | 2 +- tools/generated_files/342/supportedExprs.csv | 4 +- .../343/supportedDataSource.csv | 2 +- tools/generated_files/343/supportedExprs.csv | 4 +- .../344/supportedDataSource.csv | 2 +- tools/generated_files/344/supportedExprs.csv | 4 +- .../350/supportedDataSource.csv | 2 +- tools/generated_files/350/supportedExprs.csv | 4 +- .../351/supportedDataSource.csv | 2 +- tools/generated_files/351/supportedExprs.csv | 4 +- .../352/supportedDataSource.csv | 2 +- tools/generated_files/352/supportedExprs.csv | 4 +- .../353/supportedDataSource.csv | 2 +- tools/generated_files/353/supportedExprs.csv | 4 +- .../400/supportedDataSource.csv | 2 +- tools/generated_files/400/supportedExprs.csv | 4 +- tools/generated_files/supportedDataSource.csv | 2 +- tools/generated_files/supportedExprs.csv | 4 +- 53 files changed, 151 insertions(+), 176 deletions(-) diff --git a/docs/additional-functionality/advanced_configs.md b/docs/additional-functionality/advanced_configs.md index f3157b46099..07346a5b850 100644 --- a/docs/additional-functionality/advanced_configs.md +++ b/docs/additional-functionality/advanced_configs.md @@ -95,8 +95,8 @@ Name | Description | Default Value | Applicable at spark.rapids.sql.format.hive.text.write.enabled|When set to false disables Hive text table write acceleration|false|Runtime spark.rapids.sql.format.iceberg.enabled|When set to false disables all Iceberg acceleration|true|Runtime spark.rapids.sql.format.iceberg.read.enabled|When set to false disables Iceberg input acceleration|true|Runtime -spark.rapids.sql.format.json.enabled|When set to true enables all json input and output acceleration. (only input is currently supported anyways)|false|Runtime -spark.rapids.sql.format.json.read.enabled|When set to true enables json input acceleration|false|Runtime +spark.rapids.sql.format.json.enabled|When set to true enables all json input and output acceleration. (only input is currently supported anyways)|true|Runtime +spark.rapids.sql.format.json.read.enabled|When set to true enables json input acceleration|true|Runtime spark.rapids.sql.format.orc.enabled|When set to false disables all orc input and output acceleration|true|Runtime spark.rapids.sql.format.orc.floatTypesToString.enable|When reading an ORC file, the source data schemas(schemas of ORC file) may differ from the target schemas (schemas of the reader), we need to handle the castings from source type to target type. Since float/double numbers in GPU have different precision with CPU, when casting float/double to string, the result of GPU is different from result of CPU spark. Its default value is `true` (this means the strings result will differ from result of CPU). If it's set `false` explicitly and there exists casting from float/double to string in the job, then such behavior will cause an exception, and the job will fail.|true|Runtime spark.rapids.sql.format.orc.multiThreadedRead.maxNumFilesParallel|A limit on the maximum number of files per task processed in parallel on the CPU side before the file is sent to the GPU. This affects the amount of host memory used when reading the files in parallel. Used with MULTITHREADED reader, see spark.rapids.sql.format.orc.reader.type.|2147483647|Runtime @@ -278,7 +278,7 @@ Name | SQL Function(s) | Description | Default Value | Notes spark.rapids.sql.expression.IsNaN|`isnan`|Checks if a value is NaN|true|None| spark.rapids.sql.expression.IsNotNull|`isnotnull`|Checks if a value is not null|true|None| spark.rapids.sql.expression.IsNull|`isnull`|Checks if a value is null|true|None| -spark.rapids.sql.expression.JsonToStructs|`from_json`|Returns a struct value with the given `jsonStr` and `schema`|false|This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case| +spark.rapids.sql.expression.JsonToStructs|`from_json`|Returns a struct value with the given `jsonStr` and `schema`|true|None| spark.rapids.sql.expression.JsonTuple|`json_tuple`|Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.|false|This is disabled by default because Experimental feature that could be unstable or have performance issues.| spark.rapids.sql.expression.KnownFloatingPointNormalized| |Tag to prevent redundant normalization|true|None| spark.rapids.sql.expression.KnownNotNull| |Tag an expression as known to not be null|true|None| diff --git a/docs/compatibility.md b/docs/compatibility.md index 1382b1a9a1f..0c745069032 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -316,125 +316,102 @@ case. ## JSON -The JSON format read is an experimental feature which is expected to have some issues, so we disable -it by default. If you would like to test it, you need to enable `spark.rapids.sql.format.json.enabled` and -`spark.rapids.sql.format.json.read.enabled`. +JSON, despite being a standard format, has some ambiguity in it. Spark also offers the ability to allow +some invalid JSON to be parsed. We have tried to provide JSON parsing that is compatible with +what Apache Spark does support. Note that Spark itself has changed through different releases, and we will +try to call out which releases we offer different results for. JSON parsing is enabled by default +except for date and timestamp types where we still have work to complete. If you wish to disable +JSON Scan you can set `spark.rapids.sql.format.json.enabled` or +`spark.rapids.sql.format.json.read.enabled` to false. To disable `from_json` you can set +`spark.rapids.sql.expression.JsonToStructs` to false. -### Invalid JSON +### Limits -In Apache Spark on the CPU if a line in the JSON file is invalid the entire row is considered -invalid and will result in nulls being returned for all columns. It is considered invalid if it -violates the JSON specification, but with a few extensions. +In versions of Spark before 3.5.0 there is no maximum to how deeply nested JSON can be. After +3.5.0 this was updated to be 1,000 by default. The current GPU implementation of JSON Scan and +`from_json` limits this to 254 no matter what version of Spark is used. If the nesting level is +over this the JSON is considered invalid and all values will be returned as nulls. +`get_json_object` and `json_tuple` have a maximum nesting depth of 64. An exception is thrown if +the nesting depth goes over the maximum. - * Single quotes are allowed to quote strings and keys - * Unquoted values like NaN and Infinity can be parsed as floating point values - * Control characters do not need to be replaced with the corresponding escape sequences in a - quoted string. - * Garbage at the end of a row, if there is valid JSON at the beginning of the row, is ignored. +Spark 3.5.0 and above have limits on maximum string length 20,000,000 and maximum number length of +1,000. We do not have any of these limits on the GPU. -The GPU implementation does the same kinds of validations, but many of them are done on a per-column -basis, which, for example, means if a number is formatted incorrectly, it is likely only that value -will be considered invalid and return a null instead of nulls for the entire row. +We, like Spark, cannot support an JSON string that is larger than 2 GiB is size. -There are options that can be used to enable and disable many of these features which are mostly -listed below. +### JSON Validation -### JSON options +Spark supports the option `allowNonNumericNumbers`. Versions of Spark prior to 3.3.0 where inconsistent between +quoted and non-quoted values ([SPARK-38060](https://issues.apache.org/jira/browse/SPARK-38060)). The +GPU implementation is consistent with 3.3.0 and above. -Spark supports passing options to the JSON parser when reading a dataset. In most cases if the RAPIDS Accelerator -sees one of these options that it does not support it will fall back to the CPU. In some cases we do not. The -following options are documented below. +### JSON Floating Point Types -- `allowNumericLeadingZeros` - Allows leading zeros in numbers (e.g. 00012). By default this is set to false. - When it is false Spark considers the JSON invalid if it encounters this type of number. The RAPIDS - Accelerator supports validating columns that are returned to the user with this option on or off. - -- `allowUnquotedControlChars` - Allows JSON Strings to contain unquoted control characters (ASCII characters with - value less than 32, including tab and line feed characters) or not. By default this is set to false. If the schema - is provided while reading JSON file, then this flag has no impact on the RAPIDS Accelerator as it always allows - unquoted control characters but Spark sees these are invalid are returns nulls. However, if the schema is not provided - and this option is false, then RAPIDS Accelerator's behavior is same as Spark where an exception is thrown - as discussed in `JSON Schema discovery` section. - -- `allowNonNumericNumbers` - Allows `NaN` and `Infinity` values to be parsed (note that these are not valid numeric - values in the [JSON specification](https://json.org)). Spark versions prior to 3.3.0 have inconsistent behavior and will - parse some variants of `NaN` and `Infinity` even when this option is disabled - ([SPARK-38060](https://issues.apache.org/jira/browse/SPARK-38060)). The RAPIDS Accelerator behavior is consistent with - Spark version 3.3.0 and later. - -### Nesting -In versions of Spark before 3.5.0 there is no maximum to how deeply nested JSON can be. After -3.5.0 this was updated to be 1000 by default. The current GPU implementation limits this to 254 -no matter what version of Spark is used. If the nesting level is over this the JSON is considered -invalid and all values will be returned as nulls. - -Mixed types can have some problems. If an item being read could have some lines that are arrays -and others that are structs/dictionaries it is possible an error will be thrown. - -Dates and Timestamps have some issues and may return values for technically invalid inputs. - -Floating point numbers have issues generally like with the rest of Spark, and we can parse them into -a valid floating point number, but it might not match 100% with the way Spark does it. - -Strings are supported, but the data returned might not be normalized in the same way as the CPU -implementation. Generally this comes down to the GPU not modifying the input, whereas Spark will -do things like remove extra white space and parse numbers before turning them back into a string. +Parsing floating-point values has the same limitations as [casting from string to float](#string-to-float). -### JSON Floating Point +### JSON Integral Types -Parsing floating-point values has the same limitations as [casting from string to float](#string-to-float). +Versions of Spark prior to 3.3.0 would parse quoted integer values, like "1". But 3.3.0 and above consider +these to be invalid and will return `null` when parsed as an Integral types. The GPU implementation +follows 3.3.0 and above. -Prior to Spark 3.3.0, reading JSON strings such as `"+Infinity"` when specifying that the data type is `FloatType` -or `DoubleType` caused these values to be parsed even when `allowNonNumericNumbers` is set to false. Also, Spark -versions prior to 3.3.0 only supported the `"Infinity"` and `"-Infinity"` representations of infinity and did not -support `"+INF"`, `"-INF"`, or `"+Infinity"`, which Spark considers valid when unquoted. The GPU JSON reader is -consistent with the behavior in Spark 3.3.0 and later. +### JSON Decimal Types -Another limitation of the GPU JSON reader is that it will parse strings containing non-string boolean or numeric values where -Spark will treat them as invalid inputs and will just return `null`. +Spark supports parsing decimal types either formatted as floating point number or integral numbers, even if it is +in a quoted string. If it is in a quoted string the local of the JVM is used to determine the number format. +If the local is not for the `US`, which is the default we will fall back to the CPU because we do not currently +parse those numbers correctly. The `US` format removes all commas ',' from the quoted string. +As a part of this, though, non-arabic numbers are also supported. We do not support parsing these numbers +see (issue 10532)[https://github.com/NVIDIA/spark-rapids/issues/10532]. -### JSON Dates/Timestamps +### JSON Date/Timestamp Types Dates and timestamps are not supported by default in JSON parser, since the GPU implementation is not 100% compatible with Apache Spark. If needed, they can be turned on through the config `spark.rapids.sql.json.read.datetime.enabled`. -Once enabled, the JSON parser still does not support the `TimestampNTZ` type and will fall back to CPU -if `spark.sql.timestampType` is set to `TIMESTAMP_NTZ` or if an explicit schema is provided that -contains the `TimestampNTZ` type. +This config works for both JSON scan and `from_json`. Once enabled, the JSON parser still does +not support the `TimestampNTZ` type and will fall back to CPU if `spark.sql.timestampType` is set +to `TIMESTAMP_NTZ` or if an explicit schema is provided that contains the `TimestampNTZ` type. There is currently no support for reading numeric values as timestamps and null values are returned instead -([#4940](https://github.com/NVIDIA/spark-rapids/issues/4940)). A workaround would be to read as longs and then cast -to timestamp. +([#4940](https://github.com/NVIDIA/spark-rapids/issues/4940)). A workaround would be to read as longs and then cast to timestamp. -### JSON Schema discovery +### JSON Arrays and Structs with Overflowing Numbers -Spark SQL can automatically infer the schema of a JSON dataset if schema is not provided explicitly. The CPU -handles schema discovery and there is no GPU acceleration of this. By default Spark will read/parse the entire -dataset to determine the schema. This means that some options/errors which are ignored by the GPU may still -result in an exception if used with schema discovery. +Spark is inconsistent between versions in how it handles numbers that overflow that are nested in either an array +or a non-top-level struct. In some versions only the value that overflowed is marked as null. In other versions the +wrapping array or struct is marked as null. We currently only mark the individual value as null. This matches +versions 3.4.2 and above of Spark for structs. Arrays on most versions of spark invalidate the entire array if there +is a single value that overflows within it. -### `from_json` function +### Duplicate Struct Names -`JsonToStructs` of `from_json` is based on the same code as reading a JSON lines file. There are -a few differences with it. +The JSON specification technically allows for duplicate keys in a struct, but does not explain what to +do with them. In the case of Spark it is inconsistent between operators which value wins. `get_json_object` +depends on the query being performed. We do not always match what Spark does. We do match it in many cases, +but we consider this enough of a corner case that we have not tried to make it work in all cases. -The `from_json` function is disabled by default because it is experimental and has some known -incompatibilities with Spark, and can be enabled by setting -`spark.rapids.sql.expression.JsonToStructs=true`. You don't need to set -`spark.rapids.sql.format.json.enabled` and`spark.rapids.sql.format.json.read.enabled` to true. -In addition, if the input schema contains date and/or timestamp types, an additional config -`spark.rapids.sql.json.read.datetime.enabled` also needs to be set to `true` in order -to enable this function on the GPU. +We also do not support schemas where there are duplicate column names. We just fall back to the CPU for those cases. -There is no schema discovery as a schema is required as input to `from_json` +### JSON Normalization (String Types) -In addition to `structs`, a top level `map` type is supported, but only if the key and value are -strings. +In versions of Spark prior to 4.0.0 input JSON Strings were parsed to JSON tokens and then converted back to +strings. This effectively normalizes the output string. So things like single quotes are transformed into double +quotes, floating point numbers are parsed and converted back to strings possibly changing the format, and +escaped characters are converted back to their simplest form. We try to support this on the GPU as well. Single quotes +will be converted to double quotes. Only `get_json_object` and `json_tuple` attempt to normalize floating point +numbers. There is no implementation on the GPU right now that tries to normalize escape characters. + +### `from_json` Function + +`JsonToStructs` or `from_json` is based on the same code as reading a JSON lines file. There are +a few differences with it. -### `to_json` function +The main difference is that `from_json` supports parsing Maps and Arrays directly from a JSON column, whereas +JSON Scan only supports parsing top level structs. The GPU implementation of `from_json` has support for parsing +a `MAP` as a top level schema, but does not currently support arrays at the top level. -The `to_json` function is disabled by default because it is experimental and has some known incompatibilities -with Spark, and can be enabled by setting `spark.rapids.sql.expression.StructsToJson=true`. +### `to_json` Function Known issues are: @@ -442,7 +419,7 @@ Known issues are: produce `-4.1243574E26` but the GPU may produce `-4.124357351E26`. - Not all JSON options are respected -### get_json_object +### `get_json_object` Function Known issue: - [Floating-point number normalization error](https://github.com/NVIDIA/spark-rapids-jni/issues/1922). `get_json_object` floating-point number normalization on the GPU could sometimes return incorrect results if the string contains high-precision values, see the String to Float and Float to String section for more details. diff --git a/docs/supported_ops.md b/docs/supported_ops.md index 2fa11f8aa6e..acf7133af40 100644 --- a/docs/supported_ops.md +++ b/docs/supported_ops.md @@ -9279,7 +9279,7 @@ are limited. JsonToStructs `from_json` Returns a struct value with the given `jsonStr` and `schema` -This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case +None project jsonStr @@ -9320,7 +9320,7 @@ are limited. NS -PS
MAP only supports keys and values that are of STRING type;
UTC is only supported TZ for child TIMESTAMP;
unsupported child types NULL, BINARY, CALENDAR, MAP, UDT, DAYTIME, YEARMONTH
+PS
MAP only supports keys and values that are of STRING type and is only supported at the top level;
UTC is only supported TZ for child TIMESTAMP;
unsupported child types NULL, BINARY, CALENDAR, MAP, UDT, DAYTIME, YEARMONTH
PS
UTC is only supported TZ for child TIMESTAMP;
unsupported child types NULL, BINARY, CALENDAR, MAP, UDT, DAYTIME, YEARMONTH
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index bdeebaabbfc..45905f0b9e0 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -3780,7 +3780,8 @@ object GpuOverrides extends Logging { ExprChecks.projectOnly( TypeSig.STRUCT.nested(jsonStructReadTypes) + TypeSig.MAP.nested(TypeSig.STRING).withPsNote(TypeEnum.MAP, - "MAP only supports keys and values that are of STRING type"), + "MAP only supports keys and values that are of STRING type " + + "and is only supported at the top level"), (TypeSig.STRUCT + TypeSig.MAP + TypeSig.ARRAY).nested(TypeSig.all), Seq(ParamCheck("jsonStr", TypeSig.STRING, TypeSig.STRING))), (a, conf, p, r) => new UnaryExprMeta[JsonToStructs](a, conf, p, r) { @@ -3821,10 +3822,7 @@ object GpuOverrides extends Logging { override def convertToGpu(child: Expression): GpuExpression = // GPU implementation currently does not support duplicated json key names in input GpuJsonToStructs(a.schema, a.options, child, a.timeZoneId) - }).disabledByDefault("it is currently in beta and undergoes continuous enhancements."+ - " Please consult the "+ - "[compatibility documentation](../compatibility.md#json-supporting-types)"+ - " to determine whether you can enable this configuration for your use case"), + }), expr[StructsToJson]( "Converts structs to JSON text format", ExprChecks.projectOnly( diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index e279385be82..e22b8f53497 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -1348,12 +1348,12 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern") .doc("When set to true enables all json input and output acceleration. " + "(only input is currently supported anyways)") .booleanConf - .createWithDefault(false) + .createWithDefault(true) val ENABLE_JSON_READ = conf("spark.rapids.sql.format.json.read.enabled") .doc("When set to true enables json input acceleration") .booleanConf - .createWithDefault(false) + .createWithDefault(true) val ENABLE_READ_JSON_FLOATS = conf("spark.rapids.sql.json.read.float.enabled") .doc("JSON reading is not 100% compatible when reading floats.") diff --git a/tools/generated_files/320/supportedDataSource.csv b/tools/generated_files/320/supportedDataSource.csv index 2573406ec3b..2eae4ed00ce 100644 --- a/tools/generated_files/320/supportedDataSource.csv +++ b/tools/generated_files/320/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA diff --git a/tools/generated_files/320/supportedExprs.csv b/tools/generated_files/320/supportedExprs.csv index 808d8fb4df3..e4a4db760b0 100644 --- a/tools/generated_files/320/supportedExprs.csv +++ b/tools/generated_files/320/supportedExprs.csv @@ -288,8 +288,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/321/supportedDataSource.csv b/tools/generated_files/321/supportedDataSource.csv index 2573406ec3b..2eae4ed00ce 100644 --- a/tools/generated_files/321/supportedDataSource.csv +++ b/tools/generated_files/321/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA diff --git a/tools/generated_files/321/supportedExprs.csv b/tools/generated_files/321/supportedExprs.csv index 808d8fb4df3..e4a4db760b0 100644 --- a/tools/generated_files/321/supportedExprs.csv +++ b/tools/generated_files/321/supportedExprs.csv @@ -288,8 +288,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/321cdh/supportedDataSource.csv b/tools/generated_files/321cdh/supportedDataSource.csv index 2573406ec3b..2eae4ed00ce 100644 --- a/tools/generated_files/321cdh/supportedDataSource.csv +++ b/tools/generated_files/321cdh/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA diff --git a/tools/generated_files/321cdh/supportedExprs.csv b/tools/generated_files/321cdh/supportedExprs.csv index 808d8fb4df3..e4a4db760b0 100644 --- a/tools/generated_files/321cdh/supportedExprs.csv +++ b/tools/generated_files/321cdh/supportedExprs.csv @@ -288,8 +288,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/322/supportedDataSource.csv b/tools/generated_files/322/supportedDataSource.csv index 2573406ec3b..2eae4ed00ce 100644 --- a/tools/generated_files/322/supportedDataSource.csv +++ b/tools/generated_files/322/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA diff --git a/tools/generated_files/322/supportedExprs.csv b/tools/generated_files/322/supportedExprs.csv index 808d8fb4df3..e4a4db760b0 100644 --- a/tools/generated_files/322/supportedExprs.csv +++ b/tools/generated_files/322/supportedExprs.csv @@ -288,8 +288,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/323/supportedDataSource.csv b/tools/generated_files/323/supportedDataSource.csv index 2573406ec3b..2eae4ed00ce 100644 --- a/tools/generated_files/323/supportedDataSource.csv +++ b/tools/generated_files/323/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA diff --git a/tools/generated_files/323/supportedExprs.csv b/tools/generated_files/323/supportedExprs.csv index 808d8fb4df3..e4a4db760b0 100644 --- a/tools/generated_files/323/supportedExprs.csv +++ b/tools/generated_files/323/supportedExprs.csv @@ -288,8 +288,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/324/supportedDataSource.csv b/tools/generated_files/324/supportedDataSource.csv index 2573406ec3b..2eae4ed00ce 100644 --- a/tools/generated_files/324/supportedDataSource.csv +++ b/tools/generated_files/324/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA diff --git a/tools/generated_files/324/supportedExprs.csv b/tools/generated_files/324/supportedExprs.csv index 808d8fb4df3..e4a4db760b0 100644 --- a/tools/generated_files/324/supportedExprs.csv +++ b/tools/generated_files/324/supportedExprs.csv @@ -288,8 +288,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/330/supportedDataSource.csv b/tools/generated_files/330/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/330/supportedDataSource.csv +++ b/tools/generated_files/330/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/330/supportedExprs.csv b/tools/generated_files/330/supportedExprs.csv index fcea9c8cb40..0073281cb32 100644 --- a/tools/generated_files/330/supportedExprs.csv +++ b/tools/generated_files/330/supportedExprs.csv @@ -297,8 +297,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/330cdh/supportedDataSource.csv b/tools/generated_files/330cdh/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/330cdh/supportedDataSource.csv +++ b/tools/generated_files/330cdh/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/330cdh/supportedExprs.csv b/tools/generated_files/330cdh/supportedExprs.csv index fcea9c8cb40..0073281cb32 100644 --- a/tools/generated_files/330cdh/supportedExprs.csv +++ b/tools/generated_files/330cdh/supportedExprs.csv @@ -297,8 +297,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/331/supportedDataSource.csv b/tools/generated_files/331/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/331/supportedDataSource.csv +++ b/tools/generated_files/331/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/331/supportedExprs.csv b/tools/generated_files/331/supportedExprs.csv index 4eccb898337..f62af4c9513 100644 --- a/tools/generated_files/331/supportedExprs.csv +++ b/tools/generated_files/331/supportedExprs.csv @@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/332/supportedDataSource.csv b/tools/generated_files/332/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/332/supportedDataSource.csv +++ b/tools/generated_files/332/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/332/supportedExprs.csv b/tools/generated_files/332/supportedExprs.csv index 4eccb898337..f62af4c9513 100644 --- a/tools/generated_files/332/supportedExprs.csv +++ b/tools/generated_files/332/supportedExprs.csv @@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/332cdh/supportedDataSource.csv b/tools/generated_files/332cdh/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/332cdh/supportedDataSource.csv +++ b/tools/generated_files/332cdh/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/332cdh/supportedExprs.csv b/tools/generated_files/332cdh/supportedExprs.csv index 4eccb898337..f62af4c9513 100644 --- a/tools/generated_files/332cdh/supportedExprs.csv +++ b/tools/generated_files/332cdh/supportedExprs.csv @@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/333/supportedDataSource.csv b/tools/generated_files/333/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/333/supportedDataSource.csv +++ b/tools/generated_files/333/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/333/supportedExprs.csv b/tools/generated_files/333/supportedExprs.csv index 4eccb898337..f62af4c9513 100644 --- a/tools/generated_files/333/supportedExprs.csv +++ b/tools/generated_files/333/supportedExprs.csv @@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/334/supportedDataSource.csv b/tools/generated_files/334/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/334/supportedDataSource.csv +++ b/tools/generated_files/334/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/334/supportedExprs.csv b/tools/generated_files/334/supportedExprs.csv index 4eccb898337..f62af4c9513 100644 --- a/tools/generated_files/334/supportedExprs.csv +++ b/tools/generated_files/334/supportedExprs.csv @@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/340/supportedDataSource.csv b/tools/generated_files/340/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/340/supportedDataSource.csv +++ b/tools/generated_files/340/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/340/supportedExprs.csv b/tools/generated_files/340/supportedExprs.csv index 80bc405b058..01a48b40249 100644 --- a/tools/generated_files/340/supportedExprs.csv +++ b/tools/generated_files/340/supportedExprs.csv @@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/341/supportedDataSource.csv b/tools/generated_files/341/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/341/supportedDataSource.csv +++ b/tools/generated_files/341/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/341/supportedExprs.csv b/tools/generated_files/341/supportedExprs.csv index 80bc405b058..01a48b40249 100644 --- a/tools/generated_files/341/supportedExprs.csv +++ b/tools/generated_files/341/supportedExprs.csv @@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/342/supportedDataSource.csv b/tools/generated_files/342/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/342/supportedDataSource.csv +++ b/tools/generated_files/342/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/342/supportedExprs.csv b/tools/generated_files/342/supportedExprs.csv index 80bc405b058..01a48b40249 100644 --- a/tools/generated_files/342/supportedExprs.csv +++ b/tools/generated_files/342/supportedExprs.csv @@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/343/supportedDataSource.csv b/tools/generated_files/343/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/343/supportedDataSource.csv +++ b/tools/generated_files/343/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/343/supportedExprs.csv b/tools/generated_files/343/supportedExprs.csv index 80bc405b058..01a48b40249 100644 --- a/tools/generated_files/343/supportedExprs.csv +++ b/tools/generated_files/343/supportedExprs.csv @@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/344/supportedDataSource.csv b/tools/generated_files/344/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/344/supportedDataSource.csv +++ b/tools/generated_files/344/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/344/supportedExprs.csv b/tools/generated_files/344/supportedExprs.csv index 80bc405b058..01a48b40249 100644 --- a/tools/generated_files/344/supportedExprs.csv +++ b/tools/generated_files/344/supportedExprs.csv @@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/350/supportedDataSource.csv b/tools/generated_files/350/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/350/supportedDataSource.csv +++ b/tools/generated_files/350/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/350/supportedExprs.csv b/tools/generated_files/350/supportedExprs.csv index f45289388fc..4cbfc7c1c27 100644 --- a/tools/generated_files/350/supportedExprs.csv +++ b/tools/generated_files/350/supportedExprs.csv @@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/351/supportedDataSource.csv b/tools/generated_files/351/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/351/supportedDataSource.csv +++ b/tools/generated_files/351/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/351/supportedExprs.csv b/tools/generated_files/351/supportedExprs.csv index f45289388fc..4cbfc7c1c27 100644 --- a/tools/generated_files/351/supportedExprs.csv +++ b/tools/generated_files/351/supportedExprs.csv @@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/352/supportedDataSource.csv b/tools/generated_files/352/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/352/supportedDataSource.csv +++ b/tools/generated_files/352/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/352/supportedExprs.csv b/tools/generated_files/352/supportedExprs.csv index f45289388fc..4cbfc7c1c27 100644 --- a/tools/generated_files/352/supportedExprs.csv +++ b/tools/generated_files/352/supportedExprs.csv @@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/353/supportedDataSource.csv b/tools/generated_files/353/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/353/supportedDataSource.csv +++ b/tools/generated_files/353/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/353/supportedExprs.csv b/tools/generated_files/353/supportedExprs.csv index f45289388fc..4cbfc7c1c27 100644 --- a/tools/generated_files/353/supportedExprs.csv +++ b/tools/generated_files/353/supportedExprs.csv @@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/400/supportedDataSource.csv b/tools/generated_files/400/supportedDataSource.csv index 77f30cbe1de..82df521b39b 100644 --- a/tools/generated_files/400/supportedDataSource.csv +++ b/tools/generated_files/400/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S diff --git a/tools/generated_files/400/supportedExprs.csv b/tools/generated_files/400/supportedExprs.csv index 890f959eab5..4cfa1020889 100644 --- a/tools/generated_files/400/supportedExprs.csv +++ b/tools/generated_files/400/supportedExprs.csv @@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA diff --git a/tools/generated_files/supportedDataSource.csv b/tools/generated_files/supportedDataSource.csv index 2573406ec3b..2eae4ed00ce 100644 --- a/tools/generated_files/supportedDataSource.csv +++ b/tools/generated_files/supportedDataSource.csv @@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA -JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO +JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA diff --git a/tools/generated_files/supportedExprs.csv b/tools/generated_files/supportedExprs.csv index 808d8fb4df3..e4a4db760b0 100644 --- a/tools/generated_files/supportedExprs.csv +++ b/tools/generated_files/supportedExprs.csv @@ -288,8 +288,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA From 938db2123f1bd7397d6270de0e2910cd94823098 Mon Sep 17 00:00:00 2001 From: MithunR Date: Mon, 25 Nov 2024 12:07:11 -0800 Subject: [PATCH 03/37] Fix aqe_test failures on [databricks] 14.3. (#11750) * Fix aqe_test failures on [databricks] 14.3. Fixes #11643. This commit fixes the AQE/DPP tests that were reported in #11643 to be failing on Databricks 14.3. This is the result of a deficient shim for GpuSubqueryBroadcastMeta being active for Databricks 14.3. The deficient shim errantly extended the non-Databricks base shim. This commit moves the commonality in Databricks shims to a common base class that is then customized for the changes in Databricks 14.3. Signed-off-by: MithunR --- integration_tests/src/main/python/aqe_test.py | 6 +- .../execution/GpuSubqueryBroadcastExec.scala | 6 +- .../execution/GpuSubqueryBroadcastMeta.scala | 102 ++------------- .../GpuSubqueryBroadcastMeta330DBBase.scala | 121 ++++++++++++++++++ .../execution/GpuSubqueryBroadcastMeta.scala | 3 +- .../execution/GpuSubqueryBroadcastMeta.scala | 35 +++++ 6 files changed, 171 insertions(+), 102 deletions(-) create mode 100644 sql-plugin/src/main/spark330db/scala/org/apache/spark/rapids/execution/GpuSubqueryBroadcastMeta330DBBase.scala create mode 100644 sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastMeta.scala diff --git a/integration_tests/src/main/python/aqe_test.py b/integration_tests/src/main/python/aqe_test.py index f9dddfae038..5b3b04efdfb 100755 --- a/integration_tests/src/main/python/aqe_test.py +++ b/integration_tests/src/main/python/aqe_test.py @@ -338,10 +338,10 @@ def do_it(spark): # this should be fixed by https://github.com/NVIDIA/spark-rapids/issues/11120 aqe_join_with_dpp_fallback=["FilterExec"] if (is_databricks_runtime() or is_before_spark_330()) else [] +if is_databricks_version_or_later(14, 3): + aqe_join_with_dpp_fallback.append("CollectLimitExec") # Verify that DPP and AQE can coexist in even some odd cases involving multiple tables -@pytest.mark.skipif(condition=is_databricks_version_or_later(14, 3), - reason="https://github.com/NVIDIA/spark-rapids/issues/11643") @ignore_order(local=True) @allow_non_gpu(*aqe_join_with_dpp_fallback) def test_aqe_join_with_dpp(spark_tmp_path): @@ -395,8 +395,6 @@ def run_test(spark): assert_gpu_and_cpu_are_equal_collect(run_test, conf=_adaptive_conf) # Verify that DPP and AQE can coexist in even some odd cases involving 2 tables with multiple columns -@pytest.mark.skipif(condition=is_databricks_version_or_later(14, 3), - reason="https://github.com/NVIDIA/spark-rapids/issues/11643") @ignore_order(local=True) @allow_non_gpu(*aqe_join_with_dpp_fallback) def test_aqe_join_with_dpp_multi_columns(spark_tmp_path): diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastExec.scala index 72ed0e79504..e529e268f3f 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastExec.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastExec.scala @@ -126,8 +126,10 @@ abstract class GpuSubqueryBroadcastMetaBase( } else { willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.") } - case _ => - throw new AssertionError("should not reach here") + + case unexpected => + throw new AssertionError("Unexpected child exec in AdaptiveSparkPlan: " + + s"${unexpected.getClass.getName}") } case _ => diff --git a/sql-plugin/src/main/spark330db/scala/org/apache/spark/rapids/execution/GpuSubqueryBroadcastMeta.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/rapids/execution/GpuSubqueryBroadcastMeta.scala index 76255b3e5a6..ae32800e77a 100644 --- a/sql-plugin/src/main/spark330db/scala/org/apache/spark/rapids/execution/GpuSubqueryBroadcastMeta.scala +++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/rapids/execution/GpuSubqueryBroadcastMeta.scala @@ -21,105 +21,19 @@ spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.rapids.execution -import com.nvidia.spark.rapids.{BaseExprMeta, DataFromReplacementRule, GpuExec, RapidsConf, RapidsMeta, SparkPlanMeta} +import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuExec, RapidsConf, RapidsMeta} -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.plans.physical.IdentityBroadcastMode -import org.apache.spark.sql.execution.{SparkPlan, SubqueryBroadcastExec} -import org.apache.spark.sql.execution.adaptive.{BroadcastQueryStageExec} -import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec} -import org.apache.spark.sql.execution.joins.HashedRelationBroadcastMode +import org.apache.spark.sql.execution.SubqueryBroadcastExec -class GpuSubqueryBroadcastMeta( - s: SubqueryBroadcastExec, - conf: RapidsConf, - p: Option[RapidsMeta[_, _, _]], - r: DataFromReplacementRule) extends - SparkPlanMeta[SubqueryBroadcastExec](s, conf, p, r) { - private var broadcastBuilder: () => SparkPlan = _ - - override val childExprs: Seq[BaseExprMeta[_]] = Nil - - override val childPlans: Seq[SparkPlanMeta[SparkPlan]] = Nil - - override def tagPlanForGpu(): Unit = s.child match { - // DPP: For AQE off, in this case, we handle DPP by converting the underlying - // BroadcastExchangeExec to GpuBroadcastExchangeExec. - // This is slightly different from the Apache Spark case, because Spark - // sends the underlying plan into the plugin in advance via the PlanSubqueries rule. - // Here, we have the full non-GPU subquery plan, so we convert the whole - // thing. - case ex @ BroadcastExchangeExec(_, child) => - val exMeta = new GpuBroadcastMeta(ex.copy(child = child), conf, p, r) - exMeta.tagForGpu() - if (exMeta.canThisBeReplaced) { - broadcastBuilder = () => exMeta.convertToGpu() - } else { - willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.") - } - // DPP: For AQE on, we have an almost completely different scenario then before, - // Databricks uses a BroadcastQueryStageExec and either: - // 1) provide an underlying BroadcastExchangeExec that we will have to convert - // somehow - // 2) might already do the reuse work for us. The ReusedExchange is now a - // part of the SubqueryBroadcast, so we send it back here as underlying the - // GpuSubqueryBroadcastExchangeExec - case bqse: BroadcastQueryStageExec => - bqse.plan match { - case ex: BroadcastExchangeExec => - val exMeta = new GpuBroadcastMeta(ex, conf, p, r) - exMeta.tagForGpu() - if (exMeta.canThisBeReplaced) { - broadcastBuilder = () => exMeta.convertToGpu() - } else { - willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.") - } - case reuse: ReusedExchangeExec => - reuse.child match { - case _: GpuBroadcastExchangeExec => - // A BroadcastExchange has already been replaced, so it can run on the GPU - broadcastBuilder = () => reuse - case _ => - willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.") - } - } - case _ => - willNotWorkOnGpu("the subquery to broadcast can not entirely run in the GPU.") - } - /** - * Simply returns the original plan. Because its only child, BroadcastExchange, doesn't - * need to change if SubqueryBroadcastExec falls back to the CPU. - */ - override def convertToCpu(): SparkPlan = s +class GpuSubqueryBroadcastMeta(s: SubqueryBroadcastExec, + conf: RapidsConf, + p: Option[RapidsMeta[_, _, _]], + r: DataFromReplacementRule) + extends GpuSubqueryBroadcastMeta330DBBase(s, conf, p, r) { override def convertToGpu(): GpuExec = { GpuSubqueryBroadcastExec(s.name, Seq(s.index), s.buildKeys, broadcastBuilder())( getBroadcastModeKeyExprs) } - /** Extract the broadcast mode key expressions if there are any. */ - private def getBroadcastModeKeyExprs: Option[Seq[Expression]] = { - val broadcastMode = s.child match { - case b: BroadcastExchangeExec => - b.mode - case bqse: BroadcastQueryStageExec => - bqse.plan match { - case b: BroadcastExchangeExec => - b.mode - case reuse: ReusedExchangeExec => - reuse.child match { - case g: GpuBroadcastExchangeExec => - g.mode - } - case _ => - throw new AssertionError("should not reach here") - } - } - - broadcastMode match { - case HashedRelationBroadcastMode(keys, _) => Some(keys) - case IdentityBroadcastMode => None - case m => throw new UnsupportedOperationException(s"Unknown broadcast mode $m") - } - } -} +} \ No newline at end of file diff --git a/sql-plugin/src/main/spark330db/scala/org/apache/spark/rapids/execution/GpuSubqueryBroadcastMeta330DBBase.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/rapids/execution/GpuSubqueryBroadcastMeta330DBBase.scala new file mode 100644 index 00000000000..a6248127bad --- /dev/null +++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/rapids/execution/GpuSubqueryBroadcastMeta330DBBase.scala @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330db"} +{"spark": "332db"} +{"spark": "341db"} +{"spark": "350db143"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.execution + +import com.nvidia.spark.rapids.{BaseExprMeta, DataFromReplacementRule, RapidsConf, RapidsMeta, SparkPlanMeta} + +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.plans.physical.IdentityBroadcastMode +import org.apache.spark.sql.execution.{SparkPlan, SubqueryBroadcastExec} +import org.apache.spark.sql.execution.adaptive.{BroadcastQueryStageExec} +import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec} +import org.apache.spark.sql.execution.joins.HashedRelationBroadcastMode + +abstract class GpuSubqueryBroadcastMeta330DBBase(s: SubqueryBroadcastExec, + conf: RapidsConf, + p: Option[RapidsMeta[_, _, _]], + r: DataFromReplacementRule) extends + SparkPlanMeta[SubqueryBroadcastExec](s, conf, p, r) { + protected var broadcastBuilder: () => SparkPlan = _ + + override val childExprs: Seq[BaseExprMeta[_]] = Nil + + override val childPlans: Seq[SparkPlanMeta[SparkPlan]] = Nil + + override def tagPlanForGpu(): Unit = s.child match { + // DPP: For AQE off, in this case, we handle DPP by converting the underlying + // BroadcastExchangeExec to GpuBroadcastExchangeExec. + // This is slightly different from the Apache Spark case, because Spark + // sends the underlying plan into the plugin in advance via the PlanSubqueries rule. + // Here, we have the full non-GPU subquery plan, so we convert the whole + // thing. + case ex @ BroadcastExchangeExec(_, child) => + val exMeta = new GpuBroadcastMeta(ex.copy(child = child), conf, p, r) + exMeta.tagForGpu() + if (exMeta.canThisBeReplaced) { + broadcastBuilder = () => exMeta.convertToGpu() + } else { + willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.") + } + // DPP: For AQE on, we have an almost completely different scenario then before, + // Databricks uses a BroadcastQueryStageExec and either: + // 1) provide an underlying BroadcastExchangeExec that we will have to convert + // somehow + // 2) might already do the reuse work for us. The ReusedExchange is now a + // part of the SubqueryBroadcast, so we send it back here as underlying the + // GpuSubqueryBroadcastExchangeExec + case bqse: BroadcastQueryStageExec => + bqse.plan match { + case ex: BroadcastExchangeExec => + val exMeta = new GpuBroadcastMeta(ex, conf, p, r) + exMeta.tagForGpu() + if (exMeta.canThisBeReplaced) { + broadcastBuilder = () => exMeta.convertToGpu() + } else { + willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.") + } + case reuse: ReusedExchangeExec => + reuse.child match { + case _: GpuBroadcastExchangeExec => + // A BroadcastExchange has already been replaced, so it can run on the GPU + broadcastBuilder = () => reuse + case _ => + willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.") + } + } + case _ => + willNotWorkOnGpu("the subquery to broadcast can not entirely run in the GPU.") + } + /** + * Simply returns the original plan. Because its only child, BroadcastExchange, doesn't + * need to change if SubqueryBroadcastExec falls back to the CPU. + */ + override def convertToCpu(): SparkPlan = s + + /** Extract the broadcast mode key expressions if there are any. */ + protected def getBroadcastModeKeyExprs: Option[Seq[Expression]] = { + val broadcastMode = s.child match { + case b: BroadcastExchangeExec => + b.mode + case bqse: BroadcastQueryStageExec => + bqse.plan match { + case b: BroadcastExchangeExec => + b.mode + case reuse: ReusedExchangeExec => + reuse.child match { + case g: GpuBroadcastExchangeExec => + g.mode + } + case _ => + throw new AssertionError("should not reach here") + } + } + + broadcastMode match { + case HashedRelationBroadcastMode(keys, _) => Some(keys) + case IdentityBroadcastMode => None + case m => throw new UnsupportedOperationException(s"Unknown broadcast mode $m") + } + } +} + diff --git a/sql-plugin/src/main/spark350db143/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastMeta.scala b/sql-plugin/src/main/spark350db143/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastMeta.scala index 2f362531646..10e3fa68b76 100644 --- a/sql-plugin/src/main/spark350db143/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastMeta.scala +++ b/sql-plugin/src/main/spark350db143/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastMeta.scala @@ -15,7 +15,6 @@ */ /*** spark-rapids-shim-json-lines {"spark": "350db143"} -{"spark": "400"} spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.rapids.execution @@ -28,7 +27,7 @@ class GpuSubqueryBroadcastMeta( conf: RapidsConf, p: Option[RapidsMeta[_, _, _]], r: DataFromReplacementRule) extends - GpuSubqueryBroadcastMetaBase(s, conf, p, r) { + GpuSubqueryBroadcastMeta330DBBase(s, conf, p, r) { override def convertToGpu(): GpuExec = { GpuSubqueryBroadcastExec(s.name, s.indices, s.buildKeys, broadcastBuilder())( getBroadcastModeKeyExprs) diff --git a/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastMeta.scala b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastMeta.scala new file mode 100644 index 00000000000..c16564f523e --- /dev/null +++ b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastMeta.scala @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/*** spark-rapids-shim-json-lines +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.execution + +import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuExec, RapidsConf, RapidsMeta} + +import org.apache.spark.sql.execution.SubqueryBroadcastExec + +class GpuSubqueryBroadcastMeta( + s: SubqueryBroadcastExec, + conf: RapidsConf, + p: Option[RapidsMeta[_, _, _]], + r: DataFromReplacementRule) extends + GpuSubqueryBroadcastMetaBase(s, conf, p, r) { + override def convertToGpu(): GpuExec = { + GpuSubqueryBroadcastExec(s.name, s.indices, s.buildKeys, broadcastBuilder())( + getBroadcastModeKeyExprs) + } +} From 6b90b2fffb9035921fab6cd105469645c09a7b4d Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 25 Nov 2024 14:55:44 -0800 Subject: [PATCH 04/37] Add support for asynchronous writing for parquet (#11730) * Support async writing for query output Signed-off-by: Jihoon Son * doc change * use a long timeout * fix test failure due to a race * fix flaky test * address comments * fix the config name for hold gpu * Update sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStream.scala Simplify case arm Co-authored-by: Gera Shegalov * address comments * missing doc change * use trampoline --------- Signed-off-by: Jihoon Son Co-authored-by: Gera Shegalov --- .../spark/rapids/ColumnarOutputWriter.scala | 36 +++- .../spark/rapids/GpuParquetFileFormat.scala | 13 +- .../com/nvidia/spark/rapids/Plugin.scala | 3 + .../com/nvidia/spark/rapids/RapidsConf.scala | 35 ++++ .../rapids/io/async/AsyncOutputStream.scala | 186 ++++++++++++++++++ .../rapids/io/async/ThrottlingExecutor.scala | 43 ++++ .../rapids/io/async/TrafficController.scala | 142 +++++++++++++ .../io/async/AsyncOutputStreamSuite.scala | 162 +++++++++++++++ .../io/async/ThrottlingExecutorSuite.scala | 145 ++++++++++++++ .../io/async/TrafficControllerSuite.scala | 101 ++++++++++ 10 files changed, 855 insertions(+), 11 deletions(-) create mode 100644 sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStream.scala create mode 100644 sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutor.scala create mode 100644 sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/TrafficController.scala create mode 100644 sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStreamSuite.scala create mode 100644 sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutorSuite.scala create mode 100644 sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/TrafficControllerSuite.scala diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnarOutputWriter.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnarOutputWriter.scala index 69157c046b6..df62683d346 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnarOutputWriter.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnarOutputWriter.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,11 +25,13 @@ import com.nvidia.spark.Retryable import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource} import com.nvidia.spark.rapids.RapidsPluginImplicits._ import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{splitSpillableInHalfByRows, withRestoreOnRetry, withRetry, withRetryNoSplit} +import com.nvidia.spark.rapids.io.async.{AsyncOutputStream, TrafficController} import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FSDataOutputStream, Path} +import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.TaskContext +import org.apache.spark.internal.Logging import org.apache.spark.sql.rapids.{ColumnarWriteTaskStatsTracker, GpuWriteTaskStatsTracker} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch @@ -70,21 +72,31 @@ abstract class ColumnarOutputWriterFactory extends Serializable { abstract class ColumnarOutputWriter(context: TaskAttemptContext, dataSchema: StructType, rangeName: String, - includeRetry: Boolean) extends HostBufferConsumer { + includeRetry: Boolean, + holdGpuBetweenBatches: Boolean = false) extends HostBufferConsumer with Logging { protected val tableWriter: TableWriter protected val conf: Configuration = context.getConfiguration - // This is implemented as a method to make it easier to subclass - // ColumnarOutputWriter in the tests, and override this behavior. - protected def getOutputStream: FSDataOutputStream = { + private val trafficController: Option[TrafficController] = TrafficController.getInstance + + private def openOutputStream(): OutputStream = { val hadoopPath = new Path(path) val fs = hadoopPath.getFileSystem(conf) fs.create(hadoopPath, false) } - protected val outputStream: FSDataOutputStream = getOutputStream + // This is implemented as a method to make it easier to subclass + // ColumnarOutputWriter in the tests, and override this behavior. + protected def getOutputStream: OutputStream = { + trafficController.map(controller => { + logWarning("Async output write enabled") + new AsyncOutputStream(() => openOutputStream(), controller) + }).getOrElse(openOutputStream()) + } + + protected val outputStream: OutputStream = getOutputStream private[this] val tempBuffer = new Array[Byte](128 * 1024) private[this] var anythingWritten = false @@ -166,7 +178,11 @@ abstract class ColumnarOutputWriter(context: TaskAttemptContext, } // we successfully buffered to host memory, release the semaphore and write // the buffered data to the FS - GpuSemaphore.releaseIfNecessary(TaskContext.get) + if (!holdGpuBetweenBatches) { + logDebug("Releasing semaphore between batches") + GpuSemaphore.releaseIfNecessary(TaskContext.get) + } + writeBufferedData() updateStatistics(writeStartTime, gpuTime, statsTrackers) spillableBatch.numRows() @@ -202,6 +218,10 @@ abstract class ColumnarOutputWriter(context: TaskAttemptContext, // buffer an empty batch on close() to work around issues in cuDF // where corrupt files can be written if nothing is encoded via the writer. anythingWritten = true + + // tableWriter.write() serializes the table into the HostMemoryBuffer, and buffers it + // by calling handleBuffer() on the ColumnarOutputWriter. It may not write to the + // output stream just yet. tableWriter.write(table) } } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala index 25105386b3d..2b5f246e56a 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala @@ -271,13 +271,19 @@ class GpuParquetFileFormat extends ColumnarFileFormat with Logging { s"Set Parquet option ${ParquetOutputFormat.JOB_SUMMARY_LEVEL} to NONE.") } + val asyncOutputWriteEnabled = RapidsConf.ENABLE_ASYNC_OUTPUT_WRITE.get(sqlConf) + // holdGpuBetweenBatches is on by default if asyncOutputWriteEnabled is on + val holdGpuBetweenBatches = RapidsConf.ASYNC_QUERY_OUTPUT_WRITE_HOLD_GPU_IN_TASK.get(sqlConf) + .getOrElse(asyncOutputWriteEnabled) + new ColumnarOutputWriterFactory { override def newInstance( path: String, dataSchema: StructType, context: TaskAttemptContext): ColumnarOutputWriter = { new GpuParquetWriter(path, dataSchema, compressionType, outputTimestampType.toString, - dateTimeRebaseMode, timestampRebaseMode, context, parquetFieldIdWriteEnabled) + dateTimeRebaseMode, timestampRebaseMode, context, parquetFieldIdWriteEnabled, + holdGpuBetweenBatches) } override def getFileExtension(context: TaskAttemptContext): String = { @@ -299,8 +305,9 @@ class GpuParquetWriter( dateRebaseMode: DateTimeRebaseMode, timestampRebaseMode: DateTimeRebaseMode, context: TaskAttemptContext, - parquetFieldIdEnabled: Boolean) - extends ColumnarOutputWriter(context, dataSchema, "Parquet", true) { + parquetFieldIdEnabled: Boolean, + holdGpuBetweenBatches: Boolean) + extends ColumnarOutputWriter(context, dataSchema, "Parquet", true, holdGpuBetweenBatches) { override def throwIfRebaseNeededInExceptionMode(batch: ColumnarBatch): Unit = { val cols = GpuColumnVector.extractBases(batch) cols.foreach { col => diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala index e20b21da520..5127c7899a8 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala @@ -31,6 +31,7 @@ import com.nvidia.spark.DFUDFPlugin import com.nvidia.spark.rapids.RapidsConf.AllowMultipleJars import com.nvidia.spark.rapids.RapidsPluginUtils.buildInfoEvent import com.nvidia.spark.rapids.filecache.{FileCache, FileCacheLocalityManager, FileCacheLocalityMsg} +import com.nvidia.spark.rapids.io.async.TrafficController import com.nvidia.spark.rapids.jni.GpuTimeZoneDB import com.nvidia.spark.rapids.python.PythonWorkerSemaphore import org.apache.commons.lang3.exception.ExceptionUtils @@ -554,6 +555,7 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging { extraExecutorPlugins.foreach(_.init(pluginContext, extraConf)) GpuSemaphore.initialize() FileCache.init(pluginContext) + TrafficController.initialize(conf) } catch { // Exceptions in executor plugin can cause a single thread to die but the executor process // sticks around without any useful info until it hearbeat times out. Print what happened @@ -656,6 +658,7 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging { extraExecutorPlugins.foreach(_.shutdown()) FileCache.shutdown() GpuCoreDumpHandler.shutdown() + TrafficController.shutdown() } override def onTaskFailed(failureReason: TaskFailedReason): Unit = { diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index e22b8f53497..ab7a788d205 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -2406,6 +2406,36 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression. .booleanConf .createWithDefault(false) + val ENABLE_ASYNC_OUTPUT_WRITE = + conf("spark.rapids.sql.asyncWrite.queryOutput.enabled") + .doc("Option to turn on the async query output write. During the final output write, the " + + "task first copies the output to the host memory, and then writes it into the storage. " + + "When this option is enabled, the task will asynchronously write the output in the host " + + "memory to the storage. Only the Parquet format is supported currently.") + .internal() + .booleanConf + .createWithDefault(false) + + val ASYNC_QUERY_OUTPUT_WRITE_HOLD_GPU_IN_TASK = + conf("spark.rapids.sql.queryOutput.holdGpuInTask") + .doc("Option to hold GPU semaphore between batch processing during the final output write. " + + "This option could degrade query performance if it is enabled without the async query " + + "output write. It is recommended to consider enabling this option only when " + + s"${ENABLE_ASYNC_OUTPUT_WRITE.key} is set. This option is off by default when the async " + + "query output write is disabled; otherwise, it is on.") + .internal() + .booleanConf + .createOptional + + val ASYNC_WRITE_MAX_IN_FLIGHT_HOST_MEMORY_BYTES = + conf("spark.rapids.sql.asyncWrite.maxInFlightHostMemoryBytes") + .doc("Maximum number of host memory bytes per executor that can be in-flight for async " + + "query output write. Tasks may be blocked if the total host memory bytes in-flight " + + "exceeds this value.") + .internal() + .bytesConf(ByteUnit.BYTE) + .createWithDefault(2L * 1024 * 1024 * 1024) + private def printSectionHeader(category: String): Unit = println(s"\n### $category") @@ -2663,6 +2693,9 @@ class RapidsConf(conf: Map[String, String]) extends Logging { lazy val isFoldableNonLitAllowed: Boolean = get(FOLDABLE_NON_LIT_ALLOWED) + lazy val asyncWriteMaxInFlightHostMemoryBytes: Long = + get(ASYNC_WRITE_MAX_IN_FLIGHT_HOST_MEMORY_BYTES) + /** * Convert a string value to the injection configuration OomInjection. * @@ -3248,6 +3281,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging { lazy val caseWhenFuseEnabled: Boolean = get(CASE_WHEN_FUSE) + lazy val isAsyncOutputWriteEnabled: Boolean = get(ENABLE_ASYNC_OUTPUT_WRITE) + private val optimizerDefaults = Map( // this is not accurate because CPU projections do have a cost due to appending values // to each row that is produced, but this needs to be a really small number because diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStream.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStream.scala new file mode 100644 index 00000000000..40904a96dd2 --- /dev/null +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStream.scala @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.io.async + +import java.io.{IOException, OutputStream} +import java.util.concurrent.{Callable, TimeUnit} +import java.util.concurrent.atomic.{AtomicLong, AtomicReference} + +import com.nvidia.spark.rapids.RapidsPluginImplicits._ + +import org.apache.spark.sql.rapids.execution.TrampolineUtil + +/** + * OutputStream that performs writes asynchronously. Writes are scheduled on a background thread + * and executed in the order they were scheduled. This class is not thread-safe and should only be + * used by a single thread. + */ +class AsyncOutputStream(openFn: Callable[OutputStream], trafficController: TrafficController) + extends OutputStream { + + private var closed = false + + private val executor = new ThrottlingExecutor( + TrampolineUtil.newDaemonCachedThreadPool("AsyncOutputStream", 1, 1), + trafficController) + + // Open the underlying stream asynchronously as soon as the AsyncOutputStream is constructed, + // so that the open can be done in parallel with other operations. This could help with + // performance if the open is slow. + private val openFuture = executor.submit(openFn, 0) + // Let's give it enough time to open the stream. Something bad should have happened if it + // takes more than 5 minutes to open a stream. + private val openTimeoutMin = 5 + + private lazy val delegate: OutputStream = { + openFuture.get(openTimeoutMin, TimeUnit.MINUTES) + } + + class Metrics { + var numBytesScheduled: Long = 0 + // This is thread-safe as it is updated by the background thread and can be read by + // any threads. + val numBytesWritten: AtomicLong = new AtomicLong(0) + } + + val metrics = new Metrics + + /** + * The last error that occurred in the background thread, or None if no error occurred. + * Once it is set, all subsequent writes that are already scheduled will fail and no new + * writes will be accepted. + * + * This is thread-safe as it is set by the background thread and can be read by any threads. + */ + val lastError: AtomicReference[Option[Throwable]] = + new AtomicReference[Option[Throwable]](None) + + @throws[IOException] + private def throwIfError(): Unit = { + lastError.get() match { + case Some(t: IOException) => throw t + case Some(t) => throw new IOException(t) + case None => + } + } + + @throws[IOException] + private def ensureOpen(): Unit = { + if (closed) { + throw new IOException("Stream closed") + } + } + + private def scheduleWrite(fn: () => Unit, bytesToWrite: Int): Unit = { + throwIfError() + ensureOpen() + + metrics.numBytesScheduled += bytesToWrite + executor.submit(() => { + throwIfError() + ensureOpen() + + try { + fn() + metrics.numBytesWritten.addAndGet(bytesToWrite) + } catch { + case t: Throwable => + // Update the error state + lastError.set(Some(t)) + } + }, bytesToWrite) + } + + override def write(b: Int): Unit = { + scheduleWrite(() => delegate.write(b), 1) + } + + override def write(b: Array[Byte]): Unit = { + scheduleWrite(() => delegate.write(b), b.length) + } + + /** + * Schedules a write of the given bytes to the underlying stream. The write is executed + * asynchronously on a background thread. The method returns immediately, and the write may not + * have completed when the method returns. + * + * If an error has occurred in the background thread and [[lastError]] has been set, this function + * will throw an IOException immediately. + * + * If an error has occurred in the background thread while executing a previous write after the + * current write has been scheduled, the current write will fail with the same error. + */ + @throws[IOException] + override def write(b: Array[Byte], off: Int, len: Int): Unit = { + scheduleWrite(() => delegate.write(b, off, len), len) + } + + /** + * Flushes all pending writes to the underlying stream. This method blocks until all pending + * writes have been completed. If an error has occurred in the background thread, this method + * will throw an IOException. + * + * If an error has occurred in the background thread and [[lastError]] has been set, this function + * will throw an IOException immediately. + * + * If an error has occurred in the background thread while executing a previous task after the + * current flush has been scheduled, the current flush will fail with the same error. + */ + @throws[IOException] + override def flush(): Unit = { + throwIfError() + ensureOpen() + + val f = executor.submit(() => { + throwIfError() + ensureOpen() + + delegate.flush() + }, 0) + + f.get() + } + + /** + * Closes the underlying stream and releases any resources associated with it. All pending writes + * are flushed before closing the stream. This method blocks until all pending writes have been + * completed. + * + * If an error has occurred while flushing, this function will throw an IOException. + * + * If an error has occurred while executing a previous task before this function is called, + * this function will throw the same error. All resources and the underlying stream are still + * guaranteed to be closed. + */ + @throws[IOException] + override def close(): Unit = { + if (!closed) { + Seq[AutoCloseable]( + () => { + // Wait for all pending writes to complete + // This will throw an exception if one of the writes fails + flush() + }, + () => { + // Give the executor a chance to shutdown gracefully. + executor.shutdownNow(10, TimeUnit.SECONDS) + }, + delegate, + () => closed = true).safeClose() + } + } +} diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutor.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutor.scala new file mode 100644 index 00000000000..45889bf89ac --- /dev/null +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutor.scala @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.io.async + +import java.util.concurrent.{Callable, ExecutorService, Future, TimeUnit} + +/** + * Thin wrapper around an ExecutorService that adds throttling. + */ +class ThrottlingExecutor( + val executor: ExecutorService, throttler: TrafficController) { + + def submit[T](callable: Callable[T], hostMemoryBytes: Long): Future[T] = { + val task = new Task[T](hostMemoryBytes, callable) + throttler.blockUntilRunnable(task) + executor.submit(() => { + try { + task.call() + } finally { + throttler.taskCompleted(task) + } + }) + } + + def shutdownNow(timeout: Long, timeUnit: TimeUnit): Unit = { + executor.shutdownNow() + executor.awaitTermination(timeout, timeUnit) + } +} diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/TrafficController.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/TrafficController.scala new file mode 100644 index 00000000000..0110f2d89ca --- /dev/null +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/TrafficController.scala @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.io.async + +import java.util.concurrent.Callable +import javax.annotation.concurrent.GuardedBy + +import com.nvidia.spark.rapids.RapidsConf + +/** + * Simple wrapper around a [[Callable]] that also keeps track of the host memory bytes used by + * the task. + * + * Note: we may want to add more metadata to the task in the future, such as the device memory, + * as we implement more throttling strategies. + */ +class Task[T](val hostMemoryBytes: Long, callable: Callable[T]) extends Callable[T] { + override def call(): T = callable.call() +} + +/** + * Throttle interface to be implemented by different throttling strategies. + * + * Currently, only HostMemoryThrottle is implemented, which limits the maximum in-flight host + * memory bytes. In the future, we can add more throttling strategies, such as limiting the + * device memory usage, the number of tasks, etc. + */ +trait Throttle { + + /** + * Returns true if the task can be accepted, false otherwise. + * TrafficController will block the task from being scheduled until this method returns true. + */ + def canAccept[T](task: Task[T]): Boolean + + /** + * Callback to be called when a task is scheduled. + */ + def taskScheduled[T](task: Task[T]): Unit + + /** + * Callback to be called when a task is completed, either successfully or with an exception. + */ + def taskCompleted[T](task: Task[T]): Unit +} + +/** + * Throttle implementation that limits the total host memory used by the in-flight tasks. + */ +class HostMemoryThrottle(val maxInFlightHostMemoryBytes: Long) extends Throttle { + private var totalHostMemoryBytes: Long = 0 + + override def canAccept[T](task: Task[T]): Boolean = { + totalHostMemoryBytes + task.hostMemoryBytes <= maxInFlightHostMemoryBytes + } + + override def taskScheduled[T](task: Task[T]): Unit = { + totalHostMemoryBytes += task.hostMemoryBytes + } + + override def taskCompleted[T](task: Task[T]): Unit = { + totalHostMemoryBytes -= task.hostMemoryBytes + } + + def getTotalHostMemoryBytes: Long = totalHostMemoryBytes +} + +/** + * TrafficController is responsible for blocking tasks from being scheduled when the throttle + * is exceeded. It also keeps track of the number of tasks that are currently scheduled. + * + * This class is thread-safe as it is used by multiple tasks. + */ +class TrafficController protected[rapids] (throttle: Throttle) { + + @GuardedBy("this") + private var numTasks: Int = 0 + + /** + * Blocks the task from being scheduled until the throttle allows it. If there is no task + * currently scheduled, the task is scheduled immediately even if the throttle is exceeded. + */ + def blockUntilRunnable[T](task: Task[T]): Unit = synchronized { + if (numTasks > 0) { + while (!throttle.canAccept(task)) { + wait(100) + } + } + numTasks += 1 + throttle.taskScheduled(task) + } + + def taskCompleted[T](task: Task[T]): Unit = synchronized { + numTasks -= 1 + throttle.taskCompleted(task) + notify() + } + + def numScheduledTasks: Int = synchronized { + numTasks + } +} + +object TrafficController { + + private var instance: TrafficController = _ + + /** + * Initializes the TrafficController singleton instance. + * This is called once per executor. + */ + def initialize(conf: RapidsConf): Unit = synchronized { + if (conf.isAsyncOutputWriteEnabled && instance == null) { + instance = new TrafficController( + new HostMemoryThrottle(conf.asyncWriteMaxInFlightHostMemoryBytes)) + } + } + + def getInstance: Option[TrafficController] = synchronized { + Option(instance) + } + + def shutdown(): Unit = synchronized { + if (instance != null) { + instance = null + } + } +} diff --git a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStreamSuite.scala b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStreamSuite.scala new file mode 100644 index 00000000000..a4fa35349ce --- /dev/null +++ b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStreamSuite.scala @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.io.async + +import java.io.{BufferedOutputStream, File, FileOutputStream, IOException, OutputStream} +import java.util.concurrent.Callable + +import com.nvidia.spark.rapids.Arm.withResource +import org.scalatest.BeforeAndAfterEach +import org.scalatest.funsuite.AnyFunSuite + +class AsyncOutputStreamSuite extends AnyFunSuite with BeforeAndAfterEach { + + private val bufLen = 128 * 1024 + private val buf: Array[Byte] = new Array[Byte](bufLen) + private val maxBufCount = 10 + private val trafficController = new TrafficController( + new HostMemoryThrottle(bufLen * maxBufCount)) + + def openStream(): AsyncOutputStream = { + new AsyncOutputStream(() => { + val file = File.createTempFile("async-write-test", "tmp") + new BufferedOutputStream(new FileOutputStream(file)) + }, trafficController) + } + + test("open, write, and close") { + val numBufs = 1000 + val stream = openStream() + withResource(stream) { os => + for (_ <- 0 until numBufs) { + os.write(buf) + } + } + assertResult(bufLen * numBufs)(stream.metrics.numBytesScheduled) + assertResult(bufLen * numBufs)(stream.metrics.numBytesWritten.get()) + } + + test("write after closed") { + val os = openStream() + os.close() + assertThrows[IOException] { + os.write(buf) + } + } + + test("flush after closed") { + val os = openStream() + os.close() + assertThrows[IOException] { + os.flush() + } + } + + class ThrowingOutputStream extends OutputStream { + + var failureCount = 0 + + override def write(i: Int): Unit = { + failureCount += 1 + throw new IOException(s"Failed ${failureCount} times") + } + + override def write(b: Array[Byte], off: Int, len: Int): Unit = { + failureCount += 1 + throw new IOException(s"Failed ${failureCount} times") + } + } + + def assertThrowsWithMsg[T](fn: Callable[T], clue: String, + expectedMsgPrefix: String): Unit = { + withClue(clue) { + try { + fn.call() + } catch { + case t: Throwable => + assertIOExceptionMsg(t, expectedMsgPrefix) + } + } + } + + def assertIOExceptionMsg(t: Throwable, expectedMsgPrefix: String): Unit = { + if (t.getClass.isAssignableFrom(classOf[IOException])) { + if (!t.getMessage.contains(expectedMsgPrefix)) { + fail(s"Unexpected exception message: ${t.getMessage}") + } + } else { + if (t.getCause != null) { + assertIOExceptionMsg(t.getCause, expectedMsgPrefix) + } else { + fail(s"Unexpected exception: $t") + } + } + } + + test("write after error") { + val os = new AsyncOutputStream(() => new ThrowingOutputStream, trafficController) + + // The first call to `write` should succeed + os.write(buf) + + // Wait for the first write to fail + while (os.lastError.get().isEmpty) { + Thread.sleep(100) + } + + // The second `write` call should fail with the exception thrown by the first write failure + assertThrowsWithMsg(() => os.write(buf), + "The second write should fail with the exception thrown by the first write failure", + "Failed 1 times") + + // `close` throws the same exception + assertThrowsWithMsg(() => os.close(), + "The second write should fail with the exception thrown by the first write failure", + "Failed 1 times") + + assertResult(bufLen)(os.metrics.numBytesScheduled) + assertResult(0)(os.metrics.numBytesWritten.get()) + assert(os.lastError.get().get.isInstanceOf[IOException]) + } + + test("flush after error") { + val os = new AsyncOutputStream(() => new ThrowingOutputStream, trafficController) + + // The first write should succeed + os.write(buf) + + // The flush should fail with the exception thrown by the write failure + assertThrowsWithMsg(() => os.flush(), + "The flush should fail with the exception thrown by the write failure", + "Failed 1 times") + + // `close` throws the same exception + assertThrowsWithMsg(() => os.close(), + "The flush should fail with the exception thrown by the write failure", + "Failed 1 times") + } + + test("close after error") { + val os = new AsyncOutputStream(() => new ThrowingOutputStream, trafficController) + + os.write(buf) + + assertThrowsWithMsg(() => os.close(), + "Close should fail with the exception thrown by the write failure", + "Failed 1 times") + } +} diff --git a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutorSuite.scala b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutorSuite.scala new file mode 100644 index 00000000000..a8acf240878 --- /dev/null +++ b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutorSuite.scala @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.io.async + +import java.util.concurrent.{Callable, CountDownLatch, ExecutionException, Executors, Future, RejectedExecutionException, TimeUnit} + +import org.scalatest.BeforeAndAfterEach +import org.scalatest.funsuite.AnyFunSuite + +class ThrottlingExecutorSuite extends AnyFunSuite with BeforeAndAfterEach { + + // Some tests might take longer than usual in the limited CI environment. + // Use a long timeout to avoid flakiness. + val longTimeoutSec = 5 + + var throttle: HostMemoryThrottle = _ + var trafficController: TrafficController = _ + var executor: ThrottlingExecutor = _ + + class TestTask extends Callable[Unit] { + val latch = new CountDownLatch(1) + override def call(): Unit = { + latch.await() + } + } + + override def beforeEach(): Unit = { + throttle = new HostMemoryThrottle(100) + trafficController = new TrafficController(throttle) + executor = new ThrottlingExecutor( + Executors.newSingleThreadExecutor(), + trafficController + ) + } + + override def afterEach(): Unit = { + executor.shutdownNow(longTimeoutSec, TimeUnit.SECONDS) + } + + test("tasks submitted should update the state") { + val task1 = new TestTask + val future1 = executor.submit(task1, 10) + assertResult(1)(trafficController.numScheduledTasks) + assertResult(10)(throttle.getTotalHostMemoryBytes) + + val task2 = new TestTask + val future2 = executor.submit(task2, 20) + assertResult(2)(trafficController.numScheduledTasks) + assertResult(30)(throttle.getTotalHostMemoryBytes) + + task1.latch.countDown() + future1.get(longTimeoutSec, TimeUnit.SECONDS) + assertResult(1)(trafficController.numScheduledTasks) + assertResult(20)(throttle.getTotalHostMemoryBytes) + + task2.latch.countDown() + future2.get(longTimeoutSec, TimeUnit.SECONDS) + assertResult(0)(trafficController.numScheduledTasks) + assertResult(0)(throttle.getTotalHostMemoryBytes) + } + + test("tasks submission fails if total weight exceeds maxWeight") { + val task1 = new TestTask + val future1 = executor.submit(task1, 10) + assertResult(1)(trafficController.numScheduledTasks) + assertResult(10)(throttle.getTotalHostMemoryBytes) + + val task2 = new TestTask + val task2Weight = 100 + val exec = Executors.newSingleThreadExecutor() + val future2 = exec.submit(new Runnable { + override def run(): Unit = executor.submit(task2, task2Weight) + }) + Thread.sleep(100) + assert(!future2.isDone) + assertResult(1)(trafficController.numScheduledTasks) + assertResult(10)(throttle.getTotalHostMemoryBytes) + + task1.latch.countDown() + future1.get(longTimeoutSec, TimeUnit.SECONDS) + future2.get(longTimeoutSec, TimeUnit.SECONDS) + assertResult(1)(trafficController.numScheduledTasks) + assertResult(task2Weight)(throttle.getTotalHostMemoryBytes) + } + + test("submit one task heavier than maxWeight") { + val future = executor.submit(() => Thread.sleep(10), throttle.maxInFlightHostMemoryBytes + 1) + future.get(longTimeoutSec, TimeUnit.SECONDS) + assert(future.isDone) + assertResult(0)(trafficController.numScheduledTasks) + assertResult(0)(throttle.getTotalHostMemoryBytes) + } + + test("submit multiple tasks such that total weight does not exceed maxWeight") { + val numTasks = 10 + val taskRunTime = 10 + var future: Future[Unit] = null + for (_ <- 0 to numTasks) { + future = executor.submit(() => Thread.sleep(taskRunTime), 1) + } + // Give enough time for all tasks to complete + future.get(numTasks * taskRunTime * 5, TimeUnit.MILLISECONDS) + assertResult(0)(trafficController.numScheduledTasks) + assertResult(0)(throttle.getTotalHostMemoryBytes) + } + + test("shutdown while a task is blocked") { + val task1 = new TestTask + val future1 = executor.submit(task1, 10) + assertResult(1)(trafficController.numScheduledTasks) + assertResult(10)(throttle.getTotalHostMemoryBytes) + + val task2 = new TestTask + val task2Weight = 100 + val exec = Executors.newSingleThreadExecutor() + val future2 = exec.submit(new Runnable { + override def run(): Unit = executor.submit(task2, task2Weight) + }) + executor.shutdownNow(longTimeoutSec, TimeUnit.SECONDS) + + def assertCause(t: Throwable, cause: Class[_]): Unit = { + assert(t.getCause != null) + assert(cause.isInstance(t.getCause)) + } + + val e1 = intercept[ExecutionException](future1.get()) + assertCause(e1, classOf[InterruptedException]) + val e2 = intercept[ExecutionException](future2.get()) + assertCause(e2, classOf[RejectedExecutionException]) + } +} diff --git a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/TrafficControllerSuite.scala b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/TrafficControllerSuite.scala new file mode 100644 index 00000000000..32868ff6055 --- /dev/null +++ b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/TrafficControllerSuite.scala @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.io.async + +import java.util.concurrent.{ExecutionException, Executors, ExecutorService, TimeUnit} + +import org.scalatest.BeforeAndAfterEach +import org.scalatest.funsuite.AnyFunSuite + +class TrafficControllerSuite extends AnyFunSuite with BeforeAndAfterEach { + + private var throttle: HostMemoryThrottle = _ + private var controller: TrafficController = _ + private var executor: ExecutorService = _ + + override def beforeEach(): Unit = { + throttle = new HostMemoryThrottle(100) + controller = new TrafficController(throttle) + executor = Executors.newSingleThreadExecutor() + } + + override def afterEach(): Unit = { + executor.shutdownNow() + executor.awaitTermination(1, TimeUnit.SECONDS) + } + + class TestTask(taskMemoryBytes: Long) extends Task[Unit](taskMemoryBytes, () => {}) {} + + test("schedule tasks without blocking") { + val taskMemoryBytes = 50 + val t1 = new TestTask(taskMemoryBytes) + controller.blockUntilRunnable(t1) + assertResult(1)(controller.numScheduledTasks) + assertResult(taskMemoryBytes)(throttle.getTotalHostMemoryBytes) + + val t2 = new TestTask(50) + controller.blockUntilRunnable(t2) + assertResult(2)(controller.numScheduledTasks) + assertResult(2 * taskMemoryBytes)(throttle.getTotalHostMemoryBytes) + + controller.taskCompleted(t1) + assertResult(1)(controller.numScheduledTasks) + assertResult(taskMemoryBytes)(throttle.getTotalHostMemoryBytes) + } + + test("schedule task with blocking") { + val taskMemoryBytes = 50 + val t1 = new TestTask(taskMemoryBytes) + controller.blockUntilRunnable(t1) + + val t2 = new TestTask(taskMemoryBytes) + controller.blockUntilRunnable(t2) + + val t3 = new TestTask(taskMemoryBytes) + val f = executor.submit(new Runnable { + override def run(): Unit = controller.blockUntilRunnable(t3) + }) + Thread.sleep(100) + assert(!f.isDone) + + controller.taskCompleted(t1) + f.get(1, TimeUnit.SECONDS) + } + + test("shutdown while blocking") { + val t1 = new TestTask(10) + controller.blockUntilRunnable(t1) + + val t2 = new TestTask(110) + + val f = executor.submit(new Runnable { + override def run(): Unit = { + controller.blockUntilRunnable(t2) + } + }) + + executor.shutdownNow() + try { + f.get(1, TimeUnit.SECONDS) + fail("Should be interrupted") + } catch { + case ee: ExecutionException => + assert(ee.getCause.isInstanceOf[InterruptedException]) + case _: Throwable => fail("Should be interrupted") + } + } +} From abc3654ffda125b474441ba2cf3dd43ccdfb2483 Mon Sep 17 00:00:00 2001 From: YanxuanLiu <104543031+YanxuanLiu@users.noreply.github.com> Date: Tue, 26 Nov 2024 08:22:44 +0800 Subject: [PATCH 05/37] remove excluded release shim and TODO (#11756) * remove excluded release shim and TODO Signed-off-by: YanxuanLiu * remove shim from 2.13 properties Signed-off-by: YanxuanLiu * Fix error: 'NoneType' object has no attribute 'split' for excluded_shims Signed-off-by: timl --------- Signed-off-by: YanxuanLiu Signed-off-by: timl Co-authored-by: timl --- build/get_buildvers.py | 2 +- pom.xml | 3 +-- scala2.13/pom.xml | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/build/get_buildvers.py b/build/get_buildvers.py index 5fe864670b5..263003ea99f 100644 --- a/build/get_buildvers.py +++ b/build/get_buildvers.py @@ -34,7 +34,7 @@ def _get_buildvers(buildvers, pom_file, logger=None): else: no_snapshots.append(release) excluded_shims = pom.find(".//pom:dyn.shim.excluded.releases", ns) - if excluded_shims is not None: + if excluded_shims is not None and excluded_shims.text: for removed_shim in [x.strip() for x in excluded_shims.text.split(",")]: if removed_shim in snapshots: snapshots.remove(removed_shim) diff --git a/pom.xml b/pom.xml index 79a6a765470..7409b849968 100644 --- a/pom.xml +++ b/pom.xml @@ -813,8 +813,7 @@ - - 350db143 + . diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml index d1368d81d97..9c00390f6e5 100644 --- a/scala2.13/pom.xml +++ b/scala2.13/pom.xml @@ -813,8 +813,7 @@ - - 350db143 + . From f5be35e2f50c6ebf64d7914f34fda36772c87729 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Mon, 25 Nov 2024 20:26:45 -0600 Subject: [PATCH 06/37] Fix Kudo batch serializer to only read header in hasNext (#11766) Signed-off-by: Jason Lowe --- .../rapids/GpuColumnarBatchSerializer.scala | 51 +++++++++++-------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala index 116b8b97504..44a58370c33 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala @@ -27,7 +27,7 @@ import ai.rapids.cudf.JCudfSerialization.SerializedTableHeader import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource} import com.nvidia.spark.rapids.RapidsPluginImplicits._ import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion -import com.nvidia.spark.rapids.jni.kudo.{KudoSerializer, KudoTable} +import com.nvidia.spark.rapids.jni.kudo.{KudoSerializer, KudoTable, KudoTableHeader} import org.apache.spark.TaskContext import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer, SerializerInstance} @@ -495,47 +495,52 @@ object KudoSerializedTableColumn { class KudoSerializedBatchIterator(dIn: DataInputStream) extends BaseSerializedTableIterator { - private[this] var nextTable: Option[KudoTable] = None + private[this] var nextHeader: Option[KudoTableHeader] = None private[this] var streamClosed: Boolean = false // Don't install the callback if in a unit test Option(TaskContext.get()).foreach { tc => onTaskCompletion(tc) { - nextTable.foreach(_.close()) - nextTable = None dIn.close() } } - private def tryReadNext(): Unit = { + private def tryReadNextHeader(): Unit = { if (!streamClosed) { - withResource(new NvtxRange("Read Kudo Table", NvtxColor.YELLOW)) { _ => - val kudoTable = KudoTable.from(dIn) - if (kudoTable.isPresent) { - nextTable = Some(kudoTable.get()) - } else { + withResource(new NvtxRange("Read Kudo Header", NvtxColor.YELLOW)) { _ => + require(nextHeader.isEmpty) + nextHeader = Option(KudoTableHeader.readFrom(dIn).orElse(null)) + if (nextHeader.isEmpty) { dIn.close() streamClosed = true - nextTable = None } } } } override def hasNext: Boolean = { - nextTable match { - case Some(_) => true - case None => - tryReadNext() - nextTable.isDefined + if (nextHeader.isEmpty) { + tryReadNextHeader() } + nextHeader.isDefined } override def next(): (Int, ColumnarBatch) = { if (hasNext) { - val ret = KudoSerializedTableColumn.from(nextTable.get) - nextTable = None - (0, ret) + val header = nextHeader.get + nextHeader = None + val buffer = if (header.getNumColumns == 0) { + null + } else { + withResource(new NvtxRange("Read Kudo Body", NvtxColor.YELLOW)) { _ => + val buffer = HostMemoryBuffer.allocate(header.getTotalDataLen, false) + closeOnExcept(buffer) { _ => + buffer.copyFromStream(0, dIn, header.getTotalDataLen) + } + buffer + } + } + (0, KudoSerializedTableColumn.from(new KudoTable(header, buffer))) } else { throw new NoSuchElementException("Walked off of the end...") } @@ -547,7 +552,9 @@ class KudoSerializedBatchIterator(dIn: DataInputStream) * @return the length of the data to read, or None if the stream is closed or ended */ override def peekNextBatchSize(): Option[Long] = { - tryReadNext() - nextTable.flatMap(t => Option(t.getBuffer)).map(_.getLength) + if (nextHeader.isEmpty) { + tryReadNextHeader() + } + nextHeader.map(_.getTotalDataLen) } -} \ No newline at end of file +} From 2b6ac118112c973a7848cb4fc7a26ab68797fb4b Mon Sep 17 00:00:00 2001 From: Gera Shegalov Date: Tue, 26 Nov 2024 06:36:21 -0800 Subject: [PATCH 07/37] Avoid using StringBuffer in single-threaded methods. (#11759) Signed-off-by: Gera Shegalov --- .../org/apache/spark/sql/rapids/test/cpuJsonExpressions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/test/cpuJsonExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/test/cpuJsonExpressions.scala index 97d271b076f..0dd048967a8 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/test/cpuJsonExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/test/cpuJsonExpressions.scala @@ -203,7 +203,7 @@ object GetJsonObjectMask { oneToOneMap: Map[Char, Char], digitMap: Map[Char, Char]): String = { if (originStr != null) { - val buf = new StringBuffer(originStr.length) + val buf = new StringBuilder(originStr.length) var idx = 0 while (idx < originStr.length) { val originChar = originStr(idx) From e3dce9ec393d84f68c00da6e5631f67abffe94e0 Mon Sep 17 00:00:00 2001 From: Renjie Liu Date: Tue, 26 Nov 2024 23:11:24 +0800 Subject: [PATCH 08/37] Fix query hang when using rapids multithread shuffle manager with kudo (#11771) * Fix query hang when using kudo and multi thread shuffle manager Signed-off-by: liurenjie1024 * Fix NPE --------- Signed-off-by: liurenjie1024 --- .../rapids/GpuColumnarBatchSerializer.scala | 94 +++++++++++-------- .../RapidsShuffleInternalManagerBase.scala | 1 - 2 files changed, 53 insertions(+), 42 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala index 44a58370c33..54252253d38 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala @@ -319,10 +319,12 @@ object SerializedTableColumn { if (batch.numCols == 1) { val cv = batch.column(0) cv match { - case serializedTableColumn: SerializedTableColumn - if serializedTableColumn.hostBuffer != null => - sum += serializedTableColumn.hostBuffer.getLength + case serializedTableColumn: SerializedTableColumn => + sum += Option(serializedTableColumn.hostBuffer).map(_.getLength).getOrElse(0L) + case kudo: KudoSerializedTableColumn => + sum += Option(kudo.kudoTable.getBuffer).map(_.getLength).getOrElse(0L) case _ => + throw new IllegalStateException(s"Unexpected column type: ${cv.getClass}" ) } } sum @@ -496,65 +498,75 @@ object KudoSerializedTableColumn { class KudoSerializedBatchIterator(dIn: DataInputStream) extends BaseSerializedTableIterator { private[this] var nextHeader: Option[KudoTableHeader] = None + private[this] var toBeReturned: Option[ColumnarBatch] = None private[this] var streamClosed: Boolean = false // Don't install the callback if in a unit test Option(TaskContext.get()).foreach { tc => onTaskCompletion(tc) { + toBeReturned.foreach(_.close()) + toBeReturned = None dIn.close() } } - private def tryReadNextHeader(): Unit = { - if (!streamClosed) { - withResource(new NvtxRange("Read Kudo Header", NvtxColor.YELLOW)) { _ => - require(nextHeader.isEmpty) - nextHeader = Option(KudoTableHeader.readFrom(dIn).orElse(null)) - if (nextHeader.isEmpty) { - dIn.close() - streamClosed = true + override def peekNextBatchSize(): Option[Long] = { + if (streamClosed) { + None + } else { + if (nextHeader.isEmpty) { + withResource(new NvtxRange("Read Header", NvtxColor.YELLOW)) { _ => + val header = Option(KudoTableHeader.readFrom(dIn).orElse(null)) + if (header.isDefined) { + nextHeader = header + } else { + dIn.close() + streamClosed = true + nextHeader = None + } } } + nextHeader.map(_.getTotalDataLen) } } - override def hasNext: Boolean = { + private def tryReadNext(): Option[ColumnarBatch] = { if (nextHeader.isEmpty) { - tryReadNextHeader() - } - nextHeader.isDefined - } - - override def next(): (Int, ColumnarBatch) = { - if (hasNext) { - val header = nextHeader.get - nextHeader = None - val buffer = if (header.getNumColumns == 0) { - null - } else { - withResource(new NvtxRange("Read Kudo Body", NvtxColor.YELLOW)) { _ => - val buffer = HostMemoryBuffer.allocate(header.getTotalDataLen, false) - closeOnExcept(buffer) { _ => - buffer.copyFromStream(0, dIn, header.getTotalDataLen) + None + } else { + withResource(new NvtxRange("Read Batch", NvtxColor.YELLOW)) { _ => + val header = nextHeader.get + if (header.getNumColumns > 0) { + // This buffer will later be concatenated into another host buffer before being + // sent to the GPU, so no need to use pinned memory for these buffers. + closeOnExcept(HostMemoryBuffer.allocate(header.getTotalDataLen, false)) { hostBuffer => + hostBuffer.copyFromStream(0, dIn, header.getTotalDataLen) + val kudoTable = new KudoTable(header, hostBuffer) + Some(KudoSerializedTableColumn.from(kudoTable)) } - buffer + } else { + Some(KudoSerializedTableColumn.from(new KudoTable(header, null))) } } - (0, KudoSerializedTableColumn.from(new KudoTable(header, buffer))) - } else { - throw new NoSuchElementException("Walked off of the end...") } } - /** - * Attempt to read the next header from the stream. - * - * @return the length of the data to read, or None if the stream is closed or ended - */ - override def peekNextBatchSize(): Option[Long] = { - if (nextHeader.isEmpty) { - tryReadNextHeader() + override def hasNext: Boolean = { + peekNextBatchSize() + nextHeader.isDefined + } + + override def next(): (Int, ColumnarBatch) = { + if (toBeReturned.isEmpty) { + peekNextBatchSize() + toBeReturned = tryReadNext() + if (nextHeader.isEmpty || toBeReturned.isEmpty) { + throw new NoSuchElementException("Walked off of the end...") + } } - nextHeader.map(_.getTotalDataLen) + val ret = toBeReturned.get + toBeReturned = None + nextHeader = None + (0, ret) } } diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala index a44580c3bf5..05bc76c3fab 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala @@ -21,7 +21,6 @@ import java.util.Optional import java.util.concurrent.{Callable, ConcurrentHashMap, ExecutionException, Executors, Future, LinkedBlockingQueue, TimeUnit} import java.util.concurrent.atomic.{AtomicInteger, AtomicLong} -import scala.collection import scala.collection.mutable import scala.collection.mutable.ListBuffer From 4fa0a1dee986e05733dbdbf4971c42ad5e0e84ec Mon Sep 17 00:00:00 2001 From: "Hongbin Ma (Mahone)" Date: Tue, 26 Nov 2024 23:44:45 +0800 Subject: [PATCH 09/37] repartition-based fallback for hash aggregate v3 (#11712) Signed-off-by: Hongbin Ma (Mahone) Signed-off-by: Firestarman Co-authored-by: Firestarman --- .../scala/com/nvidia/spark/rapids/Arm.scala | 16 +- .../rapids/AutoClosableArrayBuffer.scala | 54 ++ .../spark/rapids/GpuAggregateExec.scala | 725 ++++++++++-------- .../com/nvidia/spark/rapids/GpuExec.scala | 6 + ...GpuUnboundedToUnboundedAggWindowExec.scala | 29 +- 5 files changed, 476 insertions(+), 354 deletions(-) create mode 100644 sql-plugin/src/main/scala/com/nvidia/spark/rapids/AutoClosableArrayBuffer.scala diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Arm.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Arm.scala index 926f770a683..b0cd798c179 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Arm.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Arm.scala @@ -16,7 +16,7 @@ package com.nvidia.spark.rapids import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.{ArrayBuffer, ListBuffer} import scala.util.control.ControlThrowable import com.nvidia.spark.rapids.RapidsPluginImplicits._ @@ -134,6 +134,20 @@ object Arm extends ArmScalaSpecificImpl { } } + /** Executes the provided code block, closing the resources only if an exception occurs */ + def closeOnExcept[T <: AutoCloseable, V](r: ListBuffer[T])(block: ListBuffer[T] => V): V = { + try { + block(r) + } catch { + case t: ControlThrowable => + // Don't close for these cases.. + throw t + case t: Throwable => + r.safeClose(t) + throw t + } + } + /** Executes the provided code block, closing the resources only if an exception occurs */ def closeOnExcept[T <: AutoCloseable, V](r: mutable.Queue[T])(block: mutable.Queue[T] => V): V = { diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AutoClosableArrayBuffer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AutoClosableArrayBuffer.scala new file mode 100644 index 00000000000..fb1e10b9c9e --- /dev/null +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AutoClosableArrayBuffer.scala @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.nvidia.spark.rapids + +import scala.collection.mutable.ArrayBuffer +import scala.reflect.ClassTag + +/** + * Just a simple wrapper to make working with buffers of AutoClosable things play + * nicely with withResource. + */ +class AutoClosableArrayBuffer[T <: AutoCloseable] extends AutoCloseable { + val data = new ArrayBuffer[T]() + + def append(scb: T): Unit = data.append(scb) + + def last: T = data.last + + def removeLast(): T = data.remove(data.length - 1) + + def foreach[U](f: T => U): Unit = data.foreach(f) + + def map[U](f: T => U): Seq[U] = data.map(f).toSeq + + def toArray[B >: T : ClassTag]: Array[B] = data.toArray + + def size(): Int = data.size + + def clear(): Unit = data.clear() + + def forall(p: T => Boolean): Boolean = data.forall(p) + + def iterator: Iterator[T] = data.iterator + + override def toString: String = s"AutoCloseable(${super.toString})" + + override def close(): Unit = { + data.foreach(_.close()) + data.clear() + } +} diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala index b5360a62f94..60f6dd68509 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala @@ -16,11 +16,9 @@ package com.nvidia.spark.rapids -import java.util - import scala.annotation.tailrec -import scala.collection.JavaConverters.collectionAsScalaIterableConverter import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer import ai.rapids.cudf import ai.rapids.cudf.{NvtxColor, NvtxRange} @@ -37,7 +35,7 @@ import org.apache.spark.TaskContext import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Alias, Ascending, Attribute, AttributeReference, AttributeSeq, AttributeSet, Expression, ExprId, If, NamedExpression, NullsFirst, SortOrder} +import org.apache.spark.sql.catalyst.expressions.{Alias, Ascending, Attribute, AttributeReference, AttributeSeq, AttributeSet, Expression, ExprId, If, NamedExpression, SortOrder} import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, HashPartitioning, Partitioning, UnspecifiedDistribution} @@ -47,11 +45,11 @@ import org.apache.spark.sql.execution.{ExplainUtils, SortExec, SparkPlan} import org.apache.spark.sql.execution.aggregate.{BaseAggregateExec, HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.rapids.aggregate.{CpuToGpuAggregateBufferConverter, CudfAggregate, GpuAggregateExpression, GpuToCpuAggregateBufferConverter} -import org.apache.spark.sql.rapids.execution.{GpuShuffleMeta, TrampolineUtil} +import org.apache.spark.sql.rapids.execution.{GpuBatchSubPartitioner, GpuShuffleMeta, TrampolineUtil} import org.apache.spark.sql.types._ import org.apache.spark.sql.vectorized.ColumnarBatch -object AggregateUtils { +object AggregateUtils extends Logging { private val aggs = List("min", "max", "avg", "sum", "count", "first", "last") @@ -98,8 +96,10 @@ object AggregateUtils { inputTypes: Seq[DataType], outputTypes: Seq[DataType], isReductionOnly: Boolean): Long = { + def typesToSize(types: Seq[DataType]): Long = types.map(GpuBatchUtils.estimateGpuMemory(_, nullable = false, rowCount = 1)).sum + val inputRowSize = typesToSize(inputTypes) val outputRowSize = typesToSize(outputTypes) // The cudf hash table implementation allocates four 32-bit integers per input row. @@ -120,22 +120,198 @@ object AggregateUtils { } // Calculate the max rows that can be processed during computation within the budget - val maxRows = totalBudget / computationBytesPerRow + // Make sure it's not less than 1, otherwise some corner test cases may fail + val maxRows = Math.max(totalBudget / computationBytesPerRow, 1) // Finally compute the input target batching size taking into account the cudf row limits Math.min(inputRowSize * maxRows, Int.MaxValue) } + + /** + * Concatenate batches together and perform a merge aggregation on the result. The input batches + * will be closed as part of this operation. + * + * @param batches batches to concatenate and merge aggregate + * @return lazy spillable batch which has NOT been marked spillable + */ + def concatenateAndMerge( + batches: mutable.ArrayBuffer[SpillableColumnarBatch], + metrics: GpuHashAggregateMetrics, + concatAndMergeHelper: AggHelper): SpillableColumnarBatch = { + // TODO: concatenateAndMerge (and calling code) could output a sequence + // of batches for the partial aggregate case. This would be done in case + // a retry failed a certain number of times. + val concatBatch = withResource(batches) { _ => + val concatSpillable = concatenateBatches(metrics, batches.toSeq) + withResource(concatSpillable) { + _.getColumnarBatch() + } + } + computeAggregateAndClose(metrics, concatBatch, concatAndMergeHelper) + } + + /** + * Try to concat and merge neighbour input batches to reduce the number of output batches. + * For some cases where input is highly aggregate-able, we can merge multiple input batches + * into a single output batch. In such cases we can skip repartition at all. + */ + def streamAggregateNeighours( + aggregatedBatches: CloseableBufferedIterator[SpillableColumnarBatch], + metrics: GpuHashAggregateMetrics, + targetMergeBatchSize: Long, + configuredTargetBatchSize: Long, + helper: AggHelper + ): Iterator[SpillableColumnarBatch] = { + new Iterator[SpillableColumnarBatch] { + + override def hasNext: Boolean = aggregatedBatches.hasNext + + override def next(): SpillableColumnarBatch = { + closeOnExcept(new ArrayBuffer[SpillableColumnarBatch]) { stagingBatches => { + var currentSize = 0L + while (aggregatedBatches.hasNext) { + val nextBatch = aggregatedBatches.head + if (currentSize + nextBatch.sizeInBytes > targetMergeBatchSize) { + if (stagingBatches.size == 1) { + return stagingBatches.head + } else if (stagingBatches.isEmpty) { + aggregatedBatches.next + return nextBatch + } + val merged = concatenateAndMerge(stagingBatches, metrics, helper) + stagingBatches.clear + currentSize = 0L + if (merged.sizeInBytes < configuredTargetBatchSize * 0.5) { + stagingBatches += merged + currentSize += merged.sizeInBytes + } else { + return merged + } + } else { + stagingBatches.append(nextBatch) + currentSize += nextBatch.sizeInBytes + aggregatedBatches.next + } + } + + if (stagingBatches.size == 1) { + return stagingBatches.head + } + concatenateAndMerge(stagingBatches, metrics, helper) + } + } + } + } + } + + /** + * Read the input batches and repartition them into buckets. + */ + def iterateAndRepartition( + aggregatedBatches: Iterator[SpillableColumnarBatch], + metrics: GpuHashAggregateMetrics, + targetMergeBatchSize: Long, + helper: AggHelper, + hashKeys: Seq[GpuExpression], + hashBucketNum: Int, + hashSeed: Int, + batchesByBucket: ArrayBuffer[AutoClosableArrayBuffer[SpillableColumnarBatch]] + ): Boolean = { + + var repartitionHappened = false + if (hashSeed > 200) { + throw new IllegalStateException("Too many times of repartition, may hit a bug?") + } + + def repartitionAndClose(batch: SpillableColumnarBatch): Unit = { + + // OPTIMIZATION + if (!aggregatedBatches.hasNext && batchesByBucket.forall(_.size() == 0)) { + // If this is the only batch (after merging neighbours) to be repartitioned, + // we can just add it to the first bucket and skip repartitioning. + // This is a common case when total input size can fit into a single batch. + batchesByBucket.head.append(batch) + return + } + + withResource(new NvtxWithMetrics("agg repartition", + NvtxColor.CYAN, metrics.repartitionTime)) { _ => + + withResource(new GpuBatchSubPartitioner( + Seq(batch).map(batch => { + withResource(batch) { _ => + batch.getColumnarBatch() + } + }).iterator, + hashKeys, hashBucketNum, hashSeed, "aggRepartition")) { + partitioner => { + (0 until partitioner.partitionsCount).foreach { id => + closeOnExcept(batchesByBucket) { _ => { + val newBatches = partitioner.releaseBatchesByPartition(id) + newBatches.foreach { newBatch => + if (newBatch.numRows() > 0) { + batchesByBucket(id).append(newBatch) + } else { + newBatch.safeClose() + } + } + } + } + } + } + } + } + repartitionHappened = true + } + + while (aggregatedBatches.hasNext) { + repartitionAndClose(aggregatedBatches.next) + } + + // Deal with the over sized buckets + def needRepartitionAgain(bucket: AutoClosableArrayBuffer[SpillableColumnarBatch]) = { + bucket.map(_.sizeInBytes).sum > targetMergeBatchSize && + bucket.size() != 1 && + !bucket.forall(_.numRows() == 1) // this is for test + } + + if (repartitionHappened && batchesByBucket.exists(needRepartitionAgain)) { + logDebug("Some of the repartition buckets are over sized, trying to split them") + + val newBuckets = batchesByBucket.flatMap(bucket => { + if (needRepartitionAgain(bucket)) { + val nextLayerBuckets = + ArrayBuffer.fill(hashBucketNum)(new AutoClosableArrayBuffer[SpillableColumnarBatch]()) + // Recursively merge and repartition the over sized bucket + repartitionHappened = + iterateAndRepartition( + new CloseableBufferedIterator(bucket.iterator), metrics, targetMergeBatchSize, + helper, hashKeys, hashBucketNum, hashSeed + 7, + nextLayerBuckets) || repartitionHappened + nextLayerBuckets + } else { + ArrayBuffer.apply(bucket) + } + }) + batchesByBucket.clear() + batchesByBucket.appendAll(newBuckets) + } + + repartitionHappened + } } /** Utility class to hold all of the metrics related to hash aggregation */ case class GpuHashAggregateMetrics( numOutputRows: GpuMetric, numOutputBatches: GpuMetric, - numTasksFallBacked: GpuMetric, + numTasksRepartitioned: GpuMetric, + numTasksSkippedAgg: GpuMetric, opTime: GpuMetric, computeAggTime: GpuMetric, concatTime: GpuMetric, sortTime: GpuMetric, + repartitionTime: GpuMetric, numAggOps: GpuMetric, numPreSplits: GpuMetric, singlePassTasks: GpuMetric, @@ -208,7 +384,7 @@ class AggHelper( private val groupingAttributes = groupingExpressions.map(_.toAttribute) private val aggBufferAttributes = groupingAttributes ++ - aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) + aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) // `GpuAggregateFunction` can add a pre and post step for update // and merge aggregates. @@ -228,7 +404,7 @@ class AggHelper( postStep ++= groupingAttributes postStepAttr ++= groupingAttributes postStepDataTypes ++= - groupingExpressions.map(_.dataType) + groupingExpressions.map(_.dataType) private var ix = groupingAttributes.length for (aggExp <- aggregateExpressions) { @@ -380,9 +556,9 @@ class AggHelper( withResource(new NvtxRange("groupby", NvtxColor.BLUE)) { _ => withResource(GpuColumnVector.from(preProcessed)) { preProcessedTbl => val groupOptions = cudf.GroupByOptions.builder() - .withIgnoreNullKeys(false) - .withKeysSorted(doSortAgg) - .build() + .withIgnoreNullKeys(false) + .withKeysSorted(doSortAgg) + .build() val cudfAggsOnColumn = cudfAggregates.zip(aggOrdinals).map { case (cudfAgg, ord) => cudfAgg.groupByAggregate.onColumn(ord) @@ -390,8 +566,8 @@ class AggHelper( // perform the aggregate val aggTbl = preProcessedTbl - .groupBy(groupOptions, groupingOrdinals: _*) - .aggregate(cudfAggsOnColumn.toSeq: _*) + .groupBy(groupOptions, groupingOrdinals: _*) + .aggregate(cudfAggsOnColumn.toSeq: _*) withResource(aggTbl) { _ => GpuColumnVector.from(aggTbl, postStepDataTypes.toArray) @@ -555,8 +731,8 @@ object GpuAggFirstPassIterator { metrics: GpuHashAggregateMetrics ): Iterator[SpillableColumnarBatch] = { val preprocessProjectIter = cbIter.map { cb => - val sb = SpillableColumnarBatch (cb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY) - aggHelper.preStepBound.projectAndCloseWithRetrySingleBatch (sb) + val sb = SpillableColumnarBatch(cb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY) + aggHelper.preStepBound.projectAndCloseWithRetrySingleBatch(sb) } computeAggregateWithoutPreprocessAndClose(metrics, preprocessProjectIter, aggHelper) } @@ -597,18 +773,18 @@ object GpuAggFinalPassIterator { modeInfo: AggregateModeInfo): BoundExpressionsModeAggregates = { val groupingAttributes = groupingExpressions.map(_.toAttribute) val aggBufferAttributes = groupingAttributes ++ - aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) + aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) val boundFinalProjections = if (modeInfo.hasFinalMode || modeInfo.hasCompleteMode) { val finalProjections = groupingAttributes ++ - aggregateExpressions.map(_.aggregateFunction.evaluateExpression) + aggregateExpressions.map(_.aggregateFunction.evaluateExpression) Some(GpuBindReferences.bindGpuReferences(finalProjections, aggBufferAttributes)) } else { None } // allAttributes can be different things, depending on aggregation mode: - // - Partial mode: grouping key + cudf aggregates (e.g. no avg, intead sum::count + // - Partial mode: grouping key + cudf aggregates (e.g. no avg, instead sum::count // - Final mode: grouping key + spark aggregates (e.g. avg) val finalAttributes = groupingAttributes ++ aggregateAttributes @@ -689,17 +865,22 @@ object GpuAggFinalPassIterator { /** * Iterator that takes another columnar batch iterator as input and emits new columnar batches that * are aggregated based on the specified grouping and aggregation expressions. This iterator tries - * to perform a hash-based aggregation but is capable of falling back to a sort-based aggregation - * which can operate on data that is either larger than can be represented by a cudf column or - * larger than can fit in GPU memory. + * to perform a hash-based aggregation but is capable of falling back to a repartition-based + * aggregation which can operate on data that is either larger than can be represented by a cudf + * column or larger than can fit in GPU memory. + * + * In general, GpuMergeAggregateIterator works in this flow: * - * The iterator starts by pulling all batches from the input iterator, performing an initial - * projection and aggregation on each individual batch via `aggregateInputBatches()`. The resulting - * aggregated batches are cached in memory as spillable batches. Once all input batches have been - * aggregated, `tryMergeAggregatedBatches()` is called to attempt a merge of the aggregated batches - * into a single batch. If this is successful then the resulting batch can be returned, otherwise - * `buildSortFallbackIterator` is used to sort the aggregated batches by the grouping keys and - * performs a final merge aggregation pass on the sorted batches. + * (1) The iterator starts by pulling all batches from the input iterator, performing an initial + * projection and aggregation on each individual batch via `GpuAggFirstPassIterator`, we call it + * "First Pass Aggregate". + * (2) Then the batches after first pass agg is sent to "streamAggregateNeighours", where it tries + * to concat & merge the neighbour batches into fewer batches, then "iterateAndRepartition" + * repartition the batch into fixed size buckets. Recursive repartition will be applied on + * over-sized buckets until each bucket * is within the target size. + * We call this phase "Second Pass Aggregate". + * (3) At "Third Pass Aggregate", we take each bucket and perform a final aggregation on all batches + * in the bucket, check "RepartitionAggregateIterator" for details. * * @param firstPassIter iterator that has done a first aggregation pass over the input data. * @param inputAttributes input attributes to identify the input columns from the input batches @@ -710,13 +891,12 @@ object GpuAggFinalPassIterator { * @param modeInfo identifies which aggregation modes are being used * @param metrics metrics that will be updated during aggregation * @param configuredTargetBatchSize user-specified value for the targeted input batch size - * @param useTieredProject user-specified option to enable tiered projections * @param allowNonFullyAggregatedOutput if allowed to skip third pass Agg * @param skipAggPassReductionRatio skip if the ratio of rows after a pass is bigger than this value * @param localInputRowsCount metric to track the number of input rows processed locally */ class GpuMergeAggregateIterator( - firstPassIter: Iterator[SpillableColumnarBatch], + firstPassIter: CloseableBufferedIterator[SpillableColumnarBatch], inputAttributes: Seq[Attribute], groupingExpressions: Seq[NamedExpression], aggregateExpressions: Seq[GpuAggregateExpression], @@ -728,18 +908,22 @@ class GpuMergeAggregateIterator( conf: SQLConf, allowNonFullyAggregatedOutput: Boolean, skipAggPassReductionRatio: Double, - localInputRowsCount: LocalGpuMetric) - extends Iterator[ColumnarBatch] with AutoCloseable with Logging { + localInputRowsCount: LocalGpuMetric +) + extends Iterator[ColumnarBatch] with AutoCloseable with Logging { private[this] val isReductionOnly = groupingExpressions.isEmpty private[this] val targetMergeBatchSize = computeTargetMergeBatchSize(configuredTargetBatchSize) - private[this] val aggregatedBatches = new util.ArrayDeque[SpillableColumnarBatch] - private[this] var outOfCoreIter: Option[GpuOutOfCoreSortIterator] = None - /** Iterator for fetching aggregated batches either if: - * 1. a sort-based fallback has occurred - * 2. skip third pass agg has occurred - **/ - private[this] var fallbackIter: Option[Iterator[ColumnarBatch]] = None + private[this] val defaultHashBucketNum = 16 + private[this] val defaultHashSeed = 107 + private[this] var batchesByBucket = + ArrayBuffer.fill(defaultHashBucketNum)(new AutoClosableArrayBuffer[SpillableColumnarBatch]()) + + private[this] var firstBatchChecked = false + + private[this] var bucketIter: Option[RepartitionAggregateIterator] = None + + private[this] var realIter: Option[Iterator[ColumnarBatch]] = None /** Whether a batch is pending for a reduction-only aggregation */ private[this] var hasReductionOnlyBatch: Boolean = isReductionOnly @@ -752,286 +936,168 @@ class GpuMergeAggregateIterator( } override def hasNext: Boolean = { - fallbackIter.map(_.hasNext).getOrElse { + realIter.map(_.hasNext).getOrElse { // reductions produce a result even if the input is empty - hasReductionOnlyBatch || !aggregatedBatches.isEmpty || firstPassIter.hasNext + hasReductionOnlyBatch || firstPassIter.hasNext } } override def next(): ColumnarBatch = { - fallbackIter.map(_.next()).getOrElse { - var shouldSkipThirdPassAgg = false - - // aggregate and merge all pending inputs - if (firstPassIter.hasNext) { - // first pass agg - val rowsAfterFirstPassAgg = aggregateInputBatches() - - // by now firstPassIter has been traversed, so localInputRowsCount is finished updating - if (isReductionOnly || - skipAggPassReductionRatio * localInputRowsCount.value >= rowsAfterFirstPassAgg) { - // second pass agg - tryMergeAggregatedBatches() - - val rowsAfterSecondPassAgg = aggregatedBatches.asScala.foldLeft(0L) { - (totalRows, batch) => totalRows + batch.numRows() - } - shouldSkipThirdPassAgg = - rowsAfterSecondPassAgg > skipAggPassReductionRatio * rowsAfterFirstPassAgg - } else { - shouldSkipThirdPassAgg = true - logInfo(s"Rows after first pass aggregation $rowsAfterFirstPassAgg exceeds " + - s"${skipAggPassReductionRatio * 100}% of " + - s"localInputRowsCount ${localInputRowsCount.value}, skip the second pass agg") - } - } + realIter.map(_.next()).getOrElse { - if (aggregatedBatches.size() > 1) { - // Unable to merge to a single output, so must fall back - if (allowNonFullyAggregatedOutput && shouldSkipThirdPassAgg) { - // skip third pass agg, return the aggregated batches directly - logInfo(s"Rows after second pass aggregation exceeds " + - s"${skipAggPassReductionRatio * 100}% of " + - s"rows after first pass, skip the third pass agg") - fallbackIter = Some(new Iterator[ColumnarBatch] { - override def hasNext: Boolean = !aggregatedBatches.isEmpty - - override def next(): ColumnarBatch = { - withResource(aggregatedBatches.pop()) { spillableBatch => - spillableBatch.getColumnarBatch() - } - } - }) - } else { - // fallback to sort agg, this is the third pass agg - fallbackIter = Some(buildSortFallbackIterator()) + // Handle reduction-only aggregation + if (isReductionOnly) { + val batches = ArrayBuffer.apply[SpillableColumnarBatch]() + while (firstPassIter.hasNext) { + batches += firstPassIter.next() } - fallbackIter.get.next() - } else if (aggregatedBatches.isEmpty) { - if (hasReductionOnlyBatch) { + + if (batches.isEmpty || batches.forall(_.numRows() == 0)) { hasReductionOnlyBatch = false - generateEmptyReductionBatch() + return generateEmptyReductionBatch() } else { - throw new NoSuchElementException("batches exhausted") + hasReductionOnlyBatch = false + val concat = AggregateUtils.concatenateAndMerge(batches, metrics, concatAndMergeHelper) + return withResource(concat) { cb => + cb.getColumnarBatch() + } } - } else { - // this will be the last batch - hasReductionOnlyBatch = false - withResource(aggregatedBatches.pop()) { spillableBatch => - spillableBatch.getColumnarBatch() + } + + // Handle the case of skipping second and third pass of aggregation + // This only work when spark.rapids.sql.agg.skipAggPassReductionRatio < 1 + if (!firstBatchChecked && firstPassIter.hasNext + && allowNonFullyAggregatedOutput) { + firstBatchChecked = true + + val peek = firstPassIter.head + // It's only based on first batch of first pass agg, so it's an estimate + val firstPassReductionRatioEstimate = 1.0 * peek.numRows() / localInputRowsCount.value + if (firstPassReductionRatioEstimate > skipAggPassReductionRatio) { + logDebug("Skipping second and third pass aggregation due to " + + "too high reduction ratio in first pass: " + + s"$firstPassReductionRatioEstimate") + // if so, skip any aggregation, return the origin batch directly + + realIter = Some(ConcatIterator.apply(firstPassIter, configuredTargetBatchSize)) + metrics.numTasksSkippedAgg += 1 + return realIter.get.next() + } else { + logInfo(s"The reduction ratio in first pass is not high enough to skip " + + s"second and third pass aggregation: peek.numRows: ${peek.numRows()}, " + + s"localInputRowsCount.value: ${localInputRowsCount.value}") } } + firstBatchChecked = true + + val groupingAttributes = groupingExpressions.map(_.toAttribute) + val aggBufferAttributes = groupingAttributes ++ + aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) + val hashKeys: Seq[GpuExpression] = + GpuBindReferences.bindGpuReferences(groupingAttributes, aggBufferAttributes.toSeq) + + val repartitionHappened = AggregateUtils.iterateAndRepartition( + AggregateUtils.streamAggregateNeighours( + firstPassIter, + metrics, + targetMergeBatchSize, + configuredTargetBatchSize, + concatAndMergeHelper) + , metrics, targetMergeBatchSize, concatAndMergeHelper, + hashKeys, defaultHashBucketNum, defaultHashSeed, batchesByBucket) + if (repartitionHappened) { + metrics.numTasksRepartitioned += 1 + } + + realIter = Some(ConcatIterator.apply( + new CloseableBufferedIterator(buildBucketIterator()), configuredTargetBatchSize)) + realIter.get.next() } } override def close(): Unit = { - aggregatedBatches.forEach(_.safeClose()) - aggregatedBatches.clear() - outOfCoreIter.foreach(_.close()) - outOfCoreIter = None - fallbackIter = None + batchesByBucket.foreach(_.close()) + batchesByBucket.clear() hasReductionOnlyBatch = false } private def computeTargetMergeBatchSize(confTargetSize: Long): Long = { val mergedTypes = groupingExpressions.map(_.dataType) ++ aggregateExpressions.map(_.dataType) - AggregateUtils.computeTargetBatchSize(confTargetSize, mergedTypes, mergedTypes,isReductionOnly) + AggregateUtils.computeTargetBatchSize(confTargetSize, mergedTypes, mergedTypes, isReductionOnly) } - /** Aggregate all input batches and place the results in the aggregatedBatches queue. */ - private def aggregateInputBatches(): Long = { - var rowsAfter = 0L - // cache everything in the first pass - while (firstPassIter.hasNext) { - val batch = firstPassIter.next() - rowsAfter += batch.numRows() - aggregatedBatches.add(batch) - } - rowsAfter - } + private lazy val concatAndMergeHelper = + new AggHelper(inputAttributes, groupingExpressions, aggregateExpressions, + forceMerge = true, conf, isSorted = false) + + private case class ConcatIterator( + input: CloseableBufferedIterator[SpillableColumnarBatch], + targetSize: Long) + extends Iterator[ColumnarBatch] { + + override def hasNext: Boolean = input.hasNext + + override def next(): ColumnarBatch = { + // combine all the data into a single batch + val spillCbs = ArrayBuffer[SpillableColumnarBatch]() + var totalBytes = 0L + closeOnExcept(spillCbs) { _ => + while (input.hasNext && (spillCbs.isEmpty || + (totalBytes + input.head.sizeInBytes) < targetSize)) { + val tmp = input.next + totalBytes += tmp.sizeInBytes + spillCbs += tmp + } - /** - * Attempt to merge adjacent batches in the aggregatedBatches queue until either there is only - * one batch or merging adjacent batches would exceed the target batch size. - */ - private def tryMergeAggregatedBatches(): Unit = { - while (aggregatedBatches.size() > 1) { - val concatTime = metrics.concatTime - val opTime = metrics.opTime - withResource(new NvtxWithMetrics("agg merge pass", NvtxColor.BLUE, concatTime, - opTime)) { _ => - // continue merging as long as some batches are able to be combined - if (!mergePass()) { - if (aggregatedBatches.size() > 1 && isReductionOnly) { - // We were unable to merge the aggregated batches within the target batch size limit, - // which means normally we would fallback to a sort-based approach. However for - // reduction-only aggregation there are no keys to use for a sort. The only way this - // can work is if all batches are merged. This will exceed the target batch size limit, - // but at this point it is either risk an OOM/cudf error and potentially work or - // not work at all. - logWarning(s"Unable to merge reduction-only aggregated batches within " + - s"target batch limit of $targetMergeBatchSize, attempting to merge remaining " + - s"${aggregatedBatches.size()} batches beyond limit") - withResource(mutable.ArrayBuffer[SpillableColumnarBatch]()) { batchesToConcat => - aggregatedBatches.forEach(b => batchesToConcat += b) - aggregatedBatches.clear() - val batch = concatenateAndMerge(batchesToConcat) - // batch does not need to be marked spillable since it is the last and only batch - // and will be immediately retrieved on the next() call. - aggregatedBatches.add(batch) - } - } - return + val concat = GpuAggregateIterator.concatenateBatches(metrics, spillCbs.toSeq) + withResource(concat) { _ => + concat.getColumnarBatch() } } } } - /** - * Perform a single pass over the aggregated batches attempting to merge adjacent batches. - * @return true if at least one merge operation occurred - */ - private def mergePass(): Boolean = { - val batchesToConcat: mutable.ArrayBuffer[SpillableColumnarBatch] = mutable.ArrayBuffer.empty - var wasBatchMerged = false - // Current size in bytes of the batches targeted for the next concatenation - var concatSize: Long = 0L - var batchesLeftInPass = aggregatedBatches.size() - - while (batchesLeftInPass > 0) { - closeOnExcept(batchesToConcat) { _ => - var isConcatSearchFinished = false - // Old batches are picked up at the front of the queue and freshly merged batches are - // appended to the back of the queue. Although tempting to allow the pass to "wrap around" - // and pick up batches freshly merged in this pass, it's avoided to prevent changing the - // order of aggregated batches. - while (batchesLeftInPass > 0 && !isConcatSearchFinished) { - val candidate = aggregatedBatches.getFirst - val potentialSize = concatSize + candidate.sizeInBytes - isConcatSearchFinished = concatSize > 0 && potentialSize > targetMergeBatchSize - if (!isConcatSearchFinished) { - batchesLeftInPass -= 1 - batchesToConcat += aggregatedBatches.removeFirst() - concatSize = potentialSize - } - } - } + private case class RepartitionAggregateIterator(opTime: GpuMetric) + extends Iterator[SpillableColumnarBatch] { - val mergedBatch = if (batchesToConcat.length > 1) { - wasBatchMerged = true - concatenateAndMerge(batchesToConcat) - } else { - // Unable to find a neighboring buffer to produce a valid merge in this pass, - // so simply put this buffer back on the queue for other passes. - batchesToConcat.remove(0) - } + batchesByBucket = batchesByBucket.filter(_.size() > 0) - // Add the merged batch to the end of the aggregated batch queue. Only a single pass over - // the batches is being performed due to the batch count check above, so the single-pass - // loop will terminate before picking up this new batch. - aggregatedBatches.addLast(mergedBatch) - batchesToConcat.clear() - concatSize = 0 - } + override def hasNext: Boolean = batchesByBucket.nonEmpty - wasBatchMerged - } + override def next(): SpillableColumnarBatch = { + withResource(new NvtxWithMetrics("RepartitionAggregateIterator.next", + NvtxColor.BLUE, opTime)) { _ => - private lazy val concatAndMergeHelper = - new AggHelper(inputAttributes, groupingExpressions, aggregateExpressions, - forceMerge = true, conf = conf) - - /** - * Concatenate batches together and perform a merge aggregation on the result. The input batches - * will be closed as part of this operation. - * @param batches batches to concatenate and merge aggregate - * @return lazy spillable batch which has NOT been marked spillable - */ - private def concatenateAndMerge( - batches: mutable.ArrayBuffer[SpillableColumnarBatch]): SpillableColumnarBatch = { - // TODO: concatenateAndMerge (and calling code) could output a sequence - // of batches for the partial aggregate case. This would be done in case - // a retry failed a certain number of times. - val concatBatch = withResource(batches) { _ => - val concatSpillable = concatenateBatches(metrics, batches.toSeq) - withResource(concatSpillable) { _.getColumnarBatch() } - } - computeAggregateAndClose(metrics, concatBatch, concatAndMergeHelper) - } - - /** Build an iterator that uses a sort-based approach to merge aggregated batches together. */ - private def buildSortFallbackIterator(): Iterator[ColumnarBatch] = { - logInfo(s"Falling back to sort-based aggregation with ${aggregatedBatches.size()} batches") - metrics.numTasksFallBacked += 1 - val aggregatedBatchIter = new Iterator[ColumnarBatch] { - override def hasNext: Boolean = !aggregatedBatches.isEmpty + if (batchesByBucket.last.size() == 1) { + batchesByBucket.remove(batchesByBucket.size - 1).removeLast() + } else { + // put as many buckets as possible together to aggregate, to reduce agg times + closeOnExcept(new ArrayBuffer[AutoClosableArrayBuffer[SpillableColumnarBatch]]) { + toAggregateBuckets => + var currentSize = 0L + while (batchesByBucket.nonEmpty && + batchesByBucket.last.size() + currentSize < targetMergeBatchSize) { + val bucket = batchesByBucket.remove(batchesByBucket.size - 1) + currentSize += bucket.map(_.sizeInBytes).sum + toAggregateBuckets += bucket + } - override def next(): ColumnarBatch = { - withResource(aggregatedBatches.removeFirst()) { spillable => - spillable.getColumnarBatch() + AggregateUtils.concatenateAndMerge( + toAggregateBuckets.flatMap(_.data), metrics, concatAndMergeHelper) + } } } } + } - if (isReductionOnly) { - // Normally this should never happen because `tryMergeAggregatedBatches` should have done - // a last-ditch effort to concatenate all batches together regardless of target limits. - throw new IllegalStateException("Unable to fallback to sort-based aggregation " + - "without grouping keys") - } - - val groupingAttributes = groupingExpressions.map(_.toAttribute) - val ordering = groupingAttributes.map(SortOrder(_, Ascending, NullsFirst, Seq.empty)) - val aggBufferAttributes = groupingAttributes ++ - aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) - val sorter = new GpuSorter(ordering, aggBufferAttributes) - val aggBatchTypes = aggBufferAttributes.map(_.dataType) - // Use the out of core sort iterator to sort the batches by grouping key - outOfCoreIter = Some(GpuOutOfCoreSortIterator( - aggregatedBatchIter, - sorter, - configuredTargetBatchSize, - opTime = metrics.opTime, - sortTime = metrics.sortTime, - outputBatches = NoopMetric, - outputRows = NoopMetric)) - - // The out of core sort iterator does not guarantee that a batch contains all of the values - // for a particular key, so add a key batching iterator to enforce this. That allows each batch - // to be merge-aggregated safely since all values associated with a particular key are - // guaranteed to be in the same batch. - val keyBatchingIter = new GpuKeyBatchingIterator( - outOfCoreIter.get, - sorter, - aggBatchTypes.toArray, - configuredTargetBatchSize, - numInputRows = NoopMetric, - numInputBatches = NoopMetric, - numOutputRows = NoopMetric, - numOutputBatches = NoopMetric, - concatTime = metrics.concatTime, - opTime = metrics.opTime) - - // Finally wrap the key batching iterator with a merge aggregation on the output batches. - new Iterator[ColumnarBatch] { - override def hasNext: Boolean = keyBatchingIter.hasNext - - private val mergeSortedHelper = - new AggHelper(inputAttributes, groupingExpressions, aggregateExpressions, - forceMerge = true, conf, isSorted = true) - - override def next(): ColumnarBatch = { - // batches coming out of the sort need to be merged - val resultSpillable = - computeAggregateAndClose(metrics, keyBatchingIter.next(), mergeSortedHelper) - withResource(resultSpillable) { _ => - resultSpillable.getColumnarBatch() - } - } - } + /** Build an iterator merging aggregated batches in each bucket. */ + private def buildBucketIterator(): Iterator[SpillableColumnarBatch] = { + bucketIter = Some(RepartitionAggregateIterator(opTime = metrics.opTime)) + bucketIter.get } + /** * Generates the result of a reduction-only aggregation on empty input by emitting the * initial value of each aggregator. @@ -1117,13 +1183,13 @@ abstract class GpuBaseAggregateMeta[INPUT <: SparkPlan]( ) if (arrayWithStructsGroupings) { willNotWorkOnGpu("ArrayTypes with Struct children in grouping expressions are not " + - "supported") + "supported") } tagForReplaceMode() if (agg.aggregateExpressions.exists(expr => expr.isDistinct) - && agg.aggregateExpressions.exists(expr => expr.filter.isDefined)) { + && agg.aggregateExpressions.exists(expr => expr.filter.isDefined)) { // Distinct with Filter is not supported on the GPU currently, // This makes sure that if we end up here, the plan falls back to the CPU // which will do the right thing. @@ -1195,15 +1261,15 @@ abstract class GpuBaseAggregateMeta[INPUT <: SparkPlan]( // (due to First). Fall back to CPU in this case. if (AggregateUtils.shouldFallbackMultiDistinct(agg.aggregateExpressions)) { willNotWorkOnGpu("Aggregates of non-distinct functions with multiple distinct " + - "functions are non-deterministic for non-distinct functions as it is " + - "computed using First.") + "functions are non-deterministic for non-distinct functions as it is " + + "computed using First.") } } } if (!conf.partialMergeDistinctEnabled && aggPattern.contains(PartialMerge)) { willNotWorkOnGpu("Replacing Partial Merge aggregates disabled. " + - s"Set ${conf.partialMergeDistinctEnabled} to true if desired") + s"Set ${conf.partialMergeDistinctEnabled} to true if desired") } } @@ -1256,11 +1322,11 @@ abstract class GpuBaseAggregateMeta[INPUT <: SparkPlan]( // This is a short term heuristic until we can better understand the cost // of sort vs the cost of doing aggregations so we can better decide. lazy val hasSingleBasicGroupingKey = agg.groupingExpressions.length == 1 && - agg.groupingExpressions.headOption.map(_.dataType).exists { - case StringType | BooleanType | ByteType | ShortType | IntegerType | - LongType | _: DecimalType | DateType | TimestampType => true - case _ => false - } + agg.groupingExpressions.headOption.map(_.dataType).exists { + case StringType | BooleanType | ByteType | ShortType | IntegerType | + LongType | _: DecimalType | DateType | TimestampType => true + case _ => false + } val gpuChild = childPlans.head.convertIfNeeded() val gpuAggregateExpressions = @@ -1314,11 +1380,11 @@ abstract class GpuBaseAggregateMeta[INPUT <: SparkPlan]( } val allowSinglePassAgg = (conf.forceSinglePassPartialSortAgg || - (conf.allowSinglePassPartialSortAgg && - hasSingleBasicGroupingKey && - estimatedPreProcessGrowth > 1.1)) && - canUsePartialSortAgg && - groupingCanBeSorted + (conf.allowSinglePassPartialSortAgg && + hasSingleBasicGroupingKey && + estimatedPreProcessGrowth > 1.1)) && + canUsePartialSortAgg && + groupingCanBeSorted GpuHashAggregateExec( aggRequiredChildDistributionExpressions, @@ -1332,7 +1398,8 @@ abstract class GpuBaseAggregateMeta[INPUT <: SparkPlan]( conf.forceSinglePassPartialSortAgg, allowSinglePassAgg, allowNonFullyAggregatedOutput, - conf.skipAggPassReductionRatio) + conf.skipAggPassReductionRatio + ) } } @@ -1351,7 +1418,7 @@ abstract class GpuTypedImperativeSupportedAggregateExecMeta[INPUT <: BaseAggrega private val mayNeedAggBufferConversion: Boolean = agg.aggregateExpressions.exists { expr => expr.aggregateFunction.isInstanceOf[TypedImperativeAggregate[_]] && - (expr.mode == Partial || expr.mode == PartialMerge) + (expr.mode == Partial || expr.mode == PartialMerge) } // overriding data types of Aggregation Buffers if necessary @@ -1420,6 +1487,7 @@ abstract class GpuTypedImperativeSupportedAggregateExecMeta[INPUT <: BaseAggrega allowSinglePassAgg = false, allowNonFullyAggregatedOutput = false, 1) + } else { super.convertToGpu() } @@ -1523,8 +1591,8 @@ object GpuTypedImperativeSupportedAggregateExecMeta { // [A]. there will be a R2C or C2R transition between them // [B]. there exists TypedImperativeAggregate functions in each of them (stages(i).canThisBeReplaced ^ stages(i + 1).canThisBeReplaced) && - containTypedImperativeAggregate(stages(i)) && - containTypedImperativeAggregate(stages(i + 1)) + containTypedImperativeAggregate(stages(i)) && + containTypedImperativeAggregate(stages(i + 1)) } // Return if all internal aggregation buffers are compatible with GPU Overrides. @@ -1602,10 +1670,10 @@ object GpuTypedImperativeSupportedAggregateExecMeta { fromCpuToGpu: Boolean): Seq[NamedExpression] = { val converters = mutable.Queue[Either[ - CpuToGpuAggregateBufferConverter, GpuToCpuAggregateBufferConverter]]() + CpuToGpuAggregateBufferConverter, GpuToCpuAggregateBufferConverter]]() mergeAggMeta.childExprs.foreach { case e if e.childExprs.length == 1 && - e.childExprs.head.isInstanceOf[TypedImperativeAggExprMeta[_]] => + e.childExprs.head.isInstanceOf[TypedImperativeAggExprMeta[_]] => e.wrapped.asInstanceOf[AggregateExpression].mode match { case Final | PartialMerge => val typImpAggMeta = e.childExprs.head.asInstanceOf[TypedImperativeAggExprMeta[_]] @@ -1660,16 +1728,16 @@ class GpuHashAggregateMeta( conf: RapidsConf, parent: Option[RapidsMeta[_, _, _]], rule: DataFromReplacementRule) - extends GpuBaseAggregateMeta(agg, agg.requiredChildDistributionExpressions, - conf, parent, rule) + extends GpuBaseAggregateMeta(agg, agg.requiredChildDistributionExpressions, + conf, parent, rule) class GpuSortAggregateExecMeta( override val agg: SortAggregateExec, conf: RapidsConf, parent: Option[RapidsMeta[_, _, _]], rule: DataFromReplacementRule) - extends GpuTypedImperativeSupportedAggregateExecMeta(agg, - agg.requiredChildDistributionExpressions, conf, parent, rule) { + extends GpuTypedImperativeSupportedAggregateExecMeta(agg, + agg.requiredChildDistributionExpressions, conf, parent, rule) { override def tagPlanForGpu(): Unit = { super.tagPlanForGpu() @@ -1716,14 +1784,14 @@ class GpuObjectHashAggregateExecMeta( conf: RapidsConf, parent: Option[RapidsMeta[_, _, _]], rule: DataFromReplacementRule) - extends GpuTypedImperativeSupportedAggregateExecMeta(agg, - agg.requiredChildDistributionExpressions, conf, parent, rule) + extends GpuTypedImperativeSupportedAggregateExecMeta(agg, + agg.requiredChildDistributionExpressions, conf, parent, rule) object GpuHashAggregateExecBase { def calcInputAttributes(aggregateExpressions: Seq[GpuAggregateExpression], - childOutput: Seq[Attribute], - inputAggBufferAttributes: Seq[Attribute]): Seq[Attribute] = { + childOutput: Seq[Attribute], + inputAggBufferAttributes: Seq[Attribute]): Seq[Attribute] = { val modes = aggregateExpressions.map(_.mode).distinct if (modes.contains(Final) || modes.contains(PartialMerge)) { // SPARK-31620: when planning aggregates, the partial aggregate uses aggregate function's @@ -1754,7 +1822,7 @@ object GpuHashAggregateExecBase { } /** - * The GPU version of SortAggregateExec that is intended for partial aggregations that are not + * The GPU version of AggregateExec that is intended for partial aggregations that are not * reductions and so it sorts the input data ahead of time to do it in a single pass. * * @param requiredChildDistributionExpressions this is unchanged by the GPU. It is used in @@ -1767,7 +1835,6 @@ object GpuHashAggregateExecBase { * node should project) * @param child incoming plan (where we get input columns from) * @param configuredTargetBatchSize user-configured maximum device memory size of a batch - * @param configuredTieredProjectEnabled configurable optimization to use tiered projections * @param allowNonFullyAggregatedOutput whether we can skip the third pass of aggregation * (can omit non fully aggregated data for non-final * stage of aggregation) @@ -1802,11 +1869,13 @@ case class GpuHashAggregateExec( protected override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL protected override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL override lazy val additionalMetrics: Map[String, GpuMetric] = Map( - NUM_TASKS_FALL_BACKED -> createMetric(MODERATE_LEVEL, DESCRIPTION_NUM_TASKS_FALL_BACKED), + NUM_TASKS_REPARTITIONED -> createMetric(MODERATE_LEVEL, DESCRIPTION_NUM_TASKS_REPARTITIONED), + NUM_TASKS_SKIPPED_AGG -> createMetric(MODERATE_LEVEL, DESCRIPTION_NUM_TASKS_SKIPPED_AGG), OP_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_OP_TIME), AGG_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_AGG_TIME), CONCAT_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_CONCAT_TIME), SORT_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_SORT_TIME), + REPARTITION_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_REPARTITION_TIME), "NUM_AGGS" -> createMetric(DEBUG_LEVEL, "num agg operations"), "NUM_PRE_SPLITS" -> createMetric(DEBUG_LEVEL, "num pre splits"), "NUM_TASKS_SINGLE_PASS" -> createMetric(MODERATE_LEVEL, "number of single pass tasks"), @@ -1833,11 +1902,13 @@ case class GpuHashAggregateExec( val aggMetrics = GpuHashAggregateMetrics( numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS), numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES), - numTasksFallBacked = gpuLongMetric(NUM_TASKS_FALL_BACKED), + numTasksRepartitioned = gpuLongMetric(NUM_TASKS_REPARTITIONED), + numTasksSkippedAgg = gpuLongMetric(NUM_TASKS_SKIPPED_AGG), opTime = gpuLongMetric(OP_TIME), computeAggTime = gpuLongMetric(AGG_TIME), concatTime = gpuLongMetric(CONCAT_TIME), sortTime = gpuLongMetric(SORT_TIME), + repartitionTime = gpuLongMetric(REPARTITION_TIME), numAggOps = gpuLongMetric("NUM_AGGS"), numPreSplits = gpuLongMetric("NUM_PRE_SPLITS"), singlePassTasks = gpuLongMetric("NUM_TASKS_SINGLE_PASS"), @@ -1867,11 +1938,12 @@ case class GpuHashAggregateExec( val postBoundReferences = GpuAggFinalPassIterator.setupReferences(groupingExprs, aggregateExprs, aggregateAttrs, resultExprs, modeInfo) - new DynamicGpuPartialSortAggregateIterator(cbIter, inputAttrs, groupingExprs, + new DynamicGpuPartialAggregateIterator(cbIter, inputAttrs, groupingExprs, boundGroupExprs, aggregateExprs, aggregateAttrs, resultExprs, modeInfo, localEstimatedPreProcessGrowth, alreadySorted, expectedOrdering, postBoundReferences, targetBatchSize, aggMetrics, conf, - localForcePre, localAllowPre, allowNonFullyAggregatedOutput, skipAggPassReductionRatio) + localForcePre, localAllowPre, allowNonFullyAggregatedOutput, skipAggPassReductionRatio + ) } } @@ -1914,8 +1986,8 @@ case class GpuHashAggregateExec( // Used in de-duping and optimizer rules override def producedAttributes: AttributeSet = AttributeSet(aggregateAttributes) ++ - AttributeSet(resultExpressions.diff(groupingExpressions).map(_.toAttribute)) ++ - AttributeSet(aggregateBufferAttributes) + AttributeSet(resultExpressions.diff(groupingExpressions).map(_.toAttribute)) ++ + AttributeSet(aggregateBufferAttributes) // AllTuples = distribution with a single partition and all tuples of the dataset are co-located. // Clustered = dataset with tuples co-located in the same partition if they share a specific value @@ -1938,7 +2010,7 @@ case class GpuHashAggregateExec( */ override lazy val allAttributes: AttributeSeq = child.output ++ aggregateBufferAttributes ++ aggregateAttributes ++ - aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) + aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) override def verboseString(maxFields: Int): String = toString(verbose = true, maxFields) @@ -1957,8 +2029,8 @@ case class GpuHashAggregateExec( s"""${loreArgs.mkString(", ")}""" } else { s"$nodeName (keys=$keyString, functions=$functionString)," + - s" filters=${aggregateExpressions.map(_.filter)})" + - s""" ${loreArgs.mkString(", ")}""" + s" filters=${aggregateExpressions.map(_.filter)})" + + s""" ${loreArgs.mkString(", ")}""" } } // @@ -1972,7 +2044,7 @@ case class GpuHashAggregateExec( } } -class DynamicGpuPartialSortAggregateIterator( +class DynamicGpuPartialAggregateIterator( cbIter: Iterator[ColumnarBatch], inputAttrs: Seq[Attribute], groupingExprs: Seq[NamedExpression], @@ -1999,7 +2071,7 @@ class DynamicGpuPartialSortAggregateIterator( // When doing a reduction we don't have the aggIter setup for the very first time // so we have to match what happens for the normal reduction operations. override def hasNext: Boolean = aggIter.map(_.hasNext) - .getOrElse(isReductionOnly || cbIter.hasNext) + .getOrElse(isReductionOnly || cbIter.hasNext) private[this] def estimateCardinality(cb: ColumnarBatch): Int = { withResource(boundGroupExprs.project(cb)) { groupingKeys => @@ -2052,7 +2124,8 @@ class DynamicGpuPartialSortAggregateIterator( inputAttrs.map(_.dataType).toArray, preProcessAggHelper.preStepBound, metrics.opTime, metrics.numPreSplits) - val firstPassIter = GpuAggFirstPassIterator(sortedSplitIter, preProcessAggHelper, metrics) + val firstPassIter = GpuAggFirstPassIterator(sortedSplitIter, preProcessAggHelper, + metrics) // Technically on a partial-agg, which this only works for, this last iterator should // be a noop except for some metrics. But for consistency between all of the @@ -2071,6 +2144,7 @@ class DynamicGpuPartialSortAggregateIterator( metrics.opTime, metrics.numPreSplits) val localInputRowsMetrics = new LocalGpuMetric + val firstPassIter = GpuAggFirstPassIterator( splitInputIter.map(cb => { localInputRowsMetrics += cb.numRows() @@ -2080,7 +2154,7 @@ class DynamicGpuPartialSortAggregateIterator( metrics) val mergeIter = new GpuMergeAggregateIterator( - firstPassIter, + new CloseableBufferedIterator(firstPassIter), inputAttrs, groupingExprs, aggregateExprs, @@ -2092,7 +2166,8 @@ class DynamicGpuPartialSortAggregateIterator( conf, allowNonFullyAggregatedOutput, skipAggPassReductionRatio, - localInputRowsMetrics) + localInputRowsMetrics + ) GpuAggFinalPassIterator.makeIter(mergeIter, postBoundReferences, metrics) } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala index 0ffead09de6..3d9b6285a91 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala @@ -66,6 +66,7 @@ object GpuMetric extends Logging { val COLLECT_TIME = "collectTime" val CONCAT_TIME = "concatTime" val SORT_TIME = "sortTime" + val REPARTITION_TIME = "repartitionTime" val AGG_TIME = "computeAggTime" val JOIN_TIME = "joinTime" val FILTER_TIME = "filterTime" @@ -73,6 +74,8 @@ object GpuMetric extends Logging { val BUILD_TIME = "buildTime" val STREAM_TIME = "streamTime" val NUM_TASKS_FALL_BACKED = "numTasksFallBacked" + val NUM_TASKS_REPARTITIONED = "numTasksRepartitioned" + val NUM_TASKS_SKIPPED_AGG = "numTasksSkippedAgg" val READ_FS_TIME = "readFsTime" val WRITE_BUFFER_TIME = "writeBufferTime" val FILECACHE_FOOTER_HITS = "filecacheFooterHits" @@ -104,6 +107,7 @@ object GpuMetric extends Logging { val DESCRIPTION_COLLECT_TIME = "collect batch time" val DESCRIPTION_CONCAT_TIME = "concat batch time" val DESCRIPTION_SORT_TIME = "sort time" + val DESCRIPTION_REPARTITION_TIME = "repartition time" val DESCRIPTION_AGG_TIME = "aggregation time" val DESCRIPTION_JOIN_TIME = "join time" val DESCRIPTION_FILTER_TIME = "filter time" @@ -111,6 +115,8 @@ object GpuMetric extends Logging { val DESCRIPTION_BUILD_TIME = "build time" val DESCRIPTION_STREAM_TIME = "stream time" val DESCRIPTION_NUM_TASKS_FALL_BACKED = "number of sort fallback tasks" + val DESCRIPTION_NUM_TASKS_REPARTITIONED = "number of tasks repartitioned for agg" + val DESCRIPTION_NUM_TASKS_SKIPPED_AGG = "number of tasks skipped aggregation" val DESCRIPTION_READ_FS_TIME = "time to read fs data" val DESCRIPTION_WRITE_BUFFER_TIME = "time to write data to buffer" val DESCRIPTION_FILECACHE_FOOTER_HITS = "cached footer hits" diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/window/GpuUnboundedToUnboundedAggWindowExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/window/GpuUnboundedToUnboundedAggWindowExec.scala index d685efe68e0..7c5b55cd0bd 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/window/GpuUnboundedToUnboundedAggWindowExec.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/window/GpuUnboundedToUnboundedAggWindowExec.scala @@ -17,10 +17,9 @@ package com.nvidia.spark.rapids.window import scala.collection.mutable.{ArrayBuffer, ListBuffer} -import scala.reflect.ClassTag import ai.rapids.cudf -import com.nvidia.spark.rapids.{ConcatAndConsumeAll, GpuAlias, GpuBindReferences, GpuBoundReference, GpuColumnVector, GpuExpression, GpuLiteral, GpuMetric, GpuProjectExec, SpillableColumnarBatch, SpillPriorities} +import com.nvidia.spark.rapids.{AutoClosableArrayBuffer, ConcatAndConsumeAll, GpuAlias, GpuBindReferences, GpuBoundReference, GpuColumnVector, GpuExpression, GpuLiteral, GpuMetric, GpuProjectExec, SpillableColumnarBatch, SpillPriorities} import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource} import com.nvidia.spark.rapids.RapidsPluginImplicits.AutoCloseableProducingSeq import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{splitSpillableInHalfByRows, withRetry, withRetryNoSplit} @@ -36,32 +35,6 @@ import org.apache.spark.sql.rapids.aggregate.{CudfAggregate, GpuAggregateExpress import org.apache.spark.sql.types.{DataType, IntegerType, LongType} import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} - -/** - * Just a simple wrapper to make working with buffers of AutoClosable things play - * nicely with withResource. - */ -class AutoClosableArrayBuffer[T <: AutoCloseable]() extends AutoCloseable { - private val data = new ArrayBuffer[T]() - - def append(scb: T): Unit = data.append(scb) - - def last: T = data.last - - def removeLast(): T = data.remove(data.length - 1) - - def foreach[U](f: T => U): Unit = data.foreach(f) - - def toArray[B >: T : ClassTag]: Array[B] = data.toArray - - override def toString: String = s"AutoCloseable(${super.toString})" - - override def close(): Unit = { - data.foreach(_.close()) - data.clear() - } -} - /** * Utilities for conversion between SpillableColumnarBatch, ColumnarBatch, and cudf.Table. */ From 82c26f1de2cbdb13fa0d9e041baa4b738ca85d5e Mon Sep 17 00:00:00 2001 From: knoguchi22 Date: Tue, 26 Nov 2024 13:53:03 -0500 Subject: [PATCH 10/37] Append knoguchi22 to blossom-ci whitelist [skip ci] (#11777) * Append knoguchi to blossom-ci whitelist [skip ci] * Fixing the typo in username. Signed-off-by: Koji Noguchi --------- Signed-off-by: Koji Noguchi --- .github/workflows/blossom-ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml index 93557017b08..1d7b0ab8e0b 100644 --- a/.github/workflows/blossom-ci.yml +++ b/.github/workflows/blossom-ci.yml @@ -77,7 +77,8 @@ jobs: github.actor == 'Feng-Jiang28' || github.actor == 'SurajAralihalli' || github.actor == 'jihoonson' || - github.actor == 'ustcfy' + github.actor == 'ustcfy' || + github.actor == 'knoguchi22' ) steps: - name: Check if comment is issued by authorized person From ff0ca0f4d52b197a644c332b1ffcbe9c0351fb1f Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 26 Nov 2024 15:07:05 -0600 Subject: [PATCH 11/37] Ability to decompress snappy and zstd Parquet files via CPU [databricks] (#11752) * Ability to decompress Parquet data on CPU Signed-off-by: Jason Lowe * Add tests * Refactor to reduce duplicated code * scala2.13 fix * Address review comments * Fix Databricks build * Update scala2.13 poms --------- Signed-off-by: Jason Lowe --- .../src/main/python/parquet_test.py | 9 +- jenkins/databricks/install_deps.py | 2 + scala2.13/shim-deps/databricks/pom.xml | 6 + shim-deps/databricks/pom.xml | 6 + .../iceberg/parquet/GpuParquetReader.java | 2 + .../spark/source/GpuMultiFileBatchReader.java | 6 +- .../nvidia/spark/rapids/GpuParquetScan.scala | 376 ++++++++++++++++-- .../spark/rapids/HostMemoryStreams.scala | 12 + .../com/nvidia/spark/rapids/RapidsConf.scala | 31 ++ 9 files changed, 406 insertions(+), 44 deletions(-) diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index a223d6559ed..6aa234003ba 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -299,12 +299,19 @@ def test_parquet_read_round_trip_binary_as_string(std_input_path, read_func, rea @pytest.mark.parametrize('compress', parquet_compress_options) @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -def test_parquet_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_list, reader_confs): +@pytest.mark.parametrize('cpu_decompress', [True, False]) +def test_parquet_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_list, reader_confs, cpu_decompress): data_path = spark_tmp_path + '/PARQUET_DATA' with_cpu_session( lambda spark : binary_op_df(spark, long_gen).write.parquet(data_path), conf={'spark.sql.parquet.compression.codec': compress}) all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) + if cpu_decompress: + all_confs = copy_and_update(all_confs, { + 'spark.rapids.sql.format.parquet.decompressCpu' : 'true', + 'spark.rapids.sql.format.parquet.decompressCpu.snappy' : 'true', + 'spark.rapids.sql.format.parquet.decompressCpu.zstd' : 'true' + }) assert_gpu_and_cpu_are_equal_collect( lambda spark : spark.read.parquet(data_path), conf=all_confs) diff --git a/jenkins/databricks/install_deps.py b/jenkins/databricks/install_deps.py index 11e2162957e..23453912827 100644 --- a/jenkins/databricks/install_deps.py +++ b/jenkins/databricks/install_deps.py @@ -135,6 +135,8 @@ def define_deps(spark_version, scala_version): f'{prefix_ws_sp_mvn_hadoop}--org.apache.avro--avro-mapred--org.apache.avro__avro-mapred__*.jar'), Artifact('org.apache.avro', 'avro', f'{prefix_ws_sp_mvn_hadoop}--org.apache.avro--avro--org.apache.avro__avro__*.jar'), + Artifact('com.github.luben', 'zstd-jni', + f'{prefix_ws_sp_mvn_hadoop}--com.github.luben--zstd-jni--com.github.luben__zstd-jni__*.jar'), ] # Parquet diff --git a/scala2.13/shim-deps/databricks/pom.xml b/scala2.13/shim-deps/databricks/pom.xml index 9d6ff787ef1..484e2896f61 100644 --- a/scala2.13/shim-deps/databricks/pom.xml +++ b/scala2.13/shim-deps/databricks/pom.xml @@ -231,6 +231,12 @@ ${spark.version} compile
+ + com.github.luben + zstd-jni + ${spark.version} + compile + org.apache.arrow arrow-format diff --git a/shim-deps/databricks/pom.xml b/shim-deps/databricks/pom.xml index edfa3d6f896..5f36e529aa7 100644 --- a/shim-deps/databricks/pom.xml +++ b/shim-deps/databricks/pom.xml @@ -231,6 +231,12 @@ ${spark.version} compile + + com.github.luben + zstd-jni + ${spark.version} + compile + org.apache.arrow arrow-format diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/parquet/GpuParquetReader.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/parquet/GpuParquetReader.java index 47b649af6ed..c61f7c6b6f7 100644 --- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/parquet/GpuParquetReader.java +++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/parquet/GpuParquetReader.java @@ -25,6 +25,7 @@ import scala.collection.Seq; +import com.nvidia.spark.rapids.CpuCompressionConfig$; import com.nvidia.spark.rapids.DateTimeRebaseCorrected$; import com.nvidia.spark.rapids.GpuMetric; import com.nvidia.spark.rapids.GpuParquetUtils; @@ -144,6 +145,7 @@ public org.apache.iceberg.io.CloseableIterator iterator() { partReaderSparkSchema, debugDumpPrefix, debugDumpAlways, maxBatchSizeRows, maxBatchSizeBytes, targetBatchSizeBytes, useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes, + CpuCompressionConfig$.MODULE$.disabled(), metrics, DateTimeRebaseCorrected$.MODULE$, // dateRebaseMode DateTimeRebaseCorrected$.MODULE$, // timestampRebaseMode diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuMultiFileBatchReader.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuMultiFileBatchReader.java index 9c36fe76020..b32e5e755cb 100644 --- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuMultiFileBatchReader.java +++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuMultiFileBatchReader.java @@ -352,7 +352,8 @@ protected FilePartitionReaderBase createRapidsReader(PartitionedFile[] pFiles, return new MultiFileCloudParquetPartitionReader(conf, pFiles, this::filterParquetBlocks, caseSensitive, parquetDebugDumpPrefix, parquetDebugDumpAlways, maxBatchSizeRows, maxBatchSizeBytes, targetBatchSizeBytes, maxGpuColumnSizeBytes, - useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes, metrics, partitionSchema, + useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes, + CpuCompressionConfig$.MODULE$.disabled(), metrics, partitionSchema, numThreads, maxNumFileProcessed, false, // ignoreMissingFiles false, // ignoreCorruptFiles @@ -411,7 +412,7 @@ protected FilePartitionReaderBase createRapidsReader(PartitionedFile[] pFiles, JavaConverters.asJavaCollection(filteredInfo.parquetBlockMeta.blocks()).stream() .map(b -> ParquetSingleDataBlockMeta.apply( filteredInfo.parquetBlockMeta.filePath(), - ParquetDataBlock.apply(b), + ParquetDataBlock.apply(b, CpuCompressionConfig$.MODULE$.disabled()), InternalRow.empty(), ParquetSchemaWrapper.apply(filteredInfo.parquetBlockMeta.schema()), filteredInfo.parquetBlockMeta.readSchema(), @@ -431,6 +432,7 @@ protected FilePartitionReaderBase createRapidsReader(PartitionedFile[] pFiles, caseSensitive, parquetDebugDumpPrefix, parquetDebugDumpAlways, maxBatchSizeRows, maxBatchSizeBytes, targetBatchSizeBytes, maxGpuColumnSizeBytes, useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes, + CpuCompressionConfig$.MODULE$.disabled(), metrics, partitionSchema, numThreads, false, // ignoreMissingFiles false, // ignoreCorruptFiles diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala index e38dab50d72..03eb48de6fb 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala @@ -16,7 +16,7 @@ package com.nvidia.spark.rapids -import java.io.{Closeable, EOFException, FileNotFoundException, IOException, OutputStream} +import java.io.{Closeable, EOFException, FileNotFoundException, InputStream, IOException, OutputStream} import java.net.URI import java.nio.ByteBuffer import java.nio.channels.SeekableByteChannel @@ -31,6 +31,7 @@ import scala.collection.mutable.ArrayBuffer import scala.language.implicitConversions import ai.rapids.cudf._ +import com.github.luben.zstd.ZstdDecompressCtx import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource} import com.nvidia.spark.rapids.GpuMetric._ import com.nvidia.spark.rapids.ParquetPartitionReader.{CopyRange, LocalCopy} @@ -47,6 +48,7 @@ import org.apache.parquet.bytes.BytesUtils import org.apache.parquet.bytes.BytesUtils.readIntLittleEndian import org.apache.parquet.column.ColumnDescriptor import org.apache.parquet.filter2.predicate.FilterApi +import org.apache.parquet.format.Util import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.{ParquetFileReader, ParquetInputFormat} import org.apache.parquet.hadoop.ParquetFileWriter.MAGIC @@ -54,6 +56,7 @@ import org.apache.parquet.hadoop.metadata._ import org.apache.parquet.io.{InputFile, SeekableInputStream} import org.apache.parquet.schema.{DecimalMetadata, GroupType, MessageType, OriginalType, PrimitiveType, Type} import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName +import org.xerial.snappy.Snappy import org.apache.spark.TaskContext import org.apache.spark.broadcast.Broadcast @@ -1106,6 +1109,7 @@ case class GpuParquetMultiFilePartitionReaderFactory( }.getOrElse(rapidsConf.getMultithreadedReaderKeepOrder) private val alluxioReplacementTaskTime = AlluxioCfgUtils.enabledAlluxioReplacementAlgoTaskTime(rapidsConf) + private val compressCfg = CpuCompressionConfig.forParquet(rapidsConf) // We can't use the coalescing files reader when InputFileName, InputFileBlockStart, // or InputFileBlockLength because we are combining all the files into a single buffer @@ -1137,7 +1141,7 @@ case class GpuParquetMultiFilePartitionReaderFactory( new MultiFileCloudParquetPartitionReader(conf, files, filterFunc, isCaseSensitive, debugDumpPrefix, debugDumpAlways, maxReadBatchSizeRows, maxReadBatchSizeBytes, targetBatchSizeBytes, maxGpuColumnSizeBytes, - useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes, + useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes, compressCfg, metrics, partitionSchema, numThreads, maxNumFileProcessed, ignoreMissingFiles, ignoreCorruptFiles, readUseFieldId, alluxioPathReplacementMap.getOrElse(Map.empty), alluxioReplacementTaskTime, queryUsesInputFile, keepReadsInOrderFromConf, combineConf) @@ -1244,7 +1248,7 @@ case class GpuParquetMultiFilePartitionReaderFactory( clippedBlocks ++= singleFileInfo.blocks.map(block => ParquetSingleDataBlockMeta( singleFileInfo.filePath, - ParquetDataBlock(block), + ParquetDataBlock(block, compressCfg), metaAndFile.file.partitionValues, ParquetSchemaWrapper(singleFileInfo.schema), singleFileInfo.readSchema, @@ -1262,7 +1266,7 @@ case class GpuParquetMultiFilePartitionReaderFactory( new MultiFileParquetPartitionReader(conf, files, clippedBlocks.toSeq, isCaseSensitive, debugDumpPrefix, debugDumpAlways, maxReadBatchSizeRows, maxReadBatchSizeBytes, targetBatchSizeBytes, maxGpuColumnSizeBytes, - useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes, + useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes, compressCfg, metrics, partitionSchema, numThreads, ignoreMissingFiles, ignoreCorruptFiles, readUseFieldId) } @@ -1307,6 +1311,7 @@ case class GpuParquetPartitionReaderFactory( private val readUseFieldId = ParquetSchemaClipShims.useFieldId(sqlConf) private val footerReadType = GpuParquetScan.footerReaderHeuristic( rapidsConf.parquetReaderFooterType, dataSchema, readDataSchema, readUseFieldId) + private val compressCfg = CpuCompressionConfig.forParquet(rapidsConf) override def supportColumnarReads(partition: InputPartition): Boolean = true @@ -1335,12 +1340,29 @@ case class GpuParquetPartitionReaderFactory( new ParquetPartitionReader(conf, file, singleFileInfo.filePath, singleFileInfo.blocks, singleFileInfo.schema, isCaseSensitive, readDataSchema, debugDumpPrefix, debugDumpAlways, maxReadBatchSizeRows, maxReadBatchSizeBytes, targetSizeBytes, - useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes, + useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes, compressCfg, metrics, singleFileInfo.dateRebaseMode, singleFileInfo.timestampRebaseMode, singleFileInfo.hasInt96Timestamps, readUseFieldId) } } +case class CpuCompressionConfig( + decompressSnappyCpu: Boolean, + decompressZstdCpu: Boolean) { + val decompressAnyCpu: Boolean = decompressSnappyCpu || decompressZstdCpu +} + +object CpuCompressionConfig { + def forParquet(conf: RapidsConf): CpuCompressionConfig = { + val cpuEnable = conf.parquetDecompressCpu + CpuCompressionConfig( + decompressSnappyCpu = cpuEnable && conf.parquetDecompressCpuSnappy, + decompressZstdCpu = cpuEnable && conf.parquetDecompressCpuZstd) + } + + def disabled(): CpuCompressionConfig = CpuCompressionConfig(false, false) +} + trait ParquetPartitionReaderBase extends Logging with ScanWithMetrics with MultiFileReaderFunctions { // the size of Parquet magic (at start+end) and footer length values @@ -1353,6 +1375,8 @@ trait ParquetPartitionReaderBase extends Logging with ScanWithMetrics def isSchemaCaseSensitive: Boolean + def compressCfg: CpuCompressionConfig + val copyBufferSize = conf.getInt("parquet.read.allocation.size", 8 * 1024 * 1024) def checkIfNeedToSplitBlocks(currentDateRebaseMode: DateTimeRebaseMode, @@ -1418,13 +1442,8 @@ trait ParquetPartitionReaderBase extends Logging with ScanWithMetrics schema: MessageType, handleCoalesceFiles: Boolean): Long = { // start with the size of Parquet magic (at start+end) and footer length values - var size: Long = PARQUET_META_SIZE - - // Calculate the total amount of column data that will be copied - // NOTE: Avoid using block.getTotalByteSize here as that is the - // uncompressed size rather than the size in the file. - size += currentChunkedBlocks.flatMap(_.getColumns.asScala.map(_.getTotalSize)).sum - + val headerSize: Long = PARQUET_META_SIZE + val blocksSize = ParquetPartitionReader.computeOutputSize(currentChunkedBlocks, compressCfg) val footerSize = calculateParquetFooterSize(currentChunkedBlocks, schema) val extraMemory = if (handleCoalesceFiles) { val numCols = currentChunkedBlocks.head.getColumns().size() @@ -1432,8 +1451,7 @@ trait ParquetPartitionReaderBase extends Logging with ScanWithMetrics } else { 0 } - val totalSize = size + footerSize + extraMemory - totalSize + headerSize + blocksSize + footerSize + extraMemory } protected def writeFooter( @@ -1532,7 +1550,7 @@ trait ParquetPartitionReaderBase extends Logging with ScanWithMetrics * metadata but with the file offsets updated to reflect the new position of the column data * as written to the output. * - * @param in the input stream for the original Parquet file + * @param filePath the path to the Parquet file * @param out the output stream to receive the data * @param blocks block metadata from the original file that will appear in the computed file * @param realStartOffset starting file offset of the first block @@ -1575,6 +1593,258 @@ trait ParquetPartitionReaderBase extends Logging with ScanWithMetrics computeBlockMetaData(blocks, realStartOffset) } + private class BufferedFileInput( + filePath: Path, + blocks: Seq[BlockMetaData], + metrics: Map[String, GpuMetric]) extends InputStream { + private[this] val in = filePath.getFileSystem(conf).open(filePath) + private[this] val buffer: Array[Byte] = new Array[Byte](copyBufferSize) + private[this] var bufferSize: Int = 0 + private[this] var bufferFilePos: Long = in.getPos + private[this] var bufferPos: Int = 0 + private[this] val columnIter = blocks.flatMap(_.getColumns.asScala).iterator + private[this] var currentColumn: Option[ColumnChunkMetaData] = None + private[this] val readTime: GpuMetric = metrics.getOrElse(READ_FS_TIME, NoopMetric) + + override def read(): Int = { + while (bufferPos == bufferSize) { + fillBuffer() + } + val result = buffer(bufferPos) + bufferPos += 1 + result + } + + override def read(b: Array[Byte]): Int = read(b, 0, b.length) + + override def read(dest: Array[Byte], off: Int, len: Int): Int = { + var bytesLeft = len + while (bytesLeft > 0) { + if (bufferPos == bufferSize) { + fillBuffer() + } + val numBytes = Math.min(bytesLeft, bufferSize - bufferPos) + System.arraycopy(buffer, bufferPos, dest, off + len - bytesLeft, numBytes) + bufferPos += numBytes + bytesLeft -= numBytes + } + len + } + + def read(out: HostMemoryOutputStream, len: Long): Unit = { + var bytesLeft = len + while (bytesLeft > 0) { + if (bufferPos == bufferSize) { + fillBuffer() + } + // downcast is safe because bufferSize is an int + val numBytes = Math.min(bytesLeft, bufferSize - bufferPos).toInt + out.write(buffer, bufferPos, numBytes) + bufferPos += numBytes + bytesLeft -= numBytes + } + } + + def read(out: HostMemoryBuffer, len: Long): Unit = { + var bytesLeft = len + while (bytesLeft > 0) { + if (bufferPos == bufferSize) { + fillBuffer() + } + // downcast is safe because bufferSize is an int + val numBytes = Math.min(bytesLeft, bufferSize - bufferPos).toInt + out.setBytes(len - bytesLeft, buffer, bufferPos, numBytes) + bufferPos += numBytes + bytesLeft -= numBytes + } + } + + override def skip(n: Long): Long = { + seek(getPos + n) + n + } + + def getPos: Long = bufferFilePos + bufferPos + + def seek(desiredPos: Long): Unit = { + require(desiredPos >= getPos, "Only supports seeking forward") + val posDiff = desiredPos - bufferFilePos + if (posDiff >= 0 && posDiff < bufferSize) { + bufferPos = posDiff.toInt + } else { + in.seek(desiredPos) + bufferFilePos = desiredPos + bufferSize = 0 + bufferPos = 0 + } + } + + override def close(): Unit = { + readTime.ns { + in.close() + } + } + + private def fillBuffer(): Unit = { + // TODO: Add FileCache support https://github.com/NVIDIA/spark-rapids/issues/11775 + var bytesToCopy = currentColumn.map { c => + Math.max(0, c.getStartingPos + c.getTotalSize - getPos) + }.getOrElse(0L) + var done = bytesToCopy >= buffer.length + while (!done && columnIter.hasNext) { + val column = columnIter.next() + currentColumn = Some(column) + done = if (getPos + bytesToCopy == column.getStartingPos) { + bytesToCopy += column.getTotalSize + bytesToCopy >= buffer.length + } else { + true + } + } + if (bytesToCopy <= 0) { + throw new EOFException("read beyond column data range") + } + bufferFilePos = in.getPos + bufferPos = 0 + bufferSize = Math.min(bytesToCopy, buffer.length).toInt + readTime.ns { + in.readFully(buffer, 0, bufferSize) + } + } + } + + /** + * Copies the data corresponding to the clipped blocks in the original file and compute the + * block metadata for the output. The output blocks will contain the same column chunk + * metadata but with the file offsets updated to reflect the new position of the column data + * as written to the output. + * + * @param filePath the path to the Parquet file + * @param out the output stream to receive the data + * @param blocks block metadata from the original file that will appear in the computed file + * @param realStartOffset starting file offset of the first block + * @return updated block metadata corresponding to the output + */ + protected def copyAndUncompressBlocksData( + filePath: Path, + out: HostMemoryOutputStream, + blocks: Seq[BlockMetaData], + realStartOffset: Long, + metrics: Map[String, GpuMetric], + compressCfg: CpuCompressionConfig): Seq[BlockMetaData] = { + val outStartPos = out.getPos + val writeTime = metrics.getOrElse(WRITE_BUFFER_TIME, NoopMetric) + withResource(new BufferedFileInput(filePath, blocks, metrics)) { in => + val newBlocks = blocks.map { block => + val newColumns = block.getColumns.asScala.map { column => + var columnTotalSize = column.getTotalSize + var columnCodec = column.getCodec + val columnStartingPos = realStartOffset + out.getPos - outStartPos + val columnDictOffset = if (column.getDictionaryPageOffset > 0) { + column.getDictionaryPageOffset + columnStartingPos - column.getStartingPos + } else { + 0 + } + writeTime.ns { + columnCodec match { + case CompressionCodecName.SNAPPY if compressCfg.decompressSnappyCpu => + val columnStartPos = out.getPos + decompressSnappy(in, out, column) + columnCodec = CompressionCodecName.UNCOMPRESSED + columnTotalSize = out.getPos - columnStartPos + case CompressionCodecName.ZSTD if compressCfg.decompressZstdCpu => + val columnStartPos = out.getPos + decompressZstd(in, out, column) + columnCodec = CompressionCodecName.UNCOMPRESSED + columnTotalSize = out.getPos - columnStartPos + case _ => + in.seek(column.getStartingPos) + in.read(out, columnTotalSize) + } + } + ColumnChunkMetaData.get( + column.getPath, + column.getPrimitiveType, + columnCodec, + column.getEncodingStats, + column.getEncodings, + column.getStatistics, + columnStartingPos, + columnDictOffset, + column.getValueCount, + columnTotalSize, + columnTotalSize) + } + GpuParquetUtils.newBlockMeta(block.getRowCount, newColumns.toSeq) + } + newBlocks + } + } + + private def decompressSnappy( + in: BufferedFileInput, + out: HostMemoryOutputStream, + column: ColumnChunkMetaData): Unit = { + val endPos = column.getStartingPos + column.getTotalSize + in.seek(column.getStartingPos) + var inData: Option[HostMemoryBuffer] = None + try { + while (in.getPos != endPos) { + val pageHeader = Util.readPageHeader(in) + val compressedSize = pageHeader.getCompressed_page_size + val uncompressedSize = pageHeader.getUncompressed_page_size + pageHeader.unsetCrc() + pageHeader.setCompressed_page_size(uncompressedSize) + Util.writePageHeader(pageHeader, out) + if (inData.map(_.getLength).getOrElse(0L) < compressedSize) { + inData.foreach(_.close()) + inData = Some(HostMemoryBuffer.allocate(compressedSize, false)) + } + inData.foreach { compressedBuffer => + in.read(compressedBuffer, compressedSize) + val bbIn = compressedBuffer.asByteBuffer(0, compressedSize) + val bbOut = out.writeAsByteBuffer(uncompressedSize) + Snappy.uncompress(bbIn, bbOut) + } + } + } finally { + inData.foreach(_.close()) + } + } + + private def decompressZstd( + in: BufferedFileInput, + out: HostMemoryOutputStream, + column: ColumnChunkMetaData): Unit = { + val endPos = column.getStartingPos + column.getTotalSize + in.seek(column.getStartingPos) + var inData: Option[HostMemoryBuffer] = None + try { + withResource(new ZstdDecompressCtx()) { ctx => + while (in.getPos != endPos) { + val pageHeader = Util.readPageHeader(in) + val compressedSize = pageHeader.getCompressed_page_size + val uncompressedSize = pageHeader.getUncompressed_page_size + pageHeader.unsetCrc() + pageHeader.setCompressed_page_size(uncompressedSize) + Util.writePageHeader(pageHeader, out) + if (inData.map(_.getLength).getOrElse(0L) < compressedSize) { + inData.foreach(_.close()) + inData = Some(HostMemoryBuffer.allocate(compressedSize, false)) + } + inData.foreach { compressedBuffer => + in.read(compressedBuffer, compressedSize) + val bbIn = compressedBuffer.asByteBuffer(0, compressedSize) + val bbOut = out.writeAsByteBuffer(uncompressedSize) + ctx.decompress(bbOut, bbIn) + } + } + } + } finally { + inData.foreach(_.close()) + } + } + private def copyRemoteBlocksData( remoteCopies: Seq[CopyRange], filePath: Path, @@ -1666,7 +1936,11 @@ trait ParquetPartitionReaderBase extends Logging with ScanWithMetrics closeOnExcept(HostMemoryBuffer.allocate(estTotalSize)) { hmb => val out = new HostMemoryOutputStream(hmb) out.write(ParquetPartitionReader.PARQUET_MAGIC) - val outputBlocks = copyBlocksData(filePath, out, blocks, out.getPos, metrics) + val outputBlocks = if (compressCfg.decompressAnyCpu) { + copyAndUncompressBlocksData(filePath, out, blocks, out.getPos, metrics, compressCfg) + } else { + copyBlocksData(filePath, out, blocks, out.getPos, metrics) + } val footerPos = out.getPos writeFooter(out, outputBlocks, clippedSchema) BytesUtils.writeIntLittleEndian(out, (out.getPos - footerPos).toInt) @@ -1802,7 +2076,7 @@ trait ParquetPartitionReaderBase extends Logging with ScanWithMetrics block.asInstanceOf[ParquetDataBlock].dataBlock implicit def toDataBlockBase(blocks: Seq[BlockMetaData]): Seq[DataBlockBase] = - blocks.map(ParquetDataBlock) + blocks.map(b => ParquetDataBlock(b, compressCfg)) implicit def toBlockMetaDataSeq(blocks: Seq[DataBlockBase]): Seq[BlockMetaData] = blocks.map(_.asInstanceOf[ParquetDataBlock].dataBlock) @@ -1814,10 +2088,14 @@ private case class ParquetSchemaWrapper(schema: MessageType) extends SchemaBase } // Parquet BlockMetaData wrapper -private case class ParquetDataBlock(dataBlock: BlockMetaData) extends DataBlockBase { +private case class ParquetDataBlock( + dataBlock: BlockMetaData, + compressCfg: CpuCompressionConfig) extends DataBlockBase { override def getRowCount: Long = dataBlock.getRowCount override def getReadDataSize: Long = dataBlock.getTotalByteSize - override def getBlockSize: Long = dataBlock.getColumns.asScala.map(_.getTotalSize).sum + override def getBlockSize: Long = { + ParquetPartitionReader.computeOutputSize(dataBlock, compressCfg) + } } /** Parquet extra information containing rebase modes and whether there is int96 timestamp */ @@ -1876,6 +2154,7 @@ class MultiFileParquetPartitionReader( maxGpuColumnSizeBytes: Long, useChunkedReader: Boolean, maxChunkedReaderMemoryUsageSizeBytes: Long, + override val compressCfg: CpuCompressionConfig, override val execMetrics: Map[String, GpuMetric], partitionSchema: StructType, numThreads: Int, @@ -1900,7 +2179,8 @@ class MultiFileParquetPartitionReader( file: Path, outhmb: HostMemoryBuffer, blocks: ArrayBuffer[DataBlockBase], - offset: Long) + offset: Long, + compressCfg: CpuCompressionConfig) extends Callable[(Seq[DataBlockBase], Long)] { override def call(): (Seq[DataBlockBase], Long) = { @@ -1909,7 +2189,11 @@ class MultiFileParquetPartitionReader( val startBytesRead = fileSystemBytesRead() val outputBlocks = withResource(outhmb) { _ => withResource(new HostMemoryOutputStream(outhmb)) { out => - copyBlocksData(file, out, blocks.toSeq, offset, metrics) + if (compressCfg.decompressAnyCpu) { + copyAndUncompressBlocksData(file, out, blocks.toSeq, offset, metrics, compressCfg) + } else { + copyBlocksData(file, out, blocks.toSeq, offset, metrics) + } } } val bytesRead = fileSystemBytesRead() - startBytesRead @@ -1961,7 +2245,7 @@ class MultiFileParquetPartitionReader( blocks: ArrayBuffer[DataBlockBase], offset: Long, batchContext: BatchContext): Callable[(Seq[DataBlockBase], Long)] = { - new ParquetCopyBlocksRunner(taskContext, file, outhmb, blocks, offset) + new ParquetCopyBlocksRunner(taskContext, file, outhmb, blocks, offset, compressCfg) } override final def getFileFormatShortName: String = "Parquet" @@ -2072,6 +2356,7 @@ class MultiFileCloudParquetPartitionReader( maxGpuColumnSizeBytes: Long, useChunkedReader: Boolean, maxChunkedReaderMemoryUsageSizeBytes: Long, + override val compressCfg: CpuCompressionConfig, override val execMetrics: Map[String, GpuMetric], partitionSchema: StructType, numThreads: Int, @@ -2761,6 +3046,7 @@ class ParquetPartitionReader( targetBatchSizeBytes: Long, useChunkedReader: Boolean, maxChunkedReaderMemoryUsageSizeBytes: Long, + override val compressCfg: CpuCompressionConfig, override val execMetrics: Map[String, GpuMetric], dateRebaseMode: DateTimeRebaseMode, timestampRebaseMode: DateTimeRebaseMode, @@ -2873,26 +3159,34 @@ object ParquetPartitionReader { length: Long, outputOffset: Long) extends CopyItem - /** - * Build a new BlockMetaData - * - * @param rowCount the number of rows in this block - * @param columns the new column chunks to reference in the new BlockMetaData - * @return the new BlockMetaData - */ - private[rapids] def newParquetBlock( - rowCount: Long, - columns: Seq[ColumnChunkMetaData]): BlockMetaData = { - val block = new BlockMetaData - block.setRowCount(rowCount) + private[rapids] def computeOutputSize( + blocks: Seq[BlockMetaData], + compressCfg: CpuCompressionConfig): Long = { + blocks.map { block => + computeOutputSize(block, compressCfg) + }.sum + } - var totalSize: Long = 0 - columns.foreach { column => - block.addColumn(column) - totalSize += column.getTotalUncompressedSize + private[rapids] def computeOutputSize( + block: BlockMetaData, + compressCfg: CpuCompressionConfig): Long = { + if (compressCfg.decompressAnyCpu) { + block.getColumns.asScala.map { c => + if ((c.getCodec == CompressionCodecName.SNAPPY && compressCfg.decompressSnappyCpu) + || (c.getCodec == CompressionCodecName.ZSTD && compressCfg.decompressZstdCpu)) { + // Page headers need to be rewritten when CPU decompresses, and that may + // increase the size of the page header. Guess how many pages there may be + // and add a fudge factor per page to try to avoid a late realloc+copy. + // NOTE: Avoid using block.getTotalByteSize as that is the + // uncompressed size rather than the size in the file. + val estimatedPageCount = (c.getTotalUncompressedSize / (1024 * 1024)) + 1 + c.getTotalUncompressedSize + estimatedPageCount * 8 + } else { + c.getTotalSize + } + }.sum + } else { + block.getColumns.asScala.map(_.getTotalSize).sum } - block.setTotalByteSize(totalSize) - - block } } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostMemoryStreams.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostMemoryStreams.scala index 08fe5be50b2..4be11b13254 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostMemoryStreams.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostMemoryStreams.scala @@ -54,6 +54,12 @@ class HostMemoryOutputStream(val buffer: HostMemoryBuffer) extends OutputStream pos += numBytes } + def writeAsByteBuffer(length: Int): ByteBuffer = { + val bb = buffer.asByteBuffer(pos, length) + pos += length + bb + } + def getPos: Long = pos def seek(newPos: Long): Unit = { @@ -132,6 +138,12 @@ trait HostMemoryInputStreamMixIn extends InputStream { } } + def readByteBuffer(length: Int): ByteBuffer = { + val bb = hmb.asByteBuffer(pos, length) + pos += length + bb + } + override def skip(count: Long): Long = { val oldPos = pos pos = Math.min(pos + count, hmbLength) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index ab7a788d205..406aeb0365b 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -1120,6 +1120,31 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern") .checkValues(RapidsReaderType.values.map(_.toString)) .createWithDefault(RapidsReaderType.AUTO.toString) + val PARQUET_DECOMPRESS_CPU = + conf("spark.rapids.sql.format.parquet.decompressCpu") + .doc("If true then the CPU is eligible to decompress Parquet data rather than the GPU. " + + s"See other spark.rapids.sql.format.parquet.decompressCpu.* configuration settings " + + "to control this for specific compression codecs.") + .internal() + .booleanConf + .createWithDefault(false) + + val PARQUET_DECOMPRESS_CPU_SNAPPY = + conf("spark.rapids.sql.format.parquet.decompressCpu.snappy") + .doc(s"If true and $PARQUET_DECOMPRESS_CPU is true then the CPU decompresses " + + "Parquet Snappy data rather than the GPU") + .internal() + .booleanConf + .createWithDefault(true) + + val PARQUET_DECOMPRESS_CPU_ZSTD = + conf("spark.rapids.sql.format.parquet.decompressCpu.zstd") + .doc(s"If true and $PARQUET_DECOMPRESS_CPU is true then the CPU decompresses " + + "Parquet Zstandard data rather than the GPU") + .internal() + .booleanConf + .createWithDefault(true) + val READER_MULTITHREADED_COMBINE_THRESHOLD = conf("spark.rapids.sql.reader.multithreaded.combine.sizeBytes") .doc("The target size in bytes to combine multiple small files together when using the " + @@ -2960,6 +2985,12 @@ class RapidsConf(conf: Map[String, String]) extends Logging { lazy val isParquetMultiThreadReadEnabled: Boolean = isParquetAutoReaderEnabled || RapidsReaderType.withName(get(PARQUET_READER_TYPE)) == RapidsReaderType.MULTITHREADED + lazy val parquetDecompressCpu: Boolean = get(PARQUET_DECOMPRESS_CPU) + + lazy val parquetDecompressCpuSnappy: Boolean = get(PARQUET_DECOMPRESS_CPU_SNAPPY) + + lazy val parquetDecompressCpuZstd: Boolean = get(PARQUET_DECOMPRESS_CPU_ZSTD) + lazy val maxNumParquetFilesParallel: Int = get(PARQUET_MULTITHREAD_READ_MAX_NUM_FILES_PARALLEL) lazy val isParquetReadEnabled: Boolean = get(ENABLE_PARQUET_READ) From ed02cfe4f54e3c8531017671fd6ad0388128cb75 Mon Sep 17 00:00:00 2001 From: MithunR Date: Tue, 26 Nov 2024 14:02:54 -0800 Subject: [PATCH 12/37] Fix `dpp_test.py` failures on [databricks] 14.3 (#11768) Fixes #11536. This commit fixes the tests in `dpp_test.py` that were failing on Databricks 14.3. The failures were largely a result of an erroneous shim implementation, that was fixed as part of #11750. This commit accounts for the remaining failures that result from there being a `CollectLimitExec` in certain DPP query plans (that include broadcast joins, for example). The tests have been made more permissive, in allowing the `CollectLimitExec` to run on the CPU. The `CollectLimitExec` based plans will be further explored as part of https://github.com/NVIDIA/spark-rapids/issues/11764. Signed-off-by: MithunR --- integration_tests/src/main/python/dpp_test.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/integration_tests/src/main/python/dpp_test.py b/integration_tests/src/main/python/dpp_test.py index b362a4175f3..3d5ee1a5afa 100644 --- a/integration_tests/src/main/python/dpp_test.py +++ b/integration_tests/src/main/python/dpp_test.py @@ -20,7 +20,7 @@ from conftest import spark_tmp_table_factory from data_gen import * from marks import ignore_order, allow_non_gpu, datagen_overrides, disable_ansi_mode -from spark_session import is_before_spark_320, with_cpu_session, is_before_spark_312, is_databricks_runtime, is_databricks113_or_later +from spark_session import is_before_spark_320, with_cpu_session, is_before_spark_312, is_databricks_runtime, is_databricks113_or_later, is_databricks_version_or_later # non-positive values here can produce a degenerative join, so here we ensure that most values are # positive to ensure the join will produce rows. See https://github.com/NVIDIA/spark-rapids/issues/10147 @@ -167,10 +167,17 @@ def fn(spark): ''' ] +# On some Databricks versions (>=14.3), some query plans include a `CollectLimitExec`, +# when filtering partitions. This exec falls back to CPU. These tests allow for `CollectLimit` to +# run on the CPU, if everything else in the plan execute as expected. +# Further details are furnished at https://github.com/NVIDIA/spark-rapids/issues/11764. +dpp_fallback_execs=["CollectLimitExec"] if is_databricks_version_or_later(14,3) else [] + @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 # When BroadcastExchangeExec is available on filtering side, and it can be reused: # DynamicPruningExpression(InSubqueryExec(value, GpuSubqueryBroadcastExec))) @ignore_order +@allow_non_gpu(*dpp_fallback_execs) @datagen_overrides(seed=0, reason="https://github.com/NVIDIA/spark-rapids/issues/10147") @pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn) @pytest.mark.parametrize('s_index', list(range(len(_statements))), ids=idfn) @@ -245,6 +252,7 @@ def test_dpp_bypass(spark_tmp_table_factory, store_format, s_index, aqe_enabled) # then Spark will plan an extra Aggregate to collect filtering values: # DynamicPruningExpression(InSubqueryExec(value, SubqueryExec(Aggregate(...)))) @ignore_order +@allow_non_gpu(*dpp_fallback_execs) @pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn) @pytest.mark.parametrize('s_index', list(range(len(_statements))), ids=idfn) @pytest.mark.parametrize('aqe_enabled', [ @@ -285,10 +293,11 @@ def test_dpp_skip(spark_tmp_table_factory, store_format, s_index, aqe_enabled): non_exist_classes='DynamicPruningExpression', conf=dict(_dpp_fallback_conf + [('spark.sql.adaptive.enabled', aqe_enabled)])) +dpp_like_any_fallback_execs=['FilterExec', 'CollectLimitExec'] if is_databricks_version_or_later(14,3) else ['FilterExec'] # GPU verification on https://issues.apache.org/jira/browse/SPARK-34436 @ignore_order -@allow_non_gpu('FilterExec') +@allow_non_gpu(*dpp_like_any_fallback_execs) @pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn) @pytest.mark.parametrize('aqe_enabled', [ 'false', @@ -327,6 +336,7 @@ def create_dim_table_for_like(spark): @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 +@allow_non_gpu(*dpp_fallback_execs) # Test handling DPP expressions from a HashedRelation that rearranges columns @pytest.mark.parametrize('aqe_enabled', [ 'false', From 0bf85cb9e65928328e3a2e0ec3837825f9be5f2d Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 27 Nov 2024 08:39:11 +0800 Subject: [PATCH 13/37] Update rapids JNI and private dependency to 25.02.0-SNAPSHOT (#11772) To fix: https://github.com/NVIDIA/spark-rapids/issues/11755\nWait for the pre-merge CI job to SUCCEED Signed-off-by: nvauto <70000568+nvauto@users.noreply.github.com> --- jenkins/databricks/init_cudf_udf.sh | 3 +-- pom.xml | 5 ++--- scala2.13/pom.xml | 5 ++--- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh index 94ca7473143..0898c230d48 100755 --- a/jenkins/databricks/init_cudf_udf.sh +++ b/jenkins/databricks/init_cudf_udf.sh @@ -20,8 +20,7 @@ set -ex -# TODO: https://github.com/NVIDIA/spark-rapids/issues/11755 -CUDF_VER=${CUDF_VER:-24.12} +CUDF_VER=${CUDF_VER:-25.02} CUDA_VER=${CUDA_VER:-11.8} # Need to explicitly add conda into PATH environment, to activate conda environment. diff --git a/pom.xml b/pom.xml index 7409b849968..c5adf511d97 100644 --- a/pom.xml +++ b/pom.xml @@ -828,9 +828,8 @@ spark${buildver} cuda11 ${cuda.version} - - 24.12.0-SNAPSHOT - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT + 25.02.0-SNAPSHOT 2.12 2.8.0 incremental diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml index 9c00390f6e5..8a078e6e0d0 100644 --- a/scala2.13/pom.xml +++ b/scala2.13/pom.xml @@ -828,9 +828,8 @@ spark${buildver} cuda11 ${cuda.version} - - 24.12.0-SNAPSHOT - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT + 25.02.0-SNAPSHOT 2.13 2.8.0 incremental From 5b77ed736d61932147f29f0d7de2d5c7335a903c Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Wed, 27 Nov 2024 17:41:19 +0800 Subject: [PATCH 14/37] Update advanced configs introduced by private repo (#11785) Signed-off-by: Chong Gao Co-authored-by: Chong Gao --- docs/additional-functionality/advanced_configs.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/additional-functionality/advanced_configs.md b/docs/additional-functionality/advanced_configs.md index 07346a5b850..a4427d9495a 100644 --- a/docs/additional-functionality/advanced_configs.md +++ b/docs/additional-functionality/advanced_configs.md @@ -33,6 +33,7 @@ Name | Description | Default Value | Applicable at spark.rapids.filecache.blockPathRegexp|A regular expression to decide which paths will not be cached when the file cache is enabled. If a path is blocked by this regexp but is allowed by spark.rapids.filecache.allowPathRegexp, then the path is blocked.|None|Startup spark.rapids.filecache.checkStale|Controls whether the cached is checked for being out of date with respect to the input file. When enabled, the data that has been cached locally for a file will be invalidated if the file is updated after being cached. This feature is only necessary if an input file for a Spark application can be changed during the lifetime of the application. If an individual input file will not be overwritten during the Spark application then performance may be improved by setting this to false.|true|Startup spark.rapids.filecache.maxBytes|Controls the maximum amount of data that will be cached locally. If left unspecified, it will use half of the available disk space detected on startup for the configured Spark local disks.|None|Startup +spark.rapids.filecache.minimumFreeSpace.bytes|Specify the minimum amount of free space in the Spark local disks. When the amount of free space on the Spark local disks drops below this value, cache data will be removed automatically to free disk space. A zero or negative value will disable this feature. Note if multiple Spark applications running on the same node, or there are other applications running with heavy disk writing, the filecache may not drop caches in time and may cause full disk errors. Please increase this value for this case.|32212254720|Startup spark.rapids.filecache.useChecksums|Whether to write out and verify checksums for the cached local files.|false|Startup spark.rapids.gpu.resourceName|The name of the Spark resource that represents a GPU that you want the plugin to use if using custom resources with Spark.|gpu|Startup spark.rapids.memory.gpu.allocFraction|The fraction of available (free) GPU memory that should be allocated for pooled memory. This must be less than or equal to the maximum limit configured via spark.rapids.memory.gpu.maxAllocFraction, and greater than or equal to the minimum limit configured via spark.rapids.memory.gpu.minAllocFraction.|1.0|Startup From ca466e7afa3a98ceaaffffe6b4bdee6b60066b68 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Wed, 27 Nov 2024 10:13:06 -0600 Subject: [PATCH 15/37] Remove unnecessary toBeReturned field from serialized batch iterators (#11778) Signed-off-by: Jason Lowe --- .../rapids/GpuColumnarBatchSerializer.scala | 92 +++++++------------ 1 file changed, 34 insertions(+), 58 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala index 54252253d38..8fde39eecf8 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala @@ -48,15 +48,13 @@ trait BaseSerializedTableIterator extends Iterator[(Int, ColumnarBatch)] { class SerializedBatchIterator(dIn: DataInputStream) extends BaseSerializedTableIterator { private[this] var nextHeader: Option[SerializedTableHeader] = None - private[this] var toBeReturned: Option[ColumnarBatch] = None private[this] var streamClosed: Boolean = false // Don't install the callback if in a unit test Option(TaskContext.get()).foreach { tc => onTaskCompletion(tc) { - toBeReturned.foreach(_.close()) - toBeReturned = None dIn.close() + streamClosed = true } } @@ -80,23 +78,20 @@ class SerializedBatchIterator(dIn: DataInputStream) } } - private def tryReadNext(): Option[ColumnarBatch] = { - if (nextHeader.isEmpty) { - None - } else { - withResource(new NvtxRange("Read Batch", NvtxColor.YELLOW)) { _ => - val header = nextHeader.get - if (header.getNumColumns > 0) { - // This buffer will later be concatenated into another host buffer before being - // sent to the GPU, so no need to use pinned memory for these buffers. - closeOnExcept( - HostMemoryBuffer.allocate(header.getDataLen, false)) { hostBuffer => - JCudfSerialization.readTableIntoBuffer(dIn, header, hostBuffer) - Some(SerializedTableColumn.from(header, hostBuffer)) - } - } else { - Some(SerializedTableColumn.from(header)) + private def readNextBatch(): ColumnarBatch = { + withResource(new NvtxRange("Read Batch", NvtxColor.YELLOW)) { _ => + val header = nextHeader.get + nextHeader = None + if (header.getNumColumns > 0) { + // This buffer will later be concatenated into another host buffer before being + // sent to the GPU, so no need to use pinned memory for these buffers. + closeOnExcept( + HostMemoryBuffer.allocate(header.getDataLen, false)) { hostBuffer => + JCudfSerialization.readTableIntoBuffer(dIn, header, hostBuffer) + SerializedTableColumn.from(header, hostBuffer) } + } else { + SerializedTableColumn.from(header) } } } @@ -107,17 +102,10 @@ class SerializedBatchIterator(dIn: DataInputStream) } override def next(): (Int, ColumnarBatch) = { - if (toBeReturned.isEmpty) { - peekNextBatchSize() - toBeReturned = tryReadNext() - if (nextHeader.isEmpty || toBeReturned.isEmpty) { - throw new NoSuchElementException("Walked off of the end...") - } + if (!hasNext) { + throw new NoSuchElementException("Walked off of the end...") } - val ret = toBeReturned.get - toBeReturned = None - nextHeader = None - (0, ret) + (0, readNextBatch()) } } @@ -498,15 +486,13 @@ object KudoSerializedTableColumn { class KudoSerializedBatchIterator(dIn: DataInputStream) extends BaseSerializedTableIterator { private[this] var nextHeader: Option[KudoTableHeader] = None - private[this] var toBeReturned: Option[ColumnarBatch] = None private[this] var streamClosed: Boolean = false // Don't install the callback if in a unit test Option(TaskContext.get()).foreach { tc => onTaskCompletion(tc) { - toBeReturned.foreach(_.close()) - toBeReturned = None dIn.close() + streamClosed = true } } @@ -530,23 +516,20 @@ class KudoSerializedBatchIterator(dIn: DataInputStream) } } - private def tryReadNext(): Option[ColumnarBatch] = { - if (nextHeader.isEmpty) { - None - } else { - withResource(new NvtxRange("Read Batch", NvtxColor.YELLOW)) { _ => - val header = nextHeader.get - if (header.getNumColumns > 0) { - // This buffer will later be concatenated into another host buffer before being - // sent to the GPU, so no need to use pinned memory for these buffers. - closeOnExcept(HostMemoryBuffer.allocate(header.getTotalDataLen, false)) { hostBuffer => - hostBuffer.copyFromStream(0, dIn, header.getTotalDataLen) - val kudoTable = new KudoTable(header, hostBuffer) - Some(KudoSerializedTableColumn.from(kudoTable)) - } - } else { - Some(KudoSerializedTableColumn.from(new KudoTable(header, null))) + private def readNextBatch(): ColumnarBatch = { + withResource(new NvtxRange("Read Batch", NvtxColor.YELLOW)) { _ => + val header = nextHeader.get + nextHeader = None + if (header.getNumColumns > 0) { + // This buffer will later be concatenated into another host buffer before being + // sent to the GPU, so no need to use pinned memory for these buffers. + closeOnExcept(HostMemoryBuffer.allocate(header.getTotalDataLen, false)) { hostBuffer => + hostBuffer.copyFromStream(0, dIn, header.getTotalDataLen) + val kudoTable = new KudoTable(header, hostBuffer) + KudoSerializedTableColumn.from(kudoTable) } + } else { + KudoSerializedTableColumn.from(new KudoTable(header, null)) } } } @@ -557,16 +540,9 @@ class KudoSerializedBatchIterator(dIn: DataInputStream) } override def next(): (Int, ColumnarBatch) = { - if (toBeReturned.isEmpty) { - peekNextBatchSize() - toBeReturned = tryReadNext() - if (nextHeader.isEmpty || toBeReturned.isEmpty) { - throw new NoSuchElementException("Walked off of the end...") - } + if (!hasNext) { + throw new NoSuchElementException("Walked off of the end...") } - val ret = toBeReturned.get - toBeReturned = None - nextHeader = None - (0, ret) + (0, readNextBatch()) } } From 4b1a401fd6e516cf0855bbb164d05ef981d5df27 Mon Sep 17 00:00:00 2001 From: Tim Liu Date: Thu, 28 Nov 2024 11:42:37 +0800 Subject: [PATCH 16/37] Support running CI_PART2 integration tests with JARs built by CI_PART1 (#11788) The CI_PART1 job uploads the built Spark Rapids tar file to Databricks DBFS storage. The CI_PART2 job retrieves the built tar file from DBFS storage and runs integration tests against it. Signed-off-by: timl --- jenkins/databricks/build.sh | 12 ++++++++++++ jenkins/databricks/test.sh | 25 ++++++++++++++++--------- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh index f6ff6e913b6..baec99bb015 100755 --- a/jenkins/databricks/build.sh +++ b/jenkins/databricks/build.sh @@ -178,5 +178,17 @@ if [[ "$WITH_DEFAULT_UPSTREAM_SHIM" != "0" ]]; then -Dincluded_buildvers=$UPSTREAM_BUILDVER,$BUILDVER fi +# "Delete the unused object files to reduce the size of the Spark Rapids built tar." +rm -rf dist/target/jni-deps/ +find dist/target/parallel-world/ -mindepth 1 -maxdepth 1 ! -name META-INF -exec rm -rf {} + + cd /home/ubuntu tar -zcf spark-rapids-built.tgz spark-rapids + +# Back up spark rapids built jars for the CI_PART2 job to run integration tests +TEST_MODE=${TEST_MODE:-'DEFAULT'} +PLUGIN_BUILT_TGZ=${PLUGIN_BUILT_TGZ:-"$1"} +if [[ "$TEST_MODE" == "CI_PART1" && -n "$PLUGIN_BUILT_TGZ" ]]; then + mkdir -p $(dirname $PLUGIN_BUILT_TGZ) + cp spark-rapids-built.tgz $PLUGIN_BUILT_TGZ +fi diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh index 38728161d12..abe09b226b4 100755 --- a/jenkins/databricks/test.sh +++ b/jenkins/databricks/test.sh @@ -38,6 +38,22 @@ set -ex +## 'foo=abc,bar=123,...' to 'export foo=abc bar=123 ...' +[[ -n "$EXTRA_ENVS" ]] && export ${EXTRA_ENVS//','/' '} +# TEST_MODE +# - DEFAULT: all tests except cudf_udf tests +# - DELTA_LAKE_ONLY: delta_lake tests only +# - MULTITHREADED_SHUFFLE: shuffle tests only +# - PYARROW_ONLY: pyarrow tests only +# - CI_PART1 or CI_PART2 : part1 or part2 of the tests run in parallel from CI +TEST_MODE=${TEST_MODE:-'DEFAULT'} + +# CI_PART2 untars the spark-rapids tgz built by C1_PART1 instead of rebuilding it +PLUGIN_BUILT_TGZ=${PLUGIN_BUILT_TGZ:-"$1"} +if [[ "$TEST_MODE" == "CI_PART2" && -z "$LOCAL_JAR_PATH" && -f "$PLUGIN_BUILT_TGZ" ]]; then + tar -zxf $PLUGIN_BUILT_TGZ +fi + SOURCE_PATH="/home/ubuntu/spark-rapids" [[ -d "$LOCAL_JAR_PATH" ]] && cd $LOCAL_JAR_PATH || cd $SOURCE_PATH @@ -54,15 +70,6 @@ WITH_DEFAULT_UPSTREAM_SHIM=${WITH_DEFAULT_UPSTREAM_SHIM:-1} IS_SPARK_321_OR_LATER=0 [[ "$(printf '%s\n' "3.2.1" "$BASE_SPARK_VERSION" | sort -V | head -n1)" = "3.2.1" ]] && IS_SPARK_321_OR_LATER=1 - -# TEST_MODE -# - DEFAULT: all tests except cudf_udf tests -# - DELTA_LAKE_ONLY: delta_lake tests only -# - MULTITHREADED_SHUFFLE: shuffle tests only -# - PYARROW_ONLY: pyarrow tests only -# - CI_PART1 or CI_PART2 : part1 or part2 of the tests run in parallel from CI -TEST_MODE=${TEST_MODE:-'DEFAULT'} - # Classloader config is here to work around classloader issues with # --packages in distributed setups, should be fixed by # https://github.com/NVIDIA/spark-rapids/pull/5646 From aa2da410511d8a737e207257769ec662a79174fe Mon Sep 17 00:00:00 2001 From: "Hongbin Ma (Mahone)" Date: Fri, 29 Nov 2024 23:26:13 +0800 Subject: [PATCH 17/37] fix issue 11790 (#11792) Signed-off-by: Hongbin Ma (Mahone) --- .../spark/rapids/GpuAggregateExec.scala | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala index 60f6dd68509..4ba20547e77 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala @@ -219,9 +219,6 @@ object AggregateUtils extends Logging { ): Boolean = { var repartitionHappened = false - if (hashSeed > 200) { - throw new IllegalStateException("Too many times of repartition, may hit a bug?") - } def repartitionAndClose(batch: SpillableColumnarBatch): Unit = { @@ -280,15 +277,23 @@ object AggregateUtils extends Logging { val newBuckets = batchesByBucket.flatMap(bucket => { if (needRepartitionAgain(bucket)) { - val nextLayerBuckets = - ArrayBuffer.fill(hashBucketNum)(new AutoClosableArrayBuffer[SpillableColumnarBatch]()) - // Recursively merge and repartition the over sized bucket - repartitionHappened = - iterateAndRepartition( - new CloseableBufferedIterator(bucket.iterator), metrics, targetMergeBatchSize, - helper, hashKeys, hashBucketNum, hashSeed + 7, - nextLayerBuckets) || repartitionHappened - nextLayerBuckets + if (hashSeed + 7 > 200) { + log.warn("Too many times of repartition, may hit a bug? Size for each batch in " + + "current bucket: " + bucket.map(_.sizeInBytes).mkString(", ") + " rows: " + + bucket.map(_.numRows()).mkString(", ") + " targetMergeBatchSize: " + + targetMergeBatchSize) + ArrayBuffer.apply(bucket) + } else { + val nextLayerBuckets = + ArrayBuffer.fill(hashBucketNum)(new AutoClosableArrayBuffer[SpillableColumnarBatch]()) + // Recursively merge and repartition the over sized bucket + repartitionHappened = + iterateAndRepartition( + new CloseableBufferedIterator(bucket.iterator), metrics, targetMergeBatchSize, + helper, hashKeys, hashBucketNum, hashSeed + 7, + nextLayerBuckets) || repartitionHappened + nextLayerBuckets + } } else { ArrayBuffer.apply(bucket) } @@ -1075,8 +1080,8 @@ class GpuMergeAggregateIterator( closeOnExcept(new ArrayBuffer[AutoClosableArrayBuffer[SpillableColumnarBatch]]) { toAggregateBuckets => var currentSize = 0L - while (batchesByBucket.nonEmpty && - batchesByBucket.last.size() + currentSize < targetMergeBatchSize) { + while (batchesByBucket.nonEmpty && (toAggregateBuckets.isEmpty || + batchesByBucket.last.size() + currentSize < targetMergeBatchSize)) { val bucket = batchesByBucket.remove(batchesByBucket.size - 1) currentSize += bucket.map(_.sizeInBytes).sum toAggregateBuckets += bucket From bd14dbff6d5270c7374dc9e2ce6d00ff7e902420 Mon Sep 17 00:00:00 2001 From: YanxuanLiu <104543031+YanxuanLiu@users.noreply.github.com> Date: Mon, 2 Dec 2024 09:56:22 +0800 Subject: [PATCH 18/37] Incorporate checksum of internal dependencies in the GH cache key [skip ci] (#11791) * replace date with jni&private timestamp for cache key Signed-off-by: YanxuanLiu * use date if quering timestamp failed Signed-off-by: YanxuanLiu * add bash script to get timestamp Signed-off-by: YanxuanLiu * replace timestamp with sha1 Signed-off-by: YanxuanLiu --------- Signed-off-by: YanxuanLiu --- .github/workflows/mvn-verify-check.yml | 6 ++-- .../mvn-verify-check/get-deps-sha1.sh | 36 +++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) create mode 100755 .github/workflows/mvn-verify-check/get-deps-sha1.sh diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml index 0aca7bc3655..b58799c6110 100644 --- a/.github/workflows/mvn-verify-check.yml +++ b/.github/workflows/mvn-verify-check.yml @@ -53,7 +53,8 @@ jobs: id: generateCacheKey run: | set -x - cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-$(date +'%Y-%m-%d')" + depsSHA1=$(. .github/workflows/mvn-verify-check/get-deps-sha1.sh 2.12) + cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-${depsSHA1}" echo "dailyCacheKey=$cacheKey" | tee $GITHUB_ENV $GITHUB_OUTPUT - name: Cache local Maven repository id: cache @@ -165,7 +166,8 @@ jobs: id: generateCacheKey run: | set -x - cacheKey="${{ runner.os }}-maven-scala213-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-$(date +'%Y-%m-%d')" + depsSHA1=$(. .github/workflows/mvn-verify-check/get-deps-sha1.sh 2.13) + cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-${depsSHA1}" echo "scala213dailyCacheKey=$cacheKey" | tee $GITHUB_ENV $GITHUB_OUTPUT - name: Cache local Maven repository id: cache diff --git a/.github/workflows/mvn-verify-check/get-deps-sha1.sh b/.github/workflows/mvn-verify-check/get-deps-sha1.sh new file mode 100755 index 00000000000..aa7129bd3ef --- /dev/null +++ b/.github/workflows/mvn-verify-check/get-deps-sha1.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +scala_ver=${1:-"2.12"} +base_URL="https://oss.sonatype.org/service/local/artifact/maven/resolve" +project_jni="spark-rapids-jni" +project_private="rapids-4-spark-private_${scala_ver}" + +jni_ver=$(mvn help:evaluate -q -pl dist -Dexpression=spark-rapids-jni.version -DforceStdout) +private_ver=$(mvn help:evaluate -q -pl dist -Dexpression=spark-rapids-private.version -DforceStdout) + +jni_sha1=$(curl -s -H "Accept: application/json" \ + "${base_URL}?r=snapshots&g=com.nvidia&a=${project_jni}&v=${jni_ver}&c=&e=jar&wt=json" \ + | jq .data.sha1) || $(date +'%Y-%m-%d') +private_sha1=$(curl -s -H "Accept: application/json" \ + "${base_URL}?r=snapshots&g=com.nvidia&a=${project_private}&v=${private_ver}&c=&e=jar&wt=json" \ + | jq .data.sha1) || $(date +'%Y-%m-%d') + +sha1md5=$(echo -n "${jni_sha1}_${private_sha1}" | md5sum | awk '{print $1}') + +echo $sha1md5 From cb31afb07847ff96b16d70ceec54ee1426fe5e64 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Mon, 2 Dec 2024 18:19:17 -0600 Subject: [PATCH 19/37] Fall back to CPU for non-UTC months_between (#11802) Signed-off-by: Robert (Bobby) Evans --- integration_tests/src/main/python/date_time_test.py | 12 ++++++------ .../spark/sql/rapids/datetimeExpressions.scala | 10 +++++++++- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index 5a98e06fadc..1a7024dac85 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -139,34 +139,34 @@ def test_datediff(data_gen): hms_fallback = ['ProjectExec'] if not is_supported_time_zone() else [] -@allow_non_gpu(*hms_fallback) +@allow_non_gpu(*non_utc_tz_allow) def test_months_between(): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, timestamp_gen).selectExpr('months_between(a, b, false)')) -@allow_non_gpu(*hms_fallback) +@allow_non_gpu(*non_utc_tz_allow) def test_months_between_first_day(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('months_between(a, timestamp"2024-01-01", false)')) -@allow_non_gpu(*hms_fallback) +@allow_non_gpu(*non_utc_tz_allow) def test_months_between_last_day(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('months_between(a, timestamp"2023-12-31", false)')) -@allow_non_gpu(*hms_fallback) +@allow_non_gpu(*non_utc_tz_allow) @approximate_float() def test_months_between_round(): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, timestamp_gen).selectExpr('months_between(a, b, true)')) -@allow_non_gpu(*hms_fallback) +@allow_non_gpu(*non_utc_tz_allow) @approximate_float() def test_months_between_first_day_round(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('months_between(a, timestamp"2024-01-01", true)')) -@allow_non_gpu(*hms_fallback) +@allow_non_gpu(*non_utc_tz_allow) @approximate_float() def test_months_between_last_day_round(): assert_gpu_and_cpu_are_equal_collect( diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala index 8ed4c50ac3b..0f382a7b6e6 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala @@ -1217,7 +1217,8 @@ class MonthsBetweenExprMeta(expr: MonthsBetween, rule: DataFromReplacementRule) extends ExprMeta[MonthsBetween](expr, conf, parent, rule) { - override def isTimeZoneSupported = true + // See https://github.com/NVIDIA/spark-rapids/issues/11800 + override def isTimeZoneSupported = false override def convertToGpu(): GpuExpression = { val gpuChildren = childExprs.map(_.convertToGpu()) @@ -1287,6 +1288,13 @@ object GpuMonthsBetween { private def calcSecondsInDay(converted: ColumnVector): ColumnVector = { // Find the number of seconds that are not counted for in a day + // Rounding down to the current day, only works if you are in a time zone with no + // transition rules. This is because if a transition happens in between the start + // of the day and the timestamp we will be off. As such this will need to change to + // support other time zones, and it will need to take the timezone into account when + // calculating this. + // https://github.com/NVIDIA/spark-rapids/issues/11800 + // find the micros over by finding the part that is not days val microsInDay = withResource(converted.dateTimeFloor(DateTimeRoundingFrequency.DAY)) { days => // But we cannot subtract timestamps directly. They are both micros From 738c8e38fc23c1634667443864b80f085f2737ac Mon Sep 17 00:00:00 2001 From: "Hongbin Ma (Mahone)" Date: Tue, 3 Dec 2024 09:07:10 +0800 Subject: [PATCH 20/37] exclude previous operator's time out of firstBatchHeuristic (#11794) Signed-off-by: Hongbin Ma (Mahone) --- .../main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala index 4ba20547e77..d5bbe15209d 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala @@ -2091,9 +2091,9 @@ class DynamicGpuPartialAggregateIterator( helper: AggHelper): (Iterator[ColumnarBatch], Boolean) = { // we need to decide if we are going to sort the data or not, so the very // first thing we need to do is get a batch and make a choice. + val cb = cbIter.next() withResource(new NvtxWithMetrics("dynamic sort heuristic", NvtxColor.BLUE, metrics.opTime, metrics.heuristicTime)) { _ => - val cb = cbIter.next() lazy val estimatedGrowthAfterAgg: Double = closeOnExcept(cb) { cb => val numRows = cb.numRows() val cardinality = estimateCardinality(cb) From 7927ae9b3e6f565c9e7ba45c5353dbccbdd6d483 Mon Sep 17 00:00:00 2001 From: YanxuanLiu <104543031+YanxuanLiu@users.noreply.github.com> Date: Thu, 5 Dec 2024 09:13:34 +0800 Subject: [PATCH 21/37] enable license header check & add header to files (#11786) Signed-off-by: YanxuanLiu --- .github/workflows/license-header-check.yml | 58 ++++++++++++++++++++++ docs/dev/idea-code-style-settings.xml | 16 ++++++ python/rapids/daemon.py | 1 + python/rapids/daemon_databricks.py | 1 + 4 files changed, 76 insertions(+) create mode 100644 .github/workflows/license-header-check.yml diff --git a/.github/workflows/license-header-check.yml b/.github/workflows/license-header-check.yml new file mode 100644 index 00000000000..e7f62399436 --- /dev/null +++ b/.github/workflows/license-header-check.yml @@ -0,0 +1,58 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# A workflow to check copyright/license header +name: license header check + +on: + pull_request: + types: [opened, synchronize, reopened] + +jobs: + license-header-check: + runs-on: ubuntu-latest + if: "!contains(github.event.pull_request.title, '[bot]')" + steps: + - name: Get checkout depth + run: | + echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 10 ))" >> $GITHUB_ENV + + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: ${{ env.PR_FETCH_DEPTH }} + + - name: license-header-check + uses: NVIDIA/spark-rapids-common/license-header-check@main + with: + included_file_patterns: | + *.yml, + *.yaml, + *.sh, + *.xml, + *.properties, + *.scala, + *.py, + build/*, + *.cpp, + *Dockerfile*, + *Jenkinsfile*, + *.ini, + *.java, + *.fbs + excluded_file_patterns: | + *target/*, + thirdparty/*, + sql-plugin/src/main/java/com/nvidia/spark/rapids/format/* + \ No newline at end of file diff --git a/docs/dev/idea-code-style-settings.xml b/docs/dev/idea-code-style-settings.xml index 165d30dde06..9f5c3c100dc 100644 --- a/docs/dev/idea-code-style-settings.xml +++ b/docs/dev/idea-code-style-settings.xml @@ -1,3 +1,19 @@ + +