diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml index 93557017b08..1d7b0ab8e0b 100644 --- a/.github/workflows/blossom-ci.yml +++ b/.github/workflows/blossom-ci.yml @@ -77,7 +77,8 @@ jobs: github.actor == 'Feng-Jiang28' || github.actor == 'SurajAralihalli' || github.actor == 'jihoonson' || - github.actor == 'ustcfy' + github.actor == 'ustcfy' || + github.actor == 'knoguchi22' ) steps: - name: Check if comment is issued by authorized person diff --git a/.github/workflows/license-header-check.yml b/.github/workflows/license-header-check.yml new file mode 100644 index 00000000000..e7f62399436 --- /dev/null +++ b/.github/workflows/license-header-check.yml @@ -0,0 +1,58 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# A workflow to check copyright/license header +name: license header check + +on: + pull_request: + types: [opened, synchronize, reopened] + +jobs: + license-header-check: + runs-on: ubuntu-latest + if: "!contains(github.event.pull_request.title, '[bot]')" + steps: + - name: Get checkout depth + run: | + echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 10 ))" >> $GITHUB_ENV + + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: ${{ env.PR_FETCH_DEPTH }} + + - name: license-header-check + uses: NVIDIA/spark-rapids-common/license-header-check@main + with: + included_file_patterns: | + *.yml, + *.yaml, + *.sh, + *.xml, + *.properties, + *.scala, + *.py, + build/*, + *.cpp, + *Dockerfile*, + *Jenkinsfile*, + *.ini, + *.java, + *.fbs + excluded_file_patterns: | + *target/*, + thirdparty/*, + sql-plugin/src/main/java/com/nvidia/spark/rapids/format/* + \ No newline at end of file diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml index 0aca7bc3655..b58799c6110 100644 --- a/.github/workflows/mvn-verify-check.yml +++ b/.github/workflows/mvn-verify-check.yml @@ -53,7 +53,8 @@ jobs: id: generateCacheKey run: | set -x - cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-$(date +'%Y-%m-%d')" + depsSHA1=$(. .github/workflows/mvn-verify-check/get-deps-sha1.sh 2.12) + cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-${depsSHA1}" echo "dailyCacheKey=$cacheKey" | tee $GITHUB_ENV $GITHUB_OUTPUT - name: Cache local Maven repository id: cache @@ -165,7 +166,8 @@ jobs: id: generateCacheKey run: | set -x - cacheKey="${{ runner.os }}-maven-scala213-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-$(date +'%Y-%m-%d')" + depsSHA1=$(. .github/workflows/mvn-verify-check/get-deps-sha1.sh 2.13) + cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-${depsSHA1}" echo "scala213dailyCacheKey=$cacheKey" | tee $GITHUB_ENV $GITHUB_OUTPUT - name: Cache local Maven repository id: cache diff --git a/.github/workflows/mvn-verify-check/get-deps-sha1.sh b/.github/workflows/mvn-verify-check/get-deps-sha1.sh new file mode 100755 index 00000000000..aa7129bd3ef --- /dev/null +++ b/.github/workflows/mvn-verify-check/get-deps-sha1.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +scala_ver=${1:-"2.12"} +base_URL="https://oss.sonatype.org/service/local/artifact/maven/resolve" +project_jni="spark-rapids-jni" +project_private="rapids-4-spark-private_${scala_ver}" + +jni_ver=$(mvn help:evaluate -q -pl dist -Dexpression=spark-rapids-jni.version -DforceStdout) +private_ver=$(mvn help:evaluate -q -pl dist -Dexpression=spark-rapids-private.version -DforceStdout) + +jni_sha1=$(curl -s -H "Accept: application/json" \ + "${base_URL}?r=snapshots&g=com.nvidia&a=${project_jni}&v=${jni_ver}&c=&e=jar&wt=json" \ + | jq .data.sha1) || $(date +'%Y-%m-%d') +private_sha1=$(curl -s -H "Accept: application/json" \ + "${base_URL}?r=snapshots&g=com.nvidia&a=${project_private}&v=${private_ver}&c=&e=jar&wt=json" \ + | jq .data.sha1) || $(date +'%Y-%m-%d') + +sha1md5=$(echo -n "${jni_sha1}_${private_sha1}" | md5sum | awk '{print $1}') + +echo $sha1md5 diff --git a/.github/workflows/mvn-verify-check/populate-daily-cache.sh b/.github/workflows/mvn-verify-check/populate-daily-cache.sh index b93cd0b6b49..d4e9b07d1a7 100755 --- a/.github/workflows/mvn-verify-check/populate-daily-cache.sh +++ b/.github/workflows/mvn-verify-check/populate-daily-cache.sh @@ -14,22 +14,25 @@ # See the License for the specific language governing permissions and # limitations under the License. -set -x -max_retry=3; delay=30; i=1 +set -e +set -o pipefail + if [[ $SCALA_VER == '2.12' ]]; then pom='pom.xml' elif [[ $SCALA_VER == '2.13' ]]; then pom='scala2.13/pom.xml' fi + +max_retry=3; delay=30; i=1 while true; do + buildvers=($(python build/get_buildvers.py no_snapshots $pom | tr -d ',')) && { - python build/get_buildvers.py "no_snapshots.buildvers" $pom | tr -d ',' | \ - xargs -n 1 -I {} bash -c \ - "mvn $COMMON_MVN_FLAGS --file $pom -Dbuildver={} de.qaware.maven:go-offline-maven-plugin:resolve-dependencies" - + for buildver in "${buildvers[@]}"; do + mvn $COMMON_MVN_FLAGS --file $pom -Dbuildver=$buildver de.qaware.maven:go-offline-maven-plugin:resolve-dependencies + done + } && { # compile base versions to cache scala compiler and compiler bridge - mvn $COMMON_MVN_FLAGS --file $pom \ - process-test-resources -pl sql-plugin-api -am + mvn $COMMON_MVN_FLAGS --file $pom process-test-resources -pl sql-plugin-api -am } && break || { if [[ $i -le $max_retry ]]; then echo "mvn command failed. Retry $i/$max_retry."; ((i++)); sleep $delay; ((delay=delay*2)) @@ -37,4 +40,4 @@ while true; do echo "mvn command failed. Exit 1"; exit 1 fi } -done \ No newline at end of file +done diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 83b30747abd..e4077ee5994 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -127,15 +127,15 @@ mvn -pl dist -PnoSnapshots package -DskipTests Verify that shim-specific classes are hidden from a conventional classloader. ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl +$ javap -cp dist/target/rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl Error: class not found: com.nvidia.spark.rapids.shims.SparkShimImpl ``` However, its bytecode can be loaded if prefixed with `spark3XY` not contained in the package name ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar spark320.com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 -Warning: File dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar(/spark320/com/nvidia/spark/rapids/shims/SparkShimImpl.class) does not contain class spark320.com.nvidia.spark.rapids.shims.SparkShimImpl +$ javap -cp dist/target/rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar spark320.com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 +Warning: File dist/target/rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar(/spark320/com/nvidia/spark/rapids/shims/SparkShimImpl.class) does not contain class spark320.com.nvidia.spark.rapids.shims.SparkShimImpl Compiled from "SparkShims.scala" public final class com.nvidia.spark.rapids.shims.SparkShimImpl { ``` @@ -178,7 +178,7 @@ mvn package -pl dist -am -Dbuildver=340 -DallowConventionalDistJar=true Verify `com.nvidia.spark.rapids.shims.SparkShimImpl` is conventionally loadable: ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 +$ javap -cp dist/target/rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 Compiled from "SparkShims.scala" public final class com.nvidia.spark.rapids.shims.SparkShimImpl { ``` diff --git a/README.md b/README.md index 94b73565190..61914e49df0 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ as a `provided` dependency. com.nvidia rapids-4-spark_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT provided ``` diff --git a/aggregator/pom.xml b/aggregator/pom.xml index c7a6c220247..a47745776bc 100644 --- a/aggregator/pom.xml +++ b/aggregator/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark-aggregator_2.12 RAPIDS Accelerator for Apache Spark Aggregator Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT aggregator diff --git a/api_validation/pom.xml b/api_validation/pom.xml index cddcf0c1ce1..f3339375806 100644 --- a/api_validation/pom.xml +++ b/api_validation/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-api-validation_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT api_validation diff --git a/build/get_buildvers.py b/build/get_buildvers.py index 5fe864670b5..263003ea99f 100644 --- a/build/get_buildvers.py +++ b/build/get_buildvers.py @@ -34,7 +34,7 @@ def _get_buildvers(buildvers, pom_file, logger=None): else: no_snapshots.append(release) excluded_shims = pom.find(".//pom:dyn.shim.excluded.releases", ns) - if excluded_shims is not None: + if excluded_shims is not None and excluded_shims.text: for removed_shim in [x.strip() for x in excluded_shims.text.split(",")]: if removed_shim in snapshots: snapshots.remove(removed_shim) diff --git a/datagen/README.md b/datagen/README.md index 022cc2f1eba..1c49c8db58e 100644 --- a/datagen/README.md +++ b/datagen/README.md @@ -24,12 +24,12 @@ Where `$SPARK_VERSION` is a compressed version number, like 330 for Spark 3.3.0. After this the jar should be at `target/datagen_2.12-$PLUGIN_VERSION-spark$SPARK_VERSION.jar` -for example a Spark 3.3.0 jar for the 24.12.0 release would be -`target/datagen_2.12-24.12.0-spark330.jar` +for example a Spark 3.3.0 jar for the 25.02.0 release would be +`target/datagen_2.12-25.02.0-spark330.jar` To get a spark shell with this you can run ```shell -spark-shell --jars target/datagen_2.12-24.12.0-spark330.jar +spark-shell --jars target/datagen_2.12-25.02.0-spark330.jar ``` After that you should be good to go. diff --git a/datagen/ScaleTest.md b/datagen/ScaleTest.md index a728ad9a13e..8e692173f5f 100644 --- a/datagen/ScaleTest.md +++ b/datagen/ScaleTest.md @@ -44,7 +44,7 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \ --class com.nvidia.rapids.tests.scaletest.ScaleTestDataGen \ # the main class --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \ # one dependency jar just shipped with Spark under $SPARK_HOME -./target/datagen_2.12-24.12.0-SNAPSHOT-spark332.jar \ +./target/datagen_2.12-25.02.0-SNAPSHOT-spark332.jar \ 1 \ 10 \ parquet \ diff --git a/datagen/pom.xml b/datagen/pom.xml index 9bdf897cfd7..fc2d8bc677c 100644 --- a/datagen/pom.xml +++ b/datagen/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml datagen_2.12 Data Generator Tools for generating large amounts of data - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT datagen diff --git a/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala b/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala index 2884968660d..3480718dbc7 100644 --- a/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala +++ b/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala @@ -24,6 +24,6 @@ import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.internal.ExpressionUtils.{column, expression} object DataGenExprShims { - def columnToExpr(c: Column): Expression = c - def exprToColumn(e: Expression): Column = e + def columnToExpr(c: Column): Expression = expression(c) + def exprToColumn(e: Expression): Column = column(e) } diff --git a/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala b/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala index 0c212d6842a..5b26943a541 100644 --- a/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala +++ b/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala @@ -39,6 +39,7 @@ import org.apache.spark.sql.execution.{CoalescedPartitionSpec, ShufflePartitionS import org.apache.spark.sql.execution.exchange.Exchange import org.apache.spark.sql.execution.metric.{SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter} import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBase, ShuffledBatchRDD} +import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase.createAdditionalExchangeMetrics import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.ThreadUtils @@ -71,22 +72,11 @@ case class GpuOptimizeWriteExchangeExec( private[sql] lazy val readMetrics = SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext) - override lazy val additionalMetrics: Map[String, GpuMetric] = Map( - "dataSize" -> createSizeMetric(ESSENTIAL_LEVEL, "data size"), - "dataReadSize" -> createSizeMetric(MODERATE_LEVEL, "data read size"), - "rapidsShuffleSerializationTime" -> - createNanoTimingMetric(DEBUG_LEVEL, "rs. serialization time"), - "rapidsShuffleDeserializationTime" -> - createNanoTimingMetric(DEBUG_LEVEL, "rs. deserialization time"), - "rapidsShuffleWriteTime" -> - createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle write time"), - "rapidsShuffleCombineTime" -> - createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle combine time"), - "rapidsShuffleWriteIoTime" -> - createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle write io time"), - "rapidsShuffleReadTime" -> - createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle read time") - ) ++ GpuMetric.wrap(readMetrics) ++ GpuMetric.wrap(writeMetrics) + override lazy val additionalMetrics : Map[String, GpuMetric] = { + createAdditionalExchangeMetrics(this) ++ + GpuMetric.wrap(readMetrics) ++ + GpuMetric.wrap(writeMetrics) + } override lazy val allMetrics: Map[String, GpuMetric] = { Map( @@ -98,7 +88,7 @@ case class GpuOptimizeWriteExchangeExec( } private lazy val serializer: Serializer = - new GpuColumnarBatchSerializer(gpuLongMetric("dataSize"), + new GpuColumnarBatchSerializer(allMetrics, child.output.map(_.dataType).toArray, RapidsConf.SHUFFLE_KUDO_SERIALIZER_ENABLED.get(child.conf)) diff --git a/delta-lake/delta-20x/pom.xml b/delta-lake/delta-20x/pom.xml index 1d41911c767..ba5443a7be2 100644 --- a/delta-lake/delta-20x/pom.xml +++ b/delta-lake/delta-20x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-20x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-20x diff --git a/delta-lake/delta-21x/pom.xml b/delta-lake/delta-21x/pom.xml index 7514088ca3a..602686d79ab 100644 --- a/delta-lake/delta-21x/pom.xml +++ b/delta-lake/delta-21x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-21x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-21x diff --git a/delta-lake/delta-22x/pom.xml b/delta-lake/delta-22x/pom.xml index 2ed0ea3b159..7867c573607 100644 --- a/delta-lake/delta-22x/pom.xml +++ b/delta-lake/delta-22x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-22x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-22x diff --git a/delta-lake/delta-23x/pom.xml b/delta-lake/delta-23x/pom.xml index 1daebdd0efb..f537de0be36 100644 --- a/delta-lake/delta-23x/pom.xml +++ b/delta-lake/delta-23x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../pom.xml rapids-4-spark-delta-23x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.3.x Support Delta Lake 2.3.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-23x diff --git a/delta-lake/delta-24x/pom.xml b/delta-lake/delta-24x/pom.xml index 36ec92b70c0..443681b6cb3 100644 --- a/delta-lake/delta-24x/pom.xml +++ b/delta-lake/delta-24x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-24x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-24x diff --git a/delta-lake/delta-spark330db/pom.xml b/delta-lake/delta-spark330db/pom.xml index 95f54c6807c..4812c9d0097 100644 --- a/delta-lake/delta-spark330db/pom.xml +++ b/delta-lake/delta-spark330db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark330db_2.12 RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-spark330db diff --git a/delta-lake/delta-spark332db/pom.xml b/delta-lake/delta-spark332db/pom.xml index 4d792ee1ca5..306553caa43 100644 --- a/delta-lake/delta-spark332db/pom.xml +++ b/delta-lake/delta-spark332db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark332db_2.12 RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-spark332db diff --git a/delta-lake/delta-spark341db/pom.xml b/delta-lake/delta-spark341db/pom.xml index 4b229e2e5b5..c7b4a4e2738 100644 --- a/delta-lake/delta-spark341db/pom.xml +++ b/delta-lake/delta-spark341db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark341db_2.12 RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT false diff --git a/delta-lake/delta-spark350db143/pom.xml b/delta-lake/delta-spark350db143/pom.xml index 1bca394b67c..1e166244e1e 100644 --- a/delta-lake/delta-spark350db143/pom.xml +++ b/delta-lake/delta-spark350db143/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark350db143_2.12 RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT false diff --git a/delta-lake/delta-stub/pom.xml b/delta-lake/delta-stub/pom.xml index 6d0471f9f01..31b8e03b366 100644 --- a/delta-lake/delta-stub/pom.xml +++ b/delta-lake/delta-stub/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-stub_2.12 RAPIDS Accelerator for Apache Spark Delta Lake Stub Delta Lake stub for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-stub diff --git a/dist/pom.xml b/dist/pom.xml index d628dd4ba3b..b34292a25cd 100644 --- a/dist/pom.xml +++ b/dist/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark_2.12 RAPIDS Accelerator for Apache Spark Distribution Creates the distribution package of the RAPIDS plugin for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT com.nvidia diff --git a/docs/additional-functionality/advanced_configs.md b/docs/additional-functionality/advanced_configs.md index f3157b46099..a4427d9495a 100644 --- a/docs/additional-functionality/advanced_configs.md +++ b/docs/additional-functionality/advanced_configs.md @@ -33,6 +33,7 @@ Name | Description | Default Value | Applicable at spark.rapids.filecache.blockPathRegexp|A regular expression to decide which paths will not be cached when the file cache is enabled. If a path is blocked by this regexp but is allowed by spark.rapids.filecache.allowPathRegexp, then the path is blocked.|None|Startup spark.rapids.filecache.checkStale|Controls whether the cached is checked for being out of date with respect to the input file. When enabled, the data that has been cached locally for a file will be invalidated if the file is updated after being cached. This feature is only necessary if an input file for a Spark application can be changed during the lifetime of the application. If an individual input file will not be overwritten during the Spark application then performance may be improved by setting this to false.|true|Startup spark.rapids.filecache.maxBytes|Controls the maximum amount of data that will be cached locally. If left unspecified, it will use half of the available disk space detected on startup for the configured Spark local disks.|None|Startup +spark.rapids.filecache.minimumFreeSpace.bytes|Specify the minimum amount of free space in the Spark local disks. When the amount of free space on the Spark local disks drops below this value, cache data will be removed automatically to free disk space. A zero or negative value will disable this feature. Note if multiple Spark applications running on the same node, or there are other applications running with heavy disk writing, the filecache may not drop caches in time and may cause full disk errors. Please increase this value for this case.|32212254720|Startup spark.rapids.filecache.useChecksums|Whether to write out and verify checksums for the cached local files.|false|Startup spark.rapids.gpu.resourceName|The name of the Spark resource that represents a GPU that you want the plugin to use if using custom resources with Spark.|gpu|Startup spark.rapids.memory.gpu.allocFraction|The fraction of available (free) GPU memory that should be allocated for pooled memory. This must be less than or equal to the maximum limit configured via spark.rapids.memory.gpu.maxAllocFraction, and greater than or equal to the minimum limit configured via spark.rapids.memory.gpu.minAllocFraction.|1.0|Startup @@ -95,8 +96,8 @@ Name | Description | Default Value | Applicable at spark.rapids.sql.format.hive.text.write.enabled|When set to false disables Hive text table write acceleration|false|Runtime spark.rapids.sql.format.iceberg.enabled|When set to false disables all Iceberg acceleration|true|Runtime spark.rapids.sql.format.iceberg.read.enabled|When set to false disables Iceberg input acceleration|true|Runtime -spark.rapids.sql.format.json.enabled|When set to true enables all json input and output acceleration. (only input is currently supported anyways)|false|Runtime -spark.rapids.sql.format.json.read.enabled|When set to true enables json input acceleration|false|Runtime +spark.rapids.sql.format.json.enabled|When set to true enables all json input and output acceleration. (only input is currently supported anyways)|true|Runtime +spark.rapids.sql.format.json.read.enabled|When set to true enables json input acceleration|true|Runtime spark.rapids.sql.format.orc.enabled|When set to false disables all orc input and output acceleration|true|Runtime spark.rapids.sql.format.orc.floatTypesToString.enable|When reading an ORC file, the source data schemas(schemas of ORC file) may differ from the target schemas (schemas of the reader), we need to handle the castings from source type to target type. Since float/double numbers in GPU have different precision with CPU, when casting float/double to string, the result of GPU is different from result of CPU spark. Its default value is `true` (this means the strings result will differ from result of CPU). If it's set `false` explicitly and there exists casting from float/double to string in the job, then such behavior will cause an exception, and the job will fail.|true|Runtime spark.rapids.sql.format.orc.multiThreadedRead.maxNumFilesParallel|A limit on the maximum number of files per task processed in parallel on the CPU side before the file is sent to the GPU. This affects the amount of host memory used when reading the files in parallel. Used with MULTITHREADED reader, see spark.rapids.sql.format.orc.reader.type.|2147483647|Runtime @@ -278,7 +279,7 @@ Name | SQL Function(s) | Description | Default Value | Notes spark.rapids.sql.expression.IsNaN|`isnan`|Checks if a value is NaN|true|None| spark.rapids.sql.expression.IsNotNull|`isnotnull`|Checks if a value is not null|true|None| spark.rapids.sql.expression.IsNull|`isnull`|Checks if a value is null|true|None| -spark.rapids.sql.expression.JsonToStructs|`from_json`|Returns a struct value with the given `jsonStr` and `schema`|false|This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case| +spark.rapids.sql.expression.JsonToStructs|`from_json`|Returns a struct value with the given `jsonStr` and `schema`|true|None| spark.rapids.sql.expression.JsonTuple|`json_tuple`|Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.|false|This is disabled by default because Experimental feature that could be unstable or have performance issues.| spark.rapids.sql.expression.KnownFloatingPointNormalized| |Tag to prevent redundant normalization|true|None| spark.rapids.sql.expression.KnownNotNull| |Tag an expression as known to not be null|true|None| diff --git a/docs/compatibility.md b/docs/compatibility.md index 1382b1a9a1f..0c745069032 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -316,125 +316,102 @@ case. ## JSON -The JSON format read is an experimental feature which is expected to have some issues, so we disable -it by default. If you would like to test it, you need to enable `spark.rapids.sql.format.json.enabled` and -`spark.rapids.sql.format.json.read.enabled`. +JSON, despite being a standard format, has some ambiguity in it. Spark also offers the ability to allow +some invalid JSON to be parsed. We have tried to provide JSON parsing that is compatible with +what Apache Spark does support. Note that Spark itself has changed through different releases, and we will +try to call out which releases we offer different results for. JSON parsing is enabled by default +except for date and timestamp types where we still have work to complete. If you wish to disable +JSON Scan you can set `spark.rapids.sql.format.json.enabled` or +`spark.rapids.sql.format.json.read.enabled` to false. To disable `from_json` you can set +`spark.rapids.sql.expression.JsonToStructs` to false. -### Invalid JSON +### Limits -In Apache Spark on the CPU if a line in the JSON file is invalid the entire row is considered -invalid and will result in nulls being returned for all columns. It is considered invalid if it -violates the JSON specification, but with a few extensions. +In versions of Spark before 3.5.0 there is no maximum to how deeply nested JSON can be. After +3.5.0 this was updated to be 1,000 by default. The current GPU implementation of JSON Scan and +`from_json` limits this to 254 no matter what version of Spark is used. If the nesting level is +over this the JSON is considered invalid and all values will be returned as nulls. +`get_json_object` and `json_tuple` have a maximum nesting depth of 64. An exception is thrown if +the nesting depth goes over the maximum. - * Single quotes are allowed to quote strings and keys - * Unquoted values like NaN and Infinity can be parsed as floating point values - * Control characters do not need to be replaced with the corresponding escape sequences in a - quoted string. - * Garbage at the end of a row, if there is valid JSON at the beginning of the row, is ignored. +Spark 3.5.0 and above have limits on maximum string length 20,000,000 and maximum number length of +1,000. We do not have any of these limits on the GPU. -The GPU implementation does the same kinds of validations, but many of them are done on a per-column -basis, which, for example, means if a number is formatted incorrectly, it is likely only that value -will be considered invalid and return a null instead of nulls for the entire row. +We, like Spark, cannot support an JSON string that is larger than 2 GiB is size. -There are options that can be used to enable and disable many of these features which are mostly -listed below. +### JSON Validation -### JSON options +Spark supports the option `allowNonNumericNumbers`. Versions of Spark prior to 3.3.0 where inconsistent between +quoted and non-quoted values ([SPARK-38060](https://issues.apache.org/jira/browse/SPARK-38060)). The +GPU implementation is consistent with 3.3.0 and above. -Spark supports passing options to the JSON parser when reading a dataset. In most cases if the RAPIDS Accelerator -sees one of these options that it does not support it will fall back to the CPU. In some cases we do not. The -following options are documented below. +### JSON Floating Point Types -- `allowNumericLeadingZeros` - Allows leading zeros in numbers (e.g. 00012). By default this is set to false. - When it is false Spark considers the JSON invalid if it encounters this type of number. The RAPIDS - Accelerator supports validating columns that are returned to the user with this option on or off. - -- `allowUnquotedControlChars` - Allows JSON Strings to contain unquoted control characters (ASCII characters with - value less than 32, including tab and line feed characters) or not. By default this is set to false. If the schema - is provided while reading JSON file, then this flag has no impact on the RAPIDS Accelerator as it always allows - unquoted control characters but Spark sees these are invalid are returns nulls. However, if the schema is not provided - and this option is false, then RAPIDS Accelerator's behavior is same as Spark where an exception is thrown - as discussed in `JSON Schema discovery` section. - -- `allowNonNumericNumbers` - Allows `NaN` and `Infinity` values to be parsed (note that these are not valid numeric - values in the [JSON specification](https://json.org)). Spark versions prior to 3.3.0 have inconsistent behavior and will - parse some variants of `NaN` and `Infinity` even when this option is disabled - ([SPARK-38060](https://issues.apache.org/jira/browse/SPARK-38060)). The RAPIDS Accelerator behavior is consistent with - Spark version 3.3.0 and later. - -### Nesting -In versions of Spark before 3.5.0 there is no maximum to how deeply nested JSON can be. After -3.5.0 this was updated to be 1000 by default. The current GPU implementation limits this to 254 -no matter what version of Spark is used. If the nesting level is over this the JSON is considered -invalid and all values will be returned as nulls. - -Mixed types can have some problems. If an item being read could have some lines that are arrays -and others that are structs/dictionaries it is possible an error will be thrown. - -Dates and Timestamps have some issues and may return values for technically invalid inputs. - -Floating point numbers have issues generally like with the rest of Spark, and we can parse them into -a valid floating point number, but it might not match 100% with the way Spark does it. - -Strings are supported, but the data returned might not be normalized in the same way as the CPU -implementation. Generally this comes down to the GPU not modifying the input, whereas Spark will -do things like remove extra white space and parse numbers before turning them back into a string. +Parsing floating-point values has the same limitations as [casting from string to float](#string-to-float). -### JSON Floating Point +### JSON Integral Types -Parsing floating-point values has the same limitations as [casting from string to float](#string-to-float). +Versions of Spark prior to 3.3.0 would parse quoted integer values, like "1". But 3.3.0 and above consider +these to be invalid and will return `null` when parsed as an Integral types. The GPU implementation +follows 3.3.0 and above. -Prior to Spark 3.3.0, reading JSON strings such as `"+Infinity"` when specifying that the data type is `FloatType` -or `DoubleType` caused these values to be parsed even when `allowNonNumericNumbers` is set to false. Also, Spark -versions prior to 3.3.0 only supported the `"Infinity"` and `"-Infinity"` representations of infinity and did not -support `"+INF"`, `"-INF"`, or `"+Infinity"`, which Spark considers valid when unquoted. The GPU JSON reader is -consistent with the behavior in Spark 3.3.0 and later. +### JSON Decimal Types -Another limitation of the GPU JSON reader is that it will parse strings containing non-string boolean or numeric values where -Spark will treat them as invalid inputs and will just return `null`. +Spark supports parsing decimal types either formatted as floating point number or integral numbers, even if it is +in a quoted string. If it is in a quoted string the local of the JVM is used to determine the number format. +If the local is not for the `US`, which is the default we will fall back to the CPU because we do not currently +parse those numbers correctly. The `US` format removes all commas ',' from the quoted string. +As a part of this, though, non-arabic numbers are also supported. We do not support parsing these numbers +see (issue 10532)[https://github.com/NVIDIA/spark-rapids/issues/10532]. -### JSON Dates/Timestamps +### JSON Date/Timestamp Types Dates and timestamps are not supported by default in JSON parser, since the GPU implementation is not 100% compatible with Apache Spark. If needed, they can be turned on through the config `spark.rapids.sql.json.read.datetime.enabled`. -Once enabled, the JSON parser still does not support the `TimestampNTZ` type and will fall back to CPU -if `spark.sql.timestampType` is set to `TIMESTAMP_NTZ` or if an explicit schema is provided that -contains the `TimestampNTZ` type. +This config works for both JSON scan and `from_json`. Once enabled, the JSON parser still does +not support the `TimestampNTZ` type and will fall back to CPU if `spark.sql.timestampType` is set +to `TIMESTAMP_NTZ` or if an explicit schema is provided that contains the `TimestampNTZ` type. There is currently no support for reading numeric values as timestamps and null values are returned instead -([#4940](https://github.com/NVIDIA/spark-rapids/issues/4940)). A workaround would be to read as longs and then cast -to timestamp. +([#4940](https://github.com/NVIDIA/spark-rapids/issues/4940)). A workaround would be to read as longs and then cast to timestamp. -### JSON Schema discovery +### JSON Arrays and Structs with Overflowing Numbers -Spark SQL can automatically infer the schema of a JSON dataset if schema is not provided explicitly. The CPU -handles schema discovery and there is no GPU acceleration of this. By default Spark will read/parse the entire -dataset to determine the schema. This means that some options/errors which are ignored by the GPU may still -result in an exception if used with schema discovery. +Spark is inconsistent between versions in how it handles numbers that overflow that are nested in either an array +or a non-top-level struct. In some versions only the value that overflowed is marked as null. In other versions the +wrapping array or struct is marked as null. We currently only mark the individual value as null. This matches +versions 3.4.2 and above of Spark for structs. Arrays on most versions of spark invalidate the entire array if there +is a single value that overflows within it. -### `from_json` function +### Duplicate Struct Names -`JsonToStructs` of `from_json` is based on the same code as reading a JSON lines file. There are -a few differences with it. +The JSON specification technically allows for duplicate keys in a struct, but does not explain what to +do with them. In the case of Spark it is inconsistent between operators which value wins. `get_json_object` +depends on the query being performed. We do not always match what Spark does. We do match it in many cases, +but we consider this enough of a corner case that we have not tried to make it work in all cases. -The `from_json` function is disabled by default because it is experimental and has some known -incompatibilities with Spark, and can be enabled by setting -`spark.rapids.sql.expression.JsonToStructs=true`. You don't need to set -`spark.rapids.sql.format.json.enabled` and`spark.rapids.sql.format.json.read.enabled` to true. -In addition, if the input schema contains date and/or timestamp types, an additional config -`spark.rapids.sql.json.read.datetime.enabled` also needs to be set to `true` in order -to enable this function on the GPU. +We also do not support schemas where there are duplicate column names. We just fall back to the CPU for those cases. -There is no schema discovery as a schema is required as input to `from_json` +### JSON Normalization (String Types) -In addition to `structs`, a top level `map` type is supported, but only if the key and value are -strings. +In versions of Spark prior to 4.0.0 input JSON Strings were parsed to JSON tokens and then converted back to +strings. This effectively normalizes the output string. So things like single quotes are transformed into double +quotes, floating point numbers are parsed and converted back to strings possibly changing the format, and +escaped characters are converted back to their simplest form. We try to support this on the GPU as well. Single quotes +will be converted to double quotes. Only `get_json_object` and `json_tuple` attempt to normalize floating point +numbers. There is no implementation on the GPU right now that tries to normalize escape characters. + +### `from_json` Function + +`JsonToStructs` or `from_json` is based on the same code as reading a JSON lines file. There are +a few differences with it. -### `to_json` function +The main difference is that `from_json` supports parsing Maps and Arrays directly from a JSON column, whereas +JSON Scan only supports parsing top level structs. The GPU implementation of `from_json` has support for parsing +a `MAP` as a top level schema, but does not currently support arrays at the top level. -The `to_json` function is disabled by default because it is experimental and has some known incompatibilities -with Spark, and can be enabled by setting `spark.rapids.sql.expression.StructsToJson=true`. +### `to_json` Function Known issues are: @@ -442,7 +419,7 @@ Known issues are: produce `-4.1243574E26` but the GPU may produce `-4.124357351E26`. - Not all JSON options are respected -### get_json_object +### `get_json_object` Function Known issue: - [Floating-point number normalization error](https://github.com/NVIDIA/spark-rapids-jni/issues/1922). `get_json_object` floating-point number normalization on the GPU could sometimes return incorrect results if the string contains high-precision values, see the String to Float and Float to String section for more details. diff --git a/docs/configs.md b/docs/configs.md index 7f9544496c4..04aecb41f02 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -10,7 +10,7 @@ The following is the list of options that `rapids-plugin-4-spark` supports. On startup use: `--conf [conf key]=[conf value]`. For example: ``` -${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar \ +${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ --conf spark.rapids.sql.concurrentGpuTasks=2 ``` diff --git a/docs/dev/idea-code-style-settings.xml b/docs/dev/idea-code-style-settings.xml index 165d30dde06..9f5c3c100dc 100644 --- a/docs/dev/idea-code-style-settings.xml +++ b/docs/dev/idea-code-style-settings.xml @@ -1,3 +1,19 @@ + +