Handle minimum GPU architecture supported [databricks] (#10540)

Fixes #10430. This PR ensures that Spark RAPIDS jobs are executed on supported GPU architectures without relying on manual configuration. ### Changes: 1. Processes `gpu_architectures` property from the `*version-info.properties` file generated by the native builds. 2. Verifies if the user is running the job on an architecture supported by the cuDF and JNI libraries and throws an exception if the architecture is unsupported. ### Testing Tested on a Dataproc VM running on Nvidia P4 (GPU Architecture 6.1) ``` 24/03/06 17:44:58 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU. 24/03/06 17:45:10 ERROR RapidsExecutorPlugin: Exception in the executor plugin, shutting down! java.lang.RuntimeException: Device architecture 61 is unsupported. Minimum supported architecture: 75. at com.nvidia.spark.rapids.RapidsPluginUtils$.checkGpuArchitectureInternal(Plugin.scala:366) at com.nvidia.spark.rapids.RapidsPluginUtils$.checkGpuArchitecture(Plugin.scala:375) at com.nvidia.spark.rapids.RapidsExecutorPlugin.init(Plugin.scala:461) ``` ### Related PR * NVIDIA/spark-rapids-jni#1840 * Add conf for minimum supported CUDA and error handling Signed-off-by: Partho Sarthi <[email protected]> * Revert "Add conf for minimum supported CUDA and error handling" This reverts commit 7b8eaea. * Verify the GPU architecture is supported by the plugin libraries Signed-off-by: Partho Sarthi <[email protected]> * Use semi-colon as delimiter and use intersection of supported gpu architectures Signed-off-by: Partho Sarthi <[email protected]> * Allow for compatibility with major architectures Signed-off-by: Partho Sarthi <[email protected]> * Check for version as integers Signed-off-by: Partho Sarthi <[email protected]> * Modify compatibility check for same major version and same or higher minor version Signed-off-by: Partho Sarthi <[email protected]> * Add a config to skip verification and refactor checking Signed-off-by: Partho Sarthi <[email protected]> * Update RapidsConf.scala Co-authored-by: Jason Lowe <[email protected]> * Update verification logic Signed-off-by: Partho Sarthi <[email protected]> * Update warning message Signed-off-by: Partho Sarthi <[email protected]> * Add unit tests and update warning message. Signed-off-by: Partho Sarthi <[email protected]> * Update exception class Signed-off-by: Partho Sarthi <[email protected]> * Address review comments Signed-off-by: Partho Sarthi <[email protected]> --------- Signed-off-by: Partho Sarthi <[email protected]> Co-authored-by: Jason Lowe <[email protected]>
NVIDIA · Mar 15, 2024 · 79c2a3b · 79c2a3b
1 parent 4393e9f
commit 79c2a3b
Show file tree

Hide file tree

Showing 3 changed files with 138 additions and 4 deletions.
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
@@ -73,6 +73,7 @@ object RapidsPluginUtils extends Logging {
   private val TASK_GPU_AMOUNT_KEY = "spark.task.resource.gpu.amount"
   private val EXECUTOR_GPU_AMOUNT_KEY = "spark.executor.resource.gpu.amount"
   private val SPARK_MASTER = "spark.master"
+  private val SPARK_RAPIDS_REPO_URL = "https://github.com/NVIDIA/spark-rapids"
 
   {
     val pluginProps = loadProps(PLUGIN_PROPS_FILENAME)
@@ -346,6 +347,63 @@ object RapidsPluginUtils extends Logging {
       loadExtensions(classOf[SparkPlugin], pluginClasses)
     }
   }
+
+  /**
+   * Extracts supported GPU architectures from the given properties file
+   */
+  private def getSupportedGpuArchitectures(propFileName: String): Set[Int] = {
+    val props = RapidsPluginUtils.loadProps(propFileName)
+    Option(props.getProperty("gpu_architectures"))
+      .getOrElse(throw new RuntimeException(s"GPU architectures not found in $propFileName"))
+      .split(";")
+      .map(_.toInt)
+      .toSet
+  }
+
+  /**
+   * Checks if the current GPU architecture is supported by the spark-rapids-jni
+   * and cuDF libraries.
+   */
+  def validateGpuArchitecture(): Unit = {
+    val gpuArch = Cuda.getComputeCapabilityMajor * 10 + Cuda.getComputeCapabilityMinor
+    validateGpuArchitectureInternal(gpuArch, getSupportedGpuArchitectures(JNI_PROPS_FILENAME),
+      getSupportedGpuArchitectures(CUDF_PROPS_FILENAME))
+  }
+
+  /**
+   * Checks the validity of the provided GPU architecture in the provided architecture set.
+   *
+   * See: https://docs.nvidia.com/cuda/ampere-compatibility-guide/index.html
+   */
+  def validateGpuArchitectureInternal(gpuArch: Int, jniSupportedGpuArchs: Set[Int],
+      cudfSupportedGpuArchs: Set[Int]): Unit = {
+    val supportedGpuArchs = jniSupportedGpuArchs.intersect(cudfSupportedGpuArchs)
+    if (supportedGpuArchs.isEmpty) {
+      val jniSupportedGpuArchsStr = jniSupportedGpuArchs.toSeq.sorted.mkString(", ")
+      val cudfSupportedGpuArchsStr = cudfSupportedGpuArchs.toSeq.sorted.mkString(", ")
+      throw new IllegalStateException(s"Compatibility check failed for GPU architecture " +
+        s"$gpuArch. Supported GPU architectures by JNI: $jniSupportedGpuArchsStr and " +
+        s"cuDF: $cudfSupportedGpuArchsStr. Please report this issue at $SPARK_RAPIDS_REPO_URL." +
+        s" This check can be disabled by setting `spark.rapids.skipGpuArchitectureCheck` to" +
+        s" `true`, but it may lead to functional failures.")
+    }
+
+    val minSupportedGpuArch = supportedGpuArchs.min
+    // Check if the device architecture is supported
+    if (gpuArch < minSupportedGpuArch) {
+      throw new RuntimeException(s"Device architecture $gpuArch is unsupported." +
+        s" Minimum supported architecture: $minSupportedGpuArch.")
+    }
+    val supportedMajorGpuArchs = supportedGpuArchs.map(_ / 10)
+    val majorGpuArch = gpuArch / 10
+    // Warn the user if the device's major architecture is not available
+    if (!supportedMajorGpuArchs.contains(majorGpuArch)) {
+      val supportedMajorArchStr = supportedMajorGpuArchs.toSeq.sorted.mkString(", ")
+      logWarning(s"No precompiled binaries for device major architecture $majorGpuArch. " +
+        "This may lead to expensive JIT compile on startup. " +
+        s"Binaries available for architectures $supportedMajorArchStr.")
+    }
+  }
 }
 
 /**
@@ -427,17 +485,20 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
       pluginContext: PluginContext,
       extraConf: java.util.Map[String, String]): Unit = {
     try {
-      if (Cuda.getComputeCapabilityMajor < 6) {
-        throw new RuntimeException(s"GPU compute capability ${Cuda.getComputeCapabilityMajor}" +
-          " is unsupported, requires 6.0+")
-      }
       // if configured, re-register checking leaks hook.
       reRegisterCheckLeakHook()
 
       val sparkConf = pluginContext.conf()
       val numCores = RapidsPluginUtils.estimateCoresOnExec(sparkConf)
       val conf = new RapidsConf(extraConf.asScala.toMap)
 
+      // Checks if the current GPU architecture is supported by the
+      // spark-rapids-jni and cuDF libraries.
+      // Note: We allow this check to be skipped for off-chance cases.
+      if (!conf.skipGpuArchCheck) {
+        RapidsPluginUtils.validateGpuArchitecture()
+      }
+
       // Fail if there are multiple plugin jars in the classpath.
       RapidsPluginUtils.detectMultipleJars(conf)
 

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -2141,6 +2141,13 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression.
     .booleanConf
     .createOptional
 
+  val SKIP_GPU_ARCH_CHECK = conf("spark.rapids.skipGpuArchitectureCheck")
+    .doc("When true, skips GPU architecture compatibility check. Note that this check " +
+      "might still be present in cuDF.")
+    .internal()
+    .booleanConf
+    .createWithDefault(false)
+
   private def printSectionHeader(category: String): Unit =
     println(s"\n### $category")
 
@@ -2906,6 +2913,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val splitUntilSizeOverride: Option[Long] = get(SPLIT_UNTIL_SIZE_OVERRIDE)
 
+  lazy val skipGpuArchCheck: Boolean = get(SKIP_GPU_ARCH_CHECK)
+
   private val optimizerDefaults = Map(
     // this is not accurate because CPU projections do have a cost due to appending values
     // to each row that is produced, but this needs to be a really small number because

diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuArchitectureTestSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuArchitectureTestSuite.scala
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import com.nvidia.spark.rapids.RapidsPluginUtils.validateGpuArchitectureInternal
+import org.scalatest.funsuite.AnyFunSuite
+
+class GpuArchitectureTestSuite extends AnyFunSuite {
+  test("test supported architecture") {
+    val jniSupportedGpuArchs = Set(50, 60, 70)
+    val cudfSupportedGpuArchs = Set(50, 60, 65, 70)
+    val gpuArch = 60
+    validateGpuArchitectureInternal(gpuArch, jniSupportedGpuArchs, cudfSupportedGpuArchs)
+  }
+
+  test("test unsupported architecture") {
+    val jniSupportedGpuArchs = Set(50, 60, 70)
+    val cudfSupportedGpuArchs = Set(50, 60, 65, 70)
+    val gpuArch = 40
+    val exception = intercept[RuntimeException] {
+      validateGpuArchitectureInternal(gpuArch, jniSupportedGpuArchs, cudfSupportedGpuArchs)
+    }
+    assert(exception.getMessage.contains(s"Device architecture $gpuArch is unsupported"))
+  }
+
+  test("test supported major architecture with higher minor version") {
+    val jniSupportedGpuArchs = Set(50, 60, 65, 70)
+    val cudfSupportedGpuArchs = Set(50, 60, 65, 70)
+    val gpuArch = 67
+    validateGpuArchitectureInternal(gpuArch, jniSupportedGpuArchs, cudfSupportedGpuArchs)
+  }
+
+  test("test supported major architecture with lower minor version") {
+    val jniSupportedGpuArchs = Set(50, 60, 65, 70)
+    val cudfSupportedGpuArchs = Set(50, 60, 65, 70)
+    val gpuArch = 63
+    validateGpuArchitectureInternal(gpuArch, jniSupportedGpuArchs, cudfSupportedGpuArchs)
+  }
+
+  test("test empty supported architecture set") {
+    val jniSupportedGpuArchs = Set(50, 60)
+    val cudfSupportedGpuArchs = Set(70, 80)
+    val gpuArch = 60
+    val exception = intercept[IllegalStateException] {
+      validateGpuArchitectureInternal(gpuArch, jniSupportedGpuArchs, cudfSupportedGpuArchs)
+    }
+    assert(exception.getMessage.contains(
+      s"Compatibility check failed for GPU architecture $gpuArch"))
+  }
+}