From 79c2a3bcc2532eee0c122abca9ffe4555fd5c137 Mon Sep 17 00:00:00 2001
From: Partho Sarthi <psarthi@nvidia.com>
Date: Thu, 14 Mar 2024 22:22:46 -0700
Subject: [PATCH] Handle minimum GPU architecture supported [databricks]
 (#10540)

Fixes #10430. This PR ensures that Spark RAPIDS jobs are executed on supported GPU architectures without relying on manual configuration.

### Changes:
1. Processes `gpu_architectures` property from the `*version-info.properties` file generated by the native builds.
2. Verifies if the user is running the job on an architecture supported by the cuDF and JNI libraries and throws an exception if the architecture is unsupported.

### Testing
Tested on a Dataproc VM running on Nvidia P4 (GPU Architecture 6.1)

```
24/03/06 17:44:58 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.
24/03/06 17:45:10 ERROR RapidsExecutorPlugin: Exception in the executor plugin, shutting down!
java.lang.RuntimeException: Device architecture 61 is unsupported. Minimum supported architecture: 75.
        at com.nvidia.spark.rapids.RapidsPluginUtils$.checkGpuArchitectureInternal(Plugin.scala:366)
        at com.nvidia.spark.rapids.RapidsPluginUtils$.checkGpuArchitecture(Plugin.scala:375)
        at com.nvidia.spark.rapids.RapidsExecutorPlugin.init(Plugin.scala:461)
```


### Related PR
* https://github.com/NVIDIA/spark-rapids-jni/pull/1840
* Add conf for minimum supported CUDA and error handling

Signed-off-by: Partho Sarthi <psarthi@nvidia.com>

* Revert "Add conf for minimum supported CUDA and error handling"

This reverts commit 7b8eaeaf4c0f3f9c6d781602eb59eb19c823dca0.

* Verify the GPU architecture is supported by the plugin libraries

Signed-off-by: Partho Sarthi <psarthi@nvidia.com>

* Use semi-colon as delimiter and use intersection of supported gpu architectures

Signed-off-by: Partho Sarthi <psarthi@nvidia.com>

* Allow for compatibility with major architectures

Signed-off-by: Partho Sarthi <psarthi@nvidia.com>

* Check for version as integers

Signed-off-by: Partho Sarthi <psarthi@nvidia.com>

* Modify compatibility check for same major version and same or higher minor version

Signed-off-by: Partho Sarthi <psarthi@nvidia.com>

* Add a config to skip verification and refactor checking

Signed-off-by: Partho Sarthi <psarthi@nvidia.com>

* Update RapidsConf.scala

Co-authored-by: Jason Lowe <jlowe@nvidia.com>

* Update verification logic

Signed-off-by: Partho Sarthi <psarthi@nvidia.com>

* Update warning message

Signed-off-by: Partho Sarthi <psarthi@nvidia.com>

* Add unit tests and update warning message.

Signed-off-by: Partho Sarthi <psarthi@nvidia.com>

* Update exception class

Signed-off-by: Partho Sarthi <psarthi@nvidia.com>

* Address review comments

Signed-off-by: Partho Sarthi <psarthi@nvidia.com>

---------

Signed-off-by: Partho Sarthi <psarthi@nvidia.com>
Co-authored-by: Jason Lowe <jlowe@nvidia.com>
---
 .../com/nvidia/spark/rapids/Plugin.scala      | 69 +++++++++++++++++--
 .../com/nvidia/spark/rapids/RapidsConf.scala  |  9 +++
 .../rapids/GpuArchitectureTestSuite.scala     | 64 +++++++++++++++++
 3 files changed, 138 insertions(+), 4 deletions(-)
 create mode 100644 tests/src/test/scala/com/nvidia/spark/rapids/GpuArchitectureTestSuite.scala

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
index 6ff94a0e15a..aabbacccfe3 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
@@ -73,6 +73,7 @@ object RapidsPluginUtils extends Logging {
   private val TASK_GPU_AMOUNT_KEY = "spark.task.resource.gpu.amount"
   private val EXECUTOR_GPU_AMOUNT_KEY = "spark.executor.resource.gpu.amount"
   private val SPARK_MASTER = "spark.master"
+  private val SPARK_RAPIDS_REPO_URL = "https://github.com/NVIDIA/spark-rapids"
 
   {
     val pluginProps = loadProps(PLUGIN_PROPS_FILENAME)
@@ -346,6 +347,63 @@ object RapidsPluginUtils extends Logging {
       loadExtensions(classOf[SparkPlugin], pluginClasses)
     }
   }
+
+  /**
+   * Extracts supported GPU architectures from the given properties file
+   */
+  private def getSupportedGpuArchitectures(propFileName: String): Set[Int] = {
+    val props = RapidsPluginUtils.loadProps(propFileName)
+    Option(props.getProperty("gpu_architectures"))
+      .getOrElse(throw new RuntimeException(s"GPU architectures not found in $propFileName"))
+      .split(";")
+      .map(_.toInt)
+      .toSet
+  }
+
+  /**
+   * Checks if the current GPU architecture is supported by the spark-rapids-jni
+   * and cuDF libraries.
+   */
+  def validateGpuArchitecture(): Unit = {
+    val gpuArch = Cuda.getComputeCapabilityMajor * 10 + Cuda.getComputeCapabilityMinor
+    validateGpuArchitectureInternal(gpuArch, getSupportedGpuArchitectures(JNI_PROPS_FILENAME),
+      getSupportedGpuArchitectures(CUDF_PROPS_FILENAME))
+  }
+
+  /**
+   * Checks the validity of the provided GPU architecture in the provided architecture set.
+   *
+   * See: https://docs.nvidia.com/cuda/ampere-compatibility-guide/index.html
+   */
+  def validateGpuArchitectureInternal(gpuArch: Int, jniSupportedGpuArchs: Set[Int],
+      cudfSupportedGpuArchs: Set[Int]): Unit = {
+    val supportedGpuArchs = jniSupportedGpuArchs.intersect(cudfSupportedGpuArchs)
+    if (supportedGpuArchs.isEmpty) {
+      val jniSupportedGpuArchsStr = jniSupportedGpuArchs.toSeq.sorted.mkString(", ")
+      val cudfSupportedGpuArchsStr = cudfSupportedGpuArchs.toSeq.sorted.mkString(", ")
+      throw new IllegalStateException(s"Compatibility check failed for GPU architecture " +
+        s"$gpuArch. Supported GPU architectures by JNI: $jniSupportedGpuArchsStr and " +
+        s"cuDF: $cudfSupportedGpuArchsStr. Please report this issue at $SPARK_RAPIDS_REPO_URL." +
+        s" This check can be disabled by setting `spark.rapids.skipGpuArchitectureCheck` to" +
+        s" `true`, but it may lead to functional failures.")
+    }
+
+    val minSupportedGpuArch = supportedGpuArchs.min
+    // Check if the device architecture is supported
+    if (gpuArch < minSupportedGpuArch) {
+      throw new RuntimeException(s"Device architecture $gpuArch is unsupported." +
+        s" Minimum supported architecture: $minSupportedGpuArch.")
+    }
+    val supportedMajorGpuArchs = supportedGpuArchs.map(_ / 10)
+    val majorGpuArch = gpuArch / 10
+    // Warn the user if the device's major architecture is not available
+    if (!supportedMajorGpuArchs.contains(majorGpuArch)) {
+      val supportedMajorArchStr = supportedMajorGpuArchs.toSeq.sorted.mkString(", ")
+      logWarning(s"No precompiled binaries for device major architecture $majorGpuArch. " +
+        "This may lead to expensive JIT compile on startup. " +
+        s"Binaries available for architectures $supportedMajorArchStr.")
+    }
+  }
 }
 
 /**
@@ -427,10 +485,6 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
       pluginContext: PluginContext,
       extraConf: java.util.Map[String, String]): Unit = {
     try {
-      if (Cuda.getComputeCapabilityMajor < 6) {
-        throw new RuntimeException(s"GPU compute capability ${Cuda.getComputeCapabilityMajor}" +
-          " is unsupported, requires 6.0+")
-      }
       // if configured, re-register checking leaks hook.
       reRegisterCheckLeakHook()
 
@@ -438,6 +492,13 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
       val numCores = RapidsPluginUtils.estimateCoresOnExec(sparkConf)
       val conf = new RapidsConf(extraConf.asScala.toMap)
 
+      // Checks if the current GPU architecture is supported by the
+      // spark-rapids-jni and cuDF libraries.
+      // Note: We allow this check to be skipped for off-chance cases.
+      if (!conf.skipGpuArchCheck) {
+        RapidsPluginUtils.validateGpuArchitecture()
+      }
+
       // Fail if there are multiple plugin jars in the classpath.
       RapidsPluginUtils.detectMultipleJars(conf)
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
index e466af5edd4..2971d9dae51 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -2141,6 +2141,13 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression.
     .booleanConf
     .createOptional
 
+  val SKIP_GPU_ARCH_CHECK = conf("spark.rapids.skipGpuArchitectureCheck")
+    .doc("When true, skips GPU architecture compatibility check. Note that this check " +
+      "might still be present in cuDF.")
+    .internal()
+    .booleanConf
+    .createWithDefault(false)
+
   private def printSectionHeader(category: String): Unit =
     println(s"\n### $category")
 
@@ -2906,6 +2913,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val splitUntilSizeOverride: Option[Long] = get(SPLIT_UNTIL_SIZE_OVERRIDE)
 
+  lazy val skipGpuArchCheck: Boolean = get(SKIP_GPU_ARCH_CHECK)
+
   private val optimizerDefaults = Map(
     // this is not accurate because CPU projections do have a cost due to appending values
     // to each row that is produced, but this needs to be a really small number because
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuArchitectureTestSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuArchitectureTestSuite.scala
new file mode 100644
index 00000000000..876687d219b
--- /dev/null
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuArchitectureTestSuite.scala
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import com.nvidia.spark.rapids.RapidsPluginUtils.validateGpuArchitectureInternal
+import org.scalatest.funsuite.AnyFunSuite
+
+class GpuArchitectureTestSuite extends AnyFunSuite {
+  test("test supported architecture") {
+    val jniSupportedGpuArchs = Set(50, 60, 70)
+    val cudfSupportedGpuArchs = Set(50, 60, 65, 70)
+    val gpuArch = 60
+    validateGpuArchitectureInternal(gpuArch, jniSupportedGpuArchs, cudfSupportedGpuArchs)
+  }
+
+  test("test unsupported architecture") {
+    val jniSupportedGpuArchs = Set(50, 60, 70)
+    val cudfSupportedGpuArchs = Set(50, 60, 65, 70)
+    val gpuArch = 40
+    val exception = intercept[RuntimeException] {
+      validateGpuArchitectureInternal(gpuArch, jniSupportedGpuArchs, cudfSupportedGpuArchs)
+    }
+    assert(exception.getMessage.contains(s"Device architecture $gpuArch is unsupported"))
+  }
+
+  test("test supported major architecture with higher minor version") {
+    val jniSupportedGpuArchs = Set(50, 60, 65, 70)
+    val cudfSupportedGpuArchs = Set(50, 60, 65, 70)
+    val gpuArch = 67
+    validateGpuArchitectureInternal(gpuArch, jniSupportedGpuArchs, cudfSupportedGpuArchs)
+  }
+
+  test("test supported major architecture with lower minor version") {
+    val jniSupportedGpuArchs = Set(50, 60, 65, 70)
+    val cudfSupportedGpuArchs = Set(50, 60, 65, 70)
+    val gpuArch = 63
+    validateGpuArchitectureInternal(gpuArch, jniSupportedGpuArchs, cudfSupportedGpuArchs)
+  }
+
+  test("test empty supported architecture set") {
+    val jniSupportedGpuArchs = Set(50, 60)
+    val cudfSupportedGpuArchs = Set(70, 80)
+    val gpuArch = 60
+    val exception = intercept[IllegalStateException] {
+      validateGpuArchitectureInternal(gpuArch, jniSupportedGpuArchs, cudfSupportedGpuArchs)
+    }
+    assert(exception.getMessage.contains(
+      s"Compatibility check failed for GPU architecture $gpuArch"))
+  }
+}