From 480798652b7655d29ae2cad306ac9b8eef2beeea Mon Sep 17 00:00:00 2001
From: Chang Liu <lc9114@gmail.com>
Date: Fri, 22 Nov 2024 08:39:51 -0800
Subject: [PATCH] [Bugfix] Dynamic load NVML symbols for better compatibility
 (#72)

[File PR here for the record]
Dynamically load NVML symbols for querying GPU fabric info to address incompatibility issues with outdated display drivers.

Authors:
  - Chang Liu (https://github.com/chang-l)

Approvers:
  - https://github.com/linhu-nv

URL: https://github.com/rapidsai/cugraph-gnn/pull/72
---
 cpp/src/nvml_wrap.cpp                | 77 ++++++++++++++++++++++++++++
 cpp/src/nvml_wrap.h                  | 27 ++++++++++
 cpp/src/wholememory/communicator.cpp | 74 ++++++++++++++------------
 cpp/src/wholememory/system_info.cpp  | 26 +++++-----
 cpp/src/wholememory/system_info.hpp  |  5 +-
 5 files changed, 163 insertions(+), 46 deletions(-)
 create mode 100644 cpp/src/nvml_wrap.cpp
 create mode 100644 cpp/src/nvml_wrap.h
diff --git a/cpp/src/nvml_wrap.cpp b/cpp/src/nvml_wrap.cpp
new file mode 100644
index 0000000..8aa3d8d
--- /dev/null
+++ b/cpp/src/nvml_wrap.cpp
@@ -0,0 +1,77 @@
+// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "nvml_wrap.h"
+
+#if CUDA_VERSION >= 12030
+#include <dlfcn.h>
+#include <mutex>
+#include <stdio.h>
+
+namespace {
+
+void* nvml_handle = nullptr;
+std::mutex nvml_mutex;
+bool nvml_loaded = false;
+
+bool LoadNvmlLibrary()
+{
+  nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
+  if (!nvml_handle) {
+    nvml_handle = dlopen("libnvidia-ml.so", RTLD_NOW);
+    if (!nvml_handle) {
+      fprintf(stderr, "Failed to load NVML library: %s\n", dlerror());
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+T LoadNvmlSymbol(const char* name)
+{
+  void* symbol = dlsym(nvml_handle, name);
+  if (!symbol) { return nullptr; }
+  return reinterpret_cast<T>(symbol);
+}
+
+}  // namespace
+
+// Global function pointers
+nvmlDeviceGetHandleByIndexFunc nvmlDeviceGetHandleByIndexPtr = nullptr;
+nvmlDeviceGetGpuFabricInfoFunc nvmlDeviceGetGpuFabricInfoPtr = nullptr;
+
+// Ensure NVML is loaded and symbols are initialized
+bool NvmlFabricSymbolLoaded()
+{
+  std::lock_guard<std::mutex> lock(nvml_mutex);
+  if (nvml_loaded) {
+    return true;  // Already loaded
+  }
+
+  if (LoadNvmlLibrary()) {
+    nvmlDeviceGetHandleByIndexPtr =
+      LoadNvmlSymbol<nvmlDeviceGetHandleByIndexFunc>("nvmlDeviceGetHandleByIndex");
+    nvmlDeviceGetGpuFabricInfoPtr =
+      LoadNvmlSymbol<nvmlDeviceGetGpuFabricInfoFunc>("nvmlDeviceGetGpuFabricInfo");
+
+    if (!nvmlDeviceGetHandleByIndexPtr || !nvmlDeviceGetGpuFabricInfoPtr) {
+      dlclose(nvml_handle);
+      nvml_handle = nullptr;
+    } else {
+      nvml_loaded = true;
+    }
+  }
+  return nvml_loaded;
+}
+#endif  // CUDA_VERSION >= 12030
diff --git a/cpp/src/nvml_wrap.h b/cpp/src/nvml_wrap.h
new file mode 100644
index 0000000..8a6764b
--- /dev/null
+++ b/cpp/src/nvml_wrap.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <cuda.h>
+
+#if CUDA_VERSION >= 12030
+#include <nvml.h>
+
+bool NvmlFabricSymbolLoaded();
+
+typedef nvmlReturn_t (*nvmlDeviceGetHandleByIndexFunc)(unsigned int, nvmlDevice_t*);
+typedef nvmlReturn_t (*nvmlDeviceGetGpuFabricInfoFunc)(nvmlDevice_t, nvmlGpuFabricInfo_t*);
+
+extern nvmlDeviceGetHandleByIndexFunc nvmlDeviceGetHandleByIndexPtr;
+extern nvmlDeviceGetGpuFabricInfoFunc nvmlDeviceGetGpuFabricInfoPtr;
+#endif  // CUDA_VERSION >= 12030
diff --git a/cpp/src/wholememory/communicator.cpp b/cpp/src/wholememory/communicator.cpp
index f76a4c7..34053ad 100644
--- a/cpp/src/wholememory/communicator.cpp
+++ b/cpp/src/wholememory/communicator.cpp
@@ -497,6 +497,7 @@ void get_host_info(host_info* phi)
 bool comm_support_mnnvl(wholememory_comm_t wm_comm, const std::unique_ptr<rank_info[]>& p_rank_info)
 {
 #if CUDA_VERSION >= 12030
+  if (!nvmlFabricSymbolLoaded) return 0;
   int flag = 0;
   CUdevice currentDev;
   WM_CU_CHECK_NO_THROW(cuDeviceGet(&currentDev, wm_comm->dev_id));
@@ -534,16 +535,22 @@ void exchange_rank_info(wholememory_comm_t wm_comm)
   wm_comm->clique_info.is_in_clique = 0;
 
 #if CUDA_VERSION >= 12030
-  memset(&ri.fabric_info, 0, sizeof(ri.fabric_info));
-  WHOLEMEMORY_CHECK_NOTHROW(GetGpuFabricInfo(wm_comm->dev_id, &ri.fabric_info) ==
-                            WHOLEMEMORY_SUCCESS);
+  if (nvmlFabricSymbolLoaded) {
+    memset(&ri.fabric_info, 0, sizeof(ri.fabric_info));
+    WHOLEMEMORY_CHECK_NOTHROW(GetGpuFabricInfo(wm_comm->dev_id, &ri.fabric_info) ==
+                              WHOLEMEMORY_SUCCESS);
 
-  //    // A zero UUID means we don't have MNNVL fabric info
-  if (((((long*)ri.fabric_info.clusterUuid)[0] | ((long*)ri.fabric_info.clusterUuid)[1]) == 0)) {
-    wm_comm->clique_info.is_in_clique = 0;
+    //    // A zero UUID means we don't have MNNVL fabric info
+    if (((((long*)ri.fabric_info.clusterUuid)[0] | ((long*)ri.fabric_info.clusterUuid)[1]) == 0)) {
+      wm_comm->clique_info.is_in_clique = 0;
 
+    } else {
+      wm_comm->clique_info.is_in_clique = 1;
+    }
   } else {
-    wm_comm->clique_info.is_in_clique = 1;
+    WHOLEMEMORY_WARN(
+      "Some required NVML symbols are missing, likely due to an outdated GPU display driver. MNNVL "
+      "support will be disabled.");
   }
 
 #endif
@@ -573,38 +580,41 @@ void exchange_rank_info(wholememory_comm_t wm_comm)
     }
 
 #if CUDA_VERSION >= 12030
-
-    if ((memcmp(ri.fabric_info.clusterUuid,
-                p_rank_info.get()[r].fabric_info.clusterUuid,
-                NVML_GPU_FABRIC_UUID_LEN) == 0) &&
-        (ri.fabric_info.cliqueId == p_rank_info.get()[r].fabric_info.cliqueId)) {
-      if (r == wm_comm->world_rank) {
-        wm_comm->clique_info.clique_rank = wm_comm->clique_info.clique_rank_num;
+    if (nvmlFabricSymbolLoaded) {
+      if ((memcmp(ri.fabric_info.clusterUuid,
+                  p_rank_info.get()[r].fabric_info.clusterUuid,
+                  NVML_GPU_FABRIC_UUID_LEN) == 0) &&
+          (ri.fabric_info.cliqueId == p_rank_info.get()[r].fabric_info.cliqueId)) {
+        if (r == wm_comm->world_rank) {
+          wm_comm->clique_info.clique_rank = wm_comm->clique_info.clique_rank_num;
+        }
+        if (wm_comm->clique_info.clique_rank_num == 0) {
+          wm_comm->clique_info.clique_first_rank = r;
+        }
+        wm_comm->clique_info.clique_rank_num++;
       }
-      if (wm_comm->clique_info.clique_rank_num == 0) { wm_comm->clique_info.clique_first_rank = r; }
-      wm_comm->clique_info.clique_rank_num++;
+      clique_uuids.insert(
+        std::string(reinterpret_cast<const char*>(p_rank_info.get()[r].fabric_info.clusterUuid),
+                    NVML_GPU_FABRIC_UUID_LEN));
     }
-    clique_uuids.insert(
-      std::string(reinterpret_cast<const char*>(p_rank_info.get()[r].fabric_info.clusterUuid),
-                  NVML_GPU_FABRIC_UUID_LEN));
-
 #endif
   }
 
 #if CUDA_VERSION >= 12030
-  wm_comm->clique_info.clique_num = clique_uuids.size();
-
-  std::string uuid = std::string(reinterpret_cast<const char*>(ri.fabric_info.clusterUuid),
-                                 NVML_GPU_FABRIC_UUID_LEN);
-  int id           = 0;
-  for (auto clique_uuid : clique_uuids) {
-    if (clique_uuid == uuid) { wm_comm->clique_info.clique_id = id; }
-    id++;
-  }
-
-  wm_comm->support_mnnvl = (comm_support_mnnvl(wm_comm, p_rank_info)) &&
-                           (wm_comm->clique_info.clique_rank_num == wm_comm->world_size);
+  if (nvmlFabricSymbolLoaded) {
+    wm_comm->clique_info.clique_num = clique_uuids.size();
+
+    std::string uuid = std::string(reinterpret_cast<const char*>(ri.fabric_info.clusterUuid),
+                                   NVML_GPU_FABRIC_UUID_LEN);
+    int id           = 0;
+    for (auto clique_uuid : clique_uuids) {
+      if (clique_uuid == uuid) { wm_comm->clique_info.clique_id = id; }
+      id++;
+    }
 
+    wm_comm->support_mnnvl = (comm_support_mnnvl(wm_comm, p_rank_info)) &&
+                             (wm_comm->clique_info.clique_rank_num == wm_comm->world_size);
+  }
 #endif
 }
 
diff --git a/cpp/src/wholememory/system_info.cpp b/cpp/src/wholememory/system_info.cpp
index 01c124a..8cd0209 100644
--- a/cpp/src/wholememory/system_info.cpp
+++ b/cpp/src/wholememory/system_info.cpp
@@ -13,8 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "system_info.hpp"
-
 #include <string>
 
 #include "cuda_macros.hpp"
@@ -140,17 +138,19 @@ wholememory_error_code_t NvmlEnsureInitialized()
 wholememory_error_code_t GetGpuFabricInfo(int dev, nvmlGpuFabricInfo_t* gpuFabricInfo)
 {
   WHOLEMEMORY_CHECK_NOTHROW(NvmlEnsureInitialized() == WHOLEMEMORY_SUCCESS);
-  std::lock_guard<std::mutex> locked(lock);
-  // gpuFabricInfo->version = nvmlGpuFabricInfo_v2;
-  nvmlDevice_t nvml_device;
-  nvmlReturn_t ret = nvmlDeviceGetHandleByIndex(dev, &nvml_device);
-  WHOLEMEMORY_EXPECTS_NOTHROW(
-    ret == NVML_SUCCESS, "nvmlDeviceGetHandleByIndex error:%s", nvmlErrorString(ret));
-  ret = nvmlDeviceGetGpuFabricInfo(nvml_device, gpuFabricInfo);
-  WHOLEMEMORY_EXPECTS_NOTHROW(
-    ret == NVML_SUCCESS, "nvmlDeviceGetGpuFabricInfo error:%s", nvmlErrorString(ret));
-
-  return WHOLEMEMORY_SUCCESS;
+  if (wholememory::nvmlFabricSymbolLoaded) {
+    std::lock_guard<std::mutex> locked(lock);
+    // gpuFabricInfo->version = nvmlGpuFabricInfo_v2;
+    nvmlDevice_t nvml_device;
+    nvmlReturn_t ret = nvmlDeviceGetHandleByIndexPtr(dev, &nvml_device);
+    WHOLEMEMORY_EXPECTS_NOTHROW(
+      ret == NVML_SUCCESS, "nvmlDeviceGetHandleByIndex error:%s", nvmlErrorString(ret));
+    ret = nvmlDeviceGetGpuFabricInfoPtr(nvml_device, gpuFabricInfo);
+    WHOLEMEMORY_EXPECTS_NOTHROW(
+      ret == NVML_SUCCESS, "nvmlDeviceGetGpuFabricInfo error:%s", nvmlErrorString(ret));
+    return WHOLEMEMORY_SUCCESS;
+  }
+  return WHOLEMEMORY_SYSTEM_ERROR;
 }
 
 };  // namespace wholememory
diff --git a/cpp/src/wholememory/system_info.hpp b/cpp/src/wholememory/system_info.hpp
index a157924..4d6c52c 100644
--- a/cpp/src/wholememory/system_info.hpp
+++ b/cpp/src/wholememory/system_info.hpp
@@ -18,6 +18,7 @@
 #include "wholememory/wholememory.h"
 
 #if CUDA_VERSION >= 12030
+#include "nvml_wrap.h"
 #include <nvml.h>
 #endif
 bool DevAttrPagebleMemoryAccess();
@@ -37,7 +38,9 @@ bool SupportEGM();
 // bool SupportMNNVLForEGM();
 #if CUDA_VERSION >= 12030
 namespace wholememory {
+
+inline bool nvmlFabricSymbolLoaded = NvmlFabricSymbolLoaded();
 wholememory_error_code_t GetGpuFabricInfo(int dev, nvmlGpuFabricInfo_t* gpuFabricInfo);
-}
+}  // namespace wholememory
 
 #endif