Skip to content

Commit

Permalink
[Bugfix] Dynamic load NVML symbols for better compatibility (#72)
Browse files Browse the repository at this point in the history
[File PR here for the record]
Dynamically load NVML symbols for querying GPU fabric info to address incompatibility issues with outdated display drivers.

Authors:
  - Chang Liu (https://github.com/chang-l)

Approvers:
  - https://github.com/linhu-nv

URL: #72
  • Loading branch information
chang-l authored Nov 22, 2024
1 parent 2776772 commit 4807986
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 46 deletions.
77 changes: 77 additions & 0 deletions cpp/src/nvml_wrap.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "nvml_wrap.h"

#if CUDA_VERSION >= 12030
#include <dlfcn.h>
#include <mutex>
#include <stdio.h>

namespace {

void* nvml_handle = nullptr;
std::mutex nvml_mutex;
bool nvml_loaded = false;

bool LoadNvmlLibrary()
{
nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
if (!nvml_handle) {
nvml_handle = dlopen("libnvidia-ml.so", RTLD_NOW);
if (!nvml_handle) {
fprintf(stderr, "Failed to load NVML library: %s\n", dlerror());
return false;
}
}
return true;
}

template <typename T>
T LoadNvmlSymbol(const char* name)
{
void* symbol = dlsym(nvml_handle, name);
if (!symbol) { return nullptr; }
return reinterpret_cast<T>(symbol);
}

} // namespace

// Global function pointers
nvmlDeviceGetHandleByIndexFunc nvmlDeviceGetHandleByIndexPtr = nullptr;
nvmlDeviceGetGpuFabricInfoFunc nvmlDeviceGetGpuFabricInfoPtr = nullptr;

// Ensure NVML is loaded and symbols are initialized
bool NvmlFabricSymbolLoaded()
{
std::lock_guard<std::mutex> lock(nvml_mutex);
if (nvml_loaded) {
return true; // Already loaded
}

if (LoadNvmlLibrary()) {
nvmlDeviceGetHandleByIndexPtr =
LoadNvmlSymbol<nvmlDeviceGetHandleByIndexFunc>("nvmlDeviceGetHandleByIndex");
nvmlDeviceGetGpuFabricInfoPtr =
LoadNvmlSymbol<nvmlDeviceGetGpuFabricInfoFunc>("nvmlDeviceGetGpuFabricInfo");

if (!nvmlDeviceGetHandleByIndexPtr || !nvmlDeviceGetGpuFabricInfoPtr) {
dlclose(nvml_handle);
nvml_handle = nullptr;
} else {
nvml_loaded = true;
}
}
return nvml_loaded;
}
#endif // CUDA_VERSION >= 12030
27 changes: 27 additions & 0 deletions cpp/src/nvml_wrap.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda.h>

#if CUDA_VERSION >= 12030
#include <nvml.h>

bool NvmlFabricSymbolLoaded();

typedef nvmlReturn_t (*nvmlDeviceGetHandleByIndexFunc)(unsigned int, nvmlDevice_t*);
typedef nvmlReturn_t (*nvmlDeviceGetGpuFabricInfoFunc)(nvmlDevice_t, nvmlGpuFabricInfo_t*);

extern nvmlDeviceGetHandleByIndexFunc nvmlDeviceGetHandleByIndexPtr;
extern nvmlDeviceGetGpuFabricInfoFunc nvmlDeviceGetGpuFabricInfoPtr;
#endif // CUDA_VERSION >= 12030
74 changes: 42 additions & 32 deletions cpp/src/wholememory/communicator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,7 @@ void get_host_info(host_info* phi)
bool comm_support_mnnvl(wholememory_comm_t wm_comm, const std::unique_ptr<rank_info[]>& p_rank_info)
{
#if CUDA_VERSION >= 12030
if (!nvmlFabricSymbolLoaded) return 0;
int flag = 0;
CUdevice currentDev;
WM_CU_CHECK_NO_THROW(cuDeviceGet(&currentDev, wm_comm->dev_id));
Expand Down Expand Up @@ -534,16 +535,22 @@ void exchange_rank_info(wholememory_comm_t wm_comm)
wm_comm->clique_info.is_in_clique = 0;

#if CUDA_VERSION >= 12030
memset(&ri.fabric_info, 0, sizeof(ri.fabric_info));
WHOLEMEMORY_CHECK_NOTHROW(GetGpuFabricInfo(wm_comm->dev_id, &ri.fabric_info) ==
WHOLEMEMORY_SUCCESS);
if (nvmlFabricSymbolLoaded) {
memset(&ri.fabric_info, 0, sizeof(ri.fabric_info));
WHOLEMEMORY_CHECK_NOTHROW(GetGpuFabricInfo(wm_comm->dev_id, &ri.fabric_info) ==
WHOLEMEMORY_SUCCESS);

// // A zero UUID means we don't have MNNVL fabric info
if (((((long*)ri.fabric_info.clusterUuid)[0] | ((long*)ri.fabric_info.clusterUuid)[1]) == 0)) {
wm_comm->clique_info.is_in_clique = 0;
// // A zero UUID means we don't have MNNVL fabric info
if (((((long*)ri.fabric_info.clusterUuid)[0] | ((long*)ri.fabric_info.clusterUuid)[1]) == 0)) {
wm_comm->clique_info.is_in_clique = 0;

} else {
wm_comm->clique_info.is_in_clique = 1;
}
} else {
wm_comm->clique_info.is_in_clique = 1;
WHOLEMEMORY_WARN(
"Some required NVML symbols are missing, likely due to an outdated GPU display driver. MNNVL "
"support will be disabled.");
}

#endif
Expand Down Expand Up @@ -573,38 +580,41 @@ void exchange_rank_info(wholememory_comm_t wm_comm)
}

#if CUDA_VERSION >= 12030

if ((memcmp(ri.fabric_info.clusterUuid,
p_rank_info.get()[r].fabric_info.clusterUuid,
NVML_GPU_FABRIC_UUID_LEN) == 0) &&
(ri.fabric_info.cliqueId == p_rank_info.get()[r].fabric_info.cliqueId)) {
if (r == wm_comm->world_rank) {
wm_comm->clique_info.clique_rank = wm_comm->clique_info.clique_rank_num;
if (nvmlFabricSymbolLoaded) {
if ((memcmp(ri.fabric_info.clusterUuid,
p_rank_info.get()[r].fabric_info.clusterUuid,
NVML_GPU_FABRIC_UUID_LEN) == 0) &&
(ri.fabric_info.cliqueId == p_rank_info.get()[r].fabric_info.cliqueId)) {
if (r == wm_comm->world_rank) {
wm_comm->clique_info.clique_rank = wm_comm->clique_info.clique_rank_num;
}
if (wm_comm->clique_info.clique_rank_num == 0) {
wm_comm->clique_info.clique_first_rank = r;
}
wm_comm->clique_info.clique_rank_num++;
}
if (wm_comm->clique_info.clique_rank_num == 0) { wm_comm->clique_info.clique_first_rank = r; }
wm_comm->clique_info.clique_rank_num++;
clique_uuids.insert(
std::string(reinterpret_cast<const char*>(p_rank_info.get()[r].fabric_info.clusterUuid),
NVML_GPU_FABRIC_UUID_LEN));
}
clique_uuids.insert(
std::string(reinterpret_cast<const char*>(p_rank_info.get()[r].fabric_info.clusterUuid),
NVML_GPU_FABRIC_UUID_LEN));

#endif
}

#if CUDA_VERSION >= 12030
wm_comm->clique_info.clique_num = clique_uuids.size();

std::string uuid = std::string(reinterpret_cast<const char*>(ri.fabric_info.clusterUuid),
NVML_GPU_FABRIC_UUID_LEN);
int id = 0;
for (auto clique_uuid : clique_uuids) {
if (clique_uuid == uuid) { wm_comm->clique_info.clique_id = id; }
id++;
}

wm_comm->support_mnnvl = (comm_support_mnnvl(wm_comm, p_rank_info)) &&
(wm_comm->clique_info.clique_rank_num == wm_comm->world_size);
if (nvmlFabricSymbolLoaded) {
wm_comm->clique_info.clique_num = clique_uuids.size();

std::string uuid = std::string(reinterpret_cast<const char*>(ri.fabric_info.clusterUuid),
NVML_GPU_FABRIC_UUID_LEN);
int id = 0;
for (auto clique_uuid : clique_uuids) {
if (clique_uuid == uuid) { wm_comm->clique_info.clique_id = id; }
id++;
}

wm_comm->support_mnnvl = (comm_support_mnnvl(wm_comm, p_rank_info)) &&
(wm_comm->clique_info.clique_rank_num == wm_comm->world_size);
}
#endif
}

Expand Down
26 changes: 13 additions & 13 deletions cpp/src/wholememory/system_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "system_info.hpp"

#include <string>

#include "cuda_macros.hpp"
Expand Down Expand Up @@ -140,17 +138,19 @@ wholememory_error_code_t NvmlEnsureInitialized()
wholememory_error_code_t GetGpuFabricInfo(int dev, nvmlGpuFabricInfo_t* gpuFabricInfo)
{
WHOLEMEMORY_CHECK_NOTHROW(NvmlEnsureInitialized() == WHOLEMEMORY_SUCCESS);
std::lock_guard<std::mutex> locked(lock);
// gpuFabricInfo->version = nvmlGpuFabricInfo_v2;
nvmlDevice_t nvml_device;
nvmlReturn_t ret = nvmlDeviceGetHandleByIndex(dev, &nvml_device);
WHOLEMEMORY_EXPECTS_NOTHROW(
ret == NVML_SUCCESS, "nvmlDeviceGetHandleByIndex error:%s", nvmlErrorString(ret));
ret = nvmlDeviceGetGpuFabricInfo(nvml_device, gpuFabricInfo);
WHOLEMEMORY_EXPECTS_NOTHROW(
ret == NVML_SUCCESS, "nvmlDeviceGetGpuFabricInfo error:%s", nvmlErrorString(ret));

return WHOLEMEMORY_SUCCESS;
if (wholememory::nvmlFabricSymbolLoaded) {
std::lock_guard<std::mutex> locked(lock);
// gpuFabricInfo->version = nvmlGpuFabricInfo_v2;
nvmlDevice_t nvml_device;
nvmlReturn_t ret = nvmlDeviceGetHandleByIndexPtr(dev, &nvml_device);
WHOLEMEMORY_EXPECTS_NOTHROW(
ret == NVML_SUCCESS, "nvmlDeviceGetHandleByIndex error:%s", nvmlErrorString(ret));
ret = nvmlDeviceGetGpuFabricInfoPtr(nvml_device, gpuFabricInfo);
WHOLEMEMORY_EXPECTS_NOTHROW(
ret == NVML_SUCCESS, "nvmlDeviceGetGpuFabricInfo error:%s", nvmlErrorString(ret));
return WHOLEMEMORY_SUCCESS;
}
return WHOLEMEMORY_SYSTEM_ERROR;
}

}; // namespace wholememory
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/wholememory/system_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "wholememory/wholememory.h"

#if CUDA_VERSION >= 12030
#include "nvml_wrap.h"
#include <nvml.h>
#endif
bool DevAttrPagebleMemoryAccess();
Expand All @@ -37,7 +38,9 @@ bool SupportEGM();
// bool SupportMNNVLForEGM();
#if CUDA_VERSION >= 12030
namespace wholememory {

inline bool nvmlFabricSymbolLoaded = NvmlFabricSymbolLoaded();
wholememory_error_code_t GetGpuFabricInfo(int dev, nvmlGpuFabricInfo_t* gpuFabricInfo);
}
} // namespace wholememory

#endif

0 comments on commit 4807986

Please sign in to comment.