Skip to content

Commit

Permalink
Fix bug with NCCL resource reclaimation when using multiple grid desc…
Browse files Browse the repository at this point in the history
…riptors. Add NCCL and NVSHMEM resource reclaimation to cudecompGridDescDestroy. (#4)
  • Loading branch information
romerojosh authored Aug 16, 2022
1 parent f9c5e56 commit 237a40b
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 6 deletions.
2 changes: 2 additions & 0 deletions include/internal/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ struct cudecompHandle {
int32_t local_rank; // MPI rank
int32_t local_nranks; // MPI size

// Entries for NCCL management
int n_grid_descs_using_nccl = 0; // Count of grid descriptors using NCCL
ncclComm_t nccl_comm = nullptr; // NCCL communicator (global)
ncclComm_t nccl_local_comm = nullptr; // NCCL communicator (intranode)

Expand Down
38 changes: 32 additions & 6 deletions src/cudecomp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -396,12 +396,17 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
}
}
#endif
if (!transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) &&
!haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {
CHECK_NCCL(ncclCommDestroy(handle->nccl_comm));
handle->nccl_comm = nullptr;
CHECK_NCCL(ncclCommDestroy(handle->nccl_local_comm));
handle->nccl_local_comm = nullptr;
if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {
handle->n_grid_descs_using_nccl++;
} else {
// Destroy NCCL communicator to reclaim resources if not used
if (handle->nccl_comm && handle->nccl_local_comm && handle->n_grid_descs_using_nccl == 0) {
CHECK_NCCL(ncclCommDestroy(handle->nccl_comm));
handle->nccl_comm = nullptr;
CHECK_NCCL(ncclCommDestroy(handle->nccl_local_comm));
handle->nccl_local_comm = nullptr;
}
}

*grid_desc_in = grid_desc;
Expand Down Expand Up @@ -437,6 +442,19 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
if (e) { CHECK_CUDA(cudaEventDestroy(e)); }
}

if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {
handle->n_grid_descs_using_nccl--;

// Destroy NCCL communicator to reclaim resources if not used
if (handle->nccl_comm && handle->nccl_local_comm && handle->n_grid_descs_using_nccl == 0) {
CHECK_NCCL(ncclCommDestroy(handle->nccl_comm));
handle->nccl_comm = nullptr;
CHECK_NCCL(ncclCommDestroy(handle->nccl_local_comm));
handle->nccl_local_comm = nullptr;
}
}

#ifdef ENABLE_NVSHMEM
if (transposeBackendRequiresNvshmem(grid_desc->config.transpose_comm_backend) ||
haloBackendRequiresNvshmem(grid_desc->config.halo_comm_backend)) {
Expand All @@ -447,6 +465,14 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team);
}
handle->n_grid_descs_using_nvshmem--;

// Finalize nvshmem to reclaim symmetric heap memory if not used
if (handle->nvshmem_initialized && handle->n_grid_descs_using_nvshmem == 0) {
nvshmem_finalize();
handle->nvshmem_initialized = false;
handle->nvshmem_allocations.clear();
handle->nvshmem_allocation_size = 0;
}
}
#endif

Expand Down

0 comments on commit 237a40b

Please sign in to comment.