address PR comments

rapidsai · Nov 2, 2023 · 6178f36 · 6178f36
1 parent 55fe02a
commit 6178f36
Show file tree

Hide file tree

Showing 9 changed files with 261 additions and 400 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -181,9 +181,7 @@ endif()
 
 set(CUGRAPH_SOURCES
     src/detail/shuffle_vertices.cu
-    src/detail/shuffle_vertex_pairs_int32_int32.cu
-    src/detail/shuffle_vertex_pairs_int32_int64.cu
-    src/detail/shuffle_vertex_pairs_int64_int64.cu
+    src/detail/shuffle_vertex_pairs.cu
     src/detail/collect_local_vertex_values.cu
     src/detail/groupby_and_count.cu
     src/sampling/random_walks_mg.cu

diff --git a/cpp/include/cugraph/mtmg/instance_manager.hpp b/cpp/include/cugraph/mtmg/instance_manager.hpp
@@ -18,7 +18,7 @@
 
 #include <cugraph/mtmg/handle.hpp>
 
-#include <nccl.h>
+#include <raft/comms/std_comms.hpp>
 
 #include <vector>
 
@@ -45,6 +45,19 @@ class instance_manager_t {
   {
   }
 
+  ~instance_manager_t()
+  {
+    int current_device{};
+    RAFT_CUDA_TRY(cudaGetDevice(&current_device));
+
+    for (size_t i = 0; i < nccl_comms_.size(); ++i) {
+      RAFT_CUDA_TRY(cudaSetDevice(device_ids_[i].value()));
+      RAFT_NCCL_TRY(ncclCommDestroy(*nccl_comms_[i]));
+    }
+
+    RAFT_CUDA_TRY(cudaSetDevice(current_device));
+  }
+
   /**
    * @brief Get handle
    *

diff --git a/cpp/include/cugraph/mtmg/resource_manager.hpp b/cpp/include/cugraph/mtmg/resource_manager.hpp
@@ -41,14 +41,19 @@ namespace mtmg {
  * register_local_gpu (or register_remote_gpu once we support a multi-node
  * configuration) to allocate resources that can be used in the mtmg space.
  *
+ * Each GPU in the cluster should be given a unique global rank, an integer
+ * that will be used to reference the GPU within the resource manager.  It
+ * is recommended that the GPUs be numbered sequentially from 0, although this
+ * is not required.
+ *
  * When we want to execute some graph computations, we need to create an instance for execution.
  * Based on how big a subset of the desired compute resources is desired, we can allocate some
  * number of GPUs to the problem (up to the total set of managed resources).
  *
  * The returned instance can be used to create a graph, execute one or more algorithms, etc.  Once
  * we are done the caller can delete the instance.
  *
- * At the moment, the caller is assumed to be responsible for scheduling use of the resources.
+ * The caller is assumed to be responsible for scheduling use of the resources.
  *
  * For our first release, we will only consider a single node multi-GPU configuration, so the remote
  * GPU methods are currently disabled via ifdef.
@@ -63,27 +68,28 @@ class resource_manager_t {
   /**
    * @brief add a local GPU to the resource manager.
    *
-   * @param rank       The rank to assign to the local GPU
-   * @param device_id  The device_id corresponding to this rank
+   * @param global_rank       The global rank to assign to the local GPU
+   * @param local_device_id  The local device_id corresponding to this rank
    */
-  void register_local_gpu(int rank, rmm::cuda_device_id device_id)
+  void register_local_gpu(int global_rank, rmm::cuda_device_id local_device_id)
   {
     std::lock_guard<std::mutex> lock(lock_);
 
-    CUGRAPH_EXPECTS(remote_rank_set_.find(rank) == remote_rank_set_.end(),
-                    "cannot register same rank as local and remote");
-    CUGRAPH_EXPECTS(local_rank_map_.find(rank) == local_rank_map_.end(),
-                    "cannot register same rank multiple times");
+    CUGRAPH_EXPECTS(remote_rank_set_.find(global_rank) == remote_rank_set_.end(),
+                    "cannot register same global_rank as local and remote");
+    CUGRAPH_EXPECTS(local_rank_map_.find(global_rank) == local_rank_map_.end(),
+                    "cannot register same global_rank multiple times");
 
     int num_gpus_this_node;
     RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_this_node));
 
-    CUGRAPH_EXPECTS((device_id.value() >= 0) && (device_id.value() < num_gpus_this_node),
-                    "device id out of range");
+    CUGRAPH_EXPECTS(
+      (local_device_id.value() >= 0) && (local_device_id.value() < num_gpus_this_node),
+      "local device id out of range");
 
-    local_rank_map_.insert(std::pair(rank, device_id));
+    local_rank_map_.insert(std::pair(global_rank, local_device_id));
 
-    RAFT_CUDA_TRY(cudaSetDevice(device_id.value()));
+    RAFT_CUDA_TRY(cudaSetDevice(local_device_id.value()));
 
     // FIXME: There is a bug in the cuda_memory_resource that results in a Hang.
     //   using the pool resource as a work-around.
@@ -98,36 +104,36 @@ class resource_manager_t {
     // (or the constructor of the object) to configure this behavior
 #if 0
     auto per_device_it = per_device_rmm_resources_.insert(
-      std::pair{rank, std::make_shared<rmm::mr::cuda_memory_resource>()});
+      std::pair{global_rank, std::make_shared<rmm::mr::cuda_memory_resource>()});
 #else
     auto const [free, total] = rmm::detail::available_device_memory();
     auto const min_alloc =
       rmm::detail::align_down(std::min(free, total / 6), rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
 
     auto per_device_it = per_device_rmm_resources_.insert(
-      std::pair{rank,
+      std::pair{global_rank,
                 rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
                   std::make_shared<rmm::mr::cuda_memory_resource>(), min_alloc)});
 #endif
 
-    rmm::mr::set_per_device_resource(device_id, per_device_it.first->second.get());
+    rmm::mr::set_per_device_resource(local_device_id, per_device_it.first->second.get());
   }
 
   /**
    * @brief add a remote GPU to the resource manager.
    *
-   * @param rank             The rank to assign to the remote GPU
+   * @param global_rank             The global rank to assign to the remote GPU
    */
-  void register_remote_gpu(int rank)
+  void register_remote_gpu(int global_rank)
   {
     std::lock_guard<std::mutex> lock(lock_);
 
-    CUGRAPH_EXPECTS(local_rank_map_.find(rank) == local_rank_map_.end(),
-                    "cannot register same rank as local and remote");
-    CUGRAPH_EXPECTS(remote_rank_set_.find(rank) == remote_rank_set_.end(),
-                    "cannot register same rank multiple times");
+    CUGRAPH_EXPECTS(local_rank_map_.find(global_rank) == local_rank_map_.end(),
+                    "cannot register same global_rank as local and remote");
+    CUGRAPH_EXPECTS(remote_rank_set_.find(global_rank) == remote_rank_set_.end(),
+                    "cannot register same global_rank multiple times");
 
-    remote_rank_set_.insert(rank);
+    remote_rank_set_.insert(global_rank);
   }
 
   /**
@@ -154,18 +160,12 @@ class resource_manager_t {
   {
     std::vector<int> local_ranks_to_include;
 
-    std::for_each(ranks_to_include.begin(),
-                  ranks_to_include.end(),
-                  [&local_ranks  = local_rank_map_,
-                   &remote_ranks = remote_rank_set_,
-                   &local_ranks_to_include](int rank) {
-                    if (local_ranks.find(rank) == local_ranks.end()) {
-                      CUGRAPH_EXPECTS(remote_ranks.find(rank) != remote_ranks.end(),
-                                      "requesting inclusion of an invalid rank");
-                    } else {
-                      local_ranks_to_include.push_back(rank);
-                    }
-                  });
+    std::copy_if(ranks_to_include.begin(),
+                 ranks_to_include.end(),
+                 std::back_inserter(local_ranks_to_include),
+                 [&local_ranks = local_rank_map_](int rank) {
+                   return (local_ranks.find(rank) != local_ranks.end());
+                 });
 
     std::vector<std::unique_ptr<ncclComm_t>> nccl_comms{};
     std::vector<std::unique_ptr<raft::handle_t>> handles{};
@@ -182,7 +182,7 @@ class resource_manager_t {
 
     int current_device{};
     RAFT_CUDA_TRY(cudaGetDevice(&current_device));
-    NCCL_TRY(ncclGroupStart());
+    RAFT_NCCL_TRY(ncclGroupStart());
 
     for (size_t i = 0; i < local_ranks_to_include.size(); ++i) {
       int rank = local_ranks_to_include[i];
@@ -196,12 +196,12 @@ class resource_manager_t {
                                          per_device_rmm_resources_.find(rank)->second));
       device_ids.push_back(pos->second);
 
-      NCCL_TRY(
+      RAFT_NCCL_TRY(
         ncclCommInitRank(nccl_comms[i].get(), ranks_to_include.size(), instance_manager_id, rank));
       raft::comms::build_comms_nccl_only(
         handles[i].get(), *nccl_comms[i], ranks_to_include.size(), rank);
     }
-    NCCL_TRY(ncclGroupEnd());
+    RAFT_NCCL_TRY(ncclGroupEnd());
     RAFT_CUDA_TRY(cudaSetDevice(current_device));
 
     std::vector<std::thread> running_threads;

diff --git a/cpp/src/detail/shuffle_vertex_pairs.cuh → cpp/src/detail/shuffle_vertex_pairs.cu b/cpp/src/detail/shuffle_vertex_pairs.cuh → cpp/src/detail/shuffle_vertex_pairs.cu
@@ -13,8 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#pragma once
-
 #include <detail/graph_partition_utils.cuh>
 
 #include <cugraph/detail/shuffle_wrappers.hpp>
@@ -358,5 +356,167 @@ shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
       minor_comm_size});
 }
 
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& majors,
+  rmm::device_uvector<int32_t>&& minors,
+  std::optional<rmm::device_uvector<float>>&& weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& majors,
+  rmm::device_uvector<int32_t>&& minors,
+  std::optional<rmm::device_uvector<double>>&& weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& majors,
+  rmm::device_uvector<int32_t>&& minors,
+  std::optional<rmm::device_uvector<float>>&& weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& majors,
+  rmm::device_uvector<int32_t>&& minors,
+  std::optional<rmm::device_uvector<double>>&& weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& majors,
+  rmm::device_uvector<int64_t>&& minors,
+  std::optional<rmm::device_uvector<float>>&& weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& majors,
+  rmm::device_uvector<int64_t>&& minors,
+  std::optional<rmm::device_uvector<double>>&& weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& majors,
+  rmm::device_uvector<int32_t>&& minors,
+  std::optional<rmm::device_uvector<float>>&& weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edge_types,
+  std::vector<int32_t> const& vertex_partition_range_lasts);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& majors,
+  rmm::device_uvector<int32_t>&& minors,
+  std::optional<rmm::device_uvector<double>>&& weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edge_types,
+  std::vector<int32_t> const& vertex_partition_range_lasts);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& majors,
+  rmm::device_uvector<int32_t>&& minors,
+  std::optional<rmm::device_uvector<float>>&& weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edge_types,
+  std::vector<int32_t> const& vertex_partition_range_lasts);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& majors,
+  rmm::device_uvector<int32_t>&& minors,
+  std::optional<rmm::device_uvector<double>>&& weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edge_types,
+  std::vector<int32_t> const& vertex_partition_range_lasts);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& majors,
+  rmm::device_uvector<int64_t>&& minors,
+  std::optional<rmm::device_uvector<float>>&& weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edge_types,
+  std::vector<int64_t> const& vertex_partition_range_lasts);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& majors,
+  rmm::device_uvector<int64_t>&& minors,
+  std::optional<rmm::device_uvector<double>>&& weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edge_types,
+  std::vector<int64_t> const& vertex_partition_range_lasts);
+
 }  // namespace detail
 }  // namespace cugraph