Skip to content

Commit

Permalink
address PR comments
Browse files Browse the repository at this point in the history
  • Loading branch information
ChuckHastings committed Nov 2, 2023
1 parent 55fe02a commit 6178f36
Show file tree
Hide file tree
Showing 9 changed files with 261 additions and 400 deletions.
4 changes: 1 addition & 3 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,7 @@ endif()

set(CUGRAPH_SOURCES
src/detail/shuffle_vertices.cu
src/detail/shuffle_vertex_pairs_int32_int32.cu
src/detail/shuffle_vertex_pairs_int32_int64.cu
src/detail/shuffle_vertex_pairs_int64_int64.cu
src/detail/shuffle_vertex_pairs.cu
src/detail/collect_local_vertex_values.cu
src/detail/groupby_and_count.cu
src/sampling/random_walks_mg.cu
Expand Down
15 changes: 14 additions & 1 deletion cpp/include/cugraph/mtmg/instance_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

#include <cugraph/mtmg/handle.hpp>

#include <nccl.h>
#include <raft/comms/std_comms.hpp>

#include <vector>

Expand All @@ -45,6 +45,19 @@ class instance_manager_t {
{
}

~instance_manager_t()
{
int current_device{};
RAFT_CUDA_TRY(cudaGetDevice(&current_device));

for (size_t i = 0; i < nccl_comms_.size(); ++i) {
RAFT_CUDA_TRY(cudaSetDevice(device_ids_[i].value()));
RAFT_NCCL_TRY(ncclCommDestroy(*nccl_comms_[i]));
}

RAFT_CUDA_TRY(cudaSetDevice(current_device));
}

/**
* @brief Get handle
*
Expand Down
74 changes: 37 additions & 37 deletions cpp/include/cugraph/mtmg/resource_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,19 @@ namespace mtmg {
* register_local_gpu (or register_remote_gpu once we support a multi-node
* configuration) to allocate resources that can be used in the mtmg space.
*
* Each GPU in the cluster should be given a unique global rank, an integer
* that will be used to reference the GPU within the resource manager. It
* is recommended that the GPUs be numbered sequentially from 0, although this
* is not required.
*
* When we want to execute some graph computations, we need to create an instance for execution.
* Based on how big a subset of the desired compute resources is desired, we can allocate some
* number of GPUs to the problem (up to the total set of managed resources).
*
* The returned instance can be used to create a graph, execute one or more algorithms, etc. Once
* we are done the caller can delete the instance.
*
* At the moment, the caller is assumed to be responsible for scheduling use of the resources.
* The caller is assumed to be responsible for scheduling use of the resources.
*
* For our first release, we will only consider a single node multi-GPU configuration, so the remote
* GPU methods are currently disabled via ifdef.
Expand All @@ -63,27 +68,28 @@ class resource_manager_t {
/**
* @brief add a local GPU to the resource manager.
*
* @param rank The rank to assign to the local GPU
* @param device_id The device_id corresponding to this rank
* @param global_rank The global rank to assign to the local GPU
* @param local_device_id The local device_id corresponding to this rank
*/
void register_local_gpu(int rank, rmm::cuda_device_id device_id)
void register_local_gpu(int global_rank, rmm::cuda_device_id local_device_id)
{
std::lock_guard<std::mutex> lock(lock_);

CUGRAPH_EXPECTS(remote_rank_set_.find(rank) == remote_rank_set_.end(),
"cannot register same rank as local and remote");
CUGRAPH_EXPECTS(local_rank_map_.find(rank) == local_rank_map_.end(),
"cannot register same rank multiple times");
CUGRAPH_EXPECTS(remote_rank_set_.find(global_rank) == remote_rank_set_.end(),
"cannot register same global_rank as local and remote");
CUGRAPH_EXPECTS(local_rank_map_.find(global_rank) == local_rank_map_.end(),
"cannot register same global_rank multiple times");

int num_gpus_this_node;
RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_this_node));

CUGRAPH_EXPECTS((device_id.value() >= 0) && (device_id.value() < num_gpus_this_node),
"device id out of range");
CUGRAPH_EXPECTS(
(local_device_id.value() >= 0) && (local_device_id.value() < num_gpus_this_node),
"local device id out of range");

local_rank_map_.insert(std::pair(rank, device_id));
local_rank_map_.insert(std::pair(global_rank, local_device_id));

RAFT_CUDA_TRY(cudaSetDevice(device_id.value()));
RAFT_CUDA_TRY(cudaSetDevice(local_device_id.value()));

// FIXME: There is a bug in the cuda_memory_resource that results in a Hang.
// using the pool resource as a work-around.
Expand All @@ -98,36 +104,36 @@ class resource_manager_t {
// (or the constructor of the object) to configure this behavior
#if 0
auto per_device_it = per_device_rmm_resources_.insert(
std::pair{rank, std::make_shared<rmm::mr::cuda_memory_resource>()});
std::pair{global_rank, std::make_shared<rmm::mr::cuda_memory_resource>()});
#else
auto const [free, total] = rmm::detail::available_device_memory();
auto const min_alloc =
rmm::detail::align_down(std::min(free, total / 6), rmm::detail::CUDA_ALLOCATION_ALIGNMENT);

auto per_device_it = per_device_rmm_resources_.insert(
std::pair{rank,
std::pair{global_rank,
rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
std::make_shared<rmm::mr::cuda_memory_resource>(), min_alloc)});
#endif

rmm::mr::set_per_device_resource(device_id, per_device_it.first->second.get());
rmm::mr::set_per_device_resource(local_device_id, per_device_it.first->second.get());
}

/**
* @brief add a remote GPU to the resource manager.
*
* @param rank The rank to assign to the remote GPU
* @param global_rank The global rank to assign to the remote GPU
*/
void register_remote_gpu(int rank)
void register_remote_gpu(int global_rank)
{
std::lock_guard<std::mutex> lock(lock_);

CUGRAPH_EXPECTS(local_rank_map_.find(rank) == local_rank_map_.end(),
"cannot register same rank as local and remote");
CUGRAPH_EXPECTS(remote_rank_set_.find(rank) == remote_rank_set_.end(),
"cannot register same rank multiple times");
CUGRAPH_EXPECTS(local_rank_map_.find(global_rank) == local_rank_map_.end(),
"cannot register same global_rank as local and remote");
CUGRAPH_EXPECTS(remote_rank_set_.find(global_rank) == remote_rank_set_.end(),
"cannot register same global_rank multiple times");

remote_rank_set_.insert(rank);
remote_rank_set_.insert(global_rank);
}

/**
Expand All @@ -154,18 +160,12 @@ class resource_manager_t {
{
std::vector<int> local_ranks_to_include;

std::for_each(ranks_to_include.begin(),
ranks_to_include.end(),
[&local_ranks = local_rank_map_,
&remote_ranks = remote_rank_set_,
&local_ranks_to_include](int rank) {
if (local_ranks.find(rank) == local_ranks.end()) {
CUGRAPH_EXPECTS(remote_ranks.find(rank) != remote_ranks.end(),
"requesting inclusion of an invalid rank");
} else {
local_ranks_to_include.push_back(rank);
}
});
std::copy_if(ranks_to_include.begin(),
ranks_to_include.end(),
std::back_inserter(local_ranks_to_include),
[&local_ranks = local_rank_map_](int rank) {
return (local_ranks.find(rank) != local_ranks.end());
});

std::vector<std::unique_ptr<ncclComm_t>> nccl_comms{};
std::vector<std::unique_ptr<raft::handle_t>> handles{};
Expand All @@ -182,7 +182,7 @@ class resource_manager_t {

int current_device{};
RAFT_CUDA_TRY(cudaGetDevice(&current_device));
NCCL_TRY(ncclGroupStart());
RAFT_NCCL_TRY(ncclGroupStart());

for (size_t i = 0; i < local_ranks_to_include.size(); ++i) {
int rank = local_ranks_to_include[i];
Expand All @@ -196,12 +196,12 @@ class resource_manager_t {
per_device_rmm_resources_.find(rank)->second));
device_ids.push_back(pos->second);

NCCL_TRY(
RAFT_NCCL_TRY(
ncclCommInitRank(nccl_comms[i].get(), ranks_to_include.size(), instance_manager_id, rank));
raft::comms::build_comms_nccl_only(
handles[i].get(), *nccl_comms[i], ranks_to_include.size(), rank);
}
NCCL_TRY(ncclGroupEnd());
RAFT_NCCL_TRY(ncclGroupEnd());
RAFT_CUDA_TRY(cudaSetDevice(current_device));

std::vector<std::thread> running_threads;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <detail/graph_partition_utils.cuh>

#include <cugraph/detail/shuffle_wrappers.hpp>
Expand Down Expand Up @@ -358,5 +356,167 @@ shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
minor_comm_size});
}

template std::tuple<rmm::device_uvector<int32_t>,
rmm::device_uvector<int32_t>,
std::optional<rmm::device_uvector<float>>,
std::optional<rmm::device_uvector<int32_t>>,
std::optional<rmm::device_uvector<int32_t>>>
shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
raft::handle_t const& handle,
rmm::device_uvector<int32_t>&& majors,
rmm::device_uvector<int32_t>&& minors,
std::optional<rmm::device_uvector<float>>&& weights,
std::optional<rmm::device_uvector<int32_t>>&& edge_ids,
std::optional<rmm::device_uvector<int32_t>>&& edge_types);

template std::tuple<rmm::device_uvector<int32_t>,
rmm::device_uvector<int32_t>,
std::optional<rmm::device_uvector<double>>,
std::optional<rmm::device_uvector<int32_t>>,
std::optional<rmm::device_uvector<int32_t>>>
shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
raft::handle_t const& handle,
rmm::device_uvector<int32_t>&& majors,
rmm::device_uvector<int32_t>&& minors,
std::optional<rmm::device_uvector<double>>&& weights,
std::optional<rmm::device_uvector<int32_t>>&& edge_ids,
std::optional<rmm::device_uvector<int32_t>>&& edge_types);

template std::tuple<rmm::device_uvector<int32_t>,
rmm::device_uvector<int32_t>,
std::optional<rmm::device_uvector<float>>,
std::optional<rmm::device_uvector<int64_t>>,
std::optional<rmm::device_uvector<int32_t>>>
shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
raft::handle_t const& handle,
rmm::device_uvector<int32_t>&& majors,
rmm::device_uvector<int32_t>&& minors,
std::optional<rmm::device_uvector<float>>&& weights,
std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
std::optional<rmm::device_uvector<int32_t>>&& edge_types);

template std::tuple<rmm::device_uvector<int32_t>,
rmm::device_uvector<int32_t>,
std::optional<rmm::device_uvector<double>>,
std::optional<rmm::device_uvector<int64_t>>,
std::optional<rmm::device_uvector<int32_t>>>
shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
raft::handle_t const& handle,
rmm::device_uvector<int32_t>&& majors,
rmm::device_uvector<int32_t>&& minors,
std::optional<rmm::device_uvector<double>>&& weights,
std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
std::optional<rmm::device_uvector<int32_t>>&& edge_types);

template std::tuple<rmm::device_uvector<int64_t>,
rmm::device_uvector<int64_t>,
std::optional<rmm::device_uvector<float>>,
std::optional<rmm::device_uvector<int64_t>>,
std::optional<rmm::device_uvector<int32_t>>>
shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
raft::handle_t const& handle,
rmm::device_uvector<int64_t>&& majors,
rmm::device_uvector<int64_t>&& minors,
std::optional<rmm::device_uvector<float>>&& weights,
std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
std::optional<rmm::device_uvector<int32_t>>&& edge_types);

template std::tuple<rmm::device_uvector<int64_t>,
rmm::device_uvector<int64_t>,
std::optional<rmm::device_uvector<double>>,
std::optional<rmm::device_uvector<int64_t>>,
std::optional<rmm::device_uvector<int32_t>>>
shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
raft::handle_t const& handle,
rmm::device_uvector<int64_t>&& majors,
rmm::device_uvector<int64_t>&& minors,
std::optional<rmm::device_uvector<double>>&& weights,
std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
std::optional<rmm::device_uvector<int32_t>>&& edge_types);

template std::tuple<rmm::device_uvector<int32_t>,
rmm::device_uvector<int32_t>,
std::optional<rmm::device_uvector<float>>,
std::optional<rmm::device_uvector<int32_t>>,
std::optional<rmm::device_uvector<int32_t>>>
shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
raft::handle_t const& handle,
rmm::device_uvector<int32_t>&& majors,
rmm::device_uvector<int32_t>&& minors,
std::optional<rmm::device_uvector<float>>&& weights,
std::optional<rmm::device_uvector<int32_t>>&& edge_ids,
std::optional<rmm::device_uvector<int32_t>>&& edge_types,
std::vector<int32_t> const& vertex_partition_range_lasts);

template std::tuple<rmm::device_uvector<int32_t>,
rmm::device_uvector<int32_t>,
std::optional<rmm::device_uvector<double>>,
std::optional<rmm::device_uvector<int32_t>>,
std::optional<rmm::device_uvector<int32_t>>>
shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
raft::handle_t const& handle,
rmm::device_uvector<int32_t>&& majors,
rmm::device_uvector<int32_t>&& minors,
std::optional<rmm::device_uvector<double>>&& weights,
std::optional<rmm::device_uvector<int32_t>>&& edge_ids,
std::optional<rmm::device_uvector<int32_t>>&& edge_types,
std::vector<int32_t> const& vertex_partition_range_lasts);

template std::tuple<rmm::device_uvector<int32_t>,
rmm::device_uvector<int32_t>,
std::optional<rmm::device_uvector<float>>,
std::optional<rmm::device_uvector<int64_t>>,
std::optional<rmm::device_uvector<int32_t>>>
shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
raft::handle_t const& handle,
rmm::device_uvector<int32_t>&& majors,
rmm::device_uvector<int32_t>&& minors,
std::optional<rmm::device_uvector<float>>&& weights,
std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
std::optional<rmm::device_uvector<int32_t>>&& edge_types,
std::vector<int32_t> const& vertex_partition_range_lasts);

template std::tuple<rmm::device_uvector<int32_t>,
rmm::device_uvector<int32_t>,
std::optional<rmm::device_uvector<double>>,
std::optional<rmm::device_uvector<int64_t>>,
std::optional<rmm::device_uvector<int32_t>>>
shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
raft::handle_t const& handle,
rmm::device_uvector<int32_t>&& majors,
rmm::device_uvector<int32_t>&& minors,
std::optional<rmm::device_uvector<double>>&& weights,
std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
std::optional<rmm::device_uvector<int32_t>>&& edge_types,
std::vector<int32_t> const& vertex_partition_range_lasts);

template std::tuple<rmm::device_uvector<int64_t>,
rmm::device_uvector<int64_t>,
std::optional<rmm::device_uvector<float>>,
std::optional<rmm::device_uvector<int64_t>>,
std::optional<rmm::device_uvector<int32_t>>>
shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
raft::handle_t const& handle,
rmm::device_uvector<int64_t>&& majors,
rmm::device_uvector<int64_t>&& minors,
std::optional<rmm::device_uvector<float>>&& weights,
std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
std::optional<rmm::device_uvector<int32_t>>&& edge_types,
std::vector<int64_t> const& vertex_partition_range_lasts);

template std::tuple<rmm::device_uvector<int64_t>,
rmm::device_uvector<int64_t>,
std::optional<rmm::device_uvector<double>>,
std::optional<rmm::device_uvector<int64_t>>,
std::optional<rmm::device_uvector<int32_t>>>
shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
raft::handle_t const& handle,
rmm::device_uvector<int64_t>&& majors,
rmm::device_uvector<int64_t>&& minors,
std::optional<rmm::device_uvector<double>>&& weights,
std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
std::optional<rmm::device_uvector<int32_t>>&& edge_types,
std::vector<int64_t> const& vertex_partition_range_lasts);

} // namespace detail
} // namespace cugraph
Loading

0 comments on commit 6178f36

Please sign in to comment.