Skip to content

Commit

Permalink
update logging
Browse files Browse the repository at this point in the history
  • Loading branch information
seunghwak committed Sep 10, 2024
1 parent 3f71304 commit 3577699
Show file tree
Hide file tree
Showing 5 changed files with 127 additions and 17 deletions.
9 changes: 5 additions & 4 deletions cpp/src/prims/detail/per_v_transform_reduce_e.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1348,7 +1348,7 @@ void per_v_transform_reduce_e_edge_partition(
}
}

#define PER_V_PERFORMANCE_MEASUREMENT 0
#define PER_V_PERFORMANCE_MEASUREMENT 1

template <bool incoming, // iterate over incoming edges (incoming == true) or outgoing edges
// (incoming == false)
Expand Down Expand Up @@ -1376,6 +1376,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
VertexValueOutputIterator vertex_value_output_first)
{
#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete
auto const comm_rank = handle.get_comms().get_rank();
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time0 = std::chrono::steady_clock::now();
#endif
Expand Down Expand Up @@ -2634,7 +2635,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
std::chrono::duration<double> subdur14 = subtime15 - subtime14;
std::chrono::duration<double> subdur15 = subtime16 - subtime15;
std::chrono::duration<double> subdur16 = subtime17 - subtime16;
std::cout << "sub took (" << subdur0.count() << "," << subdur1.count() << ","
std::cout << comm_rank << ":sub took (" << subdur0.count() << "," << subdur1.count() << ","
<< subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << ","
<< subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << ","
<< subdur8.count() << "," << subdur9.count() << "," << subdur10.count() << ","
Expand Down Expand Up @@ -2760,8 +2761,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
std::chrono::duration<double> dur0 = time1 - time0;
std::chrono::duration<double> dur1 = time2 - time1;
std::chrono::duration<double> dur2 = time3 - time2;
std::cout << "\t\tdetail::per_v (prep, ep, comm) took (" << dur0.count() << "," << dur1.count()
<< "," << dur2.count() << ")" << std::endl;
std::cout << "\t\t" << comm_rank << ":detail::per_v (prep, ep, comm) took (" << dur0.count()
<< "," << dur1.count() << "," << dur2.count() << ")" << std::endl;
#endif
}

Expand Down
1 change: 1 addition & 0 deletions cpp/src/prims/fill_edge_src_dst_property.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ void fill_edge_minor_property(raft::handle_t const& handle,
num_concurrent_bcasts = std::min(num_concurrent_bcasts, handle.get_stream_pool_size());
num_concurrent_bcasts =
std::min(std::max(num_concurrent_bcasts, size_t{1}), static_cast<size_t>(major_comm_size));
std::cout << comm.get_rank() << ":" << " v_list_size=" << v_list_size << " v_list_range=(" << v_list_range[0] << "," << v_list_range[1] << ") v_list_bitmap.has_value()=" << v_list_bitmap.has_value() << " num_concurrent_bcasts=" << num_concurrent_bcasts << std::endl;

std::optional<std::vector<size_t>> stream_pool_indices{std::nullopt};
if (num_concurrent_bcasts > 1) {
Expand Down
45 changes: 44 additions & 1 deletion cpp/src/structure/create_graph_from_edgelist_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,13 @@ create_graph_from_partitioned_edgelist(
auto const minor_comm_size = minor_comm.get_size();

// 1. renumber
#if 1
auto const comm_rank = handle.get_comms().get_rank();
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":create_graph_from_partitioned 0" << std::endl;
#endif

std::vector<edge_t> edgelist_edge_counts(minor_comm_size, edge_t{0});
std::vector < edge_t> edgelist_edge_counts(minor_comm_size, edge_t{0});
for (size_t i = 0; i < edgelist_edge_counts.size(); ++i) {
edgelist_edge_counts[i] = static_cast<edge_t>(edge_partition_edgelist_srcs[i].size());
}
Expand All @@ -362,6 +367,10 @@ create_graph_from_partitioned_edgelist(
num_segments_per_vertex_partition > (detail::num_sparse_segments_per_vertex_partition + 2);

// 2. sort and compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid
#if 1
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":create_graph_from_partitioned 1" << std::endl;
#endif

auto total_global_mem = handle.get_device_properties().totalGlobalMem;
size_t element_size = sizeof(vertex_t) * 2;
Expand Down Expand Up @@ -567,6 +576,10 @@ create_graph_from_partitioned_edgelist(
}

// 3. segmented sort neighbors
#if 1
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":create_graph_from_partitioned 3" << std::endl;
#endif

for (size_t i = 0; i < edge_partition_offsets.size(); ++i) {
if (edge_partition_weights) {
Expand Down Expand Up @@ -653,6 +666,10 @@ create_graph_from_partitioned_edgelist(
}

// 4. create a graph and an edge_property_t object.
#if 1
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":create_graph_from_partitioned 4" << std::endl;
#endif

std::optional<edge_property_t<graph_view_t<vertex_t, edge_t, store_transposed, true>, weight_t>>
edge_weights{std::nullopt};
Expand Down Expand Up @@ -933,6 +950,11 @@ create_graph_from_edgelist_impl(
bool renumber,
bool do_expensive_check)
{
#if 1
auto const comm_rank = handle.get_comms().get_rank();
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":create_graph_from_edgelist_impl 0" << std::endl;
#endif
auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
auto const major_comm_size = major_comm.get_size();
auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
Expand Down Expand Up @@ -1024,6 +1046,10 @@ create_graph_from_edgelist_impl(
// 1. groupby each edge chunks to their target local adjacency matrix partition (and further
// groupby within the local partition by applying the compute_gpu_id_from_vertex_t to minor vertex
// IDs).
#if 1
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":create_graph_from_edgelist_impl 1" << std::endl;
#endif

std::vector<std::vector<rmm::device_uvector<vertex_t>>> edgelist_partitioned_srcs(
edgelist_srcs.size());
Expand Down Expand Up @@ -1154,6 +1180,10 @@ create_graph_from_edgelist_impl(
if (edgelist_edge_types) { (*edgelist_edge_types).clear(); }

// 2. split the grouped edge chunks to local partitions
#if 1
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":create_graph_from_edgelist_impl 2" << std::endl;
#endif

auto edgelist_intra_partition_segment_offsets = std::vector<std::vector<edge_t>>(minor_comm_size);

Expand Down Expand Up @@ -1201,7 +1231,13 @@ create_graph_from_edgelist_impl(
intra_partition_segment_sizes.end(),
intra_partition_segment_offsets.begin() + 1);

#if 1
std::cout << comm_rank << ": i=" << i << " edge_count=" << edge_count << std::endl;
#endif
rmm::device_uvector<vertex_t> tmp_srcs(edge_count, handle.get_stream());
#if 1
std::cout << comm_rank << ": i=" << i << " tmp_srcs allocated" << std::endl;
#endif
for (int j = 0; j < major_comm_size; ++j) {
for (size_t k = 0; k < edgelist_partitioned_srcs.size(); ++k) {
auto& input_buffer = edgelist_partitioned_srcs[k][i * major_comm_size + j];
Expand All @@ -1218,6 +1254,9 @@ create_graph_from_edgelist_impl(
edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs));

rmm::device_uvector<vertex_t> tmp_dsts(edge_count, handle.get_stream());
#if 1
std::cout << comm_rank << ": i=" << i << " tmp_dsts allocated" << std::endl;
#endif
for (int j = 0; j < major_comm_size; ++j) {
for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) {
auto& input_buffer = edgelist_partitioned_dsts[k][i * major_comm_size + j];
Expand Down Expand Up @@ -1289,6 +1328,10 @@ create_graph_from_edgelist_impl(

edgelist_intra_partition_segment_offsets[i] = std::move(intra_partition_segment_offsets);
}
#if 1
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":create_graph_from_edgelist_impl 3" << std::endl;
#endif

return create_graph_from_partitioned_edgelist<vertex_t,
edge_t,
Expand Down
52 changes: 52 additions & 0 deletions cpp/src/structure/renumber_edgelist_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,11 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compu
// 1. if local_vertices.has_value() is false, find unique vertices from edge majors & minors (to
// construct local_vertices)

#if 1
auto comm_rank = multi_gpu ? handle.get_comms().get_rank() : int{0};
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":compute_renumber_map 0" << std::endl;
#endif
rmm::device_uvector<vertex_t> sorted_local_vertices(0, handle.get_stream());
if (!local_vertices) {
constexpr size_t num_bins{
Expand Down Expand Up @@ -521,6 +526,10 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compu
thrust::sort(
handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end());
}
#if 1
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":compute_renumber_map 1" << std::endl;
#endif

// 2. find an unused vertex ID

Expand All @@ -531,6 +540,10 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compu
CUGRAPH_EXPECTS(locally_unused_vertex_id.has_value(),
"Invalid input arguments: there is no unused value in the entire range of "
"vertex_t, increase vertex_t to 64 bit.");
#if 1
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":compute_renumber_map 2" << std::endl;
#endif

// 3. compute global degrees for the sorted local vertices

Expand Down Expand Up @@ -706,6 +719,10 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compu
offset += this_chunk_size;
}
}
#if 1
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":compute_renumber_map 3" << std::endl;
#endif

// 5. sort local vertices by degree (descending)

Expand Down Expand Up @@ -773,6 +790,10 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compu
d_segment_offsets.size(),
handle.get_stream());
handle.sync_stream();
#if 1
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":compute_renumber_map 4" << std::endl;
#endif

return std::make_tuple(
std::move(sorted_local_vertices), h_segment_offsets, *locally_unused_vertex_id);
Expand Down Expand Up @@ -1065,12 +1086,20 @@ renumber_edgelist(

// 1. compute renumber map

#if 1
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":renumber_edgelist 0" << std::endl;
#endif
auto [renumber_map_labels, vertex_partition_segment_offsets, locally_unused_vertex_id] =
detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(handle,
std::move(local_vertices),
edgelist_const_majors,
edgelist_const_minors,
edgelist_edge_counts);
#if 1
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":renumber_edgelist 1" << std::endl;
#endif

// 2. initialize partition_t object, number_of_vertices, and number_of_edges

Expand Down Expand Up @@ -1106,6 +1135,10 @@ renumber_edgelist(

// 3. renumber edges

#if 1
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":renumber_edgelist 2" << std::endl;
#endif
{
vertex_t max_edge_partition_major_range_size{0};
for (size_t i = 0; i < edgelist_majors.size(); ++i) {
Expand Down Expand Up @@ -1138,11 +1171,23 @@ renumber_edgelist(
}
}

#if 1
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank
<< ":renumber_edgelist 3 partition.local_edge_partition_minor_range_size()="
<< partition.local_edge_partition_minor_range_size()
<< " number_of_edges=" << number_of_edges << " comm_size=" << comm_size
<< " edgelist_intra_partition_segment_offsets.has_value()="
<< edgelist_intra_partition_segment_offsets.has_value() << std::endl;
#endif
if ((static_cast<double>(partition.local_edge_partition_minor_range_size() *
2.5 /* tuning parameter */) >=
static_cast<double>(number_of_edges / comm_size)) &&
edgelist_intra_partition_segment_offsets) { // memory footprint dominated by the O(V/sqrt(P))
// part than the O(E/P) part
#if 1
std::cout << comm_rank << "path A" << std::endl;
#endif
vertex_t max_segment_size{0};
for (int i = 0; i < major_comm_size; ++i) {
auto minor_range_vertex_partition_id =
Expand Down Expand Up @@ -1182,6 +1227,9 @@ renumber_edgelist(
}
}
} else {
#if 1
std::cout << comm_rank << ":path B" << std::endl;
#endif
rmm::device_uvector<vertex_t> renumber_map_minor_labels(
partition.local_edge_partition_minor_range_size(), handle.get_stream());
std::vector<size_t> recvcounts(major_comm_size);
Expand Down Expand Up @@ -1216,6 +1264,10 @@ renumber_edgelist(
}
}

#if 1
RAFT_CUDA_TRY(cudaDeviceSynchronize());
std::cout << comm_rank << ":renumber_edgelist 4" << std::endl;
#endif
auto edge_partition_segment_offsets =
detail::aggregate_segment_offsets(handle, vertex_partition_segment_offsets);

Expand Down
37 changes: 25 additions & 12 deletions cpp/src/traversal/bfs_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ void bfs(raft::handle_t const& handle,
"GraphViewType should support the push model.");

#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete
auto const comm_rank = GraphViewType::is_multi_gpu ? handle.get_comms().get_rank() : int{0};
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto prep0 = std::chrono::steady_clock::now();
#endif
Expand Down Expand Up @@ -275,13 +276,15 @@ void bfs(raft::handle_t const& handle,
true);
#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto prep4 = std::chrono::steady_clock::now();
auto prep4 = std::chrono::steady_clock::now();
std::chrono::duration<double> dur0 = prep1 - prep0;
std::chrono::duration<double> dur1 = prep2 - prep1;
std::chrono::duration<double> dur2 = prep3 - prep2;
std::chrono::duration<double> dur3 = prep4 - prep3;
std::chrono::duration<double> dur = prep4 - prep0;
std::cout << "prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s." << std::endl;
std::chrono::duration<double> dur = prep4 - prep0;
std::cout << comm_rank << ":prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count()
<< "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s."
<< std::endl;
#endif

// 4. BFS iteration
Expand Down Expand Up @@ -350,8 +353,9 @@ void bfs(raft::handle_t const& handle,
std::chrono::duration<double> dur1 = topdown2 - topdown1;
std::chrono::duration<double> dur2 = topdown3 - topdown2;
std::chrono::duration<double> dur = topdown3 - topdown0;
std::cout << "topdown (prim,vf,host) took " << dur.count() << " (" << dur0.count() << "," << dur1.count()
<< "," << dur2.count() << ") s." << std::endl;
std::cout << comm_rank << ":depth=" << depth << " topdown (prim,vf,host) took "
<< dur.count() << " (" << dur0.count() << "," << dur1.count() << ","
<< dur2.count() << ") s." << std::endl;
#endif
break;
}
Expand All @@ -368,7 +372,8 @@ void bfs(raft::handle_t const& handle,
#endif

if (direction_optimizing) {
// FIXME: computing m_f & updating nzd_unvisited_vertices & computing m_u can be executed concurrently.
// FIXME: computing m_f & updating nzd_unvisited_vertices & computing m_u can be executed
// concurrently.
// FIXME: also the above fill_edge_dst_property can be executed concurrently.
auto m_f = thrust::transform_reduce(
handle.get_thrust_policy(),
Expand Down Expand Up @@ -455,9 +460,11 @@ void bfs(raft::handle_t const& handle,
std::chrono::duration<double> dur4 = topdown5 - topdown4;
std::chrono::duration<double> dur5 = topdown6 - topdown5;
std::chrono::duration<double> dur = topdown6 - topdown0;
std::cout << depth << " topdown next_aggregate_frontier_size=" << next_aggregate_frontier_size << " (prim,vf,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," << dur1.count()
<< "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << ","
<< dur5.count() << ") s." << std::endl;
std::cout << comm_rank << ":depth=" << depth
<< " topdown next_aggregate_frontier_size=" << next_aggregate_frontier_size
<< " (prim,vf,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << ","
<< dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count()
<< "," << dur5.count() << ") s." << std::endl;
#endif
} else { // bottom up
#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete
Expand Down Expand Up @@ -549,8 +556,9 @@ void bfs(raft::handle_t const& handle,
std::chrono::duration<double> dur0 = bottomup1 - bottomup0;
std::chrono::duration<double> dur1 = bottomup2 - bottomup1;
std::chrono::duration<double> dur = bottomup2 - bottomup0;
std::cout << "bottomup (prim+,host) took " << dur.count() << " (" << dur0.count() << "," << dur1.count()
<< ") s." << std::endl;
std::cout << comm_rank << ":depth=" << depth << " bottomup (prim+,host) took "
<< dur.count() << " (" << dur0.count() << "," << dur1.count() << ") s."
<< std::endl;
#endif
break;
}
Expand Down Expand Up @@ -605,7 +613,12 @@ void bfs(raft::handle_t const& handle,
std::chrono::duration<double> dur3 = bottomup4 - bottomup3;
std::chrono::duration<double> dur4 = bottomup5 - bottomup4;
std::chrono::duration<double> dur = bottomup5 - bottomup0;
std::cout << depth << " bottomup next_aggregate_frontier_size=" << next_aggregate_frontier_size << " aggregatee_nzd_unvisited_vertices=" << aggregate_nzd_unvisited_vertices << " (prim+,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << ") s." << std::endl;
std::cout << comm_rank << ":depth=" << depth
<< " bottomup next_aggregate_frontier_size=" << next_aggregate_frontier_size
<< " aggregatee_nzd_unvisited_vertices=" << aggregate_nzd_unvisited_vertices
<< " (prim+,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << ","
<< dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count()
<< ") s." << std::endl;
#endif
}
cur_aggregate_frontier_size = next_aggregate_frontier_size;
Expand Down

0 comments on commit 3577699

Please sign in to comment.