Skip to content

Commit

Permalink
improve logging
Browse files Browse the repository at this point in the history
  • Loading branch information
seunghwak committed Sep 25, 2024
1 parent 07749f4 commit 4ddd0a1
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 13 deletions.
4 changes: 2 additions & 2 deletions cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1185,7 +1185,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
std::chrono::duration<double> subdur7 = subtime8 - subtime7;
std::chrono::duration<double> subdur8 = subtime9 - subtime8;
std::chrono::duration<double> subdur9 = subtime10 - subtime9;
std::cout << "sub (extract) took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count()
std::cerr << "sub (extract) took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count()
<< "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << ","
<< subdur6.count() << "," << subdur7.count() << "," << subdur8.count() << ","
<< subdur9.count() << ")" << std::endl;
Expand Down Expand Up @@ -1248,7 +1248,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
std::chrono::duration<double> dur0 = time1 - time0;
std::chrono::duration<double> dur1 = time2 - time1;
std::chrono::duration<double> dur2 = time3 - time2;
std::cout << "\t\t" << "detail::extract (pre,fill,concat) took (" << dur0.count() << "," << dur1.count()
std::cerr << "\t\t" << "detail::extract (pre,fill,concat) took (" << dur0.count() << "," << dur1.count()
<< "," << dur2.count() << ")" << std::endl;
#endif

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/prims/detail/per_v_transform_reduce_e.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2639,7 +2639,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
std::chrono::duration<double> subdur14 = subtime15 - subtime14;
std::chrono::duration<double> subdur15 = subtime16 - subtime15;
std::chrono::duration<double> subdur16 = subtime17 - subtime16;
std::cout << "sub (per_v) took (" << subdur0.count() << "," << subdur1.count() << ","
std::cerr << "sub (per_v) took (" << subdur0.count() << "," << subdur1.count() << ","
<< subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << ","
<< subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << ","
<< subdur8.count() << "," << subdur9.count() << "," << subdur10.count() << ","
Expand Down Expand Up @@ -2765,7 +2765,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
std::chrono::duration<double> dur0 = time1 - time0;
std::chrono::duration<double> dur1 = time2 - time1;
std::chrono::duration<double> dur2 = time3 - time2;
std::cout << "\t\t" << "detail::per_v (prep, ep, comm) took (" << dur0.count()
std::cerr << "\t\t" << "detail::per_v (prep, ep, comm) took (" << dur0.count()
<< "," << dur1.count() << "," << dur2.count() << ")" << std::endl;
#endif
}
Expand Down
19 changes: 18 additions & 1 deletion cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -293,19 +293,36 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle,
d_tx_buffer_last_boundaries.size(),
handle.get_stream());
handle.sync_stream();
#if 0
std::vector<size_t> tx_counts(comm.get_size(), 0);
for (int i = 0; i < major_comm_size; ++i) {
auto r = partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks(major_comm_size, minor_comm_size, i, minor_comm_rank);
tx_counts[r] = (i == 0) ? h_tx_buffer_last_boundaries[0] : (h_tx_buffer_last_boundaries[i] - h_tx_buffer_last_boundaries[i - 1]);
}

auto rx_key_buffer = allocate_dataframe_buffer<key_t>(size_t{0}, handle.get_stream());
std::tie(rx_key_buffer, std::ignore) = shuffle_values(
comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); // use comm insteads of major_comm to save P2P buffer allocation
#else
std::vector<size_t> tx_counts(h_tx_buffer_last_boundaries.size());
std::adjacent_difference(
h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin());

auto rx_key_buffer = allocate_dataframe_buffer<key_t>(size_t{0}, handle.get_stream());
std::tie(rx_key_buffer, std::ignore) = shuffle_values(
major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream());
#endif
key_buffer = std::move(rx_key_buffer);

if constexpr (!std::is_same_v<payload_t, void>) {
auto rx_payload_buffer = allocate_dataframe_buffer<payload_t>(size_t{0}, handle.get_stream());
#if 0
std::tie(rx_payload_buffer, std::ignore) = shuffle_values(
comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream());
#else
std::tie(rx_payload_buffer, std::ignore) = shuffle_values(
major_comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream());
#endif
payload_buffer = std::move(rx_payload_buffer);
}

Expand All @@ -319,7 +336,7 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle,
std::chrono::duration<double> dur0 = time1 - time0;
std::chrono::duration<double> dur1 = time2 - time1;
std::chrono::duration<double> dur2 = time3 - time2;
std::cout << "\tprim (fill,lreduce,greduce) took (" << dur0.count() << "," << dur1.count() << ","
std::cerr << "\tprim (fill,lreduce,greduce) took (" << dur0.count() << "," << dur1.count() << ","
<< dur2.count() << ")" << std::endl;
#endif

Expand Down
10 changes: 9 additions & 1 deletion cpp/src/structure/graph_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ update_local_sorted_unique_edge_majors_minors(
// majors/minors to support storing edge major/minor properties in (key, value) pairs.

// 1. Update local_sorted_unique_edge_minors & local_sorted_unique_edge_minor_offsets
RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "update_local_sorted_unique_edge_majors_minors 1" << std::endl;

{
auto [minor_range_first, minor_range_last] = meta.partition.local_edge_partition_minor_range();
Expand All @@ -191,8 +192,10 @@ update_local_sorted_unique_edge_majors_minors(
raft::comms::op_t::MAX,
handle.get_stream());

std::cout << "max_minor_properties_fill_ratio=" << max_minor_properties_fill_ratio << " detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold=" << detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold << std::endl;
if (max_minor_properties_fill_ratio <
detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) {
std::cerr << "K,V pairs" << std::endl;
auto const chunk_size =
static_cast<size_t>(std::min(1.0 / max_minor_properties_fill_ratio, 1024.0));

Expand Down Expand Up @@ -280,10 +283,11 @@ update_local_sorted_unique_edge_majors_minors(
}

// 2. Update local_sorted_unique_edge_majors & local_sorted_unique_edge_major_offsets
RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "update_local_sorted_unique_edge_majors_minors 2" << std::endl;

std::vector<vertex_t> num_local_unique_edge_major_counts(edge_partition_offsets.size());
for (size_t i = 0; i < edge_partition_offsets.size(); ++i) {
num_local_unique_edge_major_counts[i] += thrust::count_if(
num_local_unique_edge_major_counts[i] = thrust::count_if(
handle.get_thrust_policy(),
thrust::make_counting_iterator(vertex_t{0}),
thrust::make_counting_iterator(static_cast<vertex_t>(edge_partition_offsets[i].size() - 1)),
Expand All @@ -304,6 +308,7 @@ update_local_sorted_unique_edge_majors_minors(
raft::comms::op_t::MAX,
handle.get_stream());

std::cout << "max_major_properties_fill_ratio=" << max_major_properties_fill_ratio << " detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold=" << detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold << std::endl;
if (max_major_properties_fill_ratio <
detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) {
auto const chunk_size =
Expand Down Expand Up @@ -368,6 +373,7 @@ update_local_sorted_unique_edge_majors_minors(
}
local_sorted_unique_edge_major_chunk_size = chunk_size;
}
RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "update_local_sorted_unique_edge_majors_minors 3" << std::endl;

return std::make_tuple(std::move(local_sorted_unique_edge_majors),
std::move(local_sorted_unique_edge_major_chunk_start_offsets),
Expand Down Expand Up @@ -425,6 +431,7 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gp
edge_partition_indices_,
edge_partition_dcs_nzd_vertices_);
} else {
RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "graph_t 0" << std::endl;
std::tie(local_sorted_unique_edge_srcs_,
local_sorted_unique_edge_src_chunk_start_offsets_,
local_sorted_unique_edge_src_chunk_size_,
Expand All @@ -438,6 +445,7 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gp
edge_partition_offsets_,
edge_partition_indices_,
edge_partition_dcs_nzd_vertices_);
RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "graph_t 1" << std::endl;
}
}

Expand Down
13 changes: 6 additions & 7 deletions cpp/src/traversal/bfs_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ void bfs(raft::handle_t const& handle,
"GraphViewType should support the push model.");

#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete
auto const comm_rank = GraphViewType::is_multi_gpu ? handle.get_comms().get_rank() : int{0};
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto prep0 = std::chrono::steady_clock::now();
#endif
Expand Down Expand Up @@ -366,7 +365,7 @@ void bfs(raft::handle_t const& handle,
std::chrono::duration<double> dur2 = prep3 - prep2;
std::chrono::duration<double> dur3 = prep4 - prep3;
std::chrono::duration<double> dur = prep4 - prep0;
std::cout << comm_rank << ":prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count()
std::cerr << "prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count()
<< "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s."
<< std::endl;
#endif
Expand Down Expand Up @@ -437,7 +436,7 @@ void bfs(raft::handle_t const& handle,
std::chrono::duration<double> dur1 = topdown2 - topdown1;
std::chrono::duration<double> dur2 = topdown3 - topdown2;
std::chrono::duration<double> dur = topdown3 - topdown0;
std::cout << comm_rank << ":depth=" << depth << " topdown (prim,vf,host) took "
std::cerr << "depth=" << depth << " topdown (prim,vf,host) took "
<< dur.count() << " (" << dur0.count() << "," << dur1.count() << ","
<< dur2.count() << ") s." << std::endl;
#endif
Expand Down Expand Up @@ -567,7 +566,7 @@ void bfs(raft::handle_t const& handle,
? host_scalar_allreduce(
handle.get_comms(), m_u, raft::comms::op_t::SUM, handle.get_stream())
: m_u;
std::cout << comm_rank << ":m_f=" << m_f << " m_u=" << m_u
std::cerr << "m_f=" << m_f << " m_u=" << m_u
<< " aggregate_m_f * direction_optimzing_alpha="
<< aggregate_m_f * direction_optimizing_alpha
<< " aggregate_m_u=" << aggregate_m_u
Expand Down Expand Up @@ -606,7 +605,7 @@ void bfs(raft::handle_t const& handle,
std::chrono::duration<double> dur4 = topdown5 - topdown4;
std::chrono::duration<double> dur5 = topdown6 - topdown5;
std::chrono::duration<double> dur = topdown6 - topdown0;
std::cout << comm_rank << ":depth=" << depth
std::cerr << "depth=" << depth
<< " topdown next_aggregate_frontier_size=" << next_aggregate_frontier_size
<< " next topdown=" << topdown
<< " (prim,vf,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << ","
Expand Down Expand Up @@ -703,7 +702,7 @@ void bfs(raft::handle_t const& handle,
std::chrono::duration<double> dur0 = bottomup1 - bottomup0;
std::chrono::duration<double> dur1 = bottomup2 - bottomup1;
std::chrono::duration<double> dur = bottomup2 - bottomup0;
std::cout << comm_rank << ":depth=" << depth << " bottomup (prim+,host) took "
std::cerr << "depth=" << depth << " bottomup (prim+,host) took "
<< dur.count() << " (" << dur0.count() << "," << dur1.count() << ") s."
<< std::endl;
#endif
Expand Down Expand Up @@ -760,7 +759,7 @@ void bfs(raft::handle_t const& handle,
std::chrono::duration<double> dur3 = bottomup4 - bottomup3;
std::chrono::duration<double> dur4 = bottomup5 - bottomup4;
std::chrono::duration<double> dur = bottomup5 - bottomup0;
std::cout << comm_rank << ":depth=" << depth
std::cerr << "depth=" << depth
<< " bottomup next_aggregate_frontier_size=" << next_aggregate_frontier_size
<< " aggregatee_nzd_unvisited_vertices=" << aggregate_nzd_unvisited_vertices
<< " (prim+,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << ","
Expand Down

0 comments on commit 4ddd0a1

Please sign in to comment.