From 4ddd0a1e67c666099789bce8c1d2d4613bca7b91 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 25 Sep 2024 14:28:39 -0700 Subject: [PATCH] improve logging --- .../detail/extract_transform_v_frontier_e.cuh | 4 ++-- .../prims/detail/per_v_transform_reduce_e.cuh | 4 ++-- ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh | 19 ++++++++++++++++++- cpp/src/structure/graph_impl.cuh | 10 +++++++++- cpp/src/traversal/bfs_impl.cuh | 13 ++++++------- 5 files changed, 37 insertions(+), 13 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 7adca3e226c..21dda2b0c92 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -1185,7 +1185,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::chrono::duration subdur7 = subtime8 - subtime7; std::chrono::duration subdur8 = subtime9 - subtime8; std::chrono::duration subdur9 = subtime10 - subtime9; - std::cout << "sub (extract) took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() + std::cerr << "sub (extract) took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," << subdur8.count() << "," << subdur9.count() << ")" << std::endl; @@ -1248,7 +1248,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; - std::cout << "\t\t" << "detail::extract (pre,fill,concat) took (" << dur0.count() << "," << dur1.count() + std::cerr << "\t\t" << "detail::extract (pre,fill,concat) took (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << ")" << std::endl; #endif diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index ed2a3fa35f4..54941f6b816 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -2639,7 +2639,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::chrono::duration subdur14 = subtime15 - subtime14; std::chrono::duration subdur15 = subtime16 - subtime15; std::chrono::duration subdur16 = subtime17 - subtime16; - std::cout << "sub (per_v) took (" << subdur0.count() << "," << subdur1.count() << "," + std::cerr << "sub (per_v) took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," << subdur8.count() << "," << subdur9.count() << "," << subdur10.count() << "," @@ -2765,7 +2765,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; - std::cout << "\t\t" << "detail::per_v (prep, ep, comm) took (" << dur0.count() + std::cerr << "\t\t" << "detail::per_v (prep, ep, comm) took (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << ")" << std::endl; #endif } diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh index 6efa9c6313f..1f080f7b103 100644 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh @@ -293,6 +293,17 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, d_tx_buffer_last_boundaries.size(), handle.get_stream()); handle.sync_stream(); +#if 0 + std::vector tx_counts(comm.get_size(), 0); + for (int i = 0; i < major_comm_size; ++i) { + auto r = partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks(major_comm_size, minor_comm_size, i, minor_comm_rank); + tx_counts[r] = (i == 0) ? h_tx_buffer_last_boundaries[0] : (h_tx_buffer_last_boundaries[i] - h_tx_buffer_last_boundaries[i - 1]); + } + + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, std::ignore) = shuffle_values( + comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); // use comm insteads of major_comm to save P2P buffer allocation +#else std::vector tx_counts(h_tx_buffer_last_boundaries.size()); std::adjacent_difference( h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); @@ -300,12 +311,18 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); std::tie(rx_key_buffer, std::ignore) = shuffle_values( major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); +#endif key_buffer = std::move(rx_key_buffer); if constexpr (!std::is_same_v) { auto rx_payload_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); +#if 0 + std::tie(rx_payload_buffer, std::ignore) = shuffle_values( + comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream()); +#else std::tie(rx_payload_buffer, std::ignore) = shuffle_values( major_comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream()); +#endif payload_buffer = std::move(rx_payload_buffer); } @@ -319,7 +336,7 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; - std::cout << "\tprim (fill,lreduce,greduce) took (" << dur0.count() << "," << dur1.count() << "," + std::cerr << "\tprim (fill,lreduce,greduce) took (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << ")" << std::endl; #endif diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index ef43b7b13ec..4d5585304f6 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -165,6 +165,7 @@ update_local_sorted_unique_edge_majors_minors( // majors/minors to support storing edge major/minor properties in (key, value) pairs. // 1. Update local_sorted_unique_edge_minors & local_sorted_unique_edge_minor_offsets +RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "update_local_sorted_unique_edge_majors_minors 1" << std::endl; { auto [minor_range_first, minor_range_last] = meta.partition.local_edge_partition_minor_range(); @@ -191,8 +192,10 @@ update_local_sorted_unique_edge_majors_minors( raft::comms::op_t::MAX, handle.get_stream()); +std::cout << "max_minor_properties_fill_ratio=" << max_minor_properties_fill_ratio << " detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold=" << detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold << std::endl; if (max_minor_properties_fill_ratio < detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) { + std::cerr << "K,V pairs" << std::endl; auto const chunk_size = static_cast(std::min(1.0 / max_minor_properties_fill_ratio, 1024.0)); @@ -280,10 +283,11 @@ update_local_sorted_unique_edge_majors_minors( } // 2. Update local_sorted_unique_edge_majors & local_sorted_unique_edge_major_offsets +RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "update_local_sorted_unique_edge_majors_minors 2" << std::endl; std::vector num_local_unique_edge_major_counts(edge_partition_offsets.size()); for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { - num_local_unique_edge_major_counts[i] += thrust::count_if( + num_local_unique_edge_major_counts[i] = thrust::count_if( handle.get_thrust_policy(), thrust::make_counting_iterator(vertex_t{0}), thrust::make_counting_iterator(static_cast(edge_partition_offsets[i].size() - 1)), @@ -304,6 +308,7 @@ update_local_sorted_unique_edge_majors_minors( raft::comms::op_t::MAX, handle.get_stream()); +std::cout << "max_major_properties_fill_ratio=" << max_major_properties_fill_ratio << " detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold=" << detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold << std::endl; if (max_major_properties_fill_ratio < detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) { auto const chunk_size = @@ -368,6 +373,7 @@ update_local_sorted_unique_edge_majors_minors( } local_sorted_unique_edge_major_chunk_size = chunk_size; } +RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "update_local_sorted_unique_edge_majors_minors 3" << std::endl; return std::make_tuple(std::move(local_sorted_unique_edge_majors), std::move(local_sorted_unique_edge_major_chunk_start_offsets), @@ -425,6 +431,7 @@ graph_t dur2 = prep3 - prep2; std::chrono::duration dur3 = prep4 - prep3; std::chrono::duration dur = prep4 - prep0; - std::cout << comm_rank << ":prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count() + std::cerr << "prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s." << std::endl; #endif @@ -437,7 +436,7 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur1 = topdown2 - topdown1; std::chrono::duration dur2 = topdown3 - topdown2; std::chrono::duration dur = topdown3 - topdown0; - std::cout << comm_rank << ":depth=" << depth << " topdown (prim,vf,host) took " + std::cerr << "depth=" << depth << " topdown (prim,vf,host) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << ") s." << std::endl; #endif @@ -567,7 +566,7 @@ void bfs(raft::handle_t const& handle, ? host_scalar_allreduce( handle.get_comms(), m_u, raft::comms::op_t::SUM, handle.get_stream()) : m_u; - std::cout << comm_rank << ":m_f=" << m_f << " m_u=" << m_u + std::cerr << "m_f=" << m_f << " m_u=" << m_u << " aggregate_m_f * direction_optimzing_alpha=" << aggregate_m_f * direction_optimizing_alpha << " aggregate_m_u=" << aggregate_m_u @@ -606,7 +605,7 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur4 = topdown5 - topdown4; std::chrono::duration dur5 = topdown6 - topdown5; std::chrono::duration dur = topdown6 - topdown0; - std::cout << comm_rank << ":depth=" << depth + std::cerr << "depth=" << depth << " topdown next_aggregate_frontier_size=" << next_aggregate_frontier_size << " next topdown=" << topdown << " (prim,vf,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," @@ -703,7 +702,7 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur0 = bottomup1 - bottomup0; std::chrono::duration dur1 = bottomup2 - bottomup1; std::chrono::duration dur = bottomup2 - bottomup0; - std::cout << comm_rank << ":depth=" << depth << " bottomup (prim+,host) took " + std::cerr << "depth=" << depth << " bottomup (prim+,host) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << ") s." << std::endl; #endif @@ -760,7 +759,7 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur3 = bottomup4 - bottomup3; std::chrono::duration dur4 = bottomup5 - bottomup4; std::chrono::duration dur = bottomup5 - bottomup0; - std::cout << comm_rank << ":depth=" << depth + std::cerr << "depth=" << depth << " bottomup next_aggregate_frontier_size=" << next_aggregate_frontier_size << " aggregatee_nzd_unvisited_vertices=" << aggregate_nzd_unvisited_vertices << " (prim+,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << ","