diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index b31496bc960..eea8b3ccdec 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1348,7 +1348,7 @@ void per_v_transform_reduce_e_edge_partition( } } -#define PER_V_PERFORMANCE_MEASUREMENT 0 +#define PER_V_PERFORMANCE_MEASUREMENT 1 template subdur14 = subtime15 - subtime14; std::chrono::duration subdur15 = subtime16 - subtime15; std::chrono::duration subdur16 = subtime17 - subtime16; - std::cout << "sub took (" << subdur0.count() << "," << subdur1.count() << "," + std::cout << comm_rank << ":sub took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," << subdur8.count() << "," << subdur9.count() << "," << subdur10.count() << "," @@ -2760,8 +2761,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; - std::cout << "\t\tdetail::per_v (prep, ep, comm) took (" << dur0.count() << "," << dur1.count() - << "," << dur2.count() << ")" << std::endl; + std::cout << "\t\t" << comm_rank << ":detail::per_v (prep, ep, comm) took (" << dur0.count() + << "," << dur1.count() << "," << dur2.count() << ")" << std::endl; #endif } diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 3197ac8e963..b30397f5d8c 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -372,6 +372,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, num_concurrent_bcasts = std::min(num_concurrent_bcasts, handle.get_stream_pool_size()); num_concurrent_bcasts = std::min(std::max(num_concurrent_bcasts, size_t{1}), static_cast(major_comm_size)); + std::cout << comm.get_rank() << ":" << " v_list_size=" << v_list_size << " v_list_range=(" << v_list_range[0] << "," << v_list_range[1] << ") v_list_bitmap.has_value()=" << v_list_bitmap.has_value() << " num_concurrent_bcasts=" << num_concurrent_bcasts << std::endl; std::optional> stream_pool_indices{std::nullopt}; if (num_concurrent_bcasts > 1) { diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index 9796ddd60a1..b2ac05b3fa0 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -335,8 +335,13 @@ create_graph_from_partitioned_edgelist( auto const minor_comm_size = minor_comm.get_size(); // 1. renumber +#if 1 + auto const comm_rank = handle.get_comms().get_rank(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_partitioned 0" << std::endl; +#endif - std::vector edgelist_edge_counts(minor_comm_size, edge_t{0}); + std::vector < edge_t> edgelist_edge_counts(minor_comm_size, edge_t{0}); for (size_t i = 0; i < edgelist_edge_counts.size(); ++i) { edgelist_edge_counts[i] = static_cast(edge_partition_edgelist_srcs[i].size()); } @@ -362,6 +367,10 @@ create_graph_from_partitioned_edgelist( num_segments_per_vertex_partition > (detail::num_sparse_segments_per_vertex_partition + 2); // 2. sort and compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_partitioned 1" << std::endl; +#endif auto total_global_mem = handle.get_device_properties().totalGlobalMem; size_t element_size = sizeof(vertex_t) * 2; @@ -567,6 +576,10 @@ create_graph_from_partitioned_edgelist( } // 3. segmented sort neighbors +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_partitioned 3" << std::endl; +#endif for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { if (edge_partition_weights) { @@ -653,6 +666,10 @@ create_graph_from_partitioned_edgelist( } // 4. create a graph and an edge_property_t object. +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_partitioned 4" << std::endl; +#endif std::optional, weight_t>> edge_weights{std::nullopt}; @@ -933,6 +950,11 @@ create_graph_from_edgelist_impl( bool renumber, bool do_expensive_check) { +#if 1 + auto const comm_rank = handle.get_comms().get_rank(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_edgelist_impl 0" << std::endl; +#endif auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); auto const major_comm_size = major_comm.get_size(); auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); @@ -1024,6 +1046,10 @@ create_graph_from_edgelist_impl( // 1. groupby each edge chunks to their target local adjacency matrix partition (and further // groupby within the local partition by applying the compute_gpu_id_from_vertex_t to minor vertex // IDs). +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_edgelist_impl 1" << std::endl; +#endif std::vector>> edgelist_partitioned_srcs( edgelist_srcs.size()); @@ -1154,6 +1180,10 @@ create_graph_from_edgelist_impl( if (edgelist_edge_types) { (*edgelist_edge_types).clear(); } // 2. split the grouped edge chunks to local partitions +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_edgelist_impl 2" << std::endl; +#endif auto edgelist_intra_partition_segment_offsets = std::vector>(minor_comm_size); @@ -1201,7 +1231,13 @@ create_graph_from_edgelist_impl( intra_partition_segment_sizes.end(), intra_partition_segment_offsets.begin() + 1); +#if 1 + std::cout << comm_rank << ": i=" << i << " edge_count=" << edge_count << std::endl; +#endif rmm::device_uvector tmp_srcs(edge_count, handle.get_stream()); +#if 1 + std::cout << comm_rank << ": i=" << i << " tmp_srcs allocated" << std::endl; +#endif for (int j = 0; j < major_comm_size; ++j) { for (size_t k = 0; k < edgelist_partitioned_srcs.size(); ++k) { auto& input_buffer = edgelist_partitioned_srcs[k][i * major_comm_size + j]; @@ -1218,6 +1254,9 @@ create_graph_from_edgelist_impl( edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs)); rmm::device_uvector tmp_dsts(edge_count, handle.get_stream()); +#if 1 + std::cout << comm_rank << ": i=" << i << " tmp_dsts allocated" << std::endl; +#endif for (int j = 0; j < major_comm_size; ++j) { for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { auto& input_buffer = edgelist_partitioned_dsts[k][i * major_comm_size + j]; @@ -1289,6 +1328,10 @@ create_graph_from_edgelist_impl( edgelist_intra_partition_segment_offsets[i] = std::move(intra_partition_segment_offsets); } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_edgelist_impl 3" << std::endl; +#endif return create_graph_from_partitioned_edgelist, std::vector, vertex_t> compu // 1. if local_vertices.has_value() is false, find unique vertices from edge majors & minors (to // construct local_vertices) +#if 1 + auto comm_rank = multi_gpu ? handle.get_comms().get_rank() : int{0}; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":compute_renumber_map 0" << std::endl; +#endif rmm::device_uvector sorted_local_vertices(0, handle.get_stream()); if (!local_vertices) { constexpr size_t num_bins{ @@ -521,6 +526,10 @@ std::tuple, std::vector, vertex_t> compu thrust::sort( handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end()); } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":compute_renumber_map 1" << std::endl; +#endif // 2. find an unused vertex ID @@ -531,6 +540,10 @@ std::tuple, std::vector, vertex_t> compu CUGRAPH_EXPECTS(locally_unused_vertex_id.has_value(), "Invalid input arguments: there is no unused value in the entire range of " "vertex_t, increase vertex_t to 64 bit."); +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":compute_renumber_map 2" << std::endl; +#endif // 3. compute global degrees for the sorted local vertices @@ -706,6 +719,10 @@ std::tuple, std::vector, vertex_t> compu offset += this_chunk_size; } } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":compute_renumber_map 3" << std::endl; +#endif // 5. sort local vertices by degree (descending) @@ -773,6 +790,10 @@ std::tuple, std::vector, vertex_t> compu d_segment_offsets.size(), handle.get_stream()); handle.sync_stream(); +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":compute_renumber_map 4" << std::endl; +#endif return std::make_tuple( std::move(sorted_local_vertices), h_segment_offsets, *locally_unused_vertex_id); @@ -1065,12 +1086,20 @@ renumber_edgelist( // 1. compute renumber map +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":renumber_edgelist 0" << std::endl; +#endif auto [renumber_map_labels, vertex_partition_segment_offsets, locally_unused_vertex_id] = detail::compute_renumber_map(handle, std::move(local_vertices), edgelist_const_majors, edgelist_const_minors, edgelist_edge_counts); +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":renumber_edgelist 1" << std::endl; +#endif // 2. initialize partition_t object, number_of_vertices, and number_of_edges @@ -1106,6 +1135,10 @@ renumber_edgelist( // 3. renumber edges +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":renumber_edgelist 2" << std::endl; +#endif { vertex_t max_edge_partition_major_range_size{0}; for (size_t i = 0; i < edgelist_majors.size(); ++i) { @@ -1138,11 +1171,23 @@ renumber_edgelist( } } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank + << ":renumber_edgelist 3 partition.local_edge_partition_minor_range_size()=" + << partition.local_edge_partition_minor_range_size() + << " number_of_edges=" << number_of_edges << " comm_size=" << comm_size + << " edgelist_intra_partition_segment_offsets.has_value()=" + << edgelist_intra_partition_segment_offsets.has_value() << std::endl; +#endif if ((static_cast(partition.local_edge_partition_minor_range_size() * 2.5 /* tuning parameter */) >= static_cast(number_of_edges / comm_size)) && edgelist_intra_partition_segment_offsets) { // memory footprint dominated by the O(V/sqrt(P)) // part than the O(E/P) part +#if 1 + std::cout << comm_rank << "path A" << std::endl; +#endif vertex_t max_segment_size{0}; for (int i = 0; i < major_comm_size; ++i) { auto minor_range_vertex_partition_id = @@ -1182,6 +1227,9 @@ renumber_edgelist( } } } else { +#if 1 + std::cout << comm_rank << ":path B" << std::endl; +#endif rmm::device_uvector renumber_map_minor_labels( partition.local_edge_partition_minor_range_size(), handle.get_stream()); std::vector recvcounts(major_comm_size); @@ -1216,6 +1264,10 @@ renumber_edgelist( } } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":renumber_edgelist 4" << std::endl; +#endif auto edge_partition_segment_offsets = detail::aggregate_segment_offsets(handle, vertex_partition_segment_offsets); diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 1edb5e296b4..6afa5505af5 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -120,6 +120,7 @@ void bfs(raft::handle_t const& handle, "GraphViewType should support the push model."); #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + auto const comm_rank = GraphViewType::is_multi_gpu ? handle.get_comms().get_rank() : int{0}; RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto prep0 = std::chrono::steady_clock::now(); #endif @@ -275,13 +276,15 @@ void bfs(raft::handle_t const& handle, true); #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto prep4 = std::chrono::steady_clock::now(); + auto prep4 = std::chrono::steady_clock::now(); std::chrono::duration dur0 = prep1 - prep0; std::chrono::duration dur1 = prep2 - prep1; std::chrono::duration dur2 = prep3 - prep2; std::chrono::duration dur3 = prep4 - prep3; - std::chrono::duration dur = prep4 - prep0; - std::cout << "prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s." << std::endl; + std::chrono::duration dur = prep4 - prep0; + std::cout << comm_rank << ":prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count() + << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s." + << std::endl; #endif // 4. BFS iteration @@ -350,8 +353,9 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur1 = topdown2 - topdown1; std::chrono::duration dur2 = topdown3 - topdown2; std::chrono::duration dur = topdown3 - topdown0; - std::cout << "topdown (prim,vf,host) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() - << "," << dur2.count() << ") s." << std::endl; + std::cout << comm_rank << ":depth=" << depth << " topdown (prim,vf,host) took " + << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," + << dur2.count() << ") s." << std::endl; #endif break; } @@ -368,7 +372,8 @@ void bfs(raft::handle_t const& handle, #endif if (direction_optimizing) { - // FIXME: computing m_f & updating nzd_unvisited_vertices & computing m_u can be executed concurrently. + // FIXME: computing m_f & updating nzd_unvisited_vertices & computing m_u can be executed + // concurrently. // FIXME: also the above fill_edge_dst_property can be executed concurrently. auto m_f = thrust::transform_reduce( handle.get_thrust_policy(), @@ -455,9 +460,11 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur4 = topdown5 - topdown4; std::chrono::duration dur5 = topdown6 - topdown5; std::chrono::duration dur = topdown6 - topdown0; - std::cout << depth << " topdown next_aggregate_frontier_size=" << next_aggregate_frontier_size << " (prim,vf,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() - << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << "," - << dur5.count() << ") s." << std::endl; + std::cout << comm_rank << ":depth=" << depth + << " topdown next_aggregate_frontier_size=" << next_aggregate_frontier_size + << " (prim,vf,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," + << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() + << "," << dur5.count() << ") s." << std::endl; #endif } else { // bottom up #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete @@ -549,8 +556,9 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur0 = bottomup1 - bottomup0; std::chrono::duration dur1 = bottomup2 - bottomup1; std::chrono::duration dur = bottomup2 - bottomup0; - std::cout << "bottomup (prim+,host) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() - << ") s." << std::endl; + std::cout << comm_rank << ":depth=" << depth << " bottomup (prim+,host) took " + << dur.count() << " (" << dur0.count() << "," << dur1.count() << ") s." + << std::endl; #endif break; } @@ -605,7 +613,12 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur3 = bottomup4 - bottomup3; std::chrono::duration dur4 = bottomup5 - bottomup4; std::chrono::duration dur = bottomup5 - bottomup0; - std::cout << depth << " bottomup next_aggregate_frontier_size=" << next_aggregate_frontier_size << " aggregatee_nzd_unvisited_vertices=" << aggregate_nzd_unvisited_vertices << " (prim+,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << ") s." << std::endl; + std::cout << comm_rank << ":depth=" << depth + << " bottomup next_aggregate_frontier_size=" << next_aggregate_frontier_size + << " aggregatee_nzd_unvisited_vertices=" << aggregate_nzd_unvisited_vertices + << " (prim+,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," + << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() + << ") s." << std::endl; #endif } cur_aggregate_frontier_size = next_aggregate_frontier_size;