diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh
index b31496bc960..eea8b3ccdec 100644
--- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh
+++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh
@@ -1348,7 +1348,7 @@ void per_v_transform_reduce_e_edge_partition(
   }
 }
 
-#define PER_V_PERFORMANCE_MEASUREMENT 0
+#define PER_V_PERFORMANCE_MEASUREMENT 1
 
 template <bool incoming,  // iterate over incoming edges (incoming == true) or outgoing edges
                           // (incoming == false)
@@ -1376,6 +1376,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
                               VertexValueOutputIterator vertex_value_output_first)
 {
 #if PER_V_PERFORMANCE_MEASUREMENT  // FIXME: delete
+  auto const comm_rank = handle.get_comms().get_rank();
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
   auto time0 = std::chrono::steady_clock::now();
 #endif
@@ -2634,7 +2635,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
         std::chrono::duration<double> subdur14 = subtime15 - subtime14;
         std::chrono::duration<double> subdur15 = subtime16 - subtime15;
         std::chrono::duration<double> subdur16 = subtime17 - subtime16;
-        std::cout << "sub took (" << subdur0.count() << "," << subdur1.count() << ","
+        std::cout << comm_rank << ":sub took (" << subdur0.count() << "," << subdur1.count() << ","
                   << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << ","
                   << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << ","
                   << subdur8.count() << "," << subdur9.count() << "," << subdur10.count() << ","
@@ -2760,8 +2761,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
   std::chrono::duration<double> dur0 = time1 - time0;
   std::chrono::duration<double> dur1 = time2 - time1;
   std::chrono::duration<double> dur2 = time3 - time2;
-  std::cout << "\t\tdetail::per_v (prep, ep, comm) took (" << dur0.count() << "," << dur1.count()
-            << "," << dur2.count() << ")" << std::endl;
+  std::cout << "\t\t" << comm_rank << ":detail::per_v (prep, ep, comm) took (" << dur0.count()
+            << "," << dur1.count() << "," << dur2.count() << ")" << std::endl;
 #endif
 }
 
diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh
index 3197ac8e963..b30397f5d8c 100644
--- a/cpp/src/prims/fill_edge_src_dst_property.cuh
+++ b/cpp/src/prims/fill_edge_src_dst_property.cuh
@@ -372,6 +372,7 @@ void fill_edge_minor_property(raft::handle_t const& handle,
     num_concurrent_bcasts = std::min(num_concurrent_bcasts, handle.get_stream_pool_size());
     num_concurrent_bcasts =
       std::min(std::max(num_concurrent_bcasts, size_t{1}), static_cast<size_t>(major_comm_size));
+    std::cout << comm.get_rank() << ":" << " v_list_size=" << v_list_size << " v_list_range=(" << v_list_range[0] << "," << v_list_range[1] << ") v_list_bitmap.has_value()=" << v_list_bitmap.has_value() << " num_concurrent_bcasts=" << num_concurrent_bcasts << std::endl;
 
     std::optional<std::vector<size_t>> stream_pool_indices{std::nullopt};
     if (num_concurrent_bcasts > 1) {
diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
index 9796ddd60a1..b2ac05b3fa0 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
@@ -335,8 +335,13 @@ create_graph_from_partitioned_edgelist(
   auto const minor_comm_size = minor_comm.get_size();
 
   // 1. renumber
+#if 1
+  auto const comm_rank = handle.get_comms().get_rank();
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":create_graph_from_partitioned 0" << std::endl;
+#endif
 
-  std::vector<edge_t> edgelist_edge_counts(minor_comm_size, edge_t{0});
+    std::vector < edge_t> edgelist_edge_counts(minor_comm_size, edge_t{0});
   for (size_t i = 0; i < edgelist_edge_counts.size(); ++i) {
     edgelist_edge_counts[i] = static_cast<edge_t>(edge_partition_edgelist_srcs[i].size());
   }
@@ -362,6 +367,10 @@ create_graph_from_partitioned_edgelist(
     num_segments_per_vertex_partition > (detail::num_sparse_segments_per_vertex_partition + 2);
 
   // 2. sort and compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid
+#if 1
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":create_graph_from_partitioned 1" << std::endl;
+#endif
 
   auto total_global_mem = handle.get_device_properties().totalGlobalMem;
   size_t element_size   = sizeof(vertex_t) * 2;
@@ -567,6 +576,10 @@ create_graph_from_partitioned_edgelist(
   }
 
   // 3. segmented sort neighbors
+#if 1
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":create_graph_from_partitioned 3" << std::endl;
+#endif
 
   for (size_t i = 0; i < edge_partition_offsets.size(); ++i) {
     if (edge_partition_weights) {
@@ -653,6 +666,10 @@ create_graph_from_partitioned_edgelist(
   }
 
   // 4. create a graph and an edge_property_t object.
+#if 1
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":create_graph_from_partitioned 4" << std::endl;
+#endif
 
   std::optional<edge_property_t<graph_view_t<vertex_t, edge_t, store_transposed, true>, weight_t>>
     edge_weights{std::nullopt};
@@ -933,6 +950,11 @@ create_graph_from_edgelist_impl(
   bool renumber,
   bool do_expensive_check)
 {
+#if 1
+  auto const comm_rank = handle.get_comms().get_rank();
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":create_graph_from_edgelist_impl 0" << std::endl;
+#endif
   auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
   auto const major_comm_size = major_comm.get_size();
   auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
@@ -1024,6 +1046,10 @@ create_graph_from_edgelist_impl(
   // 1. groupby each edge chunks to their target local adjacency matrix partition (and further
   // groupby within the local partition by applying the compute_gpu_id_from_vertex_t to minor vertex
   // IDs).
+#if 1
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":create_graph_from_edgelist_impl 1" << std::endl;
+#endif
 
   std::vector<std::vector<rmm::device_uvector<vertex_t>>> edgelist_partitioned_srcs(
     edgelist_srcs.size());
@@ -1154,6 +1180,10 @@ create_graph_from_edgelist_impl(
   if (edgelist_edge_types) { (*edgelist_edge_types).clear(); }
 
   // 2. split the grouped edge chunks to local partitions
+#if 1
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":create_graph_from_edgelist_impl 2" << std::endl;
+#endif
 
   auto edgelist_intra_partition_segment_offsets = std::vector<std::vector<edge_t>>(minor_comm_size);
 
@@ -1201,7 +1231,13 @@ create_graph_from_edgelist_impl(
                         intra_partition_segment_sizes.end(),
                         intra_partition_segment_offsets.begin() + 1);
 
+#if 1
+    std::cout << comm_rank << ": i=" << i << " edge_count=" << edge_count << std::endl;
+#endif
     rmm::device_uvector<vertex_t> tmp_srcs(edge_count, handle.get_stream());
+#if 1
+    std::cout << comm_rank << ": i=" << i << " tmp_srcs allocated" << std::endl;
+#endif
     for (int j = 0; j < major_comm_size; ++j) {
       for (size_t k = 0; k < edgelist_partitioned_srcs.size(); ++k) {
         auto& input_buffer = edgelist_partitioned_srcs[k][i * major_comm_size + j];
@@ -1218,6 +1254,9 @@ create_graph_from_edgelist_impl(
     edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs));
 
     rmm::device_uvector<vertex_t> tmp_dsts(edge_count, handle.get_stream());
+#if 1
+    std::cout << comm_rank << ": i=" << i << " tmp_dsts allocated" << std::endl;
+#endif
     for (int j = 0; j < major_comm_size; ++j) {
       for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) {
         auto& input_buffer = edgelist_partitioned_dsts[k][i * major_comm_size + j];
@@ -1289,6 +1328,10 @@ create_graph_from_edgelist_impl(
 
     edgelist_intra_partition_segment_offsets[i] = std::move(intra_partition_segment_offsets);
   }
+#if 1
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":create_graph_from_edgelist_impl 3" << std::endl;
+#endif
 
   return create_graph_from_partitioned_edgelist<vertex_t,
                                                 edge_t,
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index abfa515df05..27bf9c4fcdf 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -247,6 +247,11 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compu
   // 1. if local_vertices.has_value() is false, find unique vertices from edge majors & minors (to
   // construct local_vertices)
 
+#if 1
+  auto comm_rank = multi_gpu ? handle.get_comms().get_rank() : int{0};
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":compute_renumber_map 0" << std::endl;
+#endif
   rmm::device_uvector<vertex_t> sorted_local_vertices(0, handle.get_stream());
   if (!local_vertices) {
     constexpr size_t num_bins{
@@ -521,6 +526,10 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compu
     thrust::sort(
       handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end());
   }
+#if 1
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":compute_renumber_map 1" << std::endl;
+#endif
 
   // 2. find an unused vertex ID
 
@@ -531,6 +540,10 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compu
   CUGRAPH_EXPECTS(locally_unused_vertex_id.has_value(),
                   "Invalid input arguments: there is no unused value in the entire range of "
                   "vertex_t, increase vertex_t to 64 bit.");
+#if 1
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":compute_renumber_map 2" << std::endl;
+#endif
 
   // 3. compute global degrees for the sorted local vertices
 
@@ -706,6 +719,10 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compu
       offset += this_chunk_size;
     }
   }
+#if 1
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":compute_renumber_map 3" << std::endl;
+#endif
 
   // 5. sort local vertices by degree (descending)
 
@@ -773,6 +790,10 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compu
                     d_segment_offsets.size(),
                     handle.get_stream());
   handle.sync_stream();
+#if 1
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":compute_renumber_map 4" << std::endl;
+#endif
 
   return std::make_tuple(
     std::move(sorted_local_vertices), h_segment_offsets, *locally_unused_vertex_id);
@@ -1065,12 +1086,20 @@ renumber_edgelist(
 
   // 1. compute renumber map
 
+#if 1
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":renumber_edgelist 0" << std::endl;
+#endif
   auto [renumber_map_labels, vertex_partition_segment_offsets, locally_unused_vertex_id] =
     detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(handle,
                                                               std::move(local_vertices),
                                                               edgelist_const_majors,
                                                               edgelist_const_minors,
                                                               edgelist_edge_counts);
+#if 1
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":renumber_edgelist 1" << std::endl;
+#endif
 
   // 2. initialize partition_t object, number_of_vertices, and number_of_edges
 
@@ -1106,6 +1135,10 @@ renumber_edgelist(
 
   // 3. renumber edges
 
+#if 1
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":renumber_edgelist 2" << std::endl;
+#endif
   {
     vertex_t max_edge_partition_major_range_size{0};
     for (size_t i = 0; i < edgelist_majors.size(); ++i) {
@@ -1138,11 +1171,23 @@ renumber_edgelist(
     }
   }
 
+#if 1
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank
+            << ":renumber_edgelist 3 partition.local_edge_partition_minor_range_size()="
+            << partition.local_edge_partition_minor_range_size()
+            << " number_of_edges=" << number_of_edges << " comm_size=" << comm_size
+            << " edgelist_intra_partition_segment_offsets.has_value()="
+            << edgelist_intra_partition_segment_offsets.has_value() << std::endl;
+#endif
   if ((static_cast<double>(partition.local_edge_partition_minor_range_size() *
                            2.5 /* tuning parameter */) >=
        static_cast<double>(number_of_edges / comm_size)) &&
       edgelist_intra_partition_segment_offsets) {  // memory footprint dominated by the O(V/sqrt(P))
                                                    // part than the O(E/P) part
+#if 1
+    std::cout << comm_rank << "path A" << std::endl;
+#endif
     vertex_t max_segment_size{0};
     for (int i = 0; i < major_comm_size; ++i) {
       auto minor_range_vertex_partition_id =
@@ -1182,6 +1227,9 @@ renumber_edgelist(
       }
     }
   } else {
+#if 1
+    std::cout << comm_rank << ":path B" << std::endl;
+#endif
     rmm::device_uvector<vertex_t> renumber_map_minor_labels(
       partition.local_edge_partition_minor_range_size(), handle.get_stream());
     std::vector<size_t> recvcounts(major_comm_size);
@@ -1216,6 +1264,10 @@ renumber_edgelist(
     }
   }
 
+#if 1
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  std::cout << comm_rank << ":renumber_edgelist 4" << std::endl;
+#endif
   auto edge_partition_segment_offsets =
     detail::aggregate_segment_offsets(handle, vertex_partition_segment_offsets);
 
diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh
index 1edb5e296b4..6afa5505af5 100644
--- a/cpp/src/traversal/bfs_impl.cuh
+++ b/cpp/src/traversal/bfs_impl.cuh
@@ -120,6 +120,7 @@ void bfs(raft::handle_t const& handle,
                 "GraphViewType should support the push model.");
 
 #if BFS_PERFORMANCE_MEASUREMENT  // FIXME: delete
+  auto const comm_rank = GraphViewType::is_multi_gpu ? handle.get_comms().get_rank() : int{0};
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
   auto prep0 = std::chrono::steady_clock::now();
 #endif
@@ -275,13 +276,15 @@ void bfs(raft::handle_t const& handle,
                          true);
 #if BFS_PERFORMANCE_MEASUREMENT  // FIXME: delete
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
-  auto prep4                        = std::chrono::steady_clock::now();
+  auto prep4                         = std::chrono::steady_clock::now();
   std::chrono::duration<double> dur0 = prep1 - prep0;
   std::chrono::duration<double> dur1 = prep2 - prep1;
   std::chrono::duration<double> dur2 = prep3 - prep2;
   std::chrono::duration<double> dur3 = prep4 - prep3;
-  std::chrono::duration<double> dur = prep4 - prep0;
-  std::cout << "prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s." << std::endl;
+  std::chrono::duration<double> dur  = prep4 - prep0;
+  std::cout << comm_rank << ":prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count()
+            << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s."
+            << std::endl;
 #endif
 
   // 4. BFS iteration
@@ -350,8 +353,9 @@ void bfs(raft::handle_t const& handle,
         std::chrono::duration<double> dur1 = topdown2 - topdown1;
         std::chrono::duration<double> dur2 = topdown3 - topdown2;
         std::chrono::duration<double> dur  = topdown3 - topdown0;
-        std::cout << "topdown (prim,vf,host) took " << dur.count() << " (" << dur0.count() << "," << dur1.count()
-                  << "," << dur2.count() << ") s." << std::endl;
+        std::cout << comm_rank << ":depth=" << depth << " topdown (prim,vf,host) took "
+                  << dur.count() << " (" << dur0.count() << "," << dur1.count() << ","
+                  << dur2.count() << ") s." << std::endl;
 #endif
         break;
       }
@@ -368,7 +372,8 @@ void bfs(raft::handle_t const& handle,
 #endif
 
       if (direction_optimizing) {
-        // FIXME: computing m_f & updating nzd_unvisited_vertices & computing m_u can be executed concurrently.
+        // FIXME: computing m_f & updating nzd_unvisited_vertices & computing m_u can be executed
+        // concurrently.
         // FIXME: also the above fill_edge_dst_property can be executed concurrently.
         auto m_f = thrust::transform_reduce(
           handle.get_thrust_policy(),
@@ -455,9 +460,11 @@ void bfs(raft::handle_t const& handle,
       std::chrono::duration<double> dur4 = topdown5 - topdown4;
       std::chrono::duration<double> dur5 = topdown6 - topdown5;
       std::chrono::duration<double> dur  = topdown6 - topdown0;
-      std::cout << depth << " topdown next_aggregate_frontier_size=" << next_aggregate_frontier_size << " (prim,vf,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," << dur1.count()
-                << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << ","
-                << dur5.count() << ") s." << std::endl;
+      std::cout << comm_rank << ":depth=" << depth
+                << " topdown next_aggregate_frontier_size=" << next_aggregate_frontier_size
+                << " (prim,vf,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << ","
+                << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count()
+                << "," << dur5.count() << ") s." << std::endl;
 #endif
     } else {                     // bottom up
 #if BFS_PERFORMANCE_MEASUREMENT  // FIXME: delete
@@ -549,8 +556,9 @@ void bfs(raft::handle_t const& handle,
         std::chrono::duration<double> dur0 = bottomup1 - bottomup0;
         std::chrono::duration<double> dur1 = bottomup2 - bottomup1;
         std::chrono::duration<double> dur  = bottomup2 - bottomup0;
-        std::cout << "bottomup (prim+,host) took " << dur.count() << " (" << dur0.count() << "," << dur1.count()
-                  << ") s." << std::endl;
+        std::cout << comm_rank << ":depth=" << depth << " bottomup (prim+,host) took "
+                  << dur.count() << " (" << dur0.count() << "," << dur1.count() << ") s."
+                  << std::endl;
 #endif
         break;
       }
@@ -605,7 +613,12 @@ void bfs(raft::handle_t const& handle,
       std::chrono::duration<double> dur3 = bottomup4 - bottomup3;
       std::chrono::duration<double> dur4 = bottomup5 - bottomup4;
       std::chrono::duration<double> dur  = bottomup5 - bottomup0;
-      std::cout << depth << " bottomup next_aggregate_frontier_size=" << next_aggregate_frontier_size << " aggregatee_nzd_unvisited_vertices=" << aggregate_nzd_unvisited_vertices << " (prim+,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << ") s." << std::endl;
+      std::cout << comm_rank << ":depth=" << depth
+                << " bottomup next_aggregate_frontier_size=" << next_aggregate_frontier_size
+                << " aggregatee_nzd_unvisited_vertices=" << aggregate_nzd_unvisited_vertices
+                << " (prim+,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << ","
+                << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count()
+                << ") s." << std::endl;
 #endif
     }
     cur_aggregate_frontier_size = next_aggregate_frontier_size;