diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index b8eaba9d575..62c7abcb149 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -168,379 +168,374 @@ endif() # which should give us a better parallel schedule. set(CUGRAPH_SOURCES - src/utilities/shuffle_vertices_mg_v32_fp.cu - src/utilities/shuffle_vertices_mg_v32_integral.cu + #src/utilities/shuffle_vertices_mg_v32_fp.cu + #src/utilities/shuffle_vertices_mg_v32_integral.cu src/utilities/shuffle_vertices_mg_v64_fp.cu src/utilities/shuffle_vertices_mg_v64_integral.cu - src/detail/permute_range_v32.cu + #src/detail/permute_range_v32.cu src/detail/permute_range_v64.cu - src/utilities/shuffle_vertex_pairs_mg_v32_e32.cu - src/utilities/shuffle_vertex_pairs_mg_v32_e64.cu + #src/utilities/shuffle_vertex_pairs_mg_v32_e32.cu + #src/utilities/shuffle_vertex_pairs_mg_v32_e64.cu src/utilities/shuffle_vertex_pairs_mg_v64_e64.cu - src/detail/collect_local_vertex_values_sg_v32_e32.cu - src/detail/collect_local_vertex_values_sg_v32_e64.cu - src/detail/collect_local_vertex_values_sg_v64_e64.cu - src/detail/collect_local_vertex_values_mg_v32_e32.cu - src/detail/collect_local_vertex_values_mg_v32_e64.cu + #src/detail/collect_local_vertex_values_sg_v32_e32.cu + #src/detail/collect_local_vertex_values_sg_v32_e64.cu + #src/detail/collect_local_vertex_values_sg_v64_e64.cu + #src/detail/collect_local_vertex_values_mg_v32_e32.cu + #src/detail/collect_local_vertex_values_mg_v32_e64.cu src/detail/collect_local_vertex_values_mg_v64_e64.cu - src/detail/groupby_and_count_mg_v32_e32.cu - src/detail/groupby_and_count_mg_v32_e64.cu + #src/detail/groupby_and_count_mg_v32_e32.cu + #src/detail/groupby_and_count_mg_v32_e64.cu src/detail/groupby_and_count_mg_v64_e64.cu - src/detail/collect_comm_wrapper_mg_v32_e32.cu + #src/detail/collect_comm_wrapper_mg_v32_e32.cu src/detail/collect_comm_wrapper_mg_v64_e64.cu - src/sampling/random_walks_mg_v64_e64.cu - src/sampling/random_walks_mg_v32_e32.cu - src/sampling/random_walks_mg_v32_e64.cu - src/community/detail/common_methods_mg_v64_e64.cu - src/community/detail/common_methods_mg_v32_e32.cu - src/community/detail/common_methods_mg_v32_e64.cu - src/community/detail/common_methods_sg_v64_e64.cu - src/community/detail/common_methods_sg_v32_e32.cu - src/community/detail/common_methods_sg_v32_e64.cu - src/community/detail/refine_sg_v64_e64.cu - src/community/detail/refine_sg_v32_e32.cu - src/community/detail/refine_sg_v32_e64.cu - src/community/detail/refine_mg_v64_e64.cu - src/community/detail/refine_mg_v32_e32.cu - src/community/detail/refine_mg_v32_e64.cu - src/community/edge_triangle_count_sg_v64_e64.cu - src/community/edge_triangle_count_sg_v32_e32.cu - src/community/edge_triangle_count_sg_v32_e64.cu - src/community/edge_triangle_count_mg_v64_e64.cu - src/community/edge_triangle_count_mg_v32_e32.cu - src/community/edge_triangle_count_mg_v32_e64.cu - src/community/detail/maximal_independent_moves_sg_v64_e64.cu - src/community/detail/maximal_independent_moves_sg_v32_e32.cu - src/community/detail/maximal_independent_moves_sg_v32_e64.cu - src/community/detail/maximal_independent_moves_mg_v64_e64.cu - src/community/detail/maximal_independent_moves_mg_v32_e32.cu - src/community/detail/maximal_independent_moves_mg_v32_e64.cu + #src/sampling/random_walks_mg_v64_e64.cu + #src/sampling/random_walks_mg_v32_e32.cu + #src/sampling/random_walks_mg_v32_e64.cu + #src/community/detail/common_methods_mg_v64_e64.cu + #src/community/detail/common_methods_mg_v32_e32.cu + #src/community/detail/common_methods_mg_v32_e64.cu + #src/community/detail/common_methods_sg_v64_e64.cu + #src/community/detail/common_methods_sg_v32_e32.cu + #src/community/detail/common_methods_sg_v32_e64.cu + #src/community/detail/refine_sg_v64_e64.cu + #src/community/detail/refine_sg_v32_e32.cu + #src/community/detail/refine_sg_v32_e64.cu + #src/community/detail/refine_mg_v64_e64.cu + #src/community/detail/refine_mg_v32_e32.cu + #src/community/detail/refine_mg_v32_e64.cu + #src/community/edge_triangle_count_sg_v64_e64.cu + #src/community/edge_triangle_count_sg_v32_e32.cu + ##src/community/edge_triangle_count_sg_v32_e64.cu + #src/community/edge_triangle_count_mg_v64_e64.cu + #src/community/edge_triangle_count_mg_v32_e32.cu + #src/community/edge_triangle_count_mg_v32_e64.cu + #src/community/detail/maximal_independent_moves_sg_v64_e64.cu + #src/community/detail/maximal_independent_moves_sg_v32_e32.cu + #src/community/detail/maximal_independent_moves_sg_v32_e64.cu + #src/community/detail/maximal_independent_moves_mg_v64_e64.cu + #src/community/detail/maximal_independent_moves_mg_v32_e32.cu + #src/community/detail/maximal_independent_moves_mg_v32_e64.cu src/detail/utility_wrappers_32.cu src/detail/utility_wrappers_64.cu src/structure/graph_view_mg_v64_e64.cu - src/structure/graph_view_mg_v32_e32.cu - src/structure/graph_view_mg_v32_e64.cu - src/structure/remove_self_loops_sg_v32_e32.cu - src/structure/remove_self_loops_sg_v32_e64.cu + #src/structure/graph_view_mg_v32_e32.cu + #src/structure/graph_view_mg_v32_e64.cu + #src/structure/remove_self_loops_sg_v32_e32.cu + #src/structure/remove_self_loops_sg_v32_e64.cu src/structure/remove_self_loops_sg_v64_e64.cu - src/structure/remove_multi_edges_sg_v32_e32.cu - src/structure/remove_multi_edges_sg_v32_e64.cu + #src/structure/remove_multi_edges_sg_v32_e32.cu + #src/structure/remove_multi_edges_sg_v32_e64.cu src/structure/remove_multi_edges_sg_v64_e64.cu - src/utilities/path_retrieval_sg_v32_e32.cu - src/utilities/path_retrieval_sg_v64_e64.cu - src/structure/legacy/graph.cu - src/linear_assignment/legacy/hungarian.cu - src/link_prediction/jaccard_sg_v64_e64.cu - src/link_prediction/jaccard_sg_v32_e32.cu - src/link_prediction/jaccard_sg_v32_e64.cu - src/link_prediction/sorensen_sg_v64_e64.cu - src/link_prediction/sorensen_sg_v32_e32.cu - src/link_prediction/sorensen_sg_v32_e64.cu - src/link_prediction/overlap_sg_v64_e64.cu - src/link_prediction/overlap_sg_v32_e32.cu - src/link_prediction/overlap_sg_v32_e64.cu - src/link_prediction/cosine_sg_v64_e64.cu - src/link_prediction/cosine_sg_v32_e32.cu - src/link_prediction/cosine_sg_v32_e64.cu - src/link_prediction/jaccard_mg_v64_e64.cu - src/link_prediction/jaccard_mg_v32_e32.cu - src/link_prediction/jaccard_mg_v32_e64.cu - src/link_prediction/sorensen_mg_v64_e64.cu - src/link_prediction/sorensen_mg_v32_e32.cu - src/link_prediction/sorensen_mg_v32_e64.cu - src/link_prediction/overlap_mg_v64_e64.cu - src/link_prediction/overlap_mg_v32_e32.cu - src/link_prediction/overlap_mg_v32_e64.cu - src/link_prediction/cosine_mg_v64_e64.cu - src/link_prediction/cosine_mg_v32_e32.cu - src/link_prediction/cosine_mg_v32_e64.cu - src/layout/legacy/force_atlas2.cu - src/converters/legacy/COOtoCSR.cu - src/community/legacy/spectral_clustering.cu - src/community/louvain_sg_v64_e64.cu - src/community/louvain_sg_v32_e32.cu - src/community/louvain_sg_v32_e64.cu - src/community/louvain_mg_v64_e64.cu - src/community/louvain_mg_v32_e32.cu - src/community/louvain_mg_v32_e64.cu - src/community/leiden_sg_v64_e64.cu - src/community/leiden_sg_v32_e32.cu - src/community/leiden_sg_v32_e64.cu - src/community/leiden_mg_v64_e64.cu - src/community/leiden_mg_v32_e32.cu - src/community/leiden_mg_v32_e64.cu - src/community/ecg_sg_v64_e64.cu - src/community/ecg_sg_v32_e32.cu - src/community/ecg_sg_v32_e64.cu - src/community/ecg_mg_v64_e64.cu - src/community/ecg_mg_v32_e32.cu - src/community/ecg_mg_v32_e64.cu - src/community/egonet_sg_v64_e64.cu - src/community/egonet_sg_v32_e32.cu - src/community/egonet_sg_v32_e64.cu - src/community/egonet_mg_v64_e64.cu - src/community/egonet_mg_v32_e32.cu - src/community/egonet_mg_v32_e64.cu - src/community/k_truss_sg_v64_e64.cu - src/community/k_truss_sg_v32_e32.cu - src/community/k_truss_sg_v32_e64.cu - src/community/k_truss_mg_v64_e64.cu - src/community/k_truss_mg_v32_e32.cu - src/community/k_truss_mg_v32_e64.cu - src/lookup/lookup_src_dst_mg_v32_e32.cu - src/lookup/lookup_src_dst_mg_v32_e64.cu - src/lookup/lookup_src_dst_mg_v64_e64.cu - src/lookup/lookup_src_dst_sg_v32_e32.cu - src/lookup/lookup_src_dst_sg_v32_e64.cu - src/lookup/lookup_src_dst_sg_v64_e64.cu - src/sampling/random_walks_old_sg_v32_e32.cu - src/sampling/random_walks_old_sg_v32_e64.cu - src/sampling/random_walks_old_sg_v64_e64.cu - src/sampling/random_walks_sg_v64_e64.cu - src/sampling/random_walks_sg_v32_e32.cu - src/sampling/random_walks_sg_v32_e64.cu - src/sampling/detail/prepare_next_frontier_sg_v64_e64.cu - src/sampling/detail/prepare_next_frontier_sg_v32_e32.cu - src/sampling/detail/prepare_next_frontier_mg_v64_e64.cu - src/sampling/detail/prepare_next_frontier_mg_v32_e32.cu - src/sampling/detail/gather_one_hop_edgelist_sg_v64_e64.cu - src/sampling/detail/gather_one_hop_edgelist_sg_v32_e32.cu - src/sampling/detail/gather_one_hop_edgelist_sg_v32_e64.cu - src/sampling/detail/gather_one_hop_edgelist_mg_v64_e64.cu - src/sampling/detail/gather_one_hop_edgelist_mg_v32_e32.cu - src/sampling/detail/gather_one_hop_edgelist_mg_v32_e64.cu - src/sampling/detail/remove_visited_vertices_from_frontier_sg_v32_e32.cu - src/sampling/detail/remove_visited_vertices_from_frontier_sg_v64_e64.cu - src/sampling/detail/check_edge_bias_values_sg_v64_e64.cu - src/sampling/detail/check_edge_bias_values_sg_v32_e32.cu - src/sampling/detail/check_edge_bias_values_sg_v32_e64.cu - src/sampling/detail/check_edge_bias_values_mg_v64_e64.cu - src/sampling/detail/check_edge_bias_values_mg_v32_e32.cu - src/sampling/detail/check_edge_bias_values_mg_v32_e64.cu - src/sampling/detail/sample_edges_sg_v64_e64.cu - src/sampling/detail/sample_edges_sg_v32_e32.cu - src/sampling/detail/sample_edges_sg_v32_e64.cu - src/sampling/detail/sample_edges_mg_v64_e64.cu - src/sampling/detail/sample_edges_mg_v32_e32.cu - src/sampling/detail/sample_edges_mg_v32_e64.cu + #src/utilities/path_retrieval_sg_v32_e32.cu + #src/utilities/path_retrieval_sg_v64_e64.cu + #src/structure/legacy/graph.cu + #src/linear_assignment/legacy/hungarian.cu + #src/link_prediction/jaccard_sg_v64_e64.cu + #src/link_prediction/jaccard_sg_v32_e32.cu + #src/link_prediction/jaccard_sg_v32_e64.cu + #src/link_prediction/sorensen_sg_v64_e64.cu + #src/link_prediction/sorensen_sg_v32_e32.cu + #src/link_prediction/sorensen_sg_v32_e64.cu + #src/link_prediction/overlap_sg_v64_e64.cu + #src/link_prediction/overlap_sg_v32_e32.cu + #src/link_prediction/overlap_sg_v32_e64.cu + #src/link_prediction/cosine_sg_v64_e64.cu + #src/link_prediction/cosine_sg_v32_e32.cu + #src/link_prediction/cosine_sg_v32_e64.cu + #src/link_prediction/jaccard_mg_v64_e64.cu + #src/link_prediction/jaccard_mg_v32_e32.cu + #src/link_prediction/jaccard_mg_v32_e64.cu + #src/link_prediction/sorensen_mg_v64_e64.cu + #src/link_prediction/sorensen_mg_v32_e32.cu + #src/link_prediction/sorensen_mg_v32_e64.cu + #src/link_prediction/overlap_mg_v64_e64.cu + #src/link_prediction/overlap_mg_v32_e32.cu + #src/link_prediction/overlap_mg_v32_e64.cu + #src/link_prediction/cosine_mg_v64_e64.cu + #src/link_prediction/cosine_mg_v32_e32.cu + #src/link_prediction/cosine_mg_v32_e64.cu + #src/layout/legacy/force_atlas2.cu + #src/converters/legacy/COOtoCSR.cu + #src/community/legacy/spectral_clustering.cu + #src/community/louvain_sg_v64_e64.cu + #src/community/louvain_sg_v32_e32.cu + #src/community/louvain_sg_v32_e64.cu + #src/community/louvain_mg_v64_e64.cu + #src/community/louvain_mg_v32_e32.cu + #src/community/louvain_mg_v32_e64.cu + #src/community/leiden_sg_v64_e64.cu + #src/community/leiden_sg_v32_e32.cu + #src/community/leiden_sg_v32_e64.cu + #src/community/leiden_mg_v64_e64.cu + #src/community/leiden_mg_v32_e32.cu + #src/community/leiden_mg_v32_e64.cu + #src/community/ecg_sg_v64_e64.cu + #src/community/ecg_sg_v32_e32.cu + #src/community/ecg_sg_v32_e64.cu + #src/community/ecg_mg_v64_e64.cu + #src/community/ecg_mg_v32_e32.cu + #src/community/ecg_mg_v32_e64.cu + #src/community/egonet_sg_v64_e64.cu + #src/community/egonet_sg_v32_e32.cu + #src/community/egonet_sg_v32_e64.cu + #src/community/egonet_mg_v64_e64.cu + #src/community/egonet_mg_v32_e32.cu + #src/community/egonet_mg_v32_e64.cu + #src/community/k_truss_sg_v64_e64.cu + #src/community/k_truss_sg_v32_e32.cu + #src/community/k_truss_sg_v32_e64.cu + #src/community/k_truss_mg_v64_e64.cu + #src/community/k_truss_mg_v32_e32.cu + #src/community/k_truss_mg_v32_e64.cu + #src/lookup/lookup_src_dst_mg_v32_e32.cu + #src/lookup/lookup_src_dst_mg_v32_e64.cu + #src/lookup/lookup_src_dst_mg_v64_e64.cu + #src/lookup/lookup_src_dst_sg_v32_e32.cu + #src/lookup/lookup_src_dst_sg_v32_e64.cu + #src/lookup/lookup_src_dst_sg_v64_e64.cu + #src/sampling/random_walks_old_sg_v32_e32.cu + #src/sampling/random_walks_old_sg_v32_e64.cu + #src/sampling/random_walks_old_sg_v64_e64.cu + #src/sampling/random_walks_sg_v64_e64.cu + #src/sampling/random_walks_sg_v32_e32.cu + #src/sampling/random_walks_sg_v32_e64.cu + #src/sampling/detail/prepare_next_frontier_sg_v64_e64.cu + #src/sampling/detail/prepare_next_frontier_sg_v32_e32.cu + #src/sampling/detail/prepare_next_frontier_mg_v64_e64.cu + #src/sampling/detail/prepare_next_frontier_mg_v32_e32.cu + #src/sampling/detail/gather_one_hop_edgelist_sg_v64_e64.cu + #src/sampling/detail/gather_one_hop_edgelist_sg_v32_e32.cu + #src/sampling/detail/gather_one_hop_edgelist_sg_v32_e64.cu + #src/sampling/detail/gather_one_hop_edgelist_mg_v64_e64.cu + #src/sampling/detail/gather_one_hop_edgelist_mg_v32_e32.cu + #src/sampling/detail/gather_one_hop_edgelist_mg_v32_e64.cu + #src/sampling/detail/remove_visited_vertices_from_frontier_sg_v32_e32.cu + #src/sampling/detail/remove_visited_vertices_from_frontier_sg_v64_e64.cu + #src/sampling/detail/check_edge_bias_values_sg_v64_e64.cu + #src/sampling/detail/check_edge_bias_values_sg_v32_e32.cu + #src/sampling/detail/check_edge_bias_values_sg_v32_e64.cu + #src/sampling/detail/check_edge_bias_values_mg_v64_e64.cu + #src/sampling/detail/check_edge_bias_values_mg_v32_e32.cu + #src/sampling/detail/check_edge_bias_values_mg_v32_e64.cu + #src/sampling/detail/sample_edges_sg_v64_e64.cu + #src/sampling/detail/sample_edges_sg_v32_e32.cu + #src/sampling/detail/sample_edges_sg_v32_e64.cu + #src/sampling/detail/sample_edges_mg_v64_e64.cu + #src/sampling/detail/sample_edges_mg_v32_e32.cu + #src/sampling/detail/sample_edges_mg_v32_e64.cu src/sampling/detail/shuffle_and_organize_output_mg_v64_e64.cu - src/sampling/detail/shuffle_and_organize_output_mg_v32_e32.cu - src/sampling/detail/shuffle_and_organize_output_mg_v32_e64.cu - src/sampling/neighbor_sampling_mg_v32_e64.cpp - src/sampling/neighbor_sampling_mg_v32_e32.cpp - src/sampling/neighbor_sampling_mg_v64_e64.cpp - src/sampling/neighbor_sampling_sg_v32_e64.cpp - src/sampling/neighbor_sampling_sg_v32_e32.cpp - src/sampling/neighbor_sampling_sg_v64_e64.cpp - src/sampling/negative_sampling_sg_v32_e64.cu - src/sampling/negative_sampling_sg_v32_e32.cu - src/sampling/negative_sampling_sg_v64_e64.cu - src/sampling/negative_sampling_mg_v32_e64.cu - src/sampling/negative_sampling_mg_v32_e32.cu - src/sampling/negative_sampling_mg_v64_e64.cu - src/sampling/sampling_post_processing_sg_v64_e64.cu - src/sampling/sampling_post_processing_sg_v32_e32.cu - src/sampling/sampling_post_processing_sg_v32_e64.cu - src/cores/core_number_sg_v64_e64.cu - src/cores/core_number_sg_v32_e32.cu - src/cores/core_number_sg_v32_e64.cu - src/cores/core_number_mg_v64_e64.cu - src/cores/core_number_mg_v32_e32.cu - src/cores/core_number_mg_v32_e64.cu - src/cores/k_core_sg_v64_e64.cu - src/cores/k_core_sg_v32_e32.cu - src/cores/k_core_sg_v32_e64.cu - src/cores/k_core_mg_v64_e64.cu - src/cores/k_core_mg_v32_e32.cu - src/cores/k_core_mg_v32_e64.cu - src/components/legacy/connectivity.cu - src/generators/generate_rmat_edgelist_sg_v32_e32.cu + #src/sampling/detail/shuffle_and_organize_output_mg_v32_e32.cu + #src/sampling/detail/shuffle_and_organize_output_mg_v32_e64.cu + #src/sampling/neighbor_sampling_mg_v32_e64.cpp + #src/sampling/neighbor_sampling_mg_v32_e32.cpp + #src/sampling/neighbor_sampling_mg_v64_e64.cpp + #src/sampling/neighbor_sampling_sg_v32_e64.cpp + #src/sampling/neighbor_sampling_sg_v32_e32.cpp + #src/sampling/neighbor_sampling_sg_v64_e64.cpp + #src/sampling/renumber_sampled_edgelist_sg_v64_e64.cu + #src/sampling/renumber_sampled_edgelist_sg_v32_e32.cu + #src/sampling/sampling_post_processing_sg_v64_e64.cu + #src/sampling/sampling_post_processing_sg_v32_e32.cu + #src/sampling/sampling_post_processing_sg_v32_e64.cu + #src/cores/core_number_sg_v64_e64.cu + #src/cores/core_number_sg_v32_e32.cu + #src/cores/core_number_sg_v32_e64.cu + ##src/cores/core_number_mg_v64_e64.cu + #src/cores/core_number_mg_v32_e32.cu + #src/cores/core_number_mg_v32_e64.cu + ##src/cores/k_core_sg_v64_e64.cu + #src/cores/k_core_sg_v32_e32.cu + #src/cores/k_core_sg_v32_e64.cu + #src/cores/k_core_mg_v64_e64.cu + #src/cores/k_core_mg_v32_e32.cu + #src/cores/k_core_mg_v32_e64.cu + #src/components/legacy/connectivity.cu + #src/generators/generate_rmat_edgelist_sg_v32_e32.cu src/generators/generate_rmat_edgelist_sg_v64_e64.cu - src/generators/generate_bipartite_rmat_edgelist_sg_v32_e32.cu - src/generators/generate_bipartite_rmat_edgelist_sg_v64_e64.cu - src/generators/generator_tools_sg_v32_e32.cu + #src/generators/generate_bipartite_rmat_edgelist_sg_v32_e32.cu + #src/generators/generate_bipartite_rmat_edgelist_sg_v64_e64.cu + #src/generators/generator_tools_sg_v32_e32.cu src/generators/generator_tools_sg_v64_e64.cu - src/generators/simple_generators_sg_v32_e32.cu - src/generators/simple_generators_sg_v64_e64.cu - src/generators/erdos_renyi_generator_sg_v32_e32.cu - src/generators/erdos_renyi_generator_sg_v64_e64.cu - src/structure/graph_sg_v64_e64.cu - src/structure/graph_sg_v32_e32.cu - src/structure/graph_sg_v32_e64.cu + #src/generators/simple_generators_sg_v32_e32.cu + #src/generators/simple_generators_sg_v64_e64.cu + #src/generators/erdos_renyi_generator_sg_v32_e32.cu + #src/generators/erdos_renyi_generator_sg_v64_e64.cu + #src/structure/graph_sg_v64_e64.cu + #src/structure/graph_sg_v32_e32.cu + #src/structure/graph_sg_v32_e64.cu src/structure/graph_mg_v64_e64.cu - src/structure/graph_mg_v32_e32.cu - src/structure/graph_mg_v32_e64.cu - src/structure/graph_view_sg_v64_e64.cu - src/structure/graph_view_sg_v32_e32.cu - src/structure/graph_view_sg_v32_e64.cu - src/structure/decompress_to_edgelist_sg_v64_e64.cu - src/structure/decompress_to_edgelist_sg_v32_e32.cu - src/structure/decompress_to_edgelist_sg_v32_e64.cu + #src/structure/graph_mg_v32_e32.cu + #src/structure/graph_mg_v32_e64.cu + #src/structure/graph_view_sg_v64_e64.cu + #src/structure/graph_view_sg_v32_e32.cu + #src/structure/graph_view_sg_v32_e64.cu + #src/structure/decompress_to_edgelist_sg_v64_e64.cu + #src/structure/decompress_to_edgelist_sg_v32_e32.cu + #src/structure/decompress_to_edgelist_sg_v32_e64.cu src/structure/decompress_to_edgelist_mg_v64_e64.cu - src/structure/decompress_to_edgelist_mg_v32_e32.cu - src/structure/decompress_to_edgelist_mg_v32_e64.cu - src/structure/symmetrize_graph_sg_v64_e64.cu - src/structure/symmetrize_graph_sg_v32_e32.cu - src/structure/symmetrize_graph_sg_v32_e64.cu + #src/structure/decompress_to_edgelist_mg_v32_e32.cu + #src/structure/decompress_to_edgelist_mg_v32_e64.cu + #src/structure/symmetrize_graph_sg_v64_e64.cu + #src/structure/symmetrize_graph_sg_v32_e32.cu + #src/structure/symmetrize_graph_sg_v32_e64.cu src/structure/symmetrize_graph_mg_v64_e64.cu - src/structure/symmetrize_graph_mg_v32_e32.cu - src/structure/symmetrize_graph_mg_v32_e64.cu - src/structure/transpose_graph_sg_v64_e64.cu - src/structure/transpose_graph_sg_v32_e32.cu - src/structure/transpose_graph_sg_v32_e64.cu - src/structure/transpose_graph_mg_v64_e64.cu - src/structure/transpose_graph_mg_v32_e32.cu - src/structure/transpose_graph_mg_v32_e64.cu - src/structure/transpose_graph_storage_sg_v64_e64.cu - src/structure/transpose_graph_storage_sg_v32_e32.cu - src/structure/transpose_graph_storage_sg_v32_e64.cu - src/structure/transpose_graph_storage_mg_v64_e64.cu - src/structure/transpose_graph_storage_mg_v32_e32.cu - src/structure/transpose_graph_storage_mg_v32_e64.cu - src/structure/coarsen_graph_sg_v64_e64.cu - src/structure/coarsen_graph_sg_v32_e32.cu - src/structure/coarsen_graph_sg_v32_e64.cu - src/structure/coarsen_graph_mg_v64_e64.cu - src/structure/coarsen_graph_mg_v32_e32.cu - src/structure/coarsen_graph_mg_v32_e64.cu + #src/structure/symmetrize_graph_mg_v32_e32.cu + #src/structure/symmetrize_graph_mg_v32_e64.cu + #src/structure/transpose_graph_sg_v64_e64.cu + #src/structure/transpose_graph_sg_v32_e32.cu + #src/structure/transpose_graph_sg_v32_e64.cu + #src/structure/transpose_graph_mg_v64_e64.cu + #src/structure/transpose_graph_mg_v32_e32.cu + #src/structure/transpose_graph_mg_v32_e64.cu + #src/structure/transpose_graph_storage_sg_v64_e64.cu + #src/structure/transpose_graph_storage_sg_v32_e32.cu + #src/structure/transpose_graph_storage_sg_v32_e64.cu + #src/structure/transpose_graph_storage_mg_v64_e64.cu + #src/structure/transpose_graph_storage_mg_v32_e32.cu + #src/structure/transpose_graph_storage_mg_v32_e64.cu + #src/structure/coarsen_graph_sg_v64_e64.cu + #src/structure/coarsen_graph_sg_v32_e32.cu + #src/structure/coarsen_graph_sg_v32_e64.cu + #src/structure/coarsen_graph_mg_v64_e64.cu + #src/structure/coarsen_graph_mg_v32_e32.cu + #src/structure/coarsen_graph_mg_v32_e64.cu src/structure/graph_weight_utils_mg_v64_e64.cu - src/structure/graph_weight_utils_mg_v32_e32.cu - src/structure/graph_weight_utils_mg_v32_e64.cu - src/structure/graph_weight_utils_sg_v64_e64.cu - src/structure/graph_weight_utils_sg_v32_e32.cu - src/structure/graph_weight_utils_sg_v32_e64.cu - src/structure/renumber_edgelist_sg_v64_e64.cu - src/structure/renumber_edgelist_sg_v32_e32.cu - src/structure/renumber_edgelist_sg_v32_e64.cu + #src/structure/graph_weight_utils_mg_v32_e32.cu + #src/structure/graph_weight_utils_mg_v32_e64.cu + #src/structure/graph_weight_utils_sg_v64_e64.cu + #src/structure/graph_weight_utils_sg_v32_e32.cu + #src/structure/graph_weight_utils_sg_v32_e64.cu + #src/structure/renumber_edgelist_sg_v64_e64.cu + #src/structure/renumber_edgelist_sg_v32_e32.cu + #src/structure/renumber_edgelist_sg_v32_e64.cu src/structure/renumber_edgelist_mg_v64_e64.cu - src/structure/renumber_edgelist_mg_v32_e32.cu - src/structure/renumber_edgelist_mg_v32_e64.cu - src/structure/renumber_utils_sg_v64_e64.cu - src/structure/renumber_utils_sg_v32_e32.cu + #src/structure/renumber_edgelist_mg_v32_e32.cu + #src/structure/renumber_edgelist_mg_v32_e64.cu + #src/structure/renumber_utils_sg_v64_e64.cu + #src/structure/renumber_utils_sg_v32_e32.cu src/structure/renumber_utils_mg_v64_e64.cu - src/structure/renumber_utils_mg_v32_e32.cu - src/structure/relabel_sg_v64_e64.cu - src/structure/relabel_sg_v32_e32.cu + #src/structure/renumber_utils_mg_v32_e32.cu + #src/structure/relabel_sg_v64_e64.cu + #src/structure/relabel_sg_v32_e32.cu src/structure/relabel_mg_v64_e64.cu - src/structure/relabel_mg_v32_e32.cu - src/structure/induced_subgraph_sg_v64_e64.cu - src/structure/induced_subgraph_sg_v32_e32.cu - src/structure/induced_subgraph_sg_v32_e64.cu - src/structure/induced_subgraph_mg_v64_e64.cu - src/structure/induced_subgraph_mg_v32_e32.cu - src/structure/induced_subgraph_mg_v32_e64.cu - src/structure/select_random_vertices_sg_v64_e64.cu - src/structure/select_random_vertices_sg_v32_e32.cu - src/structure/select_random_vertices_sg_v32_e64.cu + #src/structure/relabel_mg_v32_e32.cu + #src/structure/induced_subgraph_sg_v64_e64.cu + #src/structure/induced_subgraph_sg_v32_e32.cu + #src/structure/induced_subgraph_sg_v32_e64.cu + #src/structure/induced_subgraph_mg_v64_e64.cu + #src/structure/induced_subgraph_mg_v32_e32.cu + #src/structure/induced_subgraph_mg_v32_e64.cu + #src/structure/select_random_vertices_sg_v64_e64.cu + #src/structure/select_random_vertices_sg_v32_e32.cu + #src/structure/select_random_vertices_sg_v32_e64.cu src/structure/select_random_vertices_mg_v64_e64.cu - src/structure/select_random_vertices_mg_v32_e32.cu - src/structure/select_random_vertices_mg_v32_e64.cu - src/traversal/extract_bfs_paths_sg_v64_e64.cu - src/traversal/extract_bfs_paths_sg_v32_e32.cu - src/traversal/extract_bfs_paths_sg_v32_e64.cu - src/traversal/extract_bfs_paths_mg_v64_e64.cu - src/traversal/extract_bfs_paths_mg_v32_e32.cu - src/traversal/extract_bfs_paths_mg_v32_e64.cu - src/traversal/bfs_sg_v64_e64.cu - src/traversal/bfs_sg_v32_e32.cu - src/traversal/bfs_sg_v32_e64.cu + #src/structure/select_random_vertices_mg_v32_e32.cu + #src/structure/select_random_vertices_mg_v32_e64.cu + #src/traversal/extract_bfs_paths_sg_v64_e64.cu + #src/traversal/extract_bfs_paths_sg_v32_e32.cu + #src/traversal/extract_bfs_paths_sg_v32_e64.cu + #src/traversal/extract_bfs_paths_mg_v64_e64.cu + #src/traversal/extract_bfs_paths_mg_v32_e32.cu + #src/traversal/extract_bfs_paths_mg_v32_e64.cu + #src/traversal/bfs_sg_v64_e64.cu + #src/traversal/bfs_sg_v32_e32.cu + #src/traversal/bfs_sg_v32_e64.cu src/traversal/bfs_mg_v64_e64.cu - src/traversal/bfs_mg_v32_e32.cu - src/traversal/bfs_mg_v32_e64.cu - src/traversal/sssp_sg_v64_e64.cu - src/traversal/sssp_sg_v32_e32.cu - src/traversal/sssp_sg_v32_e64.cu - src/traversal/od_shortest_distances_sg_v64_e64.cu - src/traversal/od_shortest_distances_sg_v32_e32.cu - src/traversal/od_shortest_distances_sg_v32_e64.cu - src/traversal/sssp_mg_v64_e64.cu - src/traversal/sssp_mg_v32_e32.cu - src/traversal/sssp_mg_v32_e64.cu - src/link_analysis/hits_sg_v64_e64.cu - src/link_analysis/hits_sg_v32_e32.cu - src/link_analysis/hits_sg_v32_e64.cu - src/link_analysis/hits_mg_v64_e64.cu - src/link_analysis/hits_mg_v32_e32.cu - src/link_analysis/hits_mg_v32_e64.cu - src/link_analysis/pagerank_sg_v64_e64.cu - src/link_analysis/pagerank_sg_v32_e32.cu - src/link_analysis/pagerank_sg_v32_e64.cu - src/link_analysis/pagerank_mg_v64_e64.cu - src/link_analysis/pagerank_mg_v32_e32.cu - src/link_analysis/pagerank_mg_v32_e64.cu - src/centrality/katz_centrality_sg_v64_e64.cu - src/centrality/katz_centrality_sg_v32_e32.cu - src/centrality/katz_centrality_sg_v32_e64.cu - src/centrality/katz_centrality_mg_v64_e64.cu - src/centrality/katz_centrality_mg_v32_e32.cu - src/centrality/katz_centrality_mg_v32_e64.cu - src/centrality/eigenvector_centrality_sg_v64_e64.cu - src/centrality/eigenvector_centrality_sg_v32_e32.cu - src/centrality/eigenvector_centrality_sg_v32_e64.cu - src/centrality/eigenvector_centrality_mg_v64_e64.cu - src/centrality/eigenvector_centrality_mg_v32_e32.cu - src/centrality/eigenvector_centrality_mg_v32_e64.cu - src/centrality/betweenness_centrality_sg_v64_e64.cu - src/centrality/betweenness_centrality_sg_v32_e32.cu - src/centrality/betweenness_centrality_sg_v32_e64.cu - src/centrality/betweenness_centrality_mg_v64_e64.cu - src/centrality/betweenness_centrality_mg_v32_e32.cu - src/centrality/betweenness_centrality_mg_v32_e64.cu - src/tree/legacy/mst.cu - src/from_cugraph_ops/sampling_index.cu - src/components/weakly_connected_components_sg_v64_e64.cu - src/components/weakly_connected_components_sg_v32_e32.cu - src/components/weakly_connected_components_sg_v32_e64.cu - src/components/weakly_connected_components_mg_v64_e64.cu - src/components/weakly_connected_components_mg_v32_e32.cu - src/components/weakly_connected_components_mg_v32_e64.cu - src/components/mis_sg_v64_e64.cu - src/components/mis_sg_v32_e32.cu - src/components/mis_sg_v32_e64.cu - src/components/mis_mg_v64_e64.cu - src/components/mis_mg_v32_e32.cu - src/components/mis_mg_v32_e64.cu - src/components/vertex_coloring_sg_v64_e64.cu - src/components/vertex_coloring_sg_v32_e32.cu - src/components/vertex_coloring_sg_v32_e64.cu - src/components/vertex_coloring_mg_v64_e64.cu - src/components/vertex_coloring_mg_v32_e32.cu - src/components/vertex_coloring_mg_v32_e64.cu - src/structure/create_graph_from_edgelist_sg_v64_e64.cu - src/structure/create_graph_from_edgelist_sg_v32_e32.cu - src/structure/create_graph_from_edgelist_sg_v32_e64.cu + #src/traversal/bfs_mg_v32_e32.cu + #src/traversal/bfs_mg_v32_e64.cu + #src/traversal/sssp_sg_v64_e64.cu + #src/traversal/sssp_sg_v32_e32.cu + #src/traversal/sssp_sg_v32_e64.cu + #src/traversal/od_shortest_distances_sg_v64_e64.cu + #src/traversal/od_shortest_distances_sg_v32_e32.cu + #src/traversal/od_shortest_distances_sg_v32_e64.cu + #src/traversal/sssp_mg_v64_e64.cu + #src/traversal/sssp_mg_v32_e32.cu + #src/traversal/sssp_mg_v32_e64.cu + #src/link_analysis/hits_sg_v64_e64.cu + #src/link_analysis/hits_sg_v32_e32.cu + #src/link_analysis/hits_sg_v32_e64.cu + #src/link_analysis/hits_mg_v64_e64.cu + #src/link_analysis/hits_mg_v32_e32.cu + #src/link_analysis/hits_mg_v32_e64.cu + #src/link_analysis/pagerank_sg_v64_e64.cu + #src/link_analysis/pagerank_sg_v32_e32.cu + #src/link_analysis/pagerank_sg_v32_e64.cu + #src/link_analysis/pagerank_mg_v64_e64.cu + #src/link_analysis/pagerank_mg_v32_e32.cu + #src/link_analysis/pagerank_mg_v32_e64.cu + #src/centrality/katz_centrality_sg_v64_e64.cu + ##src/centrality/katz_centrality_sg_v32_e32.cu + #src/centrality/katz_centrality_sg_v32_e64.cu + #src/centrality/katz_centrality_mg_v64_e64.cu + #src/centrality/katz_centrality_mg_v32_e32.cu + #src/centrality/katz_centrality_mg_v32_e64.cu + #src/centrality/eigenvector_centrality_sg_v64_e64.cu + #src/centrality/eigenvector_centrality_sg_v32_e32.cu + #src/centrality/eigenvector_centrality_sg_v32_e64.cu + #src/centrality/eigenvector_centrality_mg_v64_e64.cu + #src/centrality/eigenvector_centrality_mg_v32_e32.cu + #src/centrality/eigenvector_centrality_mg_v32_e64.cu + #src/centrality/betweenness_centrality_sg_v64_e64.cu + #src/centrality/betweenness_centrality_sg_v32_e32.cu + #src/centrality/betweenness_centrality_sg_v32_e64.cu + #src/centrality/betweenness_centrality_mg_v64_e64.cu + #src/centrality/betweenness_centrality_mg_v32_e32.cu + #src/centrality/betweenness_centrality_mg_v32_e64.cu + #src/tree/legacy/mst.cu + #src/components/weakly_connected_components_sg_v64_e64.cu + #src/components/weakly_connected_components_sg_v32_e32.cu + #src/components/weakly_connected_components_sg_v32_e64.cu + #src/components/weakly_connected_components_mg_v64_e64.cu + #src/components/weakly_connected_components_mg_v32_e32.cu + #src/components/weakly_connected_components_mg_v32_e64.cu + #src/components/mis_sg_v64_e64.cu + #src/components/mis_sg_v32_e32.cu + #src/components/mis_sg_v32_e64.cu + #src/components/mis_mg_v64_e64.cu + #src/components/mis_mg_v32_e32.cu + #src/components/mis_mg_v32_e64.cu + #src/components/vertex_coloring_sg_v64_e64.cu + #src/components/vertex_coloring_sg_v32_e32.cu + #src/components/vertex_coloring_sg_v32_e64.cu + #src/components/vertex_coloring_mg_v64_e64.cu + #src/components/vertex_coloring_mg_v32_e32.cu + #src/components/vertex_coloring_mg_v32_e64.cu + #src/structure/create_graph_from_edgelist_sg_v64_e64.cu + #src/structure/create_graph_from_edgelist_sg_v32_e32.cu + #src/structure/create_graph_from_edgelist_sg_v32_e64.cu src/structure/create_graph_from_edgelist_mg_v64_e64.cu - src/structure/create_graph_from_edgelist_mg_v32_e32.cu - src/structure/create_graph_from_edgelist_mg_v32_e64.cu - src/structure/symmetrize_edgelist_sg_v64_e64.cu - src/structure/symmetrize_edgelist_sg_v32_e32.cu + #src/structure/create_graph_from_edgelist_mg_v32_e32.cu + #src/structure/create_graph_from_edgelist_mg_v32_e64.cu + #src/structure/symmetrize_edgelist_sg_v64_e64.cu + #src/structure/symmetrize_edgelist_sg_v32_e32.cu src/structure/symmetrize_edgelist_mg_v64_e64.cu - src/structure/symmetrize_edgelist_mg_v32_e32.cu - src/community/triangle_count_sg_v64_e64.cu - src/community/triangle_count_sg_v32_e32.cu - src/community/triangle_count_sg_v32_e64.cu - src/community/triangle_count_mg_v64_e64.cu - src/community/triangle_count_mg_v32_e32.cu - src/community/triangle_count_mg_v32_e64.cu - src/community/approx_weighted_matching_sg_v64_e64.cu - src/community/approx_weighted_matching_sg_v32_e32.cu - src/community/approx_weighted_matching_sg_v32_e64.cu - src/community/approx_weighted_matching_mg_v64_e64.cu - src/community/approx_weighted_matching_mg_v32_e32.cu - src/community/approx_weighted_matching_mg_v32_e64.cu - src/traversal/k_hop_nbrs_sg_v64_e64.cu - src/traversal/k_hop_nbrs_sg_v32_e32.cu - src/traversal/k_hop_nbrs_sg_v32_e64.cu + #src/structure/symmetrize_edgelist_mg_v32_e32.cu + #src/community/triangle_count_sg_v64_e64.cu + #src/community/triangle_count_sg_v32_e32.cu + #src/community/triangle_count_sg_v32_e64.cu + #src/community/triangle_count_mg_v64_e64.cu + #src/community/triangle_count_mg_v32_e32.cu + #src/community/triangle_count_mg_v32_e64.cu + #src/community/approx_weighted_matching_sg_v64_e64.cu + #src/community/approx_weighted_matching_sg_v32_e32.cu + #src/community/approx_weighted_matching_sg_v32_e64.cu + #src/community/approx_weighted_matching_mg_v64_e64.cu + #src/community/approx_weighted_matching_mg_v32_e32.cu + #src/community/approx_weighted_matching_mg_v32_e64.cu + #src/traversal/k_hop_nbrs_sg_v64_e64.cu + #src/traversal/k_hop_nbrs_sg_v32_e32.cu + #src/traversal/k_hop_nbrs_sg_v32_e64.cu src/traversal/k_hop_nbrs_mg_v64_e64.cu - src/traversal/k_hop_nbrs_mg_v32_e32.cu - src/traversal/k_hop_nbrs_mg_v32_e64.cu - src/mtmg/vertex_result_sg_v32_e32.cu - src/mtmg/vertex_result_sg_v64_e64.cu - src/mtmg/vertex_result_mg_v32_e32.cu + #src/traversal/k_hop_nbrs_mg_v32_e32.cu + #src/traversal/k_hop_nbrs_mg_v32_e64.cu + #src/mtmg/vertex_result_sg_v32_e32.cu + #src/mtmg/vertex_result_sg_v64_e64.cu + #src/mtmg/vertex_result_mg_v32_e32.cu src/mtmg/vertex_result_mg_v64_e64.cu - src/mtmg/vertex_pairs_result_sg_v32_e32.cu - src/mtmg/vertex_pairs_result_sg_v64_e64.cu - src/mtmg/vertex_pairs_result_mg_v32_e32.cu + #src/mtmg/vertex_pairs_result_sg_v32_e32.cu + #src/mtmg/vertex_pairs_result_sg_v64_e64.cu + #src/mtmg/vertex_pairs_result_mg_v32_e32.cu src/mtmg/vertex_pairs_result_mg_v64_e64.cu ) @@ -661,7 +656,6 @@ add_library(cugraph_c src/c_api/louvain.cpp src/c_api/triangle_count.cpp src/c_api/neighbor_sampling.cpp - src/c_api/negative_sampling.cpp src/c_api/labeling_result.cpp src/c_api/weakly_connected_components.cpp src/c_api/strongly_connected_components.cpp diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index 583b0a37214..628c3cc10cc 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -204,6 +204,7 @@ class edge_partition_device_view_t view) : detail::edge_partition_device_view_base_t(view.offsets(), view.indices()), dcs_nzd_vertices_(detail::to_thrust_optional(view.dcs_nzd_vertices())), + dcs_nzd_range_bitmap_(detail::to_thrust_optional(view.dcs_nzd_range_bitmap())), major_hypersparse_first_(detail::to_thrust_optional(view.major_hypersparse_first())), major_range_first_(view.major_range_first()), major_range_last_(view.major_range_last()), @@ -218,6 +219,7 @@ class edge_partition_device_view_t()); } + template + __host__ void compute_number_of_edges_async(MajorIterator major_first, + MajorIterator major_last, + raft::device_span count /* size = 1 */, + rmm::cuda_stream_view stream) const + { + if (thrust::distance(major_first, major_last) == 0) { + RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); + } + + rmm::device_uvector d_tmp_storage(0, stream); + size_t tmp_storage_bytes{0}; + + if (dcs_nzd_vertices_) { + auto local_degree_first = thrust::make_transform_iterator( + major_first, + detail::local_degree_op_t{ + this->offsets_, major_range_first_, *dcs_nzd_vertices_, *major_hypersparse_first_}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + } else { + auto local_degree_first = thrust::make_transform_iterator( + major_first, + detail::local_degree_op_t{ + this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + } + } + __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(this->major_range_size(), stream); if (dcs_nzd_vertices_) { assert(major_hypersparse_first_); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(this->major_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -266,7 +328,7 @@ class edge_partition_device_view_tmajor_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -284,7 +346,7 @@ class edge_partition_device_view_t local_degrees(thrust::distance(major_first, major_last), stream); if (dcs_nzd_vertices_) { assert(major_hypersparse_first_); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), major_first, major_last, local_degrees.begin(), @@ -295,7 +357,7 @@ class edge_partition_device_view_tmajor_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -368,7 +431,7 @@ class edge_partition_device_view_tmajor_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -394,7 +457,7 @@ class edge_partition_device_view_t> for consistency (see + // dcs_nzd_range_bitmap()) __host__ __device__ thrust::optional dcs_nzd_vertices() const { return dcs_nzd_vertices_ ? thrust::optional{(*dcs_nzd_vertices_).data()} @@ -528,10 +593,20 @@ class edge_partition_device_view_t> dcs_nzd_range_bitmap() + const + { + return dcs_nzd_range_bitmap_ + ? thrust::make_optional>( + (*dcs_nzd_range_bitmap_).data(), (*dcs_nzd_range_bitmap_).size()) + : thrust::nullopt; + } + private: // should be trivially copyable to device thrust::optional> dcs_nzd_vertices_{thrust::nullopt}; + thrust::optional> dcs_nzd_range_bitmap_{thrust::nullopt}; thrust::optional major_hypersparse_first_{thrust::nullopt}; vertex_t major_range_first_{0}; @@ -558,6 +633,7 @@ class edge_partition_device_view_t()); } + template + __host__ void compute_number_of_edges_async(MajorIterator major_first, + MajorIterator major_last, + raft::device_span count /* size = 1 */, + rmm::cuda_stream_view stream) const + { + if (thrust::distance(major_first, major_last) == 0) { + RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); + } + + rmm::device_uvector d_tmp_storage(0, stream); + size_t tmp_storage_bytes{0}; + + auto local_degree_first = thrust::make_transform_iterator( + major_first, + detail::local_degree_op_t{this->offsets_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + } + __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(this->major_range_size(), stream); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(this->major_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -595,7 +709,7 @@ class edge_partition_device_view_t local_degrees(thrust::distance(major_first, major_last), stream); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), major_first, major_last, local_degrees.begin(), @@ -613,6 +727,7 @@ class edge_partition_device_view_t local_degrees(this->major_range_size(), stream); thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(this->major_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -660,7 +775,7 @@ class edge_partition_device_view_t local_degrees(thrust::distance(major_first, major_last), stream); thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), major_first, major_last, local_degrees.begin(), diff --git a/cpp/include/cugraph/edge_partition_view.hpp b/cpp/include/cugraph/edge_partition_view.hpp index 42465273718..f0693f4b1a9 100644 --- a/cpp/include/cugraph/edge_partition_view.hpp +++ b/cpp/include/cugraph/edge_partition_view.hpp @@ -56,6 +56,7 @@ class edge_partition_view_t offsets, raft::device_span indices, std::optional> dcs_nzd_vertices, + std::optional> dcs_nzd_range_bitmap, std::optional major_hypersparse_first, vertex_t major_range_first, vertex_t major_range_last, @@ -64,6 +65,7 @@ class edge_partition_view_t(offsets, indices), dcs_nzd_vertices_(dcs_nzd_vertices), + dcs_nzd_range_bitmap_(dcs_nzd_range_bitmap), major_hypersparse_first_(major_hypersparse_first), major_range_first_(major_range_first), major_range_last_(major_range_last), @@ -78,6 +80,11 @@ class edge_partition_view_t> dcs_nzd_range_bitmap() const + { + return dcs_nzd_range_bitmap_; + } + std::optional major_hypersparse_first() const { return major_hypersparse_first_; } vertex_t major_range_first() const { return major_range_first_; } @@ -90,6 +97,7 @@ class edge_partition_view_t> dcs_nzd_vertices_{std::nullopt}; + std::optional> dcs_nzd_range_bitmap_{std::nullopt}; std::optional major_hypersparse_first_{std::nullopt}; vertex_t major_range_first_{0}; diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp index 0607b39153d..290f4b3c4db 100644 --- a/cpp/include/cugraph/graph.hpp +++ b/cpp/include/cugraph/graph.hpp @@ -48,6 +48,7 @@ struct graph_meta_t> { partition_t partition{}; std::vector edge_partition_segment_offsets{}; + std::optional> edge_partition_hypersparse_degree_offsets{}; vertex_t num_local_unique_edge_srcs{}; vertex_t num_local_unique_edge_dsts{}; @@ -61,6 +62,7 @@ struct graph_meta_t> { // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered std::optional> segment_offsets{std::nullopt}; + std::optional> hypersparse_degree_offsets{std::nullopt}; }; // graph_t is an owning graph class (note that graph_view_t is a non-owning graph class) @@ -101,6 +103,11 @@ class graph_t>>( (*edge_partition_dcs_nzd_vertices_).size()) : std::nullopt; + auto dcs_nzd_range_bitmaps = + edge_partition_dcs_nzd_range_bitmaps_ + ? std::make_optional>>( + (*edge_partition_dcs_nzd_range_bitmaps_).size()) + : std::nullopt; for (size_t i = 0; i < offsets.size(); ++i) { offsets[i] = raft::device_span(edge_partition_offsets_[i].data(), edge_partition_offsets_[i].size()); @@ -111,6 +118,11 @@ class graph_t((*edge_partition_dcs_nzd_vertices_)[i].data(), (*edge_partition_dcs_nzd_vertices_)[i].size()); } + if (dcs_nzd_range_bitmaps) { + (*dcs_nzd_range_bitmaps)[i] = + raft::device_span((*edge_partition_dcs_nzd_range_bitmaps_)[i].data(), + (*edge_partition_dcs_nzd_range_bitmaps_)[i].size()); + } } std::conditional_t{ this->number_of_vertices(), this->number_of_edges(), this->properties_, partition_, edge_partition_segment_offsets_, + edge_partition_hypersparse_degree_offsets_, local_sorted_unique_edge_srcs, local_sorted_unique_edge_src_chunk_start_offsets, local_sorted_unique_edge_src_chunk_size_, @@ -224,10 +238,13 @@ class graph_t>> edge_partition_dcs_nzd_vertices_{ std::nullopt}; + std::optional>> edge_partition_dcs_nzd_range_bitmaps_{ + std::nullopt}; partition_t partition_{}; // segment offsets within the vertex partition based on vertex degree std::vector edge_partition_segment_offsets_{}; + std::optional> edge_partition_hypersparse_degree_offsets_{}; // if valid, store row/column properties in key/value pairs (this saves memory if # unique edge // sources/destinations << V / major_comm_size|minor_comm_size). @@ -290,7 +307,11 @@ class graph_t(offsets_.data(), offsets_.size()), raft::device_span(indices_.data(), indices_.size()), graph_view_meta_t{ - this->number_of_vertices(), this->number_of_edges(), this->properties_, segment_offsets_}); + this->number_of_vertices(), + this->number_of_edges(), + this->properties_, + segment_offsets_, + hypersparse_degree_offsets_}); } private: @@ -299,6 +320,7 @@ class graph_t> segment_offsets_{}; + std::optional> hypersparse_degree_offsets_{}; }; template diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index 866ab16ee97..b7db152d476 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -41,11 +41,13 @@ struct renumber_meta_t> edge_t number_of_edges{}; partition_t partition{}; std::vector edge_partition_segment_offsets{}; + std::optional> edge_partition_hypersparse_degree_offsets{}; }; template struct renumber_meta_t> { std::vector segment_offsets{}; + std::optional> hypersparse_degree_offsets{}; }; /** @@ -244,7 +246,7 @@ void unrenumber_int_vertices(raft::handle_t const& handle, * * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -284,7 +286,7 @@ std::enable_if_t unrenumber_local_int_edges( * * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -308,6 +310,38 @@ std::enable_if_t unrenumber_local_int_edges(raft::handle_t con vertex_t num_vertices, bool do_expensive_check = false); +/** + * @brief Unrenumber local internal edge destinations to external vertices based on the providied @p + * renumber_map_labels. + * + * Note cugraph::invalid_id::value remains unchanged. This function requires the input + * edge destination vertices to unrenumber to be sorted and unique. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if + * true) as major indices in storing edges using a 2D sparse matrix. + * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) + * or multi-GPU (true). + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param sorted_unique_edge_dsts Span object storing the pointer to the edge destination internal + * vertices to be unrenumbered and the size of the pointed array. The input edge desntiation + * internal vertices should be sorted and unique. The input edge desntiation internal vertices are + * renumbered to external vertices in-place. + * @param renumber_map Span object storing pointer to the external vertices corresponding to the + * internal vertices (assigned to this process in multi-GPU) and the size of the array. + * @param vertex_partition_range_lasts Last local internal vertices (exclusive, assigned to each + * process in multi-GPU). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_vertices /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check = false); + /** * @brief Renumber local external vertices to internal vertices based on the provided @p * renumber_map_labels. @@ -346,7 +380,7 @@ void renumber_local_ext_vertices(raft::handle_t const& handle, * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam edge_type_t Type of edge types. Needs to be an integral type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -388,7 +422,7 @@ decompress_to_edgelist( * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -421,7 +455,7 @@ symmetrize_edgelist(raft::handle_t const& handle, * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -463,7 +497,7 @@ symmetrize_graph( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -505,7 +539,7 @@ transpose_graph( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -549,7 +583,7 @@ transpose_graph_storage( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -625,7 +659,7 @@ void relabel(raft::handle_t const& handle, * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -678,7 +712,7 @@ extract_induced_subgraphs( * @tparam edge_type_t Type of edge type. Needs to be an integral type, currently only int32_t is * supported * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -743,7 +777,7 @@ create_graph_from_edgelist(raft::handle_t const& handle, * @tparam edge_type_t Type of edge type. Needs to be an integral type, currently only int32_t is * supported * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -807,7 +841,7 @@ create_graph_from_edgelist( * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -829,7 +863,7 @@ std::tuple, rmm::device_uvector> get_two * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -856,7 +890,7 @@ rmm::device_uvector compute_in_weight_sums( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -883,7 +917,7 @@ rmm::device_uvector compute_out_weight_sums( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -910,7 +944,7 @@ weight_t compute_max_in_weight_sum( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -937,7 +971,7 @@ weight_t compute_max_out_weight_sum( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -963,7 +997,7 @@ weight_t compute_total_edge_weight( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -1114,7 +1148,8 @@ shuffle_external_vertex_value_pairs(raft::handle_t const& handle, * @param edge_ids Optional list of edge ids * @param edge_types Optional list of edge types * @return Tuple of vectors storing edge sources, destinations, optional weights, - * optional edge ids, optional edge types mapped to this GPU. + * optional edge ids, optional edge types mapped to this GPU and a vector storing the + * number of edges received from each GPU. */ template std::tuple, diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index a2ff3166fa4..6d3da3740bf 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -165,7 +165,12 @@ class partition_t { return vertex_partition_range_last(partition_idx) - vertex_partition_range_first(partition_idx); } - size_t number_of_local_edge_partitions() const { return minor_comm_size_; } + size_t number_of_local_edge_partitions() const { return static_cast(minor_comm_size_); } + size_t coinciding_local_edge_partition_idx() const + { + return static_cast(minor_comm_rank_); + } // the major range of coinciding_local_edge_partition_idx()'th local edge partition coincides + // with the local vertex partition range // major: source of the edge partition (if not transposed) or destination of the edge partition // (if transposed). @@ -243,15 +248,20 @@ namespace detail { // use (key, value) pairs to store source/destination properties if (unique edge // sources/destinations) over (V / major_comm_size|minor_comm_size) is smaller than the threshold // value -double constexpr edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold = 0.1; +double constexpr edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold = + 0.0; // FIXME: just for benchmarking // FIXME: threshold values require tuning // use the hypersparse format (currently, DCSR or DCSC) for the vertices with their degrees smaller // than minor_comm_size * hypersparse_threshold_ratio, should be less than 1.0 double constexpr hypersparse_threshold_ratio = 0.5; -size_t constexpr low_degree_threshold{raft::warp_size()}; -size_t constexpr mid_degree_threshold{1024}; -size_t constexpr num_sparse_segments_per_vertex_partition{3}; +size_t constexpr low_degree_threshold{ + raft::warp_size()}; // belongs to the low degree segment if the global degree is smaller than + // this value. +size_t constexpr mid_degree_threshold{ + 1024}; // belongs to the medium degree segment if the global degree is smaller than this value, + // otherwise, belongs to the high degree segment. +size_t constexpr num_sparse_segments_per_vertex_partition{3}; // high, mid, low // Common for both graph_view_t & graph_t and both single-GPU & multi-GPU versions template @@ -313,6 +323,7 @@ struct graph_view_meta_t edge_partition_segment_offsets{}; + std::optional> edge_partition_hypersparse_degree_offsets{}; std::conditional_t>, @@ -356,6 +367,7 @@ struct graph_view_meta_t> segment_offsets{std::nullopt}; + std::optional> hypersparse_degree_offsets{std::nullopt}; }; // graph_view_t is a non-owning graph class (note that graph_t is an owning graph class) @@ -380,6 +392,8 @@ class graph_view_t> const& edge_partition_indices, std::optional>> const& edge_partition_dcs_nzd_vertices, + std::optional>> const& + edge_partition_dcs_nzd_range_bitmaps, graph_view_meta_t meta); std::vector vertex_partition_range_offsets() const @@ -552,6 +566,12 @@ class graph_view_t> local_vertex_partition_segment_offsets() const + { + auto partition_idx = partition_.coinciding_local_edge_partition_idx(); + return local_edge_partition_segment_offsets(partition_idx); + } + std::optional> local_edge_partition_segment_offsets( size_t partition_idx) const { @@ -563,6 +583,28 @@ class graph_view_t> local_vertex_partition_hypersparse_degree_offsets() const + { + auto partition_idx = partition_.coinciding_local_edge_partition_idx(); + return local_edge_partition_hypersparse_degree_offsets(partition_idx); + } + + std::optional> local_edge_partition_hypersparse_degree_offsets( + size_t partition_idx) const + { + auto num_degrees_per_vertex_partition = + edge_partition_hypersparse_degree_offsets_ + ? ((*edge_partition_hypersparse_degree_offsets_).size() / edge_partition_offsets_.size()) + : size_t{0}; + return edge_partition_hypersparse_degree_offsets_ + ? std::make_optional>( + (*edge_partition_hypersparse_degree_offsets_).begin() + + partition_idx * num_degrees_per_vertex_partition, + (*edge_partition_hypersparse_degree_offsets_).begin() + + (partition_idx + 1) * num_degrees_per_vertex_partition) + : std::nullopt; + } + vertex_partition_view_t local_vertex_partition_view() const { return vertex_partition_view_t(this->number_of_vertices(), @@ -605,6 +647,9 @@ class graph_view_t>> edge_partition_dcs_nzd_vertices_{}; + std::optional>> + edge_partition_dcs_nzd_range_bitmaps_{}; partition_t partition_{}; // segment offsets based on vertex degree std::vector edge_partition_segment_offsets_{}; + std::optional> edge_partition_hypersparse_degree_offsets_{}; // if valid, store source/destination property values in key/value pairs (this saves memory if # // unique edge sources/destinations << V / major_comm_size|minor_comm_size). @@ -903,6 +951,11 @@ class graph_view_t> local_vertex_partition_segment_offsets() const + { + return local_edge_partition_segment_offsets(size_t{0}); + } + std::optional> local_edge_partition_segment_offsets( size_t partition_idx = 0) const { @@ -910,6 +963,18 @@ class graph_view_t> local_vertex_partition_hypersparse_degree_offsets() const + { + return local_edge_partition_hypersparse_degree_offsets(size_t{0}); + } + + std::optional> local_edge_partition_hypersparse_degree_offsets( + size_t partition_idx = 0) const + { + assert(partition_idx == 0); + return hypersparse_degree_offsets_; + } + vertex_partition_view_t local_vertex_partition_view() const { return vertex_partition_view_t(this->number_of_vertices()); @@ -1050,6 +1115,7 @@ class graph_view_t> segment_offsets_{std::nullopt}; + std::optional> hypersparse_degree_offsets_{std::nullopt}; std::optional> edge_mask_view_{std::nullopt}; }; diff --git a/cpp/include/cugraph/partition_manager.hpp b/cpp/include/cugraph/partition_manager.hpp index 309b169e646..377fd0a4de9 100644 --- a/cpp/include/cugraph/partition_manager.hpp +++ b/cpp/include/cugraph/partition_manager.hpp @@ -22,6 +22,8 @@ #include #include +#include // FIXME: temporarily added for setenv + #include namespace cugraph { @@ -71,6 +73,30 @@ class partition_manager { : (major_comm_rank * minor_comm_size + minor_comm_rank); } +#ifdef __CUDACC__ + __host__ __device__ +#endif + static int + compute_major_comm_rank_from_global_comm_rank(int major_comm_size, + int minor_comm_size, + int comm_rank) + { + return map_major_comm_to_gpu_row_comm ? comm_rank % major_comm_size + : comm_rank / minor_comm_size; + } + +#ifdef __CUDACC__ + __host__ __device__ +#endif + static int + compute_minor_comm_rank_from_global_comm_rank(int major_comm_size, + int minor_comm_size, + int comm_rank) + { + return map_major_comm_to_gpu_row_comm ? comm_rank / major_comm_size + : comm_rank % minor_comm_size; + } + #ifdef __CUDACC__ __host__ __device__ #endif @@ -137,10 +163,35 @@ class partition_manager { int row_idx = rank / gpu_row_comm_size; int col_idx = rank % gpu_row_comm_size; +#if 1 // FIXME: a trick to use InfiniBand SHARP in a sub-communicator (currently, a GPU can + // participate in only one SHARP accelerated communicator) + comm.barrier(); // to enforce initialization in comm + std::cerr << "start intializing node_comm" << std::endl; + std::cerr << "start intializing major_comm" << std::endl; + handle.set_subcomm("gpu_row_comm", + std::make_shared(comm.comm_split(row_idx, col_idx))); + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + major_comm.barrier(); /// to enforce initialization in major_comm + std::cerr << "major_comm initialized" << std::endl; +#if 1 // for EOS + auto ret = setenv("NCCL_COLLNET_ENABLE", "1", 1); + if (ret != 0) + std::cerr << "setenv(\"NCCL_COLLNET_ENABLE\", \"1\", 1) returned " << ret << std::endl; + ret = setenv("NCCL_SHARP_DISABLE", "0", 1); + if (ret != 0) + std::cerr << "setenv(\"NCCL_SHARP_DISABLE\", \"0\", 1) returned " << ret << std::endl; +#endif + handle.set_subcomm("gpu_col_comm", + std::make_shared(comm.comm_split(col_idx, row_idx))); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + minor_comm.barrier(); /// to enforce initialization in minor_comm + std::cerr << "minor_comm initialized" << std::endl; +#else handle.set_subcomm("gpu_row_comm", std::make_shared(comm.comm_split(row_idx, col_idx))); handle.set_subcomm("gpu_col_comm", std::make_shared(comm.comm_split(col_idx, row_idx))); +#endif }; }; diff --git a/cpp/include/cugraph/utilities/dataframe_buffer.hpp b/cpp/include/cugraph/utilities/dataframe_buffer.hpp index a20613c65ef..6d47ec540da 100644 --- a/cpp/include/cugraph/utilities/dataframe_buffer.hpp +++ b/cpp/include/cugraph/utilities/dataframe_buffer.hpp @@ -82,6 +82,53 @@ auto allocate_dataframe_buffer(size_t buffer_size, rmm::cuda_stream_view stream_ std::make_index_sequence(), buffer_size, stream_view); } +template +struct dataframe_buffer_type { + using type = decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{})); +}; + +template +using dataframe_buffer_type_t = typename dataframe_buffer_type::type; + +template +std::optional> try_allocate_dataframe_buffer( + size_t buffer_size, rmm::cuda_stream_view stream_view) +{ + try { + return allocate_dataframe_buffer(buffer_size, stream_view); + } catch (std::exception const& e) { + return std::nullopt; + } +} + +template +struct dataframe_buffer_iterator_type { + using type = typename rmm::device_uvector::iterator; +}; + +template +struct dataframe_buffer_iterator_type> { + using type = thrust::zip_iterator::iterator...>>; +}; + +template +using dataframe_buffer_iterator_type_t = typename dataframe_buffer_iterator_type::type; + +template +struct dataframe_buffer_const_iterator_type { + using type = typename rmm::device_uvector::const_iterator; +}; + +template +struct dataframe_buffer_const_iterator_type> { + using type = + thrust::zip_iterator::const_iterator...>>; +}; + +template +using dataframe_buffer_const_iterator_type_t = + typename dataframe_buffer_const_iterator_type::type; + template void reserve_dataframe_buffer(BufferType& buffer, size_t new_buffer_capacity, @@ -206,30 +253,4 @@ auto get_dataframe_buffer_cend(BufferType& buffer) std::make_index_sequence::value>(), buffer); } -template -struct dataframe_buffer_value_type { - using type = void; -}; - -template -struct dataframe_buffer_value_type> { - using type = T; -}; - -template -struct dataframe_buffer_value_type...>> { - using type = thrust::tuple; -}; - -template -using dataframe_buffer_value_type_t = typename dataframe_buffer_value_type::type; - -template -struct dataframe_buffer_type { - using type = decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{})); -}; - -template -using dataframe_buffer_type_t = typename dataframe_buffer_type::type; - } // namespace cugraph diff --git a/cpp/include/cugraph/utilities/device_comm.hpp b/cpp/include/cugraph/utilities/device_comm.hpp index ffb0f7d9e5b..07de2d06466 100644 --- a/cpp/include/cugraph/utilities/device_comm.hpp +++ b/cpp/include/cugraph/utilities/device_comm.hpp @@ -55,7 +55,7 @@ auto iter_to_raw_ptr(thrust::detail::normal_iterator> iter } template -std::enable_if_t::value, void> +std::enable_if_t, void> device_isend_impl(raft::comms::comms_t const& comm, InputIterator input_first, size_t count, @@ -76,7 +76,7 @@ std::enable_if_t::value, void> device_isend_ raft::comms::request_t* request) { static_assert( - std::is_same::value_type, OutputValueType>::value); + std::is_same_v::value_type, OutputValueType>); comm.isend(iter_to_raw_ptr(input_first), count, dst, tag, request); } @@ -136,7 +136,7 @@ device_irecv_impl(raft::comms::comms_t const& comm, { static_assert( - std::is_same::value_type>::value); + std::is_same_v::value_type>); comm.irecv(iter_to_raw_ptr(output_first), count, src, tag, request); } @@ -200,7 +200,7 @@ device_sendrecv_impl(raft::comms::comms_t const& comm, { using value_type = typename std::iterator_traits::value_type; static_assert( - std::is_same::value_type, value_type>::value); + std::is_same_v::value_type, value_type>); comm.device_sendrecv(iter_to_raw_ptr(input_first), tx_count, dst, @@ -286,7 +286,7 @@ device_multicast_sendrecv_impl(raft::comms::comms_t const& comm, { using value_type = typename std::iterator_traits::value_type; static_assert( - std::is_same::value_type, value_type>::value); + std::is_same_v::value_type, value_type>); comm.device_multicast_sendrecv(iter_to_raw_ptr(input_first), tx_counts, tx_offsets, @@ -379,8 +379,8 @@ device_bcast_impl(raft::comms::comms_t const& comm, int root, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.bcast( iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, root, stream_view.value()); } @@ -440,8 +440,8 @@ device_allreduce_impl(raft::comms::comms_t const& comm, raft::comms::op_t op, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.allreduce( iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, op, stream_view.value()); } @@ -503,8 +503,8 @@ device_reduce_impl(raft::comms::comms_t const& comm, int root, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.reduce(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, @@ -548,6 +548,62 @@ struct device_reduce_tuple_iterator_element_impl +std::enable_if_t::value, void> +device_allgather_impl(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) +{ + // no-op +} + +template +std::enable_if_t< + std::is_arithmetic::value_type>::value, + void> +device_allgather_impl(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) +{ + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); + comm.allgather( + iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), sendcount, stream_view.value()); +} + +template +struct device_allgather_tuple_iterator_element_impl { + void run(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) const + { + device_allgather_impl(comm, + thrust::get(input_first.get_iterator_tuple()), + thrust::get(output_first.get_iterator_tuple()), + sendcount, + stream_view); + device_allgather_tuple_iterator_element_impl().run( + comm, input_first, output_first, sendcount, stream_view); + } +}; + +template +struct device_allgather_tuple_iterator_element_impl { + void run(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) const + { + } +}; + template std::enable_if_t::value, void> device_allgatherv_impl(raft::comms::comms_t const& comm, @@ -571,8 +627,8 @@ device_allgatherv_impl(raft::comms::comms_t const& comm, std::vector const& displacements, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.allgatherv(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), recvcounts.data(), @@ -639,8 +695,8 @@ device_gatherv_impl(raft::comms::comms_t const& comm, int root, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.gatherv(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), sendcount, @@ -1000,6 +1056,44 @@ device_reduce(raft::comms::comms_t const& comm, .run(comm, input_first, output_first, count, op, root, stream_view); } +template +std::enable_if_t< + std::is_arithmetic::value_type>::value, + void> +device_allgather(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) +{ + detail::device_allgather_impl(comm, input_first, output_first, sendcount, stream_view); +} + +template +std::enable_if_t< + is_thrust_tuple_of_arithmetic::value_type>::value && + is_thrust_tuple::value_type>::value, + void> +device_allgather(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) +{ + static_assert( + thrust::tuple_size::value_type>::value == + thrust::tuple_size::value_type>::value); + + size_t constexpr tuple_size = + thrust::tuple_size::value_type>::value; + + detail::device_allgather_tuple_iterator_element_impl() + .run(comm, input_first, output_first, sendcount, stream_view); +} + template std::enable_if_t< std::is_arithmetic::value_type>::value, diff --git a/cpp/include/cugraph/utilities/misc_utils.cuh b/cpp/include/cugraph/utilities/misc_utils.cuh index 633dabe5b40..91a349007da 100644 --- a/cpp/include/cugraph/utilities/misc_utils.cuh +++ b/cpp/include/cugraph/utilities/misc_utils.cuh @@ -81,7 +81,7 @@ std::tuple, std::vector> compute_offset_aligned_ return std::make_tuple(h_chunk_offsets, h_element_offsets); } else { - return std::make_tuple(std::vector{{0, offsets.size() - 1}}, + return std::make_tuple(std::vector{{0, static_cast(offsets.size() - 1)}}, std::vector{{0, num_elements}}); } } diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh index 3cbd35b4bc3..98fa2cb1706 100644 --- a/cpp/include/cugraph/utilities/shuffle_comm.cuh +++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -50,6 +51,8 @@ namespace cugraph { namespace detail { +constexpr size_t cache_line_size = 128; + template struct compute_group_id_count_pair_t { GroupIdIterator group_id_first{}; @@ -76,6 +79,7 @@ inline std::tuple, std::vector> compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const& comm, rmm::device_uvector const& d_tx_value_counts, + bool drop_empty_ranks, rmm::cuda_stream_view stream_view) { auto const comm_size = comm.get_size(); @@ -111,28 +115,30 @@ compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const& comm, std::partial_sum(tx_counts.begin(), tx_counts.end() - 1, tx_offsets.begin() + 1); std::partial_sum(rx_counts.begin(), rx_counts.end() - 1, rx_offsets.begin() + 1); - int num_tx_dst_ranks{0}; - int num_rx_src_ranks{0}; - for (int i = 0; i < comm_size; ++i) { - if (tx_counts[i] != 0) { - tx_counts[num_tx_dst_ranks] = tx_counts[i]; - tx_offsets[num_tx_dst_ranks] = tx_offsets[i]; - tx_dst_ranks[num_tx_dst_ranks] = tx_dst_ranks[i]; - ++num_tx_dst_ranks; - } - if (rx_counts[i] != 0) { - rx_counts[num_rx_src_ranks] = rx_counts[i]; - rx_offsets[num_rx_src_ranks] = rx_offsets[i]; - rx_src_ranks[num_rx_src_ranks] = rx_src_ranks[i]; - ++num_rx_src_ranks; + if (drop_empty_ranks) { + int num_tx_dst_ranks{0}; + int num_rx_src_ranks{0}; + for (int i = 0; i < comm_size; ++i) { + if (tx_counts[i] != 0) { + tx_counts[num_tx_dst_ranks] = tx_counts[i]; + tx_offsets[num_tx_dst_ranks] = tx_offsets[i]; + tx_dst_ranks[num_tx_dst_ranks] = tx_dst_ranks[i]; + ++num_tx_dst_ranks; + } + if (rx_counts[i] != 0) { + rx_counts[num_rx_src_ranks] = rx_counts[i]; + rx_offsets[num_rx_src_ranks] = rx_offsets[i]; + rx_src_ranks[num_rx_src_ranks] = rx_src_ranks[i]; + ++num_rx_src_ranks; + } } + tx_counts.resize(num_tx_dst_ranks); + tx_offsets.resize(num_tx_dst_ranks); + tx_dst_ranks.resize(num_tx_dst_ranks); + rx_counts.resize(num_rx_src_ranks); + rx_offsets.resize(num_rx_src_ranks); + rx_src_ranks.resize(num_rx_src_ranks); } - tx_counts.resize(num_tx_dst_ranks); - tx_offsets.resize(num_tx_dst_ranks); - tx_dst_ranks.resize(num_tx_dst_ranks); - rx_counts.resize(num_rx_src_ranks); - rx_offsets.resize(num_rx_src_ranks); - rx_src_ranks.resize(num_rx_src_ranks); return std::make_tuple(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks); } @@ -823,6 +829,8 @@ auto shuffle_values(raft::comms::comms_t const& comm, std::vector const& tx_value_counts, rmm::cuda_stream_view stream_view) { + using value_t = typename thrust::iterator_traits::value_type; + auto const comm_size = comm.get_size(); rmm::device_uvector d_tx_value_counts(comm_size, stream_view); @@ -836,11 +844,10 @@ auto shuffle_values(raft::comms::comms_t const& comm, std::vector rx_offsets{}; std::vector rx_src_ranks{}; std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) = - detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, true, stream_view); - auto rx_value_buffer = - allocate_dataframe_buffer::value_type>( - rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); + auto rx_value_buffer = allocate_dataframe_buffer( + rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size). device_multicast_sendrecv(comm, @@ -866,6 +873,236 @@ auto shuffle_values(raft::comms::comms_t const& comm, return std::make_tuple(std::move(rx_value_buffer), rx_counts); } +// Add gaps in the receive buffer to enforce that the sent data offset and the received data offset +// have the same alignment for every rank. This is faster assuming that @p alignment ensures cache +// line alignment in both send & receive buffer (tested with NCCL 2.23.4) +template +auto shuffle_values( + raft::comms::comms_t const& comm, + TxValueIterator tx_value_first, + std::vector const& tx_value_counts, + size_t alignment, // # elements + std::optional::value_type> fill_value, + rmm::cuda_stream_view stream_view) +{ + using value_t = typename thrust::iterator_traits::value_type; + + auto const comm_size = comm.get_size(); + + std::vector tx_value_displacements(tx_value_counts.size()); + std::exclusive_scan( + tx_value_counts.begin(), tx_value_counts.end(), tx_value_displacements.begin(), size_t{0}); + + std::vector tx_unaligned_counts(comm_size); + std::vector tx_displacements(comm_size); + std::vector tx_aligned_counts(comm_size); + std::vector tx_aligned_displacements(comm_size); + std::vector rx_unaligned_counts(comm_size); + std::vector rx_displacements(comm_size); + std::vector rx_aligned_counts(comm_size); + std::vector rx_aligned_displacements(comm_size); + std::vector tx_ranks(comm_size); + std::iota(tx_ranks.begin(), tx_ranks.end(), int{0}); + auto rx_ranks = tx_ranks; + for (size_t i = 0; i < tx_value_counts.size(); ++i) { + tx_unaligned_counts[i] = 0; + if (tx_value_displacements[i] % alignment != 0) { + tx_unaligned_counts[i] = + std::min(alignment - (tx_value_displacements[i] % alignment), tx_value_counts[i]); + } + tx_displacements[i] = tx_value_displacements[i]; + tx_aligned_counts[i] = tx_value_counts[i] - tx_unaligned_counts[i]; + tx_aligned_displacements[i] = tx_value_displacements[i] + tx_unaligned_counts[i]; + } + + rmm::device_uvector d_tx_unaligned_counts(tx_unaligned_counts.size(), stream_view); + rmm::device_uvector d_tx_aligned_counts(tx_aligned_counts.size(), stream_view); + rmm::device_uvector d_rx_unaligned_counts(rx_unaligned_counts.size(), stream_view); + rmm::device_uvector d_rx_aligned_counts(rx_aligned_counts.size(), stream_view); + raft::update_device(d_tx_unaligned_counts.data(), + tx_unaligned_counts.data(), + tx_unaligned_counts.size(), + stream_view); + raft::update_device( + d_tx_aligned_counts.data(), tx_aligned_counts.data(), tx_aligned_counts.size(), stream_view); + std::vector tx_counts(comm_size, size_t{1}); + std::vector tx_offsets(comm_size); + std::iota(tx_offsets.begin(), tx_offsets.end(), size_t{0}); + auto rx_counts = tx_counts; + auto rx_offsets = tx_offsets; + cugraph::device_multicast_sendrecv(comm, + d_tx_unaligned_counts.data(), + tx_counts, + tx_offsets, + tx_ranks, + d_rx_unaligned_counts.data(), + rx_counts, + rx_offsets, + rx_ranks, + stream_view); + cugraph::device_multicast_sendrecv(comm, + d_tx_aligned_counts.data(), + tx_counts, + tx_offsets, + tx_ranks, + d_rx_aligned_counts.data(), + rx_counts, + rx_offsets, + rx_ranks, + stream_view); + raft::update_host(rx_unaligned_counts.data(), + d_rx_unaligned_counts.data(), + d_rx_unaligned_counts.size(), + stream_view); + raft::update_host( + rx_aligned_counts.data(), d_rx_aligned_counts.data(), d_rx_aligned_counts.size(), stream_view); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view)); + size_t offset{0}; + for (size_t i = 0; i < rx_counts.size(); ++i) { + auto target_alignment = (alignment - rx_unaligned_counts[i]) % alignment; + auto cur_alignment = offset % alignment; + if (target_alignment >= cur_alignment) { + offset += target_alignment - cur_alignment; + } else { + offset += (target_alignment + alignment) - cur_alignment; + } + rx_displacements[i] = offset; + rx_aligned_displacements[i] = rx_displacements[i] + rx_unaligned_counts[i]; + offset = rx_aligned_displacements[i] + rx_aligned_counts[i]; + } + + auto rx_values = allocate_dataframe_buffer( + rx_aligned_displacements.back() + rx_aligned_counts.back(), stream_view); + if (fill_value) { + thrust::fill(rmm::exec_policy_nosync(stream_view), + get_dataframe_buffer_begin(rx_values), + get_dataframe_buffer_end(rx_values), + *fill_value); + } + cugraph::device_multicast_sendrecv(comm, + tx_value_first, + tx_unaligned_counts, + tx_displacements, + tx_ranks, + get_dataframe_buffer_begin(rx_values), + rx_unaligned_counts, + rx_displacements, + rx_ranks, + stream_view); + cugraph::device_multicast_sendrecv(comm, + tx_value_first, + tx_aligned_counts, + tx_aligned_displacements, + tx_ranks, + get_dataframe_buffer_begin(rx_values), + rx_aligned_counts, + rx_aligned_displacements, + rx_ranks, + stream_view); + + return std::make_tuple(std::move(rx_values), + tx_unaligned_counts, + tx_aligned_counts, + tx_displacements, + rx_unaligned_counts, + rx_aligned_counts, + rx_displacements); +} + +// this uses less memory than calling shuffle_values then sort & unique but requires comm.get_size() +// - 1 communication steps +template +auto shuffle_and_unique_segment_sorted_values( + raft::comms::comms_t const& comm, + TxValueIterator + segment_sorted_tx_value_first, // sorted within each segment (segment sizes: + // tx_value_counts[i], where i = [0, comm_size); and bettter be + // unique to reduce communication volume + std::vector const& tx_value_counts, + rmm::cuda_stream_view stream_view) +{ + using value_t = typename thrust::iterator_traits::value_type; + + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + + auto sorted_unique_values = allocate_dataframe_buffer(0, stream_view); + if (comm_size == 1) { + resize_dataframe_buffer(sorted_unique_values, tx_value_counts[comm_rank], stream_view); + thrust::copy(rmm::exec_policy_nosync(stream_view), + segment_sorted_tx_value_first, + segment_sorted_tx_value_first + tx_value_counts[comm_rank], + get_dataframe_buffer_begin(sorted_unique_values)); + resize_dataframe_buffer( + sorted_unique_values, + thrust::distance(get_dataframe_buffer_begin(sorted_unique_values), + thrust::unique(rmm::exec_policy_nosync(stream_view), + get_dataframe_buffer_begin(sorted_unique_values), + get_dataframe_buffer_end(sorted_unique_values))), + stream_view); + } else { + rmm::device_uvector d_tx_value_counts(comm_size, stream_view); + raft::update_device( + d_tx_value_counts.data(), tx_value_counts.data(), comm_size, stream_view.value()); + + std::vector tx_counts{}; + std::vector tx_offsets{}; + std::vector rx_counts{}; + std::vector rx_offsets{}; + std::tie(tx_counts, tx_offsets, std::ignore, rx_counts, rx_offsets, std::ignore) = + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, false, stream_view); + + d_tx_value_counts.resize(0, stream_view); + d_tx_value_counts.shrink_to_fit(stream_view); + + for (int i = 1; i < comm_size; ++i) { + auto dst = (comm_rank + i) % comm_size; + auto src = + static_cast((static_cast(comm_rank) + static_cast(comm_size - i)) % + static_cast(comm_size)); + auto rx_sorted_values = allocate_dataframe_buffer(rx_counts[src], stream_view); + device_sendrecv(comm, + segment_sorted_tx_value_first + tx_offsets[dst], + tx_counts[dst], + dst, + get_dataframe_buffer_begin(rx_sorted_values), + rx_counts[src], + src, + stream_view); + auto merged_sorted_values = allocate_dataframe_buffer( + (i == 1 ? tx_counts[comm_rank] : size_dataframe_buffer(sorted_unique_values)) + + rx_counts[src], + stream_view); + if (i == 1) { + thrust::merge( + rmm::exec_policy_nosync(stream_view), + segment_sorted_tx_value_first + tx_offsets[comm_rank], + segment_sorted_tx_value_first + (tx_offsets[comm_rank] + tx_counts[comm_rank]), + get_dataframe_buffer_begin(rx_sorted_values), + get_dataframe_buffer_end(rx_sorted_values), + get_dataframe_buffer_begin(merged_sorted_values)); + } else { + thrust::merge(rmm::exec_policy_nosync(stream_view), + get_dataframe_buffer_begin(sorted_unique_values), + get_dataframe_buffer_end(sorted_unique_values), + get_dataframe_buffer_begin(rx_sorted_values), + get_dataframe_buffer_end(rx_sorted_values), + get_dataframe_buffer_begin(merged_sorted_values)); + } + resize_dataframe_buffer( + merged_sorted_values, + thrust::distance(get_dataframe_buffer_begin(merged_sorted_values), + thrust::unique(rmm::exec_policy_nosync(stream_view), + get_dataframe_buffer_begin(merged_sorted_values), + get_dataframe_buffer_end(merged_sorted_values))), + stream_view); + sorted_unique_values = std::move(merged_sorted_values); + } + } + shrink_to_fit_dataframe_buffer(sorted_unique_values, stream_view); + return sorted_unique_values; +} + template auto groupby_gpu_id_and_shuffle_values(raft::comms::comms_t const& comm, ValueIterator tx_value_first /* [INOUT */, @@ -889,7 +1126,7 @@ auto groupby_gpu_id_and_shuffle_values(raft::comms::comms_t const& comm, std::vector rx_offsets{}; std::vector rx_src_ranks{}; std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) = - detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, true, stream_view); auto rx_value_buffer = allocate_dataframe_buffer::value_type>( @@ -943,7 +1180,7 @@ auto groupby_gpu_id_and_shuffle_kv_pairs(raft::comms::comms_t const& comm, std::vector rx_offsets{}; std::vector rx_src_ranks{}; std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) = - detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, true, stream_view); rmm::device_uvector::value_type> rx_keys( rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); diff --git a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp index 2c36ed33359..29b9d132ef8 100644 --- a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp +++ b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp @@ -64,6 +64,18 @@ size_t sum_thrust_tuple_element_sizes(std::index_sequence) return (... + sizeof(typename thrust::tuple_element::type)); } +template +size_t min_thrust_tuple_element_sizes(std::index_sequence) +{ + return std::min(sizeof(typename thrust::tuple_element::type)...); +} + +template +size_t max_thrust_tuple_element_sizes(std::index_sequence) +{ + return std::max(sizeof(typename thrust::tuple_element::type)...); +} + template auto thrust_tuple_to_std_tuple(TupleType tup, std::index_sequence) { @@ -181,6 +193,20 @@ constexpr size_t sum_thrust_tuple_element_sizes() std::make_index_sequence::value>()); } +template +constexpr size_t min_thrust_tuple_element_sizes() +{ + return detail::min_thrust_tuple_element_sizes( + std::make_index_sequence::value>()); +} + +template +constexpr size_t max_thrust_tuple_element_sizes() +{ + return detail::max_thrust_tuple_element_sizes( + std::make_index_sequence::value>()); +} + template auto thrust_tuple_to_std_tuple(TupleType tup) { diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh index 8ae49ed207c..88ef3987a03 100644 --- a/cpp/src/centrality/betweenness_centrality_impl.cuh +++ b/cpp/src/centrality/betweenness_centrality_impl.cuh @@ -23,7 +23,7 @@ #include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh" #include "prims/transform_e.cuh" #include "prims/transform_reduce_v.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -133,15 +133,15 @@ std::tuple, rmm::device_uvector> brandes_b update_edge_src_property(handle, graph_view, sigmas.begin(), src_sigmas.mutable_view()); update_edge_dst_property(handle, graph_view, distances.begin(), dst_distances.mutable_view()); - auto [new_frontier, new_sigma] = - transform_reduce_v_frontier_outgoing_e_by_dst(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - src_sigmas.view(), - dst_distances.view(), - cugraph::edge_dummy_property_t{}.view(), - brandes_e_op_t{}, - reduce_op::plus()); + auto [new_frontier, new_sigma] = cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( + handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + src_sigmas.view(), + dst_distances.view(), + cugraph::edge_dummy_property_t{}.view(), + brandes_e_op_t{}, + reduce_op::plus()); update_v_frontier(handle, graph_view, diff --git a/cpp/src/community/approx_weighted_matching_impl.cuh b/cpp/src/community/approx_weighted_matching_impl.cuh index a0ccfa52ffc..869ed4e7ae6 100644 --- a/cpp/src/community/approx_weighted_matching_impl.cuh +++ b/cpp/src/community/approx_weighted_matching_impl.cuh @@ -243,11 +243,12 @@ std::tuple, weight_t> approximate_weighted_matchin major_comm_size, minor_comm_size}; - candidates_of_candidates = cugraph::collect_values_for_keys(handle, + candidates_of_candidates = cugraph::collect_values_for_keys(comm, target_candidate_map.view(), candidates.begin(), candidates.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); } else { candidates_of_candidates.resize(candidates.size(), handle.get_stream()); diff --git a/cpp/src/community/detail/common_methods.cuh b/cpp/src/community/detail/common_methods.cuh index e17abdb3703..18fb3fdb251 100644 --- a/cpp/src/community/detail/common_methods.cuh +++ b/cpp/src/community/detail/common_methods.cuh @@ -289,11 +289,12 @@ rmm::device_uvector update_clustering_by_delta_modularity( invalid_vertex_id::value, std::numeric_limits::max(), handle.get_stream()); - vertex_cluster_weights_v = cugraph::collect_values_for_keys(handle, + vertex_cluster_weights_v = cugraph::collect_values_for_keys(comm, cluster_key_weight_map.view(), next_clusters_v.begin(), next_clusters_v.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); src_cluster_weights = edge_src_property_t, weight_t>(handle, diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh index 272e3d71f83..d11e38dbf9d 100644 --- a/cpp/src/community/detail/refine_impl.cuh +++ b/cpp/src/community/detail/refine_impl.cuh @@ -182,11 +182,12 @@ refine_clustering( comm_size, major_comm_size, minor_comm_size}; vertex_louvain_cluster_weights = - cugraph::collect_values_for_keys(handle, + cugraph::collect_values_for_keys(comm, cluster_key_weight_map.view(), louvain_assignment_of_vertices.begin(), louvain_assignment_of_vertices.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); } else { vertex_louvain_cluster_weights.resize(louvain_assignment_of_vertices.size(), @@ -468,11 +469,12 @@ refine_clustering( // comm_size, major_comm_size, minor_comm_size}; louvain_of_leiden_keys_used_in_edge_reduction = - cugraph::collect_values_for_keys(handle, + cugraph::collect_values_for_keys(comm, leiden_to_louvain_map.view(), leiden_keys_used_in_edge_reduction.begin(), leiden_keys_used_in_edge_reduction.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); } else { louvain_of_leiden_keys_used_in_edge_reduction.resize( leiden_keys_used_in_edge_reduction.size(), handle.get_stream()); @@ -859,11 +861,12 @@ refine_clustering( // comm_size, major_comm_size, minor_comm_size}; lovain_of_leiden_cluster_keys = - cugraph::collect_values_for_keys(handle, + cugraph::collect_values_for_keys(comm, leiden_to_louvain_map.view(), leiden_keys_to_read_louvain.begin(), leiden_keys_to_read_louvain.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); } else { lovain_of_leiden_cluster_keys.resize(leiden_keys_to_read_louvain.size(), handle.get_stream()); diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh index 468f4f7280f..219bc3c4d1d 100644 --- a/cpp/src/components/weakly_connected_components_impl.cuh +++ b/cpp/src/components/weakly_connected_components_impl.cuh @@ -16,7 +16,7 @@ #pragma once #include "prims/fill_edge_src_dst_property.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -550,24 +550,25 @@ void weakly_connected_components_impl(raft::handle_t const& handle, auto old_num_edge_inserts = num_edge_inserts.value(handle.get_stream()); resize_dataframe_buffer(edge_buffer, old_num_edge_inserts + max_pushes, handle.get_stream()); - auto new_frontier_tagged_vertex_buffer = transform_reduce_v_frontier_outgoing_e_by_dst( - handle, - level_graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op_t{ - GraphViewType::is_multi_gpu - ? detail::edge_partition_endpoint_property_device_view_t( - edge_dst_components.mutable_view()) - : detail::edge_partition_endpoint_property_device_view_t( - detail::edge_minor_property_view_t(level_components, - vertex_t{0})), - level_graph_view.local_edge_partition_dst_range_first(), - get_dataframe_buffer_begin(edge_buffer), - num_edge_inserts.data()}, - reduce_op::null()); + auto new_frontier_tagged_vertex_buffer = + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( + handle, + level_graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op_t{ + GraphViewType::is_multi_gpu + ? detail::edge_partition_endpoint_property_device_view_t( + edge_dst_components.mutable_view()) + : detail::edge_partition_endpoint_property_device_view_t( + detail::edge_minor_property_view_t(level_components, + vertex_t{0})), + level_graph_view.local_edge_partition_dst_range_first(), + get_dataframe_buffer_begin(edge_buffer), + num_edge_inserts.data()}, + reduce_op::null()); update_v_frontier(handle, level_graph_view, diff --git a/cpp/src/cores/core_number_impl.cuh b/cpp/src/cores/core_number_impl.cuh index d807ccac5a5..a2b6f6430f0 100644 --- a/cpp/src/cores/core_number_impl.cuh +++ b/cpp/src/cores/core_number_impl.cuh @@ -16,7 +16,7 @@ #pragma once #include "prims/reduce_v.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -222,14 +222,15 @@ void core_number(raft::handle_t const& handle, if (graph_view.is_symmetric() || ((degree_type == k_core_degree_type_t::IN) || (degree_type == k_core_degree_type_t::INOUT))) { auto [new_frontier_vertex_buffer, delta_buffer] = - transform_reduce_v_frontier_outgoing_e_by_dst(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - dst_core_numbers.view(), - edge_dummy_property_t{}.view(), - e_op_t{k, delta}, - reduce_op::plus()); + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( + handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + dst_core_numbers.view(), + edge_dummy_property_t{}.view(), + e_op_t{k, delta}, + reduce_op::plus()); update_v_frontier( handle, diff --git a/cpp/src/lookup/lookup_src_dst_impl.cuh b/cpp/src/lookup/lookup_src_dst_impl.cuh index 1c8c39fd6dd..45bbf870d80 100644 --- a/cpp/src/lookup/lookup_src_dst_impl.cuh +++ b/cpp/src/lookup/lookup_src_dst_impl.cuh @@ -115,12 +115,13 @@ struct lookup_container_t::lookup_con auto const minor_comm_size = minor_comm.get_size(); value_buffer = cugraph::collect_values_for_keys( - handle, + comm, kv_store_object->view(), edge_ids_to_lookup.begin(), edge_ids_to_lookup.end(), cugraph::detail::compute_gpu_id_from_ext_edge_id_t{ - comm_size, major_comm_size, minor_comm_size}); + comm_size, major_comm_size, minor_comm_size}, + handle.get_stream()); } else { cugraph::resize_dataframe_buffer( value_buffer, edge_ids_to_lookup.size(), handle.get_stream()); diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 177c79ace87..6cc410c0c8a 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -15,9 +15,11 @@ */ #pragma once +#include "prims/detail/multi_stream_utils.cuh" #include "prims/detail/optional_dataframe_buffer.hpp" #include "prims/detail/prim_functors.cuh" #include "prims/property_op_utils.cuh" +#include "prims/vertex_frontier.cuh" #include #include @@ -72,9 +74,9 @@ __device__ void push_buffer_element(BufferKeyOutputIterator buffer_key_output_fi e_op_result_t e_op_result) { using output_key_t = - typename optional_dataframe_buffer_value_type_t::value; + typename optional_dataframe_buffer_iterator_value_type_t::value; using output_value_t = - typename optional_dataframe_buffer_value_type_t::value; + typename optional_dataframe_buffer_iterator_value_type_t::value; assert(e_op_result.has_value()); @@ -118,7 +120,6 @@ __device__ void warp_push_buffer_elements( } template buffer_idx(*buffer_idx_ptr); - int32_t constexpr shared_array_size = max_one_e_per_frontier_key - ? int32_t{1} /* dummy */ - : extract_transform_v_frontier_e_kernel_block_size; - __shared__ std::conditional_t - warp_local_degree_inclusive_sums[shared_array_size]; - __shared__ std::conditional_t - warp_key_local_edge_offsets[shared_array_size]; + __shared__ edge_t + warp_local_degree_inclusive_sums[extract_transform_v_frontier_e_kernel_block_size]; + __shared__ edge_t warp_key_local_edge_offsets[extract_transform_v_frontier_e_kernel_block_size]; using WarpScan = cub::WarpScan; - __shared__ std:: - conditional_t - temp_storage; + __shared__ typename WarpScan::TempStorage temp_storage; auto indices = edge_partition.indices(); @@ -216,98 +211,74 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree( } } - if constexpr (max_one_e_per_frontier_key) { - // each thread processes one frontier key, exits if any edge returns a valid output + auto min_key_idx = static_cast(idx - (idx % raft::warp_size())); // inclusive + auto max_key_idx = + static_cast(std::min(static_cast(min_key_idx) + raft::warp_size(), + static_cast(num_keys))); // exclusive - e_op_result_t e_op_result{thrust::nullopt}; - auto key = *(key_first + idx); + // update warp_local_degree_inclusive_sums & warp_key_local_edge_offsets - if (edge_partition_e_mask) { - for (edge_t i = 0; i < local_degree; ++i) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - e_op_result = call_e_op(key, edge_offset + i); - if (e_op_result) { break; } - } - } - } else { - for (edge_t i = 0; i < local_degree; ++i) { - e_op_result = call_e_op(key, edge_offset + i); - if (e_op_result) { break; } - } - } - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } else { - auto min_key_idx = static_cast(idx - (idx % raft::warp_size())); // inclusive - auto max_key_idx = - static_cast(std::min(static_cast(min_key_idx) + raft::warp_size(), - static_cast(num_keys))); // exclusive - - // update warp_local_degree_inclusive_sums & warp_key_local_edge_offsets - - warp_key_local_edge_offsets[threadIdx.x] = edge_offset; - WarpScan(temp_storage) - .InclusiveSum(local_degree, warp_local_degree_inclusive_sums[threadIdx.x]); - __syncwarp(); + warp_key_local_edge_offsets[threadIdx.x] = edge_offset; + WarpScan(temp_storage) + .InclusiveSum(local_degree, warp_local_degree_inclusive_sums[threadIdx.x]); + __syncwarp(); - // all the threads in a warp collectively process local edges for the keys in [key_first + - // min_key_idx, key_first + max_key_idx) + // all the threads in a warp collectively process local edges for the keys in [key_first + + // min_key_idx, key_first + max_key_idx) - auto num_edges_this_warp = warp_local_degree_inclusive_sums[warp_id * raft::warp_size() + - (max_key_idx - min_key_idx) - 1]; - auto rounded_up_num_edges_this_warp = - ((static_cast(num_edges_this_warp) + (raft::warp_size() - 1)) / raft::warp_size()) * - raft::warp_size(); + auto num_edges_this_warp = warp_local_degree_inclusive_sums[warp_id * raft::warp_size() + + (max_key_idx - min_key_idx) - 1]; + auto rounded_up_num_edges_this_warp = + ((static_cast(num_edges_this_warp) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); - auto this_warp_inclusive_sum_first = - warp_local_degree_inclusive_sums + warp_id * raft::warp_size(); - auto this_warp_inclusive_sum_last = - this_warp_inclusive_sum_first + (max_key_idx - min_key_idx); + auto this_warp_inclusive_sum_first = + warp_local_degree_inclusive_sums + warp_id * raft::warp_size(); + auto this_warp_inclusive_sum_last = this_warp_inclusive_sum_first + (max_key_idx - min_key_idx); - if (edge_partition_e_mask) { - for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { - e_op_result_t e_op_result{thrust::nullopt}; - - if (i < static_cast(num_edges_this_warp)) { - auto key_idx_this_warp = static_cast(thrust::distance( - this_warp_inclusive_sum_first, - thrust::upper_bound( - thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); - auto local_edge_offset = - warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + - static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} - : *(this_warp_inclusive_sum_first + - (key_idx_this_warp - 1)))); - if ((*edge_partition_e_mask).get(local_edge_offset)) { - auto key = *(key_first + (min_key_idx + key_idx_this_warp)); - e_op_result = call_e_op(key, local_edge_offset); - } - } + if (edge_partition_e_mask) { + for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { + e_op_result_t e_op_result{thrust::nullopt}; - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } - } else { - for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { - e_op_result_t e_op_result{thrust::nullopt}; - - if (i < static_cast(num_edges_this_warp)) { - auto key_idx_this_warp = static_cast(thrust::distance( - this_warp_inclusive_sum_first, - thrust::upper_bound( - thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); - auto local_edge_offset = - warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + - static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} - : *(this_warp_inclusive_sum_first + - (key_idx_this_warp - 1)))); + if (i < static_cast(num_edges_this_warp)) { + auto key_idx_this_warp = static_cast(thrust::distance( + this_warp_inclusive_sum_first, + thrust::upper_bound( + thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); + auto local_edge_offset = + warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + + static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} + : *(this_warp_inclusive_sum_first + + (key_idx_this_warp - 1)))); + if ((*edge_partition_e_mask).get(local_edge_offset)) { auto key = *(key_first + (min_key_idx + key_idx_this_warp)); e_op_result = call_e_op(key, local_edge_offset); } + } - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); + } + } else { + for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { + e_op_result_t e_op_result{thrust::nullopt}; + + if (i < static_cast(num_edges_this_warp)) { + auto key_idx_this_warp = static_cast(thrust::distance( + this_warp_inclusive_sum_first, + thrust::upper_bound( + thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); + auto local_edge_offset = + warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + + static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} + : *(this_warp_inclusive_sum_first + + (key_idx_this_warp - 1)))); + auto key = *(key_first + (min_key_idx + key_idx_this_warp)); + e_op_result = call_e_op(key, local_edge_offset); } + + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } @@ -315,8 +286,7 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree( } } -template buffer_idx(*buffer_idx_ptr); - using WarpReduce = cub::WarpReduce; - __shared__ std::conditional_t - temp_storage[max_one_e_per_frontier_key - ? (extract_transform_v_frontier_e_kernel_block_size / raft::warp_size()) - : int32_t{1} /* dummy */]; - while (idx < static_cast(thrust::distance(key_first, key_last))) { auto key = *(key_first + idx); auto major = thrust_tuple_get_or_identity(key); auto major_offset = edge_partition.major_offset_from_major_nocheck(major); vertex_t const* indices{nullptr}; edge_t local_edge_offset{}; - edge_t local_out_degree{}; - thrust::tie(indices, local_edge_offset, local_out_degree) = + edge_t local_degree{}; + thrust::tie(indices, local_edge_offset, local_degree) = edge_partition.local_edges(major_offset); - auto rounded_up_local_out_degree = - ((static_cast(local_out_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * + auto rounded_up_local_degree = + ((static_cast(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * raft::warp_size(); auto call_e_op = call_e_op_t(local_out_degree)) && + if ((i < static_cast(local_degree)) && ((*edge_partition_e_mask).get(local_edge_offset + i))) { e_op_result = call_e_op(i); } - if constexpr (max_one_e_per_frontier_key) { - auto first_valid_lane_id = - WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) - .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); - first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); - if (lane_id == first_valid_lane_id) { - auto push_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed); - push_buffer_element( - buffer_key_output_first, buffer_value_output_first, push_idx, e_op_result); - } - if (first_valid_lane_id != raft::warp_size()) { break; } - } else { - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } else { - for (size_t i = lane_id; i < rounded_up_local_out_degree; i += raft::warp_size()) { + for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { e_op_result_t e_op_result{thrust::nullopt}; - if (i < static_cast(local_out_degree)) { e_op_result = call_e_op(i); } - - if constexpr (max_one_e_per_frontier_key) { - auto first_valid_lane_id = - WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) - .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); - first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); - if (lane_id == first_valid_lane_id) { - auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed); - push_buffer_element( - buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result); - } - if (first_valid_lane_id != raft::warp_size()) { break; } - } else { - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } + if (i < static_cast(local_degree)) { e_op_result = call_e_op(i); } + + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } @@ -446,8 +382,7 @@ __global__ static void extract_transform_v_frontier_e_mid_degree( } } -template edge_partition, KeyIterator key_first, - KeyIterator key_last, + raft::device_span key_local_degree_offsets, EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, @@ -482,132 +417,249 @@ __global__ static void extract_transform_v_frontier_e_high_degree( typename EdgePartitionEdgeValueInputWrapper::value_type, EdgeOp>::type; - auto const warp_id = threadIdx.x / raft::warp_size(); + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; auto const lane_id = threadIdx.x % raft::warp_size(); - auto idx = static_cast(blockIdx.x); - - cuda::atomic_ref buffer_idx(*buffer_idx_ptr); - using BlockReduce = cub::BlockReduce; - __shared__ std::conditional_t - temp_storage; - __shared__ int32_t output_thread_id; - - while (idx < static_cast(thrust::distance(key_first, key_last))) { - auto key = *(key_first + idx); - auto major = thrust_tuple_get_or_identity(key); - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - vertex_t const* indices{nullptr}; - edge_t local_edge_offset{}; - edge_t local_out_degree{}; - thrust::tie(indices, local_edge_offset, local_out_degree) = - edge_partition.local_edges(major_offset); - auto rounded_up_local_out_degree = ((static_cast(local_out_degree) + - (extract_transform_v_frontier_e_kernel_block_size - 1)) / - extract_transform_v_frontier_e_kernel_block_size) * - extract_transform_v_frontier_e_kernel_block_size; - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - key, - major_offset, - indices, - local_edge_offset}; + auto idx = static_cast(tid); - if (edge_partition_e_mask) { - for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) { - e_op_result_t e_op_result{thrust::nullopt}; - if ((i < static_cast(local_out_degree)) && - ((*edge_partition_e_mask).get(local_edge_offset + i))) { - e_op_result = call_e_op(i); - } + cuda::atomic_ref buffer_idx(*buffer_idx_ptr); - if constexpr (max_one_e_per_frontier_key) { - auto first_valid_thread_id = - BlockReduce(temp_storage) - .Reduce(e_op_result ? threadIdx.x : extract_transform_v_frontier_e_kernel_block_size, - cub::Min()); - if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } - __syncthreads(); - if (threadIdx.x == output_thread_id) { - auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed); - push_buffer_element( - buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result); - } - if (output_thread_id != extract_transform_v_frontier_e_kernel_block_size) { break; } - } else { - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } - } - } else { - for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) { - e_op_result_t e_op_result{thrust::nullopt}; - if (i < static_cast(local_out_degree)) { e_op_result = call_e_op(i); } - - if constexpr (max_one_e_per_frontier_key) { - auto first_valid_thread_id = - BlockReduce(temp_storage) - .Reduce(e_op_result ? threadIdx.x : extract_transform_v_frontier_e_kernel_block_size, - cub::Min()); - if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } - __syncthreads(); - if (threadIdx.x == output_thread_id) { - auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed); - push_buffer_element( - buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result); - } - if (output_thread_id != extract_transform_v_frontier_e_kernel_block_size) { break; } - } else { - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); + auto num_edges = *(key_local_degree_offsets.rbegin()); + size_t rounded_up_num_edges = + ((static_cast(num_edges) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); + while (idx < rounded_up_num_edges) { + e_op_result_t e_op_result{thrust::nullopt}; + if (idx < num_edges) { + auto key_idx = thrust::distance( + key_local_degree_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, key_local_degree_offsets.begin() + 1, key_local_degree_offsets.end(), idx)); + auto key = *(key_first + key_idx); + auto major = thrust_tuple_get_or_identity(key); + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t local_edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, local_edge_offset, local_degree) = + edge_partition.local_edges(major_offset); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + local_edge_offset}; + + auto e_idx = static_cast(idx - key_local_degree_offsets[key_idx]); + if (edge_partition_e_mask) { + if ((*edge_partition_e_mask).get(local_edge_offset + e_idx)) { + e_op_result = call_e_op(e_idx); } + } else { + e_op_result = call_e_op(e_idx); } } + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); + + idx += gridDim.x * blockDim.x; + } +} + +template +void extract_transform_v_frontier_e_edge_partition( + raft::handle_t const& handle, + edge_partition_device_view_t edge_partition, + InputKeyIterator edge_partition_frontier_key_first, + InputKeyIterator edge_partition_frontier_key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + OptionalOutputKeyIterator output_key_first, + OptionalOutputValueIterator output_value_first, + raft::device_span count /* size = 1 */, + EdgeOp e_op, + std::optional> high_segment_key_local_degree_offsets, + std::optional high_segment_edge_count, + std::optional> key_segment_offsets, + std::optional> const& edge_partition_stream_pool_indices) +{ + size_t stream_pool_size{0}; + if (edge_partition_stream_pool_indices) { + stream_pool_size = (*edge_partition_stream_pool_indices).size(); + } + if (key_segment_offsets) { + if (((*key_segment_offsets)[1] > 0) && ((*high_segment_edge_count) > 0)) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[0 % stream_pool_size]) + : handle.get_stream(); + + raft::grid_1d_thread_t update_grid((*high_segment_edge_count), + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_high_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first, + raft::device_span((*high_segment_key_local_degree_offsets).data(), + (*high_segment_key_local_degree_offsets).size()), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[1 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_mid_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first + (*key_segment_offsets)[1], + edge_partition_frontier_key_first + (*key_segment_offsets)[2], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2] > 0) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[2 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_hypersparse_or_low_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first + (*key_segment_offsets)[2], + edge_partition_frontier_key_first + (*key_segment_offsets)[3], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + if (edge_partition.dcs_nzd_vertex_count() && + ((*key_segment_offsets)[4] - (*key_segment_offsets)[3] > 0)) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[3 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_thread_t update_grid((*key_segment_offsets)[4] - (*key_segment_offsets)[3], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_hypersparse_or_low_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first + (*key_segment_offsets)[3], + edge_partition_frontier_key_first + (*key_segment_offsets)[4], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + } else { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[0 % stream_pool_size]) + : handle.get_stream(); + + auto frontier_size = static_cast( + thrust::distance(edge_partition_frontier_key_first, edge_partition_frontier_key_last)); + if (frontier_size > 0) { + raft::grid_1d_thread_t update_grid(frontier_size, + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); - idx += gridDim.x; + extract_transform_v_frontier_e_hypersparse_or_low_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first, + edge_partition_frontier_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } } } +#define EXTRACT_PERFORMANCE_MEASUREMENT 0 // FIXME: delete + template -std::tuple< - decltype(allocate_optional_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{})), - decltype(allocate_optional_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> +std::tuple, + optional_dataframe_buffer_type_t> extract_transform_v_frontier_e(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& frontier, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, EdgeOp e_op, bool do_expensive_check = false) { +#if EXTRACT_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time0 = std::chrono::steady_clock::now(); +#endif using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; - using key_t = typename VertexFrontierBucketType::key_type; + using key_t = typename KeyBucketType::key_type; using output_key_t = OutputKeyT; using output_value_t = OutputValueT; @@ -653,6 +705,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, thrust::optional, thrust::optional>>>); + constexpr bool try_bitmap = GraphViewType::is_multi_gpu && std::is_same_v && + KeyBucketType::is_sorted_unique; + if (do_expensive_check) { auto frontier_vertex_first = thrust_tuple_get_or_identity(frontier.begin()); @@ -673,10 +728,15 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, "Invalid input argument: frontier includes out-of-range keys."); } + [[maybe_unused]] constexpr auto max_segments = + detail::num_sparse_segments_per_vertex_partition + size_t{1}; + + // 1. pre-process frontier data + auto frontier_key_first = frontier.begin(); auto frontier_key_last = frontier.end(); auto frontier_keys = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - if constexpr (!VertexFrontierBucketType::is_sorted_unique) { + if constexpr (!KeyBucketType::is_sorted_unique) { resize_dataframe_buffer(frontier_keys, frontier.size(), handle.get_stream()); thrust::copy(handle.get_thrust_policy(), frontier_key_first, @@ -689,209 +749,726 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, frontier_key_last = get_dataframe_buffer_end(frontier_keys); } - // 1. fill the buffers + std::optional> key_segment_offsets{std::nullopt}; + { // drop zero degree vertices & compute key_segment_offsets + size_t partition_idx{0}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + partition_idx = static_cast(minor_comm.get_rank()); + } + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + if (segment_offsets) { + if (thrust::distance(frontier_key_first, frontier_key_last) > 0) { + key_segment_offsets = compute_key_segment_offsets( + frontier_key_first, + frontier_key_last, + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + (*key_segment_offsets).back() = *((*key_segment_offsets).rbegin() + 1); + frontier_key_last = frontier_key_first + (*key_segment_offsets).back(); + } else { + key_segment_offsets = std::vector((*segment_offsets).size(), 0); + } + } + } + + // 2. compute local max_pushes - auto key_buffer = - allocate_optional_dataframe_buffer(size_t{0}, handle.get_stream()); - auto value_buffer = - allocate_optional_dataframe_buffer(size_t{0}, handle.get_stream()); - rmm::device_scalar buffer_idx(size_t{0}, handle.get_stream()); + size_t local_max_pushes{}; + { + size_t partition_idx{}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + partition_idx = static_cast(minor_comm_rank); + } + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto frontier_major_first = + thrust_tuple_get_or_identity(frontier_key_first); + auto frontier_major_last = + thrust_tuple_get_or_identity(frontier_key_last); + // for an edge-masked graph, we can pass edge mask to compute tighter bound (at the expense of + // additional computing) + local_max_pushes = edge_partition.compute_number_of_edges( + frontier_major_first, frontier_major_last, handle.get_stream()); + } + + // 3. communication over minor_comm std::vector local_frontier_sizes{}; + std::conditional_t, std::byte /* dummy */> + max_tmp_buffer_sizes{}; + std::conditional_t, std::byte /* dummy */> + tmp_buffer_size_per_loop_approximations{}; + std::conditional_t, std::byte /* dummy */> + local_frontier_range_firsts{}; + std::conditional_t, std::byte /* dummy */> + local_frontier_range_lasts{}; + std::optional>> key_segment_offset_vectors{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - local_frontier_sizes = host_scalar_allgather( - minor_comm, - static_cast(thrust::distance(frontier_key_first, frontier_key_last)), - handle.get_stream()); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + auto max_tmp_buffer_size = + static_cast(static_cast(handle.get_device_properties().totalGlobalMem) * 0.2); + size_t approx_tmp_buffer_size_per_loop{}; + { + size_t key_size{0}; + if constexpr (std::is_arithmetic_v) { + key_size = sizeof(key_t); + } else { + key_size = cugraph::sum_thrust_tuple_element_sizes(); + } + size_t output_key_size{0}; + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + output_key_size = sizeof(output_key_t); + } else { + output_key_size = cugraph::sum_thrust_tuple_element_sizes(); + } + } + size_t output_value_size{0}; + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + output_value_size = sizeof(output_value_t); + } else { + output_value_size = cugraph::sum_thrust_tuple_element_sizes(); + } + } + approx_tmp_buffer_size_per_loop = + static_cast(thrust::distance(frontier_key_first, frontier_key_last)) * key_size + + local_max_pushes * (output_key_size + output_value_size); + } + + size_t num_scalars = + 3; // local_frontier_size, max_tmp_buffer_size, approx_tmp_buffer_size_per_loop + if constexpr (try_bitmap) { + num_scalars += 2; // local_frontier_range_first, local_frontier_range_last + } + if (key_segment_offsets) { num_scalars += (*key_segment_offsets).size(); } + rmm::device_uvector d_aggregate_tmps(minor_comm_size * num_scalars, + handle.get_stream()); + thrust::tabulate( + handle.get_thrust_policy(), + d_aggregate_tmps.begin() + num_scalars * minor_comm_rank, + d_aggregate_tmps.begin() + (num_scalars * minor_comm_rank + (try_bitmap ? 5 : 3)), + [frontier_key_first, + max_tmp_buffer_size, + approx_tmp_buffer_size_per_loop, + v_list_size = static_cast(thrust::distance(frontier_key_first, frontier_key_last)), + vertex_partition_range_first = + graph_view.local_vertex_partition_range_first()] __device__(size_t i) { + if (i == 0) { + return v_list_size; + } else if (i == 1) { + return max_tmp_buffer_size; + } else if (i == 2) { + return approx_tmp_buffer_size_per_loop; + } + if constexpr (try_bitmap) { + if (i == 3) { + vertex_t first{}; + if (v_list_size > 0) { + first = *frontier_key_first; + } else { + first = vertex_partition_range_first; + } + assert(static_cast(static_cast(first)) == first); + return static_cast(first); + } else if (i == 4) { + assert(i == 4); + vertex_t last{}; + if (v_list_size > 0) { + last = *(frontier_key_first + (v_list_size - 1)) + 1; + } else { + last = vertex_partition_range_first; + } + assert(static_cast(static_cast(last)) == last); + return static_cast(last); + } + } + assert(false); + return size_t{0}; + }); + if (key_segment_offsets) { + raft::update_device( + d_aggregate_tmps.data() + (minor_comm_rank * num_scalars + (try_bitmap ? 5 : 3)), + (*key_segment_offsets).data(), + (*key_segment_offsets).size(), + handle.get_stream()); + } + + if (minor_comm_size > 1) { + device_allgather(minor_comm, + d_aggregate_tmps.data() + minor_comm_rank * num_scalars, + d_aggregate_tmps.data(), + num_scalars, + handle.get_stream()); + } + + std::vector h_aggregate_tmps(d_aggregate_tmps.size()); + raft::update_host(h_aggregate_tmps.data(), + d_aggregate_tmps.data(), + d_aggregate_tmps.size(), + handle.get_stream()); + handle.sync_stream(); + local_frontier_sizes = std::vector(minor_comm_size); + max_tmp_buffer_sizes = std::vector(minor_comm_size); + tmp_buffer_size_per_loop_approximations = std::vector(minor_comm_size); + if constexpr (try_bitmap) { + local_frontier_range_firsts = std::vector(minor_comm_size); + local_frontier_range_lasts = std::vector(minor_comm_size); + } + if (key_segment_offsets) { + key_segment_offset_vectors = std::vector>{}; + (*key_segment_offset_vectors).reserve(minor_comm_size); + } + for (int i = 0; i < minor_comm_size; ++i) { + local_frontier_sizes[i] = h_aggregate_tmps[i * num_scalars]; + max_tmp_buffer_sizes[i] = h_aggregate_tmps[i * num_scalars + 1]; + tmp_buffer_size_per_loop_approximations[i] = h_aggregate_tmps[i * num_scalars + 2]; + if constexpr (try_bitmap) { + local_frontier_range_firsts[i] = + static_cast(h_aggregate_tmps[i * num_scalars + 3]); + local_frontier_range_lasts[i] = + static_cast(h_aggregate_tmps[i * num_scalars + 4]); + } + if (key_segment_offsets) { + (*key_segment_offset_vectors) + .emplace_back(h_aggregate_tmps.begin() + (i * num_scalars + (try_bitmap ? 5 : 3)), + h_aggregate_tmps.begin() + + (i * num_scalars + (try_bitmap ? 5 : 3) + (*key_segment_offsets).size())); + } + } } else { local_frontier_sizes = std::vector{static_cast( static_cast(thrust::distance(frontier_key_first, frontier_key_last)))}; + if (key_segment_offsets) { + key_segment_offset_vectors = std::vector>(1); + (*key_segment_offset_vectors)[0] = *key_segment_offsets; + } + } + + // update frontier bitmap (used to reduce broadcast bandwidth size) + + bool v_compressible{false}; + std:: + conditional_t>, std::byte /* dummy */> + frontier_bitmap{}; + std:: + conditional_t>, std::byte /* dummy */> + compressed_frontier{}; + if constexpr (try_bitmap) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + auto const minor_comm_rank = minor_comm.get_rank(); + + if constexpr (sizeof(vertex_t) == 8) { + vertex_t local_frontier_max_range_size{0}; + for (int i = 0; i < minor_comm_size; ++i) { + auto range_size = local_frontier_range_lasts[i] - local_frontier_range_firsts[i]; + local_frontier_max_range_size = std::max(range_size, local_frontier_max_range_size); + } + if (local_frontier_max_range_size <= + std::numeric_limits::max()) { // broadcast 32 bit offset values instead of 64 + // bit vertex IDs + v_compressible = true; + } + } + + double avg_fill_ratio{0.0}; + for (int i = 0; i < minor_comm_size; ++i) { + auto num_keys = static_cast(local_frontier_sizes[i]); + auto range_size = local_frontier_range_lasts[i] - local_frontier_range_firsts[i]; + avg_fill_ratio += + (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; + } + avg_fill_ratio /= static_cast(minor_comm_size); + constexpr double threshold_ratio = + 8.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); + auto avg_frontier_size = + std::reduce(local_frontier_sizes.begin(), local_frontier_sizes.end()) / + static_cast(minor_comm_size); + + if ((avg_fill_ratio > threshold_ratio) && + (static_cast(avg_frontier_size) > + packed_bools_per_word() * + 32 /* tuning parameter, to consider additional kernel launch overhead */)) { + frontier_bitmap = + compute_vertex_list_bitmap_info(frontier_key_first, + frontier_key_last, + local_frontier_range_firsts[minor_comm_rank], + local_frontier_range_lasts[minor_comm_rank], + handle.get_stream()); + } else if (v_compressible) { + rmm::device_uvector tmps(local_frontier_sizes[minor_comm_rank], + handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + frontier_key_first, + frontier_key_last, + tmps.begin(), + cuda::proclaim_return_type( + [range_first = local_frontier_range_firsts[minor_comm_rank]] __device__( + auto v) { return static_cast(v - range_first); })); + compressed_frontier = std::move(tmps); + } + } + } + + // set-up stream ppol + + std::optional> stream_pool_indices{std::nullopt}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + auto max_tmp_buffer_size = + std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / + static_cast(minor_comm_size); + auto approx_tmp_buffer_size_per_loop = + std::reduce(tmp_buffer_size_per_loop_approximations.begin(), + tmp_buffer_size_per_loop_approximations.end()) / + static_cast(minor_comm_size); + size_t num_streams_per_loop{1}; + if (graph_view.local_vertex_partition_segment_offsets() && + (handle.get_stream_pool_size() >= max_segments)) { + num_streams_per_loop = std::max( + std::min(size_t{8} / graph_view.number_of_local_edge_partitions(), max_segments), + size_t{ + 1}); // Note that "CUDA_DEVICE_MAX_CONNECTIONS (default: 8, can be set to [1, 32])" sets + // the number of queues, if the total number of streams exceeds this number, jobs on + // different streams can be sent to one queue leading to false dependency. Setting + // num_concurrent_loops above the number of queues has some benefits in NCCL + // communications but creating too many streams just for compute may not help. + } + stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, + approx_tmp_buffer_size_per_loop, + graph_view.number_of_local_edge_partitions(), + num_streams_per_loop, + handle.get_stream_pool_size()); + if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } + } + + size_t num_concurrent_loops{1}; + std::optional> loop_stream_pool_indices{ + std::nullopt}; // first num_concurrent_loopos streams from stream_pool_indices + if (stream_pool_indices) { + num_concurrent_loops = + std::min(graph_view.number_of_local_edge_partitions(), (*stream_pool_indices).size()); + loop_stream_pool_indices = std::vector(num_concurrent_loops); + std::iota((*loop_stream_pool_indices).begin(), (*loop_stream_pool_indices).end(), size_t{0}); } + rmm::device_uvector counters(num_concurrent_loops, handle.get_stream()); + + if constexpr (!GraphViewType::is_multi_gpu) { + if (loop_stream_pool_indices) { handle.sync_stream(); } + } + + // 2. fill the buffers + + std::vector> key_buffers{}; + std::vector> value_buffers{}; + key_buffers.reserve(graph_view.number_of_local_edge_partitions()); + value_buffers.reserve(graph_view.number_of_local_edge_partitions()); + auto edge_mask_view = graph_view.edge_mask_view(); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - auto edge_partition_e_mask = - edge_mask_view - ? thrust::make_optional< - detail::edge_partition_edge_property_device_view_t>( - *edge_mask_view, i) - : thrust::nullopt; - - auto edge_partition_frontier_key_buffer = - allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - vertex_t edge_partition_frontier_size = static_cast(local_frontier_sizes[i]); - auto edge_partition_frontier_key_first = frontier_key_first; - auto edge_partition_frontier_key_last = frontier_key_last; +#if EXTRACT_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time1 = std::chrono::steady_clock::now(); +#endif + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i += num_concurrent_loops) { +#if EXTRACT_PERFORMANCE_MEASUREMENT + auto subtime0 = std::chrono::steady_clock::now(); +#endif + auto loop_count = + std::min(num_concurrent_loops, graph_view.number_of_local_edge_partitions() - i); + + std::conditional_t< + GraphViewType::is_multi_gpu, + std::conditional_t< + try_bitmap, + std::vector, rmm::device_uvector>>, + std::vector>>, + std::byte /* dummy */> + edge_partition_key_buffers{}; if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + edge_partition_key_buffers.reserve(loop_count); + + std::conditional_t>>, + std::byte /* dummy */> + edge_partition_bitmap_buffers{}; + if constexpr (try_bitmap) { + if (frontier_bitmap) { + edge_partition_bitmap_buffers = std::vector>{}; + (*edge_partition_bitmap_buffers).reserve(loop_count); + } + } - resize_dataframe_buffer( - edge_partition_frontier_key_buffer, edge_partition_frontier_size, handle.get_stream()); - - device_bcast(minor_comm, - frontier_key_first, - get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), - edge_partition_frontier_size, - static_cast(i), - handle.get_stream()); - - edge_partition_frontier_key_first = - get_dataframe_buffer_begin(edge_partition_frontier_key_buffer); - edge_partition_frontier_key_last = - get_dataframe_buffer_end(edge_partition_frontier_key_buffer); - } - - auto edge_partition_frontier_major_first = - thrust_tuple_get_or_identity( - edge_partition_frontier_key_first); - auto edge_partition_frontier_major_last = - thrust_tuple_get_or_identity( - edge_partition_frontier_key_last); - - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - auto max_pushes = max_one_e_per_frontier_key ? edge_partition_frontier_size - : edge_partition.compute_number_of_edges( - edge_partition_frontier_major_first, - edge_partition_frontier_major_last, - handle.get_stream()); - - auto new_buffer_size = buffer_idx.value(handle.get_stream()) + max_pushes; - resize_optional_dataframe_buffer( - key_buffer, new_buffer_size, handle.get_stream()); - resize_optional_dataframe_buffer( - value_buffer, new_buffer_size, handle.get_stream()); - - edge_partition_src_input_device_view_t edge_partition_src_value_input{}; - edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; - if constexpr (GraphViewType::is_storage_transposed) { - edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input); - edge_partition_dst_value_input = - edge_partition_dst_input_device_view_t(edge_dst_value_input, i); - } else { - edge_partition_src_value_input = - edge_partition_src_input_device_view_t(edge_src_value_input, i); - edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + bool use_bitmap_buffer = false; + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { + (*edge_partition_bitmap_buffers) + .emplace_back(packed_bool_size(local_frontier_range_lasts[partition_idx] - + local_frontier_range_firsts[partition_idx]), + handle.get_stream()); + use_bitmap_buffer = true; + } + } + if (!use_bitmap_buffer) { + bool allocated{false}; + if constexpr (try_bitmap) { + if (v_compressible) { + edge_partition_key_buffers.push_back(rmm::device_uvector( + local_frontier_sizes[partition_idx], handle.get_stream())); + allocated = true; + } + } + if (!allocated) { + edge_partition_key_buffers.push_back(allocate_dataframe_buffer( + local_frontier_sizes[partition_idx], handle.get_stream())); + } + } + } + + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + if constexpr (try_bitmap) { + if (frontier_bitmap) { + device_bcast(minor_comm, + (*frontier_bitmap).data(), + get_dataframe_buffer_begin((*edge_partition_bitmap_buffers)[j]), + size_dataframe_buffer((*edge_partition_bitmap_buffers)[j]), + static_cast(partition_idx), + handle.get_stream()); + } else if (compressed_frontier) { + device_bcast(minor_comm, + (*compressed_frontier).data(), + get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])), + local_frontier_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } else { + device_bcast(minor_comm, + frontier_key_first, + get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])), + local_frontier_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } + } else { + device_bcast(minor_comm, + frontier_key_first, + get_dataframe_buffer_begin(edge_partition_key_buffers[j]), + local_frontier_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } + } + device_group_end(minor_comm); + if (loop_stream_pool_indices) { handle.sync_stream(); } + + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + std::variant, rmm::device_uvector> keys = + rmm::device_uvector(0, loop_stream); + if (v_compressible) { + std::get<0>(keys).resize(local_frontier_sizes[partition_idx], loop_stream); + } else { + keys = + rmm::device_uvector(local_frontier_sizes[partition_idx], loop_stream); + } + + auto& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + + auto range_first = local_frontier_range_firsts[partition_idx]; + auto range_last = local_frontier_range_lasts[partition_idx]; + if (keys.index() == 0) { + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + get_dataframe_buffer_begin(std::get<0>(keys)), + raft::device_span(counters.data() + j, + size_t{1}), // dummy, we already know the counts + uint32_t{0}, + static_cast(range_last - range_first), + loop_stream); + } else { + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + get_dataframe_buffer_begin(std::get<1>(keys)), + raft::device_span(counters.data() + j, + size_t{1}), // dummy, we already know the counts + range_first, + range_last, + loop_stream); + } + + edge_partition_key_buffers.push_back(std::move(keys)); + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + (*edge_partition_bitmap_buffers).clear(); + } + } + } +#if EXTRACT_PERFORMANCE_MEASUREMENT + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + auto subtime1 = std::chrono::steady_clock::now(); +#endif + + std::vector> output_key_buffers{}; + output_key_buffers.reserve(loop_count); + std::vector> output_value_buffers{}; + output_value_buffers.reserve(loop_count); + std::vector edge_partition_max_push_counts(loop_count); + + std::optional>> + high_segment_key_local_degree_offset_vectors{std::nullopt}; + std::optional> high_segment_edge_counts{std::nullopt}; + if (key_segment_offset_vectors) { + high_segment_key_local_degree_offset_vectors = std::vector>{}; + (*high_segment_key_local_degree_offset_vectors).reserve(loop_count); + high_segment_edge_counts = std::vector(loop_count); } - auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i); - if (segment_offsets) { - static_assert(num_sparse_segments_per_vertex_partition == 3); - std::vector h_thresholds(num_sparse_segments_per_vertex_partition + - (graph_view.use_dcs() ? 1 : 0) - 1); - h_thresholds[0] = edge_partition.major_range_first() + (*segment_offsets)[1]; - h_thresholds[1] = edge_partition.major_range_first() + (*segment_offsets)[2]; - if (graph_view.use_dcs()) { - h_thresholds[2] = edge_partition.major_range_first() + (*segment_offsets)[3]; + edge_partition_max_push_counts[0] = local_max_pushes; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + if (static_cast(partition_idx) != minor_comm_rank) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + + auto const& keys = edge_partition_key_buffers[j]; + + bool computed{false}; + if constexpr (try_bitmap) { + if (keys.index() == 0) { + auto major_first = thrust::make_transform_iterator( + std::get<0>(keys).begin(), + cuda::proclaim_return_type( + [range_first = + local_frontier_range_firsts[partition_idx]] __device__(uint32_t v_offset) { + return range_first + static_cast(v_offset); + })); + edge_partition.compute_number_of_edges_async( + major_first, + major_first + std::get<0>(keys).size(), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + computed = true; + } + } + if (!computed) { + dataframe_buffer_const_iterator_type_t key_first{}; + size_t num_keys{}; + if constexpr (try_bitmap) { + assert(keys.index() == 1); + key_first = get_dataframe_buffer_begin(std::get<1>(keys)); + num_keys = std::get<1>(keys).size(); + } else { + key_first = get_dataframe_buffer_begin(keys); + num_keys = size_dataframe_buffer(keys); + } + auto major_first = thrust_tuple_get_or_identity(key_first); + edge_partition.compute_number_of_edges_async( + major_first, + major_first + num_keys, + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + raft::update_host( + edge_partition_max_push_counts.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); + if (static_cast(minor_comm_rank / num_concurrent_loops) == + (i / num_concurrent_loops)) { + edge_partition_max_push_counts[minor_comm_rank % num_concurrent_loops] = local_max_pushes; + } } - rmm::device_uvector d_thresholds(h_thresholds.size(), handle.get_stream()); - raft::update_device( - d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), handle.get_stream()); - rmm::device_uvector d_offsets(d_thresholds.size(), handle.get_stream()); - thrust::lower_bound(handle.get_thrust_policy(), - edge_partition_frontier_major_first, - edge_partition_frontier_major_last, - d_thresholds.begin(), - d_thresholds.end(), - d_offsets.begin()); - std::vector h_offsets(d_offsets.size()); - raft::update_host(h_offsets.data(), d_offsets.data(), d_offsets.size(), handle.get_stream()); - RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - h_offsets.push_back(edge_partition_frontier_size); - // FIXME: we may further improve performance by 1) concurrently running kernels on different - // segments; 2) individually tuning block sizes for different segments; and 3) adding one - // more segment for very high degree vertices and running segmented reduction - if (h_offsets[0] > 0) { - raft::grid_1d_block_t update_grid(h_offsets[0], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_high_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first, - edge_partition_frontier_key_first + h_offsets[0], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), - e_op); + } +#if EXTRACT_PERFORMANCE_MEASUREMENT + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + auto subtime2 = std::chrono::steady_clock::now(); +#endif + + if (key_segment_offset_vectors) { + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + rmm::device_uvector high_segment_key_local_degree_offsets( + key_segment_offsets[1] + 1, loop_stream); + high_segment_key_local_degree_offsets.set_element_to_zero_async(0, loop_stream); + bool computed{false}; + if constexpr (try_bitmap) { + auto const& keys = edge_partition_key_buffers[j]; + if (keys.index() == 0) { + auto key_local_degree_first = thrust::make_transform_iterator( + std::get<0>(keys).begin(), + cuda::proclaim_return_type( + [edge_partition, + range_first = + local_frontier_range_firsts[partition_idx]] __device__(uint32_t v_offset) { + auto major = range_first + static_cast(v_offset); + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + return static_cast(edge_partition.local_degree(major_offset)); + })); + thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream), + key_local_degree_first, + key_local_degree_first + key_segment_offsets[1], + high_segment_key_local_degree_offsets.begin() + 1); + computed = true; + } + } + if (!computed) { + auto key_first = frontier_key_first; + if constexpr (GraphViewType::is_multi_gpu) { + auto const& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + assert(keys.index() == 1); + key_first = get_dataframe_buffer_begin(std::get<1>(keys)); + } else { + key_first = get_dataframe_buffer_begin(keys); + } + auto key_local_degree_first = thrust::make_transform_iterator( + key_first, cuda::proclaim_return_type([edge_partition] __device__(auto key) { + auto major = thrust_tuple_get_or_identity(key); + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + return static_cast(edge_partition.local_degree(major_offset)); + })); + thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream), + key_local_degree_first, + key_local_degree_first + key_segment_offsets[1], + high_segment_key_local_degree_offsets.begin() + 1); + } + } + raft::update_host((*high_segment_edge_counts).data() + j, + high_segment_key_local_degree_offsets.data() + key_segment_offsets[1], + 1, + loop_stream); + (*high_segment_key_local_degree_offset_vectors) + .push_back(std::move(high_segment_key_local_degree_offsets)); } - if (h_offsets[1] - h_offsets[0] > 0) { - raft::grid_1d_warp_t update_grid(h_offsets[1] - h_offsets[0], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_mid_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first + h_offsets[0], - edge_partition_frontier_key_first + h_offsets[1], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), - e_op); + + // to ensure that *high_segment_edge_counts[] is valid + if (loop_stream_pool_indices) { + handle.sync_stream_pool(*loop_stream_pool_indices); + } else { + handle.sync_stream(); } - if (h_offsets[2] - h_offsets[1] > 0) { - raft::grid_1d_thread_t update_grid(h_offsets[2] - h_offsets[1], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_hypersparse_or_low_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first + h_offsets[1], - edge_partition_frontier_key_first + h_offsets[2], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), - e_op); + } + + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + output_key_buffers.push_back(allocate_optional_dataframe_buffer( + edge_partition_max_push_counts[j], loop_stream)); + output_value_buffers.push_back(allocate_optional_dataframe_buffer( + edge_partition_max_push_counts[j], loop_stream)); + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } +#if EXTRACT_PERFORMANCE_MEASUREMENT + auto subtime3 = std::chrono::steady_clock::now(); +#endif + + thrust::fill( + handle.get_thrust_policy(), counters.begin(), counters.begin() + loop_count, size_t{0}); + if (loop_stream_pool_indices) { handle.sync_stream(); } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, partition_idx) + : thrust::nullopt; + size_t num_streams_per_loop{1}; + if (stream_pool_indices) { + assert((*stream_pool_indices).size() >= num_concurrent_loops); + num_streams_per_loop = (*stream_pool_indices).size() / num_concurrent_loops; } - if (edge_partition.dcs_nzd_vertex_count() && (h_offsets[3] - h_offsets[2] > 0)) { - raft::grid_1d_thread_t update_grid(h_offsets[3] - h_offsets[2], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_hypersparse_or_low_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first + h_offsets[2], - edge_partition_frontier_key_first + h_offsets[3], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), - e_op); + auto edge_partition_stream_pool_indices = + stream_pool_indices + ? std::make_optional>( + (*stream_pool_indices).data() + j * num_streams_per_loop, num_streams_per_loop) + : std::nullopt; + + edge_partition_src_input_device_view_t edge_partition_src_value_input{}; + edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; + if constexpr (GraphViewType::is_storage_transposed) { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input, partition_idx); + } else { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input, partition_idx); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input); } - } else { - if (edge_partition_frontier_size > 0) { - raft::grid_1d_thread_t update_grid(edge_partition_frontier_size, - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - - extract_transform_v_frontier_e_hypersparse_or_low_degree - <<>>( + auto edge_partition_e_value_input = + edge_partition_e_input_device_view_t(edge_value_input, partition_idx); + + bool computed{false}; + if constexpr (try_bitmap) { + auto const& keys = edge_partition_key_buffers[j]; + if (keys.index() == 0) { + auto edge_partition_frontier_key_first = thrust::make_transform_iterator( + std::get<0>(keys).begin(), + cuda::proclaim_return_type( + [range_first = local_frontier_range_firsts[partition_idx]] __device__( + uint32_t v_offset) { return range_first + static_cast(v_offset); })); + auto edge_partition_frontier_key_last = + edge_partition_frontier_key_first + std::get<0>(keys).size(); + extract_transform_v_frontier_e_edge_partition( + handle, edge_partition, edge_partition_frontier_key_first, edge_partition_frontier_key_last, @@ -899,24 +1476,183 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_dst_value_input, edge_partition_e_value_input, edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), - e_op); + get_optional_dataframe_buffer_begin(output_key_buffers[j]), + get_optional_dataframe_buffer_begin(output_value_buffers[j]), + raft::device_span(counters.data() + j, size_t{1}), + e_op, + high_segment_key_local_degree_offset_vectors + ? std::make_optional>( + (*high_segment_key_local_degree_offset_vectors)[j].data(), + (*high_segment_key_local_degree_offset_vectors)[j].size()) + : std::nullopt, + high_segment_edge_counts ? std::make_optional((*high_segment_edge_counts)[j]) + : std::nullopt, + key_segment_offset_vectors ? std::make_optional>( + (*key_segment_offset_vectors)[partition_idx].data(), + (*key_segment_offset_vectors)[partition_idx].size()) + : std::nullopt, + edge_partition_stream_pool_indices); + computed = true; + } } - } - } + if (!computed) { + auto edge_partition_frontier_key_first = frontier_key_first; + auto edge_partition_frontier_key_last = frontier_key_last; + if constexpr (GraphViewType::is_multi_gpu) { + auto const& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + assert(keys.index() == 1); + edge_partition_frontier_key_first = std::get<1>(keys).begin(); + edge_partition_frontier_key_last = std::get<1>(keys).end(); + } else { + edge_partition_frontier_key_first = get_dataframe_buffer_begin(keys); + edge_partition_frontier_key_last = get_dataframe_buffer_end(keys); + } + } - // 2. resize and return the buffers + extract_transform_v_frontier_e_edge_partition( + handle, + edge_partition, + edge_partition_frontier_key_first, + edge_partition_frontier_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + get_optional_dataframe_buffer_begin(output_key_buffers[j]), + get_optional_dataframe_buffer_begin(output_value_buffers[j]), + raft::device_span(counters.data() + j, size_t{1}), + e_op, + high_segment_key_local_degree_offset_vectors + ? std::make_optional>( + (*high_segment_key_local_degree_offset_vectors)[j].data(), + (*high_segment_key_local_degree_offset_vectors)[j].size()) + : std::nullopt, + high_segment_edge_counts ? std::make_optional((*high_segment_edge_counts)[j]) + : std::nullopt, + key_segment_offset_vectors ? std::make_optional>( + (*key_segment_offset_vectors)[partition_idx].data(), + (*key_segment_offset_vectors)[partition_idx].size()) + : std::nullopt, + edge_partition_stream_pool_indices); + } + } - auto new_buffer_size = buffer_idx.value(handle.get_stream()); + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +#if EXTRACT_PERFORMANCE_MEASUREMENT + auto subtime4 = std::chrono::steady_clock::now(); +#endif + + std::vector h_counts(loop_count); + raft::update_host(h_counts.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); +#if EXTRACT_PERFORMANCE_MEASUREMENT + auto subtime5 = std::chrono::steady_clock::now(); +#endif + + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto tmp_buffer_size = h_counts[j]; + if (tmp_buffer_size > 0) { + auto& tmp_key_buffer = output_key_buffers[j]; + auto& tmp_value_buffer = output_value_buffers[j]; + + resize_optional_dataframe_buffer( + tmp_key_buffer, tmp_buffer_size, loop_stream); + // skip shrink_to_fit before return to cut execution time + + resize_optional_dataframe_buffer( + tmp_value_buffer, tmp_buffer_size, loop_stream); + // skip shrink_to_fit before return to cut execution time + + key_buffers.push_back(std::move(tmp_key_buffer)); + value_buffers.push_back(std::move(tmp_value_buffer)); + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } +#if EXTRACT_PERFORMANCE_MEASUREMENT + auto subtime6 = std::chrono::steady_clock::now(); + std::chrono::duration subdur0 = subtime1 - subtime0; + std::chrono::duration subdur1 = subtime2 - subtime1; + std::chrono::duration subdur2 = subtime3 - subtime2; + std::chrono::duration subdur3 = subtime4 - subtime3; + std::chrono::duration subdur4 = subtime5 - subtime4; + std::chrono::duration subdur5 = subtime6 - subtime5; + std::cerr << "sub (extract) took (" << subdur0.count() << "," << subdur1.count() << "," + << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," + << subdur5.count() << ") loop_count=" << loop_count << std::endl; +#endif + } +#if EXTRACT_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time2 = std::chrono::steady_clock::now(); +#endif + + // 3. concatenate and return the buffers + + auto key_buffer = allocate_optional_dataframe_buffer(0, handle.get_stream()); + auto value_buffer = allocate_optional_dataframe_buffer(0, handle.get_stream()); + if (key_buffers.size() == 0) { + /* nothing to do */ + } else if (key_buffers.size() == 1) { + key_buffer = std::move(key_buffers[0]); + value_buffer = std::move(value_buffers[0]); + shrink_to_fit_optional_dataframe_buffer(key_buffer, handle.get_stream()); + shrink_to_fit_optional_dataframe_buffer(value_buffer, handle.get_stream()); + } else { + std::vector buffer_sizes(key_buffers.size()); + static_assert(!std::is_same_v || !std::is_same_v); + for (size_t i = 0; i < key_buffers.size(); ++i) { + if constexpr (!std::is_same_v) { + buffer_sizes[i] = size_optional_dataframe_buffer(key_buffers[i]); + } else { + buffer_sizes[i] = size_optional_dataframe_buffer(value_buffers[i]); + } + } + auto buffer_size = std::reduce(buffer_sizes.begin(), buffer_sizes.end()); + resize_optional_dataframe_buffer(key_buffer, buffer_size, handle.get_stream()); + resize_optional_dataframe_buffer( + value_buffer, buffer_size, handle.get_stream()); + std::vector buffer_displacements(buffer_sizes.size()); + std::exclusive_scan( + buffer_sizes.begin(), buffer_sizes.end(), buffer_displacements.begin(), size_t{0}); + handle.sync_stream(); + for (size_t i = 0; i < key_buffers.size(); ++i) { + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[i]) + : handle.get_stream(); + if constexpr (!std::is_same_v) { + thrust::copy( + rmm::exec_policy_nosync(loop_stream), + get_optional_dataframe_buffer_cbegin(key_buffers[i]), + get_optional_dataframe_buffer_cend(key_buffers[i]), + get_optional_dataframe_buffer_begin(key_buffer) + buffer_displacements[i]); + } - resize_optional_dataframe_buffer(key_buffer, new_buffer_size, handle.get_stream()); - shrink_to_fit_optional_dataframe_buffer(key_buffer, handle.get_stream()); + if constexpr (!std::is_same_v) { + thrust::copy(rmm::exec_policy_nosync(loop_stream), + get_optional_dataframe_buffer_cbegin(value_buffers[i]), + get_optional_dataframe_buffer_cend(value_buffers[i]), + get_optional_dataframe_buffer_begin(value_buffer) + + buffer_displacements[i]); + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + } - resize_optional_dataframe_buffer( - value_buffer, new_buffer_size, handle.get_stream()); - shrink_to_fit_optional_dataframe_buffer(value_buffer, handle.get_stream()); +#if EXTRACT_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time3 = std::chrono::steady_clock::now(); + std::chrono::duration dur0 = time1 - time0; + std::chrono::duration dur1 = time2 - time1; + std::chrono::duration dur2 = time3 - time2; + std::cerr << "\t\t" + << "detail::extract (pre,fill,concat) took (" << dur0.count() << "," << dur1.count() + << "," << dur2.count() << ")" << std::endl; +#endif return std::make_tuple(std::move(key_buffer), std::move(value_buffer)); } diff --git a/cpp/src/prims/detail/multi_stream_utils.cuh b/cpp/src/prims/detail/multi_stream_utils.cuh new file mode 100644 index 00000000000..76ef3fb0de4 --- /dev/null +++ b/cpp/src/prims/detail/multi_stream_utils.cuh @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include + +#include +#include +#include + +#include + +#include +#include + +namespace cugraph { + +namespace detail { + +inline std::vector init_stream_pool_indices(size_t max_tmp_buffer_size, + size_t approx_tmp_buffer_size_per_loop, + size_t loop_count, + size_t num_streams_per_loop, + size_t max_streams) +{ + size_t num_streams = std::min(loop_count * num_streams_per_loop, + raft::round_down_safe(max_streams, num_streams_per_loop)); + + auto num_concurrent_loops = + (approx_tmp_buffer_size_per_loop > 0) + ? std::max(max_tmp_buffer_size / approx_tmp_buffer_size_per_loop, size_t{1}) + : loop_count; + num_streams = std::min(num_concurrent_loops * num_streams_per_loop, num_streams); + + std::vector stream_pool_indices(num_streams); + std::iota(stream_pool_indices.begin(), stream_pool_indices.end(), size_t{0}); + + return stream_pool_indices; +} + +// this assumes that the caller already knows how many items will be copied. +template +void copy_if_nosync(InputIterator input_first, + InputIterator input_last, + FlagIterator flag_first, + OutputIterator output_first, + raft::device_span count /* size = 1 */, + rmm::cuda_stream_view stream_view) +{ + CUGRAPH_EXPECTS( + static_cast(thrust::distance(input_first, input_last)) <= + static_cast(std::numeric_limits::max()), + "cugraph::detail::copy_if_nosync relies on cub::DeviceSelect::Flagged which uses int for input " + "size, but thrust::distance(input_first, input_last) exceeds std::numeric_limits::max()."); + + size_t tmp_storage_bytes{0}; + size_t input_size = static_cast(thrust::distance(input_first, input_last)); + + cub::DeviceSelect::Flagged(static_cast(nullptr), + tmp_storage_bytes, + input_first, + flag_first, + output_first, + count.data(), + input_size, + stream_view); + + auto d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, stream_view); + + cub::DeviceSelect::Flagged(d_tmp_storage.data(), + tmp_storage_bytes, + input_first, + flag_first, + output_first, + count.data(), + input_size, + stream_view); +} + +template +void count_nosync(InputIterator input_first, + InputIterator input_last, + raft::device_span count /* size = 1 */, + typename thrust::iterator_traits::value_type value, + rmm::cuda_stream_view stream_view) +{ + CUGRAPH_EXPECTS( + static_cast(thrust::distance(input_first, input_last)) <= + static_cast(std::numeric_limits::max()), + "cugraph::detail::count_nosync relies on cub::DeviceReduce::Sum which uses int for input size, " + "but thrust::distance(input_first, input_last) exceeds std::numeric_limits::max()."); + + size_t tmp_storage_bytes{0}; + size_t input_size = static_cast(thrust::distance(input_first, input_last)); + + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + input_first, + count.data(), + input_size, + stream_view); + + auto d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, stream_view); + + cub::DeviceReduce::Sum( + d_tmp_storage.data(), tmp_storage_bytes, input_first, count.data(), input_size, stream_view); +} + +template +void sum_nosync( + InputIterator input_first, + InputIterator input_last, + raft::device_span::value_type> sum /* size = 1 */, + rmm::cuda_stream_view stream_view) +{ + CUGRAPH_EXPECTS( + static_cast(thrust::distance(input_first, input_last)) <= + static_cast(std::numeric_limits::max()), + "cugraph::detail::count_nosync relies on cub::DeviceReduce::Sum which uses int for input size, " + "but thrust::distance(input_first, input_last) exceeds std::numeric_limits::max()."); + + size_t tmp_storage_bytes{0}; + size_t input_size = static_cast(thrust::distance(input_first, input_last)); + + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + input_first, + sum.data(), + input_size, + stream_view); + + auto d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, stream_view); + + cub::DeviceReduce::Sum( + d_tmp_storage.data(), tmp_storage_bytes, input_first, sum.data(), input_size, stream_view); +} + +} // namespace detail + +} // namespace cugraph diff --git a/cpp/src/prims/detail/optional_dataframe_buffer.hpp b/cpp/src/prims/detail/optional_dataframe_buffer.hpp index 87c095f8e81..6657b91f13b 100644 --- a/cpp/src/prims/detail/optional_dataframe_buffer.hpp +++ b/cpp/src/prims/detail/optional_dataframe_buffer.hpp @@ -26,152 +26,130 @@ namespace detail { // we cannot use thrust::iterator_traits::value_type if Iterator is void* (reference to // void is not allowed) template -struct optional_dataframe_buffer_value_type_t; +struct optional_dataframe_buffer_iterator_value_type_t; template -struct optional_dataframe_buffer_value_type_t>> { +struct optional_dataframe_buffer_iterator_value_type_t< + Iterator, + std::enable_if_t>> { using value = typename thrust::iterator_traits::value_type; }; template -struct optional_dataframe_buffer_value_type_t>> { +struct optional_dataframe_buffer_iterator_value_type_t< + Iterator, + std::enable_if_t>> { using value = void; }; -template >* = nullptr> -std::byte allocate_optional_dataframe_buffer(size_t size, rmm::cuda_stream_view stream) -{ - return std::byte{0}; // dummy -} - -template >* = nullptr> +template auto allocate_optional_dataframe_buffer(size_t size, rmm::cuda_stream_view stream) { - return allocate_dataframe_buffer(size, stream); + if constexpr (std::is_same_v) { + return std::byte{0}; // dummy + } else { + return allocate_dataframe_buffer(size, stream); + } } -template >* = nullptr> -void* get_optional_dataframe_buffer_begin(std::byte& optional_dataframe_buffer) -{ - return static_cast(nullptr); -} +template +struct optional_dataframe_buffer_type { + using type = decltype(allocate_optional_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{})); +}; -template >* = nullptr> -auto get_optional_dataframe_buffer_begin( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer) -{ - return get_dataframe_buffer_begin(optional_dataframe_buffer); -} +template +using optional_dataframe_buffer_type_t = typename optional_dataframe_buffer_type::type; -template >* = nullptr> -void* get_optional_dataframe_buffer_end(std::byte& optional_dataframe_buffer) +template +auto get_optional_dataframe_buffer_begin( + optional_dataframe_buffer_type_t& optional_dataframe_buffer) { - return static_cast(nullptr); + if constexpr (std::is_same_v) { + return static_cast(nullptr); + } else { + return get_dataframe_buffer_begin(optional_dataframe_buffer); + } } -template >* = nullptr> +template auto get_optional_dataframe_buffer_end( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer) + optional_dataframe_buffer_type_t& optional_dataframe_buffer) { - return get_dataframe_buffer_end(optional_dataframe_buffer); + if constexpr (std::is_same_v) { + return static_cast(nullptr); + } else { + return get_dataframe_buffer_end(optional_dataframe_buffer); + } } -template >* = nullptr> -void const* get_optional_dataframe_buffer_cbegin(std::byte const& optional_dataframe_buffer) -{ - return static_cast(nullptr); -} - -template >* = nullptr> +template auto get_optional_dataframe_buffer_cbegin( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))> const& - optional_dataframe_buffer) + optional_dataframe_buffer_type_t const& optional_dataframe_buffer) { - return get_dataframe_buffer_cbegin(optional_dataframe_buffer); + if constexpr (std::is_same_v) { + return static_cast(nullptr); + } else { + return get_dataframe_buffer_cbegin(optional_dataframe_buffer); + } } -template >* = nullptr> -void const* get_optional_dataframe_buffer_cend(std::byte const& optional_dataframe_buffer) -{ - return static_cast(nullptr); -} - -template >* = nullptr> +template auto get_optional_dataframe_buffer_cend( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))> const& - optional_dataframe_buffer) -{ - return get_dataframe_buffer_cend(optional_dataframe_buffer); -} - -template >* = nullptr> -void reserve_optional_dataframe_buffer(std::byte& optional_dataframe_buffer, - size_t new_buffer_capacity, - rmm::cuda_stream_view stream_view) + optional_dataframe_buffer_type_t const& optional_dataframe_buffer) { - return; + if constexpr (std::is_same_v) { + return static_cast(nullptr); + } else { + return get_dataframe_buffer_cend(optional_dataframe_buffer); + } } -template >* = nullptr> +template void reserve_optional_dataframe_buffer( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer, + optional_dataframe_buffer_type_t& optional_dataframe_buffer, size_t new_buffer_capacity, rmm::cuda_stream_view stream_view) { - return reserve_dataframe_buffer(optional_dataframe_buffer, new_buffer_capacity, stream_view); -} - -template >* = nullptr> -void resize_optional_dataframe_buffer(std::byte& optional_dataframe_buffer, - size_t new_buffer_size, - rmm::cuda_stream_view stream_view) -{ - return; + if constexpr (std::is_same_v) { + return; + } else { + return reserve_dataframe_buffer(optional_dataframe_buffer, new_buffer_capacity, stream_view); + } } -template >* = nullptr> +template void resize_optional_dataframe_buffer( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer, + optional_dataframe_buffer_type_t& optional_dataframe_buffer, size_t new_buffer_size, rmm::cuda_stream_view stream_view) { - return resize_dataframe_buffer(optional_dataframe_buffer, new_buffer_size, stream_view); + if constexpr (std::is_same_v) { + return; + } else { + return resize_dataframe_buffer(optional_dataframe_buffer, new_buffer_size, stream_view); + } } -template >* = nullptr> -void shrink_to_fit_optional_dataframe_buffer(std::byte& optional_dataframe_buffer, - rmm::cuda_stream_view stream_view) -{ - return; -} - -template >* = nullptr> +template void shrink_to_fit_optional_dataframe_buffer( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer, - rmm::cuda_stream_view stream_view) -{ - return shrink_to_fit_dataframe_buffer(optional_dataframe_buffer, stream_view); -} - -template >* = nullptr> -size_t size_optional_dataframe_buffer(std::byte const& optional_dataframe_buffer) + optional_dataframe_buffer_type_t& optional_dataframe_buffer, rmm::cuda_stream_view stream_view) { - return size_t{0}; + if constexpr (std::is_same_v) { + return; + } else { + return shrink_to_fit_dataframe_buffer(optional_dataframe_buffer, stream_view); + } } -template >* = nullptr> +template size_t size_optional_dataframe_buffer( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))> const& - optional_dataframe_buffer) + optional_dataframe_buffer_type_t& optional_dataframe_buffer) { - return size_dataframe_buffer(optional_dataframe_buffer); + if constexpr (std::is_same_v) { + return size_t{0}; + } else { + return size_dataframe_buffer(optional_dataframe_buffer); + } } } // namespace detail diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh new file mode 100644 index 00000000000..c15ce02a985 --- /dev/null +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -0,0 +1,4490 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "detail/graph_partition_utils.cuh" +#include "prims/detail/multi_stream_utils.cuh" +#include "prims/detail/optional_dataframe_buffer.hpp" +#include "prims/detail/prim_functors.cuh" +#include "prims/detail/prim_utils.cuh" +#include "prims/fill_edge_src_dst_property.cuh" +#include "prims/property_op_utils.cuh" +#include "prims/reduce_op.cuh" +#include "prims/vertex_frontier.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cugraph { + +namespace detail { + +int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512; +int32_t constexpr per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size = 128; + +template +struct iterator_value_type_or_default_t; + +template +struct iterator_value_type_or_default_t>> { + using value_type = default_t; // if Iterator is invalid (void*), value_type = default_t +}; + +template +struct iterator_value_type_or_default_t>> { + using value_type = typename thrust::iterator_traits< + Iterator>::value_type; // if iterator is valid, value_type = typename + // thrust::iterator_traits::value_type +}; + +template +__device__ auto init_pred_op( + edge_partition_device_view_t const& edge_partition, + EdgePartitionSrcValueInputWrapper const& edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper const& edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper const& edge_partition_e_value_input, + PredOp const& pred_op, + key_t key, + typename GraphViewType::vertex_type major_offset, + typename GraphViewType::vertex_type const* indices, + typename GraphViewType::edge_type edge_offset) +{ + if constexpr (std::is_same_v< + PredOp, + const_true_e_op_t>) { + return call_const_true_e_op_t{}; + } else { + return call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset}; + } +} + +template +struct transform_and_atomic_reduce_t { + edge_partition_device_view_t const& edge_partition{}; + vertex_t const* indices{nullptr}; + TransformOp const& transform_op{}; + PredOp const& pred_op{}; + ResultValueOutputIteratorOrWrapper& result_value_output{}; + + __device__ void operator()(edge_t i) const + { + if (pred_op(i)) { + auto e_op_result = transform_op(i); + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + if constexpr (multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } +}; + +template +__device__ void update_result_value_output( + edge_partition_device_view_t const& edge_partition, + vertex_t const* indices, + edge_t local_degree, + TransformOp const& transform_op, + result_t init, + ReduceOp const& reduce_op, + PredOp const& pred_op, + size_t output_idx /* relevent only when update_major === true */, + ResultValueOutputIteratorOrWrapper& result_value_output) +{ + if constexpr (update_major) { + result_t val{}; + if constexpr (std::is_same_v>) { + if constexpr (std::is_same_v>) { // init is selected only when no + // edges return a valid value + val = init; + for (edge_t i = 0; i < local_degree; ++i) { + auto tmp = transform_op(i); + val = tmp; + break; + } + } else { + val = thrust::transform_reduce(thrust::seq, + thrust::make_counting_iterator(edge_t{0}), + thrust::make_counting_iterator(local_degree), + transform_op, + init, + reduce_op); + } + } else { + val = init; + for (edge_t i = 0; i < local_degree; ++i) { + if (pred_op(i)) { + auto tmp = transform_op(i); + if constexpr (std::is_same_v>) { // init is selected only when + // no edges return a valid + // value + val = tmp; + break; + } else { + val = reduce_op(val, tmp); + } + } + } + } + *(result_value_output + output_idx) = val; + } else { + thrust::for_each(thrust::seq, + thrust::make_counting_iterator(edge_t{0}), + thrust::make_counting_iterator(local_degree), + transform_and_atomic_reduce_t{ + edge_partition, indices, transform_op, pred_op, result_value_output}); + } +} + +template +__global__ static void per_v_transform_reduce_e_hypersparse( + edge_partition_device_view_t edge_partition, + OptionalKeyIterator key_first, + OptionalKeyIterator key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper result_value_output, + EdgeOp e_op, + T init /* relevant only if update_major == true */, + ReduceOp reduce_op, + PredOp pred_op) +{ + constexpr bool use_input_key = !std::is_same_v; + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< + ReduceOp>); // atomic_reduce is defined only when + // has_compatible_raft_comms_op_t is true + static_assert(update_major || !use_input_key); + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = + typename iterator_value_type_or_default_t::value_type; + + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto idx = static_cast(tid); + + size_t key_count{}; + if constexpr (use_input_key) { + key_count = static_cast(thrust::distance(key_first, key_last)); + } else { + key_count = *(edge_partition.dcs_nzd_vertex_count()); + } + + while (idx < key_count) { + key_t key{}; + vertex_t major{}; + thrust::optional major_idx{}; + if constexpr (use_input_key) { + key = *(key_first + idx); + major = thrust_tuple_get_or_identity(key); + major_idx = edge_partition.major_idx_from_major_nocheck(major); + } else { + key = *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast(idx))); + major = key; + auto major_start_offset = static_cast(*(edge_partition.major_hypersparse_first()) - + edge_partition.major_range_first()); + major_idx = major_start_offset + idx; // major_offset != major_idx in the hypersparse region + } + + size_t output_idx = use_input_key ? idx : (major - *(edge_partition).major_hypersparse_first()); + if (major_idx) { + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = + edge_partition.local_edges(static_cast(*major_idx)); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + auto call_pred_op = init_pred_op(edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset); + + if (edge_partition_e_mask) { + update_result_value_output( + edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + [&edge_partition_e_mask, &call_pred_op, edge_offset] __device__(edge_t i) { + if ((*edge_partition_e_mask).get(edge_offset + i)) { + return call_pred_op(i); + } else { + return false; + } + }, + output_idx, + result_value_output); + } else { + update_result_value_output(edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + call_pred_op, + output_idx, + result_value_output); + } + } else { + if constexpr (update_major) { *(result_value_output + output_idx) = init; } + } + idx += gridDim.x * blockDim.x; + } +} + +template +__global__ static void per_v_transform_reduce_e_low_degree( + edge_partition_device_view_t edge_partition, + KeyIterator key_first, + KeyIterator key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper result_value_output, + EdgeOp e_op, + T init /* relevant only if update_major == true */, + ReduceOp reduce_op, + PredOp pred_op) +{ + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< + ReduceOp>); // atomic_reduce is defined only when + // has_compatible_raft_comms_op_t is true + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = typename thrust::iterator_traits::value_type; + + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto idx = static_cast(tid); + + while (idx < static_cast(thrust::distance(key_first, key_last))) { + auto key = *(key_first + idx); + auto major = thrust_tuple_get_or_identity(key); + + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = + edge_partition.local_edges(static_cast(major_offset)); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + auto call_pred_op = init_pred_op(edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset); + + if (edge_partition_e_mask) { + update_result_value_output( + edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + [&edge_partition_e_mask, &call_pred_op, edge_offset] __device__(edge_t i) { + if ((*edge_partition_e_mask).get(edge_offset + i)) { + return call_pred_op(i); + } else { + return false; + } + }, + idx, + result_value_output); + } else { + update_result_value_output(edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + call_pred_op, + idx, + result_value_output); + } + idx += gridDim.x * blockDim.x; + } +} + +template +__global__ static void per_v_transform_reduce_e_mid_degree( + edge_partition_device_view_t edge_partition, + KeyIterator key_first, + KeyIterator key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper result_value_output, + EdgeOp e_op, + T init /* relevant only if update_major == true */, + T identity_element /* relevant only if update_major == true */, + ReduceOp reduce_op, + PredOp pred_op) +{ + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< + ReduceOp>); // atomic_reduce is defined only when + // has_compatible_raft_comms_op_t is true + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using e_op_result_t = T; + using key_t = typename thrust::iterator_traits::value_type; + + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + static_assert(per_v_transform_reduce_e_kernel_block_size % raft::warp_size() == 0); + auto const lane_id = tid % raft::warp_size(); + auto idx = static_cast(tid / raft::warp_size()); + + using WarpReduce = cub::WarpReduce< + std::conditional_t>, int32_t, e_op_result_t>>; + [[maybe_unused]] __shared__ + std::conditional_t + temp_storage[update_major ? (per_v_transform_reduce_e_kernel_block_size / raft::warp_size()) + : int32_t{1} /* dummy */]; + + while (idx < static_cast(thrust::distance(key_first, key_last))) { + auto key = *(key_first + idx); + auto major = thrust_tuple_get_or_identity(key); + + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + auto call_pred_op = init_pred_op(edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset); + + [[maybe_unused]] std::conditional_t + reduced_e_op_result{}; + [[maybe_unused]] std::conditional_t>, + int32_t, + std::byte /* dummy */> + first_valid_lane_id{}; + if constexpr (update_major) { + reduced_e_op_result = + (lane_id == 0) ? init : identity_element; // init == identity_element for reduce_op::any + if constexpr (std::is_same_v>) { + first_valid_lane_id = raft::warp_size(); + } + } + + if (edge_partition_e_mask) { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); + for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { + thrust::optional e_op_result{thrust::nullopt}; + if ((i < static_cast(local_degree)) && + (*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) { + e_op_result = call_e_op(i); + } + first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) + .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); + first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); + if (lane_id == first_valid_lane_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_lane_id != raft::warp_size()) { break; } + } + } else { + for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { + if ((*edge_partition_e_mask).get(edge_offset + i) & call_pred_op(i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } else { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); + for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { + thrust::optional e_op_result{thrust::nullopt}; + if (i < static_cast(local_degree) && call_pred_op(i)) { + e_op_result = call_e_op(i); + } + first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) + .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); + first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); + if (lane_id == first_valid_lane_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_lane_id != raft::warp_size()) { break; } + } + } else { + for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { + if (call_pred_op(i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } + + if constexpr (update_major) { + if constexpr (std::is_same_v>) { + if (lane_id == ((first_valid_lane_id == raft::warp_size()) ? 0 : first_valid_lane_id)) { + *(result_value_output + idx) = reduced_e_op_result; + } + } else { + reduced_e_op_result = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) + .Reduce(reduced_e_op_result, reduce_op); + if (lane_id == 0) { *(result_value_output + idx) = reduced_e_op_result; } + } + } + + idx += gridDim.x * (blockDim.x / raft::warp_size()); + } +} + +template +__global__ static void per_v_transform_reduce_e_high_degree( + edge_partition_device_view_t edge_partition, + KeyIterator key_first, + KeyIterator key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper result_value_output, + EdgeOp e_op, + T init /* relevant only if update_major == true */, + T identity_element /* relevant only if update_major == true */, + ReduceOp reduce_op, + PredOp pred_op) +{ + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< + ReduceOp>); // atomic_reduce is defined only when + // has_compatible_raft_comms_op_t is true + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using e_op_result_t = T; + using key_t = typename thrust::iterator_traits::value_type; + + auto idx = static_cast(blockIdx.x); + + using BlockReduce = cub::BlockReduce< + std::conditional_t>, int32_t, e_op_result_t>, + std::is_same_v> + ? per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size + : per_v_transform_reduce_e_kernel_block_size>; + [[maybe_unused]] __shared__ + std::conditional_t + temp_storage; + [[maybe_unused]] __shared__ + std::conditional_t>, + int32_t, + std::byte /* dummy */> + output_thread_id; + + while (idx < static_cast(thrust::distance(key_first, key_last))) { + auto key = *(key_first + idx); + auto major = thrust_tuple_get_or_identity(key); + + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + auto call_pred_op = init_pred_op(edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset); + + [[maybe_unused]] std::conditional_t + reduced_e_op_result{}; + [[maybe_unused]] std::conditional_t>, + int32_t, + std::byte /* dummy */> + first_valid_thread_id{}; + if constexpr (update_major) { + reduced_e_op_result = threadIdx.x == 0 + ? init + : identity_element; // init == identity_element for reduce_op::any + if constexpr (std::is_same_v>) { + first_valid_thread_id = per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size; + } + } + + if (edge_partition_e_mask) { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + + (per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size - 1)) / + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) * + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size; + for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { + thrust::optional e_op_result{thrust::nullopt}; + if ((i < static_cast(local_degree)) && + (*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) { + e_op_result = call_e_op(i); + } + first_valid_thread_id = + BlockReduce(temp_storage) + .Reduce(e_op_result + ? threadIdx.x + : per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size, + cub::Min()); + if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } + __syncthreads(); + first_valid_thread_id = output_thread_id; + if (threadIdx.x == first_valid_thread_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_thread_id != + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) { + break; + } + } + } else { + for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { + if ((*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } else { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + + (per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size - 1)) / + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) * + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size; + for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { + thrust::optional e_op_result{thrust::nullopt}; + if ((i < static_cast(local_degree)) && call_pred_op(i)) { + e_op_result = call_e_op(i); + } + first_valid_thread_id = + BlockReduce(temp_storage) + .Reduce(e_op_result + ? threadIdx.x + : per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size, + cub::Min()); + if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } + __syncthreads(); + first_valid_thread_id = output_thread_id; + if (threadIdx.x == first_valid_thread_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_thread_id != + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) { + break; + } + } + } else { + for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { + if (call_pred_op(i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } + + if constexpr (update_major) { + if constexpr (std::is_same_v>) { + if (threadIdx.x == ((first_valid_thread_id == + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) + ? 0 + : first_valid_thread_id)) { + *(result_value_output + idx) = reduced_e_op_result; + } + } else { + reduced_e_op_result = BlockReduce(temp_storage).Reduce(reduced_e_op_result, reduce_op); + if (threadIdx.x == 0) { *(result_value_output + idx) = reduced_e_op_result; } + } + } + + idx += gridDim.x; + } +} + +template +void compute_priorities( + raft::comms::comms_t const& comm, + ValueIterator value_first, + raft::device_span priorities, + std::optional, raft::device_span>> + hypersparse_key_offsets, // we may not have values for the entire "range_size" if + // hypersparse_key_offsets.has_value() is true + size_t contiguous_size, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + typename thrust::iterator_traits::value_type init, + bool ignore_local_values, + rmm::cuda_stream_view stream_view) +{ + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + + // For each vertex, select a comm_rank among the GPUs with a value other than init (if there are + // more than one, the GPU with (comm_rank == root) has the highest priority, GPUs in the same DGX + // node should be the next) + + if (ignore_local_values) { + thrust::fill(rmm::exec_policy_nosync(stream_view), + priorities.begin(), + priorities.end(), + std::numeric_limits::max()); + } else { + thrust::tabulate( + rmm::exec_policy_nosync(stream_view), + priorities.begin(), + priorities.begin() + contiguous_size, + [value_first, root, subgroup_size, init, comm_rank, comm_size] __device__(auto offset) { + auto val = *(value_first + offset); + return (val != init) + ? rank_to_priority( + comm_rank, root, subgroup_size, comm_size, static_cast(offset)) + : std::numeric_limits::max(); // lowest priority + }); + if (hypersparse_key_offsets) { + thrust::fill(rmm::exec_policy_nosync(stream_view), + priorities.begin() + contiguous_size, + priorities.end(), + std::numeric_limits::max()); + if ((*hypersparse_key_offsets).index() == 0) { + auto priority_first = thrust::make_transform_iterator( + std::get<0>(*hypersparse_key_offsets).begin(), + cuda::proclaim_return_type( + [root, subgroup_size, comm_rank, comm_size] __device__(uint32_t offset) { + return rank_to_priority( + comm_rank, root, subgroup_size, comm_size, static_cast(offset)); + })); + thrust::scatter_if( + rmm::exec_policy_nosync(stream_view), + priority_first, + priority_first + std::get<0>(*hypersparse_key_offsets).size(), + std::get<0>(*hypersparse_key_offsets).begin(), + value_first + contiguous_size, + priorities.begin(), + is_not_equal_t::value_type>{init}); + } else { + auto priority_first = thrust::make_transform_iterator( + std::get<1>(*hypersparse_key_offsets).begin(), + cuda::proclaim_return_type( + [root, subgroup_size, comm_rank, comm_size] __device__(size_t offset) { + return rank_to_priority( + comm_rank, root, subgroup_size, comm_size, static_cast(offset)); + })); + thrust::scatter_if( + rmm::exec_policy_nosync(stream_view), + priority_first, + priority_first + std::get<1>(*hypersparse_key_offsets).size(), + std::get<1>(*hypersparse_key_offsets).begin(), + value_first + contiguous_size, + priorities.begin(), + is_not_equal_t::value_type>{init}); + } + } + } +} + +// return selected ranks if root. +// otherwise, it is sufficient to just return bool flags indiciating whether this rank's values are +// selected or not. +template +std::variant, + int, + priority_t>> /* root, store selected ranks */, + std::optional> /* store bitmap */> +compute_selected_ranks_from_priorities( + raft::comms::comms_t const& comm, + raft::device_span priorities, + std::optional, raft::device_span>> + hypersparse_key_offsets, // we may not have values for the entire "range_size" if + // hypersparse_key_offsets.has_value() is true + size_t contiguous_size, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + bool ignore_local_values, + rmm::cuda_stream_view stream_view) +{ + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + + using rank_t = std::conditional_t, int, priority_t>; + + if (comm_rank == root) { + rmm::device_uvector selected_ranks(priorities.size(), stream_view); + auto offset_priority_pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); + thrust::transform(rmm::exec_policy_nosync(stream_view), + offset_priority_pair_first, + offset_priority_pair_first + priorities.size(), + selected_ranks.begin(), + [root, subgroup_size, comm_rank, comm_size] __device__(auto pair) { + auto offset = thrust::get<0>(pair); + auto priority = thrust::get<1>(pair); + auto rank = (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, offset); + return static_cast(rank); + }); + return selected_ranks; + } else { + std::optional> keep_flags{std::nullopt}; + if (!ignore_local_values) { + keep_flags = rmm::device_uvector( + packed_bool_size(hypersparse_key_offsets + ? (contiguous_size + ((*hypersparse_key_offsets).index() == 0 + ? std::get<0>(*hypersparse_key_offsets).size() + : std::get<1>(*hypersparse_key_offsets).size())) + : contiguous_size), + stream_view); + thrust::fill(rmm::exec_policy_nosync(stream_view), + (*keep_flags).begin(), + (*keep_flags).end(), + packed_bool_empty_mask()); + auto offset_priority_pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); + thrust::for_each( + rmm::exec_policy_nosync(stream_view), + offset_priority_pair_first, + offset_priority_pair_first + contiguous_size, + [keep_flags = raft::device_span((*keep_flags).data(), (*keep_flags).size()), + root, + subgroup_size, + comm_rank, + comm_size] __device__(auto pair) { + auto offset = thrust::get<0>(pair); + auto priority = thrust::get<1>(pair); + auto rank = (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, offset); + if (rank == comm_rank) { + cuda::atomic_ref word( + keep_flags[packed_bool_offset(offset)]); + word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed); + } + }); + if (hypersparse_key_offsets) { + if ((*hypersparse_key_offsets).index() == 0) { + auto pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{contiguous_size}), + std::get<0>(*hypersparse_key_offsets).begin()); + thrust::for_each( + rmm::exec_policy_nosync(stream_view), + pair_first, + pair_first + std::get<0>(*hypersparse_key_offsets).size(), + [priorities = raft::device_span(priorities.data(), priorities.size()), + keep_flags = raft::device_span((*keep_flags).data(), (*keep_flags).size()), + root, + subgroup_size, + comm_rank, + comm_size] __device__(auto pair) { + auto offset = thrust::get<1>(pair); + auto priority = priorities[offset]; + auto rank = + (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, static_cast(offset)); + if (rank == comm_rank) { + cuda::atomic_ref word( + keep_flags[packed_bool_offset(thrust::get<0>(pair))]); + word.fetch_or(packed_bool_mask(thrust::get<0>(pair)), + cuda::std::memory_order_relaxed); + } + }); + } else { + auto pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{contiguous_size}), + std::get<1>(*hypersparse_key_offsets).begin()); + thrust::for_each( + rmm::exec_policy_nosync(stream_view), + pair_first, + pair_first + std::get<1>(*hypersparse_key_offsets).size(), + [priorities = raft::device_span(priorities.data(), priorities.size()), + keep_flags = raft::device_span((*keep_flags).data(), (*keep_flags).size()), + root, + subgroup_size, + comm_rank, + comm_size] __device__(auto pair) { + auto offset = thrust::get<1>(pair); + auto priority = priorities[offset]; + auto rank = + (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, static_cast(offset)); + if (rank == comm_rank) { + cuda::atomic_ref word( + keep_flags[packed_bool_offset(thrust::get<0>(pair))]); + word.fetch_or(packed_bool_mask(thrust::get<0>(pair)), + cuda::std::memory_order_relaxed); + } + }); + } + } + } + return keep_flags; + } +} + +template +void per_v_transform_reduce_e_edge_partition( + raft::handle_t const& handle, + edge_partition_device_view_t edge_partition, + OptionalKeyIterator edge_partition_key_first, + OptionalKeyIterator edge_partition_key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper output_buffer, + EdgeOp e_op, + T major_init, + T major_identity_element, + ReduceOp reduce_op, + PredOp pred_op, + std::optional> key_segment_offsets, + std::optional> const& edge_partition_stream_pool_indices) +{ + constexpr bool use_input_key = !std::is_same_v; + + using vertex_t = typename GraphViewType::vertex_type; + using segment_key_iterator_t = + std::conditional_t; + + size_t stream_pool_size{0}; + if (edge_partition_stream_pool_indices) { + stream_pool_size = (*edge_partition_stream_pool_indices).size(); + } + if (key_segment_offsets) { + static_assert(detail::num_sparse_segments_per_vertex_partition == 3); + + if (edge_partition.dcs_nzd_vertex_count()) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[0 % stream_pool_size]) + : handle.get_stream(); + + if constexpr (update_major && !use_input_key) { // this is necessary as we don't visit + // every vertex in the hypersparse segment + thrust::fill(rmm::exec_policy_nosync(exec_stream), + output_buffer + (*key_segment_offsets)[3], + output_buffer + (*key_segment_offsets)[4], + major_init); + } + + auto segment_size = use_input_key + ? ((*key_segment_offsets)[4] - (*key_segment_offsets)[3]) + : static_cast(*(edge_partition.dcs_nzd_vertex_count())); + if (segment_size > 0) { + raft::grid_1d_thread_t update_grid(segment_size, + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[3]; } + auto segment_key_first = edge_partition_key_first; + auto segment_key_last = edge_partition_key_last; + if constexpr (use_input_key) { + segment_key_first += (*key_segment_offsets)[3]; + segment_key_last = + segment_key_first + ((*key_segment_offsets)[4] - (*key_segment_offsets)[3]); + } else { + assert(segment_key_first == nullptr); + assert(segment_key_last == nullptr); + } + detail::per_v_transform_reduce_e_hypersparse + <<>>( + edge_partition, + segment_key_first, + segment_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + segment_output_buffer, + e_op, + major_init, + reduce_op, + pred_op); + } + } + if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[1 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2], + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[2]; } + std::optional + segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor + // is a deleted function, segment_key_first should always have a value + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + *segment_key_first += (*key_segment_offsets)[2]; + detail::per_v_transform_reduce_e_low_degree + <<>>( + edge_partition, + *segment_key_first, + *segment_key_first + ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + segment_output_buffer, + e_op, + major_init, + reduce_op, + pred_op); + } + if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[2 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1], + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[1]; } + std::optional + segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor + // is a deleted function, segment_key_first should always have a value + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + *segment_key_first += (*key_segment_offsets)[1]; + detail::per_v_transform_reduce_e_mid_degree + <<>>( + edge_partition, + *segment_key_first, + *segment_key_first + ((*key_segment_offsets)[2] - (*key_segment_offsets)[1]), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + segment_output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op); + } + if ((*key_segment_offsets)[1] > 0) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[3 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_block_t update_grid( + (*key_segment_offsets)[1], + std::is_same_v> + ? detail::per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size + : detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + std::optional + segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor + // is a deleted function, segment_key_first should always have a value + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + detail::per_v_transform_reduce_e_high_degree + <<>>( + edge_partition, + *segment_key_first, + *segment_key_first + (*key_segment_offsets)[1], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op); + } + } else { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[0 % stream_pool_size]) + : handle.get_stream(); + + size_t num_keys{}; + if constexpr (use_input_key) { + num_keys = + static_cast(thrust::distance(edge_partition_key_first, edge_partition_key_last)); + } else { + num_keys = static_cast(edge_partition.major_range_size()); + } + + if (num_keys > size_t{0}) { + raft::grid_1d_thread_t update_grid(num_keys, + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + std::optional + segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor + // is a deleted function, segment_key_first should always have a value + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + detail::per_v_transform_reduce_e_low_degree + <<>>( + edge_partition, + *segment_key_first, + *segment_key_first + num_keys, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + reduce_op, + pred_op); + } + } +} + +#define PER_V_PERFORMANCE_MEASUREMENT 0 // FIXME: delete performance logging code + +template +void per_v_transform_reduce_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + OptionalKeyIterator sorted_unique_key_first, + OptionalKeyIterator sorted_unique_key_last, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first) +{ +#if PER_V_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time0 = std::chrono::steady_clock::now(); +#endif + constexpr bool update_major = (incoming == GraphViewType::is_storage_transposed); + constexpr bool use_input_key = !std::is_same_v; + static_assert(update_major || !use_input_key); + constexpr bool filter_input_key = + GraphViewType::is_multi_gpu && use_input_key && + std::is_same_v>; // if GraphViewType::is_multi_gpu && update_major && + // std::is_same_v>, for any + // vertex in the frontier, we need to visit only local edges + // if we find any valid local edge (FIXME: this is + // applicable even when use_input_key is false). + + static_assert( + ReduceOp::pure_function && + ((reduce_op::has_compatible_raft_comms_op_v && + reduce_op::has_identity_element_v) || + (update_major && + std::is_same_v>))); // current restriction, to support general + // reduction, we may need to take a less + // efficient code path + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = + typename iterator_value_type_or_default_t::value_type; + + using edge_partition_src_input_device_view_t = std::conditional_t< + std::is_same_v, + detail::edge_partition_endpoint_dummy_property_device_view_t, + detail::edge_partition_endpoint_property_device_view_t< + vertex_t, + typename EdgeSrcValueInputWrapper::value_iterator, + typename EdgeSrcValueInputWrapper::value_type>>; + using edge_partition_dst_input_device_view_t = std::conditional_t< + std::is_same_v, + detail::edge_partition_endpoint_dummy_property_device_view_t, + detail::edge_partition_endpoint_property_device_view_t< + vertex_t, + typename EdgeDstValueInputWrapper::value_iterator, + typename EdgeDstValueInputWrapper::value_type>>; + using edge_partition_e_input_device_view_t = std::conditional_t< + std::is_same_v, + detail::edge_partition_edge_dummy_property_device_view_t, + detail::edge_partition_edge_property_device_view_t< + edge_t, + typename EdgeValueInputWrapper::value_iterator, + typename EdgeValueInputWrapper::value_type>>; + + static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); + + constexpr bool try_bitmap = + GraphViewType::is_multi_gpu && use_input_key && std::is_same_v; + + [[maybe_unused]] constexpr auto max_segments = + detail::num_sparse_segments_per_vertex_partition + size_t{1}; + + // 1. drop zero degree keys & compute key_segment_offsets + + auto local_vertex_partition_segment_offsets = graph_view.local_vertex_partition_segment_offsets(); + + std::conditional_t>, std::byte /* dummy */> + key_segment_offsets{}; + auto sorted_unique_nzd_key_last = sorted_unique_key_last; + if constexpr (use_input_key) { + if (local_vertex_partition_segment_offsets) { + key_segment_offsets = compute_key_segment_offsets( + sorted_unique_key_first, + sorted_unique_nzd_key_last, + raft::host_span((*local_vertex_partition_segment_offsets).data(), + (*local_vertex_partition_segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + (*key_segment_offsets).back() = *((*key_segment_offsets).rbegin() + 1); + sorted_unique_nzd_key_last = sorted_unique_key_first + (*key_segment_offsets).back(); + } + } + + // 2. initialize vertex value output buffer + + if constexpr (update_major) { // no vertices in the zero degree segment are visited (otherwise, + // no need to initialize) + if constexpr (use_input_key) { + thrust::fill(handle.get_thrust_policy(), + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_key_last), + init); + } else { + if (local_vertex_partition_segment_offsets) { + thrust::fill( + handle.get_thrust_policy(), + vertex_value_output_first + *((*local_vertex_partition_segment_offsets).rbegin() + 1), + vertex_value_output_first + *((*local_vertex_partition_segment_offsets).rbegin()), + init); + } + } + } else { + if constexpr (GraphViewType::is_multi_gpu) { + /* no need to initialize (we use minor_tmp_buffer) */ + } else { + thrust::fill(handle.get_thrust_policy(), + vertex_value_output_first, + vertex_value_output_first + graph_view.local_vertex_partition_range_size(), + init); + } + } + + // 3. filter input keys & update key_segment_offsets +#if PER_V_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time1 = std::chrono::steady_clock::now(); +#endif + + auto edge_mask_view = graph_view.edge_mask_view(); + + auto tmp_key_buffer = + allocate_optional_dataframe_buffer>( + 0, handle.get_stream()); + auto tmp_output_indices = + allocate_optional_dataframe_buffer>( + 0, handle.get_stream()); + std::conditional_t, + VertexValueOutputIterator> + tmp_vertex_value_output_first{}; + if constexpr (filter_input_key) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(static_cast(minor_comm_rank))); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, static_cast(minor_comm_rank)) + : thrust::nullopt; + + std::optional> edge_partition_stream_pool_indices{std::nullopt}; + if (local_vertex_partition_segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { + edge_partition_stream_pool_indices = std::vector(max_segments); + std::iota((*edge_partition_stream_pool_indices).begin(), + (*edge_partition_stream_pool_indices).end(), + size_t{0}); + } + + if (edge_partition_stream_pool_indices) { handle.sync_stream(); } + + edge_partition_src_input_device_view_t edge_partition_src_value_input{}; + edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; + if constexpr (GraphViewType::is_storage_transposed) { + edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input); + edge_partition_dst_value_input = edge_partition_dst_input_device_view_t( + edge_dst_value_input, static_cast(minor_comm_rank)); + } else { + edge_partition_src_value_input = edge_partition_src_input_device_view_t( + edge_src_value_input, static_cast(minor_comm_rank)); + edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input); + } + auto edge_partition_e_value_input = + edge_partition_e_input_device_view_t(edge_value_input, static_cast(minor_comm_rank)); + + per_v_transform_reduce_e_edge_partition( + handle, + edge_partition, + sorted_unique_key_first, + sorted_unique_nzd_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + vertex_value_output_first, + e_op, + init, + init, + reduce_op, + pred_op, + key_segment_offsets ? std::make_optional>( + (*key_segment_offsets).data(), (*key_segment_offsets).size()) + : std::nullopt, + edge_partition_stream_pool_indices ? std::make_optional>( + (*edge_partition_stream_pool_indices).data(), + (*edge_partition_stream_pool_indices).size()) + : std::nullopt); + + if (edge_partition_stream_pool_indices) { + handle.sync_stream_pool(*edge_partition_stream_pool_indices); + } + + auto num_tmp_keys = thrust::count( + handle.get_thrust_policy(), + vertex_value_output_first, + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), + init); // we allow false positives (some edge operations may actually return init) + + resize_optional_dataframe_buffer(tmp_key_buffer, num_tmp_keys, handle.get_stream()); + resize_optional_dataframe_buffer(tmp_output_indices, num_tmp_keys, handle.get_stream()); + + auto input_first = + thrust::make_zip_iterator(sorted_unique_key_first, thrust::make_counting_iterator(size_t{0})); + thrust::copy_if( + handle.get_thrust_policy(), + input_first, + input_first + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), + vertex_value_output_first, + thrust::make_zip_iterator(get_optional_dataframe_buffer_begin(tmp_key_buffer), + get_optional_dataframe_buffer_begin(tmp_output_indices)), + is_equal_t{init}); + + sorted_unique_key_first = get_optional_dataframe_buffer_begin(tmp_key_buffer); + sorted_unique_nzd_key_last = get_optional_dataframe_buffer_end(tmp_key_buffer); + tmp_vertex_value_output_first = thrust::make_permutation_iterator( + vertex_value_output_first, get_optional_dataframe_buffer_begin(tmp_output_indices)); + + if (key_segment_offsets) { + key_segment_offsets = compute_key_segment_offsets( + sorted_unique_key_first, + sorted_unique_nzd_key_last, + raft::host_span((*local_vertex_partition_segment_offsets).data(), + (*local_vertex_partition_segment_offsets).size()), + edge_partition.major_range_first(), + handle.get_stream()); + assert((*key_segment_offsets).back() == *((*key_segment_offsets).rbegin() + 1)); + assert(sorted_uniue_nzd_key_last == sorted_unique_key_first + (*key_segment_offsets).back()); + } + } else { + tmp_vertex_value_output_first = vertex_value_output_first; + } + + /* 4. compute subgroup_size (used to compute priority in device_gatherv) */ +#if PER_V_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time2 = std::chrono::steady_clock::now(); +#endif + + [[maybe_unused]] std::conditional_t>, + int, + std::byte /* dummy */> + subgroup_size{}; + if constexpr (GraphViewType::is_multi_gpu && update_major && + std::is_same_v>) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + int num_gpus_per_node{}; + RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); + if (comm_size <= num_gpus_per_node) { + subgroup_size = minor_comm_size; + } else { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm + ? std::max(num_gpus_per_node / major_comm_size, int{1}) + : std::min(minor_comm_size, num_gpus_per_node); + } + } + + // 5. collect max_tmp_buffer_size, approx_tmp_buffer_size_per_loop, local_key_list_sizes, + // local_v_list_range_firsts, local_v_list_range_lasts, local_key_list_deg1_sizes, + // key_segment_offset_vectors + + std::conditional_t, std::byte /* dummy */> + max_tmp_buffer_sizes{}; + std::conditional_t, std::byte /* dummy */> + tmp_buffer_size_per_loop_approximations{}; + std::conditional_t, std::byte /* dummy */> + local_key_list_sizes{}; + std::conditional_t, std::byte /* dummy */> + local_v_list_range_firsts{}; + std::conditional_t, std::byte /* dummy */> + local_v_list_range_lasts{}; + std::conditional_t>, std::byte /* dummy */> + local_key_list_deg1_sizes{}; // if global degree is 1, any valid local value should be selected + std::conditional_t>>, + std::byte /* dummy */> + key_segment_offset_vectors{}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + auto max_tmp_buffer_size = + static_cast(static_cast(handle.get_device_properties().totalGlobalMem) * 0.2); + size_t approx_tmp_buffer_size_per_loop{0}; + if constexpr (update_major) { + size_t key_size{0}; + if constexpr (use_input_key) { + if constexpr (std::is_arithmetic_v) { + key_size = sizeof(key_t); + } else { + key_size = sum_thrust_tuple_element_sizes(); + } + } + size_t value_size{0}; + if constexpr (std::is_arithmetic_v) { + value_size = sizeof(T); + } else { + value_size = sum_thrust_tuple_element_sizes(); + } + + size_t major_range_size{}; + if constexpr (use_input_key) { + major_range_size = static_cast( + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)); + ; + } else { + major_range_size = graph_view.local_vertex_partition_range_size(); + } + size_t size_per_key{}; + if constexpr (filter_input_key) { + size_per_key = + key_size + + value_size / 2; // to reflect that many keys will be filtered out, note that this is a + // simple approximation, memory requirement in this case is much more + // complex as we store additional temporary variables + + } else { + size_per_key = key_size + value_size; + } + approx_tmp_buffer_size_per_loop = major_range_size * size_per_key; + } + + size_t num_scalars = 2; // max_tmp_buffer_size, approx_tmp_buffer_size_per_loop + size_t num_scalars_less_key_segment_offsets = num_scalars; + if constexpr (use_input_key) { + num_scalars += 1; // local_key_list_size + if constexpr (try_bitmap) { + num_scalars += 2; // local_key_list_range_first, local_key_list_range_last + } + if (filter_input_key && graph_view.use_dcs()) { + num_scalars += 1; // local_key_list_degree_1_size + } + num_scalars_less_key_segment_offsets = num_scalars; + if (key_segment_offsets) { num_scalars += (*key_segment_offsets).size(); } + } + + rmm::device_uvector d_aggregate_tmps(minor_comm_size * num_scalars, + handle.get_stream()); + auto hypersparse_degree_offsets = + graph_view.local_vertex_partition_hypersparse_degree_offsets(); + thrust::tabulate( + handle.get_thrust_policy(), + d_aggregate_tmps.begin() + num_scalars * minor_comm_rank, + d_aggregate_tmps.begin() + num_scalars * minor_comm_rank + + num_scalars_less_key_segment_offsets, + [max_tmp_buffer_size, + approx_tmp_buffer_size_per_loop, + sorted_unique_key_first, + sorted_unique_nzd_key_last, + deg1_v_first = (filter_input_key && graph_view.use_dcs()) + ? thrust::make_optional(graph_view.local_vertex_partition_range_first() + + (*local_vertex_partition_segment_offsets)[3] + + *((*hypersparse_degree_offsets).rbegin() + 1)) + : thrust::nullopt, + vertex_partition_range_first = + graph_view.local_vertex_partition_range_first()] __device__(size_t i) { + if (i == 0) { + return max_tmp_buffer_size; + } else if (i == 1) { + return approx_tmp_buffer_size_per_loop; + } + if constexpr (use_input_key) { + auto v_list_size = static_cast( + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)); + if (i == 2) { return v_list_size; } + if constexpr (try_bitmap) { + if (i == 3) { + vertex_t first{}; + if (v_list_size > 0) { + first = *sorted_unique_key_first; + } else { + first = vertex_partition_range_first; + } + assert(static_cast(static_cast(first)) == first); + return static_cast(first); + } else if (i == 4) { + vertex_t last{}; + if (v_list_size > 0) { + last = *(sorted_unique_key_first + (v_list_size - 1)) + 1; + } else { + last = vertex_partition_range_first; + } + assert(static_cast(static_cast(last)) == last); + return static_cast(last); + } else if (i == 5) { + if (deg1_v_first) { + auto sorted_unique_v_first = thrust::make_transform_iterator( + sorted_unique_key_first, + cuda::proclaim_return_type([] __device__(auto key) { + return thrust_tuple_get_or_identity(key); + })); + return v_list_size - static_cast(thrust::distance( + sorted_unique_v_first, + thrust::lower_bound(thrust::seq, + sorted_unique_v_first, + sorted_unique_v_first + v_list_size, + deg1_v_first))); + } + } + } else { + if (i == 3) { + if (deg1_v_first) { + auto sorted_unique_v_first = thrust::make_transform_iterator( + sorted_unique_key_first, + cuda::proclaim_return_type([] __device__(auto key) { + return thrust_tuple_get_or_identity(key); + })); + return v_list_size - static_cast(thrust::distance( + sorted_unique_v_first, + thrust::lower_bound(thrust::seq, + sorted_unique_v_first, + sorted_unique_v_first + v_list_size, + deg1_v_first))); + } + } + } + } + assert(false); + return size_t{0}; + }); + if constexpr (use_input_key) { + if (key_segment_offsets) { + raft::update_device(d_aggregate_tmps.data() + (num_scalars * minor_comm_rank + + num_scalars_less_key_segment_offsets), + (*key_segment_offsets).data(), + (*key_segment_offsets).size(), + handle.get_stream()); + } + } + + if (minor_comm_size > 1) { + device_allgather(minor_comm, + d_aggregate_tmps.data() + minor_comm_rank * num_scalars, + d_aggregate_tmps.data(), + num_scalars, + handle.get_stream()); + } + + std::vector h_aggregate_tmps(d_aggregate_tmps.size()); + raft::update_host(h_aggregate_tmps.data(), + d_aggregate_tmps.data(), + d_aggregate_tmps.size(), + handle.get_stream()); + handle.sync_stream(); + max_tmp_buffer_sizes = std::vector(minor_comm_size); + tmp_buffer_size_per_loop_approximations = std::vector(minor_comm_size); + if constexpr (use_input_key) { + local_key_list_sizes = std::vector(minor_comm_size); + if constexpr (try_bitmap) { + local_v_list_range_firsts = std::vector(minor_comm_size); + local_v_list_range_lasts = std::vector(minor_comm_size); + } + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { + local_key_list_deg1_sizes = std::vector(minor_comm_size); + } + } + if (key_segment_offsets) { + key_segment_offset_vectors = std::vector>{}; + (*key_segment_offset_vectors).reserve(minor_comm_size); + } + } + for (int i = 0; i < minor_comm_size; ++i) { + max_tmp_buffer_sizes[i] = h_aggregate_tmps[i * num_scalars]; + tmp_buffer_size_per_loop_approximations[i] = h_aggregate_tmps[i * num_scalars + 1]; + if constexpr (use_input_key) { + local_key_list_sizes[i] = h_aggregate_tmps[i * num_scalars + 2]; + if constexpr (try_bitmap) { + local_v_list_range_firsts[i] = + static_cast(h_aggregate_tmps[i * num_scalars + 3]); + local_v_list_range_lasts[i] = + static_cast(h_aggregate_tmps[i * num_scalars + 4]); + } + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { + (*local_key_list_deg1_sizes)[i] = + static_cast(h_aggregate_tmps[i * num_scalars + (try_bitmap ? 5 : 3)]); + } + } + if (key_segment_offsets) { + (*key_segment_offset_vectors) + .emplace_back( + h_aggregate_tmps.begin() + i * num_scalars + num_scalars_less_key_segment_offsets, + h_aggregate_tmps.begin() + i * num_scalars + num_scalars_less_key_segment_offsets + + (*key_segment_offsets).size()); + } + } + } + } else { + if constexpr (use_input_key) { + local_key_list_sizes = std::vector{ + static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last))}; + if (key_segment_offsets) { + key_segment_offset_vectors = std::vector>(1); + (*key_segment_offset_vectors)[0] = *key_segment_offsets; + } + } + } + + // 6. compute optional bitmap info & compressed vertex list + + bool v_compressible{false}; + std:: + conditional_t>, std::byte /* dummy */> + v_list_bitmap{}; + std:: + conditional_t>, std::byte /* dummy */> + compressed_v_list{}; + if constexpr (try_bitmap) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + auto const minor_comm_rank = minor_comm.get_rank(); + + if constexpr (sizeof(vertex_t) == 8) { + vertex_t local_v_list_max_range_size{0}; + for (int i = 0; i < minor_comm_size; ++i) { + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + local_v_list_max_range_size = std::max(range_size, local_v_list_max_range_size); + } + if (local_v_list_max_range_size <= + std::numeric_limits::max()) { // broadcast 32bit offset values instead of 64 + // bit vertex IDs + v_compressible = true; + } + } + + double avg_fill_ratio{0.0}; + for (int i = 0; i < minor_comm_size; ++i) { + auto num_keys = static_cast(local_key_list_sizes[i]); + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + avg_fill_ratio += + (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; + } + avg_fill_ratio /= static_cast(minor_comm_size); + double threshold_ratio = + 2.0 /* tuning parameter (consider that we need to reprodce vertex list from bitmap)*/ / + static_cast((v_compressible ? sizeof(uint32_t) : sizeof(vertex_t)) * 8); + auto avg_key_list_size = + std::reduce(local_key_list_sizes.begin(), local_key_list_sizes.end()) / + static_cast(minor_comm_size); + + if ((avg_fill_ratio > threshold_ratio) && + (static_cast(avg_key_list_size) > + packed_bools_per_word() * + 32 /* tuning parameter, to considerr additional kernel launch overhead */)) { + v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_key_first, + sorted_unique_nzd_key_last, + local_v_list_range_firsts[minor_comm_rank], + local_v_list_range_lasts[minor_comm_rank], + handle.get_stream()); + } else if (v_compressible) { + rmm::device_uvector tmps(local_key_list_sizes[minor_comm_rank], + handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + sorted_unique_key_first, + sorted_unique_nzd_key_last, + tmps.begin(), + cuda::proclaim_return_type( + [range_first = local_v_list_range_firsts[minor_comm_rank]] __device__( + auto v) { return static_cast(v - range_first); })); + compressed_v_list = std::move(tmps); + } + } + } + + bool uint32_key_output_offset = false; + if constexpr (GraphViewType::is_multi_gpu && update_major && + std::is_same_v>) { + size_t max_key_offset_size = std::numeric_limits::max(); + if constexpr (filter_input_key) { + max_key_offset_size = std::reduce( + local_key_list_sizes.begin(), local_key_list_sizes.end(), size_t{0}, [](auto l, auto r) { + return std::max(l, r); + }); + } else { + static_assert(!use_input_key); + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(i)); + auto const& segment_offsets = graph_view.local_edge_partition_segment_offsets(i); + + auto output_range_size = + segment_offsets ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : edge_partition.major_range_size(); + + max_key_offset_size = std::max(static_cast(output_range_size), max_key_offset_size); + } + } + uint32_key_output_offset = + (max_key_offset_size <= static_cast(std::numeric_limits::max())); + } + + // 7. set-up stream pool & events + + std::optional> stream_pool_indices{std::nullopt}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + auto max_tmp_buffer_size = + std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / + static_cast(minor_comm_size); + auto approx_tmp_buffer_size_per_loop = + std::reduce(tmp_buffer_size_per_loop_approximations.begin(), + tmp_buffer_size_per_loop_approximations.end()) / + static_cast(minor_comm_size); + size_t num_streams_per_loop{1}; + if (local_vertex_partition_segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { + num_streams_per_loop = std::max( + std::min(size_t{8} / graph_view.number_of_local_edge_partitions(), max_segments), + size_t{ + 1}); // Note that "CUDA_DEVICE_MAX_CONNECTIONS (default: 8, can be set to [1, 32])" sets + // the number of queues, if the total number of streams exceeds this number, jobs on + // different streams can be sent to one queue leading to false dependency. Setting + // num_concurrent_loops above the number of queues has some benefits in NCCL + // communications but creating too many streams just for compute may not help. + } + stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, + approx_tmp_buffer_size_per_loop, + graph_view.number_of_local_edge_partitions(), + num_streams_per_loop, + handle.get_stream_pool_size()); + if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } + } + + size_t num_concurrent_loops{1}; + std::optional> loop_stream_pool_indices{ + std::nullopt}; // first num_concurrent_loops streams from stream_pool_indices + if (stream_pool_indices) { + num_concurrent_loops = + std::min(graph_view.number_of_local_edge_partitions(), (*stream_pool_indices).size()); + loop_stream_pool_indices = std::vector(num_concurrent_loops); + std::iota((*loop_stream_pool_indices).begin(), (*loop_stream_pool_indices).end(), size_t{0}); + } + + // 8. set-up temporary buffers + + using minor_tmp_buffer_type = std::conditional_t, + edge_dst_property_t>; + [[maybe_unused]] std::unique_ptr minor_tmp_buffer{}; + if constexpr (GraphViewType::is_multi_gpu && !update_major) { + minor_tmp_buffer = std::make_unique(handle, graph_view); + auto minor_init = init; + auto view = minor_tmp_buffer->view(); + if (view.keys()) { // defer applying the initial value to the end as minor_tmp_buffer ma not + // store values for the entire minor rangey + minor_init = ReduceOp::identity_element; + } else { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + minor_init = (major_comm_rank == 0) ? init : ReduceOp::identity_element; + } + fill_edge_minor_property(handle, graph_view, minor_tmp_buffer->mutable_view(), minor_init); + } + + using edge_partition_minor_output_device_view_t = + std::conditional_tmutable_view().value_first())>, + void /* dummy */>; + + auto counters = allocate_optional_dataframe_buffer< + std::conditional_t>( + num_concurrent_loops, handle.get_stream()); + + if constexpr (!GraphViewType::is_multi_gpu || !use_input_key) { + if (loop_stream_pool_indices) { handle.sync_stream(); } + } + + // 9. process local edge partitions + +#if PER_V_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time3 = std::chrono::steady_clock::now(); +#endif + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i += num_concurrent_loops) { +#if PER_V_PERFORMANCE_MEASUREMENT + auto subtime0 = std::chrono::steady_clock::now(); + auto subtime1 = std::chrono::steady_clock::now(); + auto subtime2 = std::chrono::steady_clock::now(); +#endif + auto loop_count = + std::min(num_concurrent_loops, graph_view.number_of_local_edge_partitions() - i); +#if PER_V_PERFORMANCE_MEASUREMENT + std::vector bcast_sizes(loop_count); +#endif + + std::conditional_t< + GraphViewType::is_multi_gpu && use_input_key, + std::conditional_t< + try_bitmap, + std::vector, rmm::device_uvector>>, + std::vector>>, + std::byte /* dummy */> + edge_partition_key_buffers{}; + std::conditional_t, rmm::device_uvector>>>, + std::byte /* dummy */> + edge_partition_hypersparse_key_offset_vectors{}; // drop zero local degree keys in th + // hypersparse regione + std::conditional_t>, std::byte /* dummy */> + edge_partition_deg1_hypersparse_key_offset_counts{}; + std::vector process_local_edges(loop_count, true); + + if constexpr (GraphViewType::is_multi_gpu && use_input_key) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + auto const minor_comm_rank = minor_comm.get_rank(); + + edge_partition_key_buffers.reserve(loop_count); + + std::conditional_t>>, + std::byte /* dummy */> + edge_partition_bitmap_buffers{std::nullopt}; + if constexpr (try_bitmap) { + if (v_list_bitmap) { + edge_partition_bitmap_buffers = std::vector>{}; + (*edge_partition_bitmap_buffers).reserve(loop_count); + } + } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + bool use_bitmap_buffer = false; + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { + (*edge_partition_bitmap_buffers) + .emplace_back(packed_bool_size(local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]), + handle.get_stream()); + use_bitmap_buffer = true; +#if PER_V_PERFORMANCE_MEASUREMENT + bcast_sizes[j] = packed_bool_size(local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]) * + sizeof(uint32_t); +#endif + } +#if PER_V_PERFORMANCE_MEASUREMENT + else { + bcast_sizes[j] = local_key_list_sizes[partition_idx] * + (v_compressible ? sizeof(uint32_t) : sizeof(vertex_t)); + } +#endif + } + if (!use_bitmap_buffer) { + bool allocated{false}; + if constexpr (try_bitmap) { + if (v_compressible) { + edge_partition_key_buffers.push_back(rmm::device_uvector( + local_key_list_sizes[partition_idx], handle.get_stream())); + allocated = true; + } + } + if (!allocated) { + edge_partition_key_buffers.push_back(allocate_dataframe_buffer( + local_key_list_sizes[partition_idx], handle.get_stream())); + } + } + + if constexpr (filter_input_key) { + if (static_cast(partition_idx) == minor_comm_rank) { + process_local_edges[j] = false; + } + } + } +#if PER_V_PERFORMANCE_MEASUREMENT + handle.sync_stream(); + subtime1 = std::chrono::steady_clock::now(); +#endif + + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + if constexpr (try_bitmap) { + if (v_list_bitmap) { + device_bcast(minor_comm, + (*v_list_bitmap).data(), + get_dataframe_buffer_begin((*edge_partition_bitmap_buffers)[j]), + size_dataframe_buffer((*edge_partition_bitmap_buffers)[j]), + static_cast(partition_idx), + handle.get_stream()); + } else if (compressed_v_list) { + device_bcast(minor_comm, + (*compressed_v_list).data(), + get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])), + local_key_list_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } else { + device_bcast(minor_comm, + sorted_unique_key_first, + get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])), + local_key_list_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } + } else { + device_bcast(minor_comm, + sorted_unique_key_first, + get_dataframe_buffer_begin(edge_partition_key_buffers[j]), + local_key_list_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } + } + device_group_end(minor_comm); + if (loop_stream_pool_indices) { handle.sync_stream(); } +#if PER_V_PERFORMANCE_MEASUREMENT + subtime2 = std::chrono::steady_clock::now(); +#endif + + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { + // copy keys from temporary bitmap buffers to key buffers (copy only the sparse segments + // if filter_input_key is true) + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + std::variant, rmm::device_uvector> keys = + rmm::device_uvector(0, loop_stream); + if (v_compressible) { + std::get<0>(keys).resize( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + } else { + keys = rmm::device_uvector( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + } + + auto& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + if (process_local_edges[j]) { + auto range_first = local_v_list_range_firsts[partition_idx]; + auto range_last = local_v_list_range_lasts[partition_idx]; + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { // skip copying the hypersparse segment + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + range_last = std::min(range_last, *(edge_partition.major_hypersparse_first())); + } + } + if (range_first < range_last) { + if (keys.index() == 0) { + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + get_dataframe_buffer_begin(std::get<0>(keys)), + raft::device_span( + counters.data() + j, + size_t{1}), // dummy, we already know the counts (i.e. + // (*key_segment_offset_vectors)[partition_idx][3]) + uint32_t{0}, + static_cast(range_last - range_first), + loop_stream); + } else { + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + get_dataframe_buffer_begin(std::get<1>(keys)), + raft::device_span( + counters.data() + j, + size_t{1}), // dummy, we already know the counts (i.e. + // (*key_segment_offset_vectors)[partition_idx][3]) + range_first, + range_last, + loop_stream); + } + } + } else { + rx_bitmap.resize(0, loop_stream); + rx_bitmap.shrink_to_fit(loop_stream); + } + edge_partition_key_buffers.push_back(std::move(keys)); + } + } + } + + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { + edge_partition_hypersparse_key_offset_vectors = + std::vector, rmm::device_uvector>>{}; + (*edge_partition_hypersparse_key_offset_vectors).reserve(loop_count); + edge_partition_deg1_hypersparse_key_offset_counts = std::vector(loop_count, 0); + + std::conditional_t, + rmm::device_uvector>>, + std::vector>>>, + std::byte /* dummy */> + edge_partition_new_key_buffers{}; + bool allocate_new_key_buffer{true}; + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { allocate_new_key_buffer = false; } + } + if (allocate_new_key_buffer) { // allocate new key buffers and copy the sparse segment + // keys to the new key buffers + if constexpr (try_bitmap) { + edge_partition_new_key_buffers = std::vector< + std::variant, rmm::device_uvector>>{}; + } else { + edge_partition_new_key_buffers = std::vector>{}; + } + (*edge_partition_new_key_buffers).reserve(loop_count); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + if constexpr (try_bitmap) { + if (v_compressible) { + auto new_key_buffer = rmm::device_uvector( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + if (process_local_edges[j]) { + thrust::copy( + rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])), + get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])) + + key_segment_offsets[3], + get_dataframe_buffer_begin(new_key_buffer)); + } else { + std::get<0>(edge_partition_key_buffers[j]).resize(0, loop_stream); + std::get<0>(edge_partition_key_buffers[j]).shrink_to_fit(loop_stream); + } + (*edge_partition_new_key_buffers).push_back(std::move(new_key_buffer)); + } else { + auto new_key_buffer = rmm::device_uvector( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + if (process_local_edges[j]) { + thrust::copy( + rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])), + get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])) + + key_segment_offsets[3], + get_dataframe_buffer_begin(new_key_buffer)); + } else { + std::get<1>(edge_partition_key_buffers[j]).resize(0, loop_stream); + std::get<1>(edge_partition_key_buffers[j]).shrink_to_fit(loop_stream); + } + (*edge_partition_new_key_buffers).push_back(std::move(new_key_buffer)); + } + } else { + auto new_key_buffer = allocate_dataframe_buffer( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + if (process_local_edges[j]) { + thrust::copy(rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin(edge_partition_key_buffers[j]), + get_dataframe_buffer_begin(edge_partition_key_buffers[j]) + + key_segment_offsets[3], + get_dataframe_buffer_begin(new_key_buffer)); + } else { + edge_partition_key_buffers[j].resize(0, loop_stream); + edge_partition_key_buffers[j].shrink_to_fit(loop_stream); + } + (*edge_partition_new_key_buffers).push_back(std::move(new_key_buffer)); + } + } + } + + if constexpr (try_bitmap) { // if we are using a bitmap buffer + if (v_list_bitmap) { + std::vector> input_count_offset_vectors{}; + input_count_offset_vectors.reserve(loop_count); + + std::vector> filtered_bitmap_vectors{}; + std::vector> output_count_offset_vectors{}; + filtered_bitmap_vectors.reserve(loop_count); + output_count_offset_vectors.reserve(loop_count); + + std::vector range_offset_firsts(loop_count, 0); + std::vector range_offset_lasts(loop_count, 0); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + rmm::device_uvector input_count_offsets(0, loop_stream); + if (process_local_edges[j]) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + + auto range_offset_first = + std::min((edge_partition.major_range_first() + (*segment_offsets)[3] > + local_v_list_range_firsts[partition_idx]) + ? ((edge_partition.major_range_first() + (*segment_offsets)[3]) - + local_v_list_range_firsts[partition_idx]) + : vertex_t{0}, + local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]); + auto range_offset_last = + std::min(((edge_partition.major_range_first() + (*segment_offsets)[4]) > + local_v_list_range_firsts[partition_idx]) + ? ((edge_partition.major_range_first() + (*segment_offsets)[4]) - + local_v_list_range_firsts[partition_idx]) + : vertex_t{0}, + local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]); + if (range_offset_first < range_offset_last) { + auto const& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + auto input_count_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(packed_bool_offset(range_offset_first)), + cuda::proclaim_return_type( + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + range_offset_first] __device__(size_t i) { + auto word = range_bitmap[i]; + if (i == packed_bool_offset(range_offset_first)) { + word &= ~packed_bool_partial_mask( + range_offset_first % + packed_bools_per_word()); // clear the bits in the sparse region + } + return static_cast(__popc(word)); + })); + input_count_offsets.resize( + (rx_bitmap.size() - packed_bool_offset(range_offset_first)) + 1, loop_stream); + input_count_offsets.set_element_to_zero_async(0, loop_stream); + thrust::inclusive_scan( + rmm::exec_policy_nosync(loop_stream), + input_count_first, + input_count_first + + (rx_bitmap.size() - packed_bool_offset(range_offset_first)), + input_count_offsets.begin() + 1); + } + range_offset_firsts[j] = range_offset_first; + range_offset_lasts[j] = range_offset_last; + } + input_count_offset_vectors.push_back(std::move(input_count_offsets)); + } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + rmm::device_uvector filtered_bitmap(0, loop_stream); + rmm::device_uvector output_count_offsets(0, loop_stream); + if (process_local_edges[j]) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + + auto segment_bitmap = *(edge_partition.dcs_nzd_range_bitmap()); + + auto range_offset_first = range_offset_firsts[j]; + auto range_offset_last = range_offset_lasts[j]; + if (range_offset_first < range_offset_last) { + auto const& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + filtered_bitmap.resize( + rx_bitmap.size() - packed_bool_offset(range_offset_first), loop_stream); + thrust::tabulate( + rmm::exec_policy_nosync(loop_stream), + filtered_bitmap.begin(), + filtered_bitmap.end(), + cuda::proclaim_return_type( + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + segment_bitmap = raft::device_span(segment_bitmap.data(), + segment_bitmap.size()), + range_first = local_v_list_range_firsts[partition_idx], + range_offset_first, + range_offset_last, + major_hypersparse_first = + *(edge_partition.major_hypersparse_first())] __device__(size_t i) { + auto this_word_range_offset_first = cuda::std::max( + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()), + range_offset_first); + auto this_word_range_offset_last = + cuda::std::min(static_cast( + (packed_bool_offset(range_offset_first) + (i + 1)) * + packed_bools_per_word()), + range_offset_last); + auto range_lead_bits = static_cast(this_word_range_offset_first % + packed_bools_per_word()); + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask(range_offset_first % + packed_bools_per_word()); + } + auto this_word_hypersparse_offset_first = + (range_first + this_word_range_offset_first) - major_hypersparse_first; + auto num_bits = static_cast(this_word_range_offset_last - + this_word_range_offset_first); + auto hypersparse_lead_bits = + static_cast(this_word_hypersparse_offset_first) % + packed_bools_per_word(); + auto segment_bitmap_word = ((segment_bitmap[packed_bool_offset( + this_word_hypersparse_offset_first)] >> + hypersparse_lead_bits)) + << range_lead_bits; + auto remaining_bits = + (num_bits > (packed_bools_per_word() - hypersparse_lead_bits)) + ? (num_bits - (packed_bools_per_word() - hypersparse_lead_bits)) + : size_t{0}; + if (remaining_bits > 0) { + segment_bitmap_word |= + ((segment_bitmap + [packed_bool_offset(this_word_hypersparse_offset_first) + 1] & + packed_bool_partial_mask(remaining_bits)) + << ((packed_bools_per_word() - hypersparse_lead_bits) + + range_lead_bits)); + } + return range_bitmap_word & segment_bitmap_word; + })); + auto output_count_first = thrust::make_transform_iterator( + filtered_bitmap.begin(), + cuda::proclaim_return_type([] __device__(uint32_t word) { + return static_cast(__popc(word)); + })); + output_count_offsets.resize(filtered_bitmap.size() + 1, loop_stream); + output_count_offsets.set_element_to_zero_async(0, loop_stream); + thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream), + output_count_first, + output_count_first + filtered_bitmap.size(), + output_count_offsets.begin() + 1); + } + } + filtered_bitmap_vectors.push_back(std::move(filtered_bitmap)); + output_count_offset_vectors.push_back(std::move(output_count_offsets)); + } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + auto& keys = edge_partition_key_buffers[j]; + std::variant, rmm::device_uvector> offsets = + rmm::device_uvector(0, loop_stream); + if (uint32_key_output_offset) { + std::get<0>(offsets).resize(process_local_edges[j] + ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } else { + offsets = rmm::device_uvector( + process_local_edges[j] ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } + + if (process_local_edges[j]) { + auto range_offset_first = range_offset_firsts[j]; + auto range_offset_last = range_offset_lasts[j]; + if (range_offset_first < range_offset_last) { + auto const& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + auto const& input_count_offsets = input_count_offset_vectors[j]; + auto const& filtered_bitmap = filtered_bitmap_vectors[j]; + auto const& output_count_offsets = output_count_offset_vectors[j]; + + if (keys.index() == 0) { + if (offsets.index() == 0) { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(filtered_bitmap.size()), + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + filtered_bitmap = raft::device_span( + filtered_bitmap.data(), filtered_bitmap.size()), + input_count_offsets = raft::device_span( + input_count_offsets.data(), input_count_offsets.size()), + output_count_offsets = raft::device_span( + output_count_offsets.data(), output_count_offsets.size()), + output_key_first = + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + output_offset_first = std::get<0>(offsets).begin(), + range_offset_first, + start_key_offset = key_segment_offsets[3]] __device__(size_t i) { + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask( + range_offset_first % packed_bools_per_word()); + } + auto filtered_bitmap_word = filtered_bitmap[i]; + auto lead_bits = (i == 0) + ? static_cast(range_offset_first % + packed_bools_per_word()) + : static_cast(0); + auto this_word_start_v_offset = + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()); + auto this_word_start_key_offset = + static_cast(start_key_offset + input_count_offsets[i]); + auto this_word_output_start_offset = output_count_offsets[i]; + for (int j = 0; j < __popc(filtered_bitmap_word); ++j) { + auto jth_set_bit_pos = static_cast( + __fns(filtered_bitmap_word, lead_bits, j + 1)); + *(output_key_first + (this_word_output_start_offset + j)) = + this_word_start_v_offset + jth_set_bit_pos; + *(output_offset_first + (this_word_output_start_offset + j)) = + this_word_start_key_offset + + static_cast(__popc( + range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos))); + } + }); + } else { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(filtered_bitmap.size()), + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + filtered_bitmap = raft::device_span( + filtered_bitmap.data(), filtered_bitmap.size()), + input_count_offsets = raft::device_span( + input_count_offsets.data(), input_count_offsets.size()), + output_count_offsets = raft::device_span( + output_count_offsets.data(), output_count_offsets.size()), + output_key_first = + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + output_offset_first = std::get<1>(offsets).begin(), + range_offset_first, + start_key_offset = key_segment_offsets[3]] __device__(size_t i) { + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask( + range_offset_first % packed_bools_per_word()); + } + auto filtered_bitmap_word = filtered_bitmap[i]; + auto lead_bits = (i == 0) + ? static_cast(range_offset_first % + packed_bools_per_word()) + : static_cast(0); + auto this_word_start_v_offset = + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()); + auto this_word_start_key_offset = + static_cast(start_key_offset + input_count_offsets[i]); + auto this_word_output_start_offset = output_count_offsets[i]; + for (int j = 0; j < __popc(filtered_bitmap_word); ++j) { + auto jth_set_bit_pos = static_cast( + __fns(filtered_bitmap_word, lead_bits, j + 1)); + *(output_key_first + (this_word_output_start_offset + j)) = + this_word_start_v_offset + jth_set_bit_pos; + *(output_offset_first + (this_word_output_start_offset + j)) = + this_word_start_key_offset + + static_cast(__popc( + range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos))); + } + }); + } + } else { + if (offsets.index() == 0) { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(filtered_bitmap.size()), + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + filtered_bitmap = raft::device_span( + filtered_bitmap.data(), filtered_bitmap.size()), + input_count_offsets = raft::device_span( + input_count_offsets.data(), input_count_offsets.size()), + output_count_offsets = raft::device_span( + output_count_offsets.data(), output_count_offsets.size()), + output_key_first = + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + output_offset_first = std::get<0>(offsets).begin(), + range_first = local_v_list_range_firsts[partition_idx], + range_offset_first, + start_key_offset = key_segment_offsets[3]] __device__(size_t i) { + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask( + range_offset_first % packed_bools_per_word()); + } + auto filtered_bitmap_word = filtered_bitmap[i]; + auto lead_bits = (i == 0) + ? static_cast(range_offset_first % + packed_bools_per_word()) + : static_cast(0); + auto this_word_start_v = + range_first + + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()); + auto this_word_start_key_offset = + static_cast(start_key_offset + input_count_offsets[i]); + auto this_word_output_start_offset = output_count_offsets[i]; + for (int j = 0; j < __popc(filtered_bitmap_word); ++j) { + auto jth_set_bit_pos = static_cast( + __fns(filtered_bitmap_word, lead_bits, j + 1)); + *(output_key_first + (this_word_output_start_offset + j)) = + this_word_start_v + jth_set_bit_pos; + *(output_offset_first + (this_word_output_start_offset + j)) = + this_word_start_key_offset + + static_cast(__popc( + range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos))); + } + }); + } else { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(filtered_bitmap.size()), + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + filtered_bitmap = raft::device_span( + filtered_bitmap.data(), filtered_bitmap.size()), + input_count_offsets = raft::device_span( + input_count_offsets.data(), input_count_offsets.size()), + output_count_offsets = raft::device_span( + output_count_offsets.data(), output_count_offsets.size()), + output_key_first = + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + output_offset_first = std::get<1>(offsets).begin(), + range_first = local_v_list_range_firsts[partition_idx], + range_offset_first, + start_key_offset = key_segment_offsets[3]] __device__(size_t i) { + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask( + range_offset_first % packed_bools_per_word()); + } + auto filtered_bitmap_word = filtered_bitmap[i]; + auto lead_bits = (i == 0) + ? static_cast(range_offset_first % + packed_bools_per_word()) + : static_cast(0); + auto this_word_start_v = + range_first + + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()); + auto this_word_start_key_offset = + static_cast(start_key_offset + input_count_offsets[i]); + auto this_word_output_start_offset = output_count_offsets[i]; + for (int j = 0; j < __popc(filtered_bitmap_word); ++j) { + auto jth_set_bit_pos = static_cast( + __fns(filtered_bitmap_word, lead_bits, j + 1)); + *(output_key_first + (this_word_output_start_offset + j)) = + this_word_start_v + jth_set_bit_pos; + *(output_offset_first + (this_word_output_start_offset + j)) = + this_word_start_key_offset + + static_cast(__popc( + range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos))); + } + }); + } + } + thrust::transform( + rmm::exec_policy_nosync(loop_stream), + output_count_offsets.begin() + (output_count_offsets.size() - 1), + output_count_offsets.end(), + counters.data() + j, + typecast_t{}); + } else { + thrust::fill(rmm::exec_policy_nosync(loop_stream), + counters.data() + j, + counters.data() + (j + 1), + size_t{0}); + } + } + + (*edge_partition_hypersparse_key_offset_vectors).push_back(std::move(offsets)); + } + } + } + if (edge_partition_new_key_buffers) { // if there is no bitmap buffer + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + auto& keys = edge_partition_key_buffers[j]; + std::variant, rmm::device_uvector> offsets = + rmm::device_uvector(0, loop_stream); + if (uint32_key_output_offset) { + std::get<0>(offsets).resize(process_local_edges[j] + ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } else { + offsets = rmm::device_uvector( + process_local_edges[j] ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } + + if (process_local_edges[j]) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + + auto segment_bitmap = *(edge_partition.dcs_nzd_range_bitmap()); + + auto& new_keys = (*edge_partition_new_key_buffers)[j]; + if constexpr (try_bitmap) { + assert(!v_list_bitmap); + if (keys.index() == 0) { + auto flag_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + cuda::proclaim_return_type( + [segment_bitmap = raft::device_span(segment_bitmap.data(), + segment_bitmap.size()), + range_first = local_v_list_range_firsts[partition_idx], + major_hypersparse_first = + *(edge_partition + .major_hypersparse_first())] __device__(uint32_t v_offset) { + auto v = range_first + static_cast(v_offset); + auto segment_offset = v - major_hypersparse_first; + return ((segment_bitmap[packed_bool_offset(segment_offset)] & + packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); + })); + if (offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<0>(keys)), + thrust::make_counting_iterator(uint32_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<0>(new_keys)) + + key_segment_offsets[3], + std::get<0>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<0>(keys)), + thrust::make_counting_iterator(size_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<0>(new_keys)) + + key_segment_offsets[3], + std::get<1>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } else { + auto flag_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(std::get<1>(keys)) + key_segment_offsets[3], + cuda::proclaim_return_type( + [segment_bitmap = raft::device_span(segment_bitmap.data(), + segment_bitmap.size()), + major_hypersparse_first = + *(edge_partition.major_hypersparse_first())] __device__(vertex_t v) { + auto segment_offset = v - major_hypersparse_first; + return ((segment_bitmap[packed_bool_offset(segment_offset)] & + packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); + })); + if (offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<1>(keys)), + thrust::make_counting_iterator(uint32_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<1>(new_keys)) + + key_segment_offsets[3], + std::get<0>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<1>(keys)), + thrust::make_counting_iterator(size_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<1>(new_keys)) + + key_segment_offsets[3], + std::get<1>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } + } else { + auto flag_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(keys) + key_segment_offsets[3], + cuda::proclaim_return_type( + [segment_bitmap = raft::device_span(segment_bitmap.data(), + segment_bitmap.size()), + major_hypersparse_first = + *(edge_partition.major_hypersparse_first())] __device__(auto key) { + auto segment_offset = + thrust_tuple_get_or_identity(key) - major_hypersparse_first; + return ((segment_bitmap[packed_bool_offset(segment_offset)] & + packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); + })); + if (offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(keys), + thrust::make_counting_iterator(uint32_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(new_keys) + key_segment_offsets[3], + std::get<0>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(keys), + thrust::make_counting_iterator(size_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(new_keys) + key_segment_offsets[3], + std::get<1>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } + } + + (*edge_partition_hypersparse_key_offset_vectors).push_back(std::move(offsets)); + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + if (edge_partition_new_key_buffers) { + for (size_t j = 0; j < loop_count; ++j) { + edge_partition_key_buffers[j] = std::move((*edge_partition_new_key_buffers)[j]); + } + } + if (edge_partition_bitmap_buffers) { (*edge_partition_bitmap_buffers).clear(); } + + std::vector h_counts(loop_count); + raft::update_host(h_counts.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + if (process_local_edges[j]) { + auto& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + auto& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + if (keys.index() == 0) { + resize_dataframe_buffer( + std::get<0>(keys), key_segment_offsets[3] + h_counts[j], loop_stream); + } else { + resize_dataframe_buffer( + std::get<1>(keys), key_segment_offsets[3] + h_counts[j], loop_stream); + } + } else { + resize_dataframe_buffer(keys, key_segment_offsets[3] + h_counts[j], loop_stream); + } + // skip shrink_to_fit to cut execution time + + auto& offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + if (offsets.index() == 0) { + std::get<0>(offsets).resize(h_counts[j], loop_stream); + } else { + std::get<1>(offsets).resize(h_counts[j], loop_stream); + } + // skip shrink_to_fit to cut execution time + } + } + + { // update edge_partition_deg1_hypersparse_key_offset_counts + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + + std::vector h_ptrs( + loop_count); // pointers to hypersparse key offset vectors + std::vector h_scalars( + loop_count * 2); // (key offset vector sizes, start degree 1 key offset) + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + if (process_local_edges[j]) { + auto const& offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + if (offsets.index() == 0) { + h_ptrs[j] = static_cast(std::get<0>(offsets).data()); + h_scalars[j * 2] = std::get<0>(offsets).size(); + } else { + h_ptrs[j] = static_cast(std::get<1>(offsets).data()); + h_scalars[j * 2] = std::get<1>(offsets).size(); + } + h_scalars[j * 2 + 1] = + local_key_list_sizes[partition_idx] - (*local_key_list_deg1_sizes)[partition_idx]; + } else { + h_ptrs[j] = static_cast(nullptr); + h_scalars[j * 2] = size_t{0}; + h_scalars[j * 2 + 1] = size_t{0}; + } + } + rmm::device_uvector d_ptrs(h_ptrs.size(), handle.get_stream()); + rmm::device_uvector d_scalars(h_scalars.size(), handle.get_stream()); + raft::update_device(d_ptrs.data(), h_ptrs.data(), h_ptrs.size(), handle.get_stream()); + raft::update_device( + d_scalars.data(), h_scalars.data(), h_scalars.size(), handle.get_stream()); + rmm::device_uvector d_counts(loop_count, handle.get_stream()); + thrust::transform( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(loop_count), + d_counts.begin(), + cuda::proclaim_return_type( + [d_ptrs = raft::device_span(d_ptrs.data(), d_ptrs.size()), + d_scalars = raft::device_span(d_scalars.data(), d_scalars.size()), + uint32_key_output_offset] __device__(auto i) { + auto first = d_ptrs[i]; + if (first != static_cast(nullptr)) { + auto size = d_scalars[i * 2]; + auto start_offset = d_scalars[i * 2 + 1]; + if (uint32_key_output_offset) { + auto casted_first = static_cast(first); + return size - static_cast(thrust::distance( + casted_first, + thrust::lower_bound(thrust::seq, + casted_first, + casted_first + size, + static_cast(start_offset)))); + } else { + auto casted_first = static_cast(first); + return size - + static_cast(thrust::distance( + casted_first, + thrust::lower_bound( + thrust::seq, casted_first, casted_first + size, start_offset))); + } + } else { + return size_t{0}; + } + })); + raft::update_host((*edge_partition_deg1_hypersparse_key_offset_counts).data(), + d_counts.data(), + d_counts.size(), + handle.get_stream()); + handle.sync_stream(); + } + } + } + } +#if PER_V_PERFORMANCE_MEASUREMENT + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + auto subtime3 = std::chrono::steady_clock::now(); +#endif + + std::conditional_t>, + std::byte /* dummy */> + edge_partition_major_output_buffers{}; + if constexpr (GraphViewType::is_multi_gpu && update_major) { + edge_partition_major_output_buffers.reserve(loop_count); + } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + if constexpr (GraphViewType::is_multi_gpu && update_major) { + size_t buffer_size{0}; + if (process_local_edges[j]) { + if constexpr (use_input_key) { + auto const& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + if (keys.index() == 0) { + buffer_size = size_dataframe_buffer(std::get<0>(keys)); + } else { + buffer_size = size_dataframe_buffer(std::get<1>(keys)); + } + } else { + buffer_size = size_dataframe_buffer(keys); + } + } else { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + + buffer_size = + segment_offsets + ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : edge_partition.major_range_size(); + } + } + edge_partition_major_output_buffers.push_back( + allocate_dataframe_buffer(buffer_size, loop_stream)); + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } +#if PER_V_PERFORMANCE_MEASUREMENT + auto subtime4 = std::chrono::steady_clock::now(); +#endif + + for (size_t j = 0; j < loop_count; ++j) { + if (process_local_edges[j]) { + auto partition_idx = i + j; + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, partition_idx) + : thrust::nullopt; + size_t num_streams_per_loop{1}; + if (stream_pool_indices) { + assert((*stream_pool_indices).size() >= num_concurrent_loops); + num_streams_per_loop = (*stream_pool_indices).size() / num_concurrent_loops; + } + auto edge_partition_stream_pool_indices = + stream_pool_indices + ? std::make_optional>( + (*stream_pool_indices).data() + j * num_streams_per_loop, num_streams_per_loop) + : std::nullopt; + + T major_init{}; + T major_identity_element{}; + if constexpr (update_major) { + if constexpr (std::is_same_v>) { // if any edge has a non-init value, + // one of the non-init values will + // be selected. + major_init = init; + major_identity_element = init; + } else { + major_init = ReduceOp::identity_element; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + major_init = (static_cast(partition_idx) == minor_comm_rank) + ? init + : ReduceOp::identity_element; + } else { + major_init = init; + } + major_identity_element = ReduceOp::identity_element; + } + } + + std::optional> key_segment_offsets{std::nullopt}; + if constexpr (use_input_key) { + if (key_segment_offset_vectors) { + key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + (*key_segment_offsets).back() = + size_dataframe_buffer(edge_partition_major_output_buffers[j]); + *((*key_segment_offsets).rbegin() + 1) = (*key_segment_offsets).back(); + } + } + } + } else { + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + if (segment_offsets) { + key_segment_offsets = std::vector((*segment_offsets).size()); + std::transform((*segment_offsets).begin(), + (*segment_offsets).end(), + (*key_segment_offsets).begin(), + [](vertex_t offset) { return static_cast(offset); }); + } + } + + edge_partition_src_input_device_view_t edge_partition_src_value_input{}; + edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; + if constexpr (GraphViewType::is_storage_transposed) { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input, partition_idx); + } else { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input, partition_idx); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input); + } + auto edge_partition_e_value_input = + edge_partition_e_input_device_view_t(edge_value_input, partition_idx); + + std::conditional_t, + edge_partition_minor_output_device_view_t>, + VertexValueOutputIterator> + output_buffer{}; + if constexpr (GraphViewType::is_multi_gpu) { + if constexpr (update_major) { + output_buffer = get_dataframe_buffer_begin(edge_partition_major_output_buffers[j]); + } else { + output_buffer = + edge_partition_minor_output_device_view_t(minor_tmp_buffer->mutable_view()); + } + } else { + output_buffer = tmp_vertex_value_output_first; + } + + bool processed{false}; + if constexpr (try_bitmap) { + auto const& keys = edge_partition_key_buffers[j]; + if (keys.index() == 0) { + auto edge_partition_key_first = thrust::make_transform_iterator( + std::get<0>(keys).begin(), + cuda::proclaim_return_type( + [range_first = local_v_list_range_firsts[partition_idx]] __device__( + uint32_t v_offset) { return range_first + static_cast(v_offset); })); + per_v_transform_reduce_e_edge_partition( + handle, + edge_partition, + edge_partition_key_first, + edge_partition_key_first + std::get<0>(keys).size(), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op, + key_segment_offsets ? std::make_optional>( + (*key_segment_offsets).data(), (*key_segment_offsets).size()) + : std::nullopt, + edge_partition_stream_pool_indices); + processed = true; + } + } + if (!processed) { + auto edge_partition_key_first = sorted_unique_key_first; + auto edge_partition_key_last = sorted_unique_nzd_key_last; + if constexpr (GraphViewType::is_multi_gpu && use_input_key) { + auto const& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + edge_partition_key_first = get_dataframe_buffer_begin(std::get<1>(keys)); + edge_partition_key_last = get_dataframe_buffer_end(std::get<1>(keys)); + } else { + edge_partition_key_first = get_dataframe_buffer_begin(keys); + edge_partition_key_last = get_dataframe_buffer_end(keys); + } + } + + per_v_transform_reduce_e_edge_partition( + handle, + edge_partition, + edge_partition_key_first, + edge_partition_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op, + key_segment_offsets ? std::make_optional>( + (*key_segment_offsets).data(), (*key_segment_offsets).size()) + : std::nullopt, + edge_partition_stream_pool_indices); + } + } + } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +#if PER_V_PERFORMANCE_MEASUREMENT + auto subtime5 = std::chrono::steady_clock::now(); +#endif + + if constexpr (GraphViewType::is_multi_gpu && update_major) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + if constexpr (use_input_key) { + edge_partition_key_buffers.clear(); + edge_partition_key_buffers.shrink_to_fit(); + } + + if constexpr (std::is_same_v>) { + std::conditional_t< + filter_input_key, + std::optional, raft::device_span>>>, + std::byte /* dummy */> + edge_partition_hypersparse_non_deg1_key_offset_spans{}; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + edge_partition_hypersparse_non_deg1_key_offset_spans = std::vector< + std::variant, raft::device_span>>( + loop_count); + } + } + + std::vector edge_partition_allreduce_sizes(loop_count); + std::vector edge_partition_allreduce_displacements(loop_count); + std::vector edge_partition_contiguous_sizes(loop_count); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto const& output_buffer = edge_partition_major_output_buffers[j]; + + size_t allreduce_size{}; + size_t contiguous_size{}; + if constexpr (filter_input_key) { + allreduce_size = local_key_list_sizes[partition_idx]; + if (local_key_list_deg1_sizes) { + allreduce_size -= (*local_key_list_deg1_sizes)[partition_idx]; + } + if (key_segment_offset_vectors) { + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + contiguous_size = key_segment_offsets[3]; + } else { + contiguous_size = local_key_list_sizes[partition_idx]; + } + } else { + static_assert(!use_input_key); + auto hypersparse_degree_offsets = + graph_view.local_edge_partition_hypersparse_degree_offsets(partition_idx); + allreduce_size = size_dataframe_buffer(output_buffer); + if (hypersparse_degree_offsets) { + allreduce_size -= *((*hypersparse_degree_offsets).rbegin()) - + *((*hypersparse_degree_offsets).rbegin() + 1); + } + contiguous_size = size_dtaframe_buffer(output_buffer); + } + edge_partition_allreduce_sizes[j] = allreduce_size; + edge_partition_contiguous_sizes[j] = contiguous_size; + } + std::exclusive_scan(edge_partition_allreduce_sizes.begin(), + edge_partition_allreduce_sizes.end(), + edge_partition_allreduce_displacements.begin(), + size_t{0}); + std::variant, rmm::device_uvector> + aggregate_priorities = rmm::device_uvector(0, handle.get_stream()); + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + std::get<0>(aggregate_priorities) + .resize( + edge_partition_allreduce_displacements.back() + edge_partition_allreduce_sizes.back(), + handle.get_stream()); + } else { // priority == uint32_t + aggregate_priorities = rmm::device_uvector( + edge_partition_allreduce_displacements.back() + edge_partition_allreduce_sizes.back(), + handle.get_stream()); + } + if (loop_stream_pool_indices) { handle.sync_stream(); } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + std::optional< + std::variant, raft::device_span>> + hypersparse_non_deg1_key_offsets{std::nullopt}; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + auto const& offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + + if (offsets.index() == 0) { + hypersparse_non_deg1_key_offsets = raft::device_span( + std::get<0>(offsets).data(), + std::get<0>(offsets).size() - + (edge_partition_deg1_hypersparse_key_offset_counts + ? (*edge_partition_deg1_hypersparse_key_offset_counts)[j] + : size_t{0})); + } else { + hypersparse_non_deg1_key_offsets = raft::device_span( + std::get<1>(offsets).data(), + std::get<1>(offsets).size() - + (edge_partition_deg1_hypersparse_key_offset_counts + ? (*edge_partition_deg1_hypersparse_key_offset_counts)[j] + : size_t{0})); + } + (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j] = + *hypersparse_non_deg1_key_offsets; + } + } + + auto const& output_buffer = edge_partition_major_output_buffers[j]; + + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + compute_priorities( + minor_comm, + get_dataframe_buffer_begin(output_buffer), + raft::device_span(std::get<0>(aggregate_priorities).data() + + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]), + hypersparse_non_deg1_key_offsets, + edge_partition_contiguous_sizes[j], + static_cast(partition_idx), + subgroup_size, + init, + process_local_edges[j] ? false : true /* ignore_local_values */, + loop_stream); + } else { // priority == uint32_t + compute_priorities( + minor_comm, + get_dataframe_buffer_begin(output_buffer), + raft::device_span(std::get<1>(aggregate_priorities).data() + + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]), + hypersparse_non_deg1_key_offsets, + edge_partition_contiguous_sizes[j], + static_cast(partition_idx), + subgroup_size, + init, + process_local_edges[j] ? false : true /* ignore_local_values */, + loop_stream); + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } +#if PER_V_PERFORMANCE_MEASUREMENT + auto subtime6 = std::chrono::steady_clock::now(); +#endif + + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + device_allreduce(minor_comm, + std::get<0>(aggregate_priorities).data(), + std::get<0>(aggregate_priorities).data(), + std::get<0>(aggregate_priorities).size(), + raft::comms::op_t::MIN, + handle.get_stream()); + } else { // priority == uint32_t + device_allreduce(minor_comm, + std::get<1>(aggregate_priorities).data(), + std::get<1>(aggregate_priorities).data(), + std::get<1>(aggregate_priorities).size(), + raft::comms::op_t::MIN, + handle.get_stream()); + } + if (loop_stream_pool_indices) { handle.sync_stream(); } +#if PER_V_PERFORMANCE_MEASUREMENT + auto subtime7 = std::chrono::steady_clock::now(); +#endif + + std::vector< + std::variant, rmm::device_uvector>, + std::optional>>> + edge_partition_selected_ranks_or_flags{}; + edge_partition_selected_ranks_or_flags.reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto const& output_buffer = edge_partition_major_output_buffers[j]; + std::optional< + std::variant, raft::device_span>> + hypersparse_non_deg1_key_offsets{std::nullopt}; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + hypersparse_non_deg1_key_offsets = + (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j]; + } + } + + auto contiguous_size = edge_partition_contiguous_sizes[j]; + + std::variant, rmm::device_uvector>, + std::optional>> + selected_ranks_or_flags = + std::variant, rmm::device_uvector>( + rmm::device_uvector(0, loop_stream)); + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + auto priorities = raft::device_span( + std::get<0>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]); + auto tmp = compute_selected_ranks_from_priorities( + minor_comm, + priorities, + hypersparse_non_deg1_key_offsets, + contiguous_size, + static_cast(partition_idx), + subgroup_size, + process_local_edges[j] ? false : true /* ignore_local_values */, + loop_stream); + if (tmp.index() == 0) { + selected_ranks_or_flags = + std::variant, rmm::device_uvector>( + std::move(std::get<0>(tmp))); + } else { + selected_ranks_or_flags = std::move(std::get<1>(tmp)); + } + } else { // priority_t == uint32_t + auto priorities = raft::device_span( + std::get<1>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]); + auto tmp = compute_selected_ranks_from_priorities( + minor_comm, + priorities, + hypersparse_non_deg1_key_offsets, + contiguous_size, + static_cast(partition_idx), + subgroup_size, + process_local_edges[j] ? false : true /* ignore_local_values */, + loop_stream); + if (tmp.index() == 0) { + selected_ranks_or_flags = + std::variant, rmm::device_uvector>( + std::move(std::get<0>(tmp))); + } else { + selected_ranks_or_flags = std::move(std::get<1>(tmp)); + } + } + edge_partition_selected_ranks_or_flags.push_back(std::move(selected_ranks_or_flags)); + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + std::get<0>(aggregate_priorities).resize(0, handle.get_stream()); + std::get<0>(aggregate_priorities).shrink_to_fit(handle.get_stream()); + } else { + std::get<1>(aggregate_priorities).resize(0, handle.get_stream()); + std::get<1>(aggregate_priorities).shrink_to_fit(handle.get_stream()); + } + if (loop_stream_pool_indices) { handle.sync_stream(); } +#if PER_V_PERFORMANCE_MEASUREMENT + auto subtime8 = std::chrono::steady_clock::now(); +#endif + + std::vector> edge_partition_values{}; + edge_partition_values.reserve(loop_count); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto& output_buffer = edge_partition_major_output_buffers[j]; + + auto values = allocate_dataframe_buffer( + process_local_edges[j] ? size_dataframe_buffer(output_buffer) : size_t{0}, loop_stream); + if (process_local_edges[j]) { + if (minor_comm_rank == static_cast(partition_idx)) { + assert(!use_input_key); + assert(edge_partition_selected_ranks_or_flags[j].index() == 0); + auto const& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); + if (selected_ranks.index() == 0) { + copy_if_nosync( + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_begin(output_buffer) + edge_partition_allreduce_sizes[j], + thrust::make_transform_iterator( + std::get<0>(selected_ranks).begin(), + cuda::proclaim_return_type([minor_comm_rank] __device__(auto rank) { + return static_cast(rank) == minor_comm_rank; + })), + get_dataframe_buffer_begin(values), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + copy_if_nosync( + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_begin(output_buffer) + edge_partition_allreduce_sizes[j], + thrust::make_transform_iterator( + std::get<1>(selected_ranks).begin(), + cuda::proclaim_return_type( + [minor_comm_rank] __device__(auto rank) { return rank == minor_comm_rank; })), + get_dataframe_buffer_begin(values), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } else { + assert(edge_partition_selected_ranks_or_flags[j].index() == 1); + auto& keep_flags = std::get<1>(edge_partition_selected_ranks_or_flags[j]); + size_t input_end_offset{}; + if constexpr (filter_input_key) { + input_end_offset = edge_partition_contiguous_sizes[j]; + if (edge_partition_hypersparse_non_deg1_key_offset_spans) { + auto const& span = (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j]; + if (span.index() == 0) { + input_end_offset += std::get<0>(span).size(); + } else { + input_end_offset += std::get<1>(span).size(); + } + } + } else { + input_end_offset = edge_partition_allreduce_sizes[j]; + } + copy_if_nosync( + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_begin(output_buffer) + input_end_offset, + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [keep_flags = raft::device_span( + (*keep_flags).data(), (*keep_flags).size())] __device__(size_t offset) { + auto word = keep_flags[packed_bool_offset(offset)]; + return ((word & packed_bool_mask(offset)) != packed_bool_empty_mask()); + })), + get_dataframe_buffer_begin(values), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + (*keep_flags).resize(0, loop_stream); + (*keep_flags).shrink_to_fit(loop_stream); + } + } + + edge_partition_values.push_back(std::move(values)); + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + + std::vector copy_sizes(loop_count); + raft::update_host(copy_sizes.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); + + std::optional< + std::vector, rmm::device_uvector>>> + edge_partition_deg1_hypersparse_output_offset_vectors{}; + + if (graph_view.use_dcs()) { + edge_partition_deg1_hypersparse_output_offset_vectors = + std::vector, rmm::device_uvector>>{}; + (*edge_partition_deg1_hypersparse_output_offset_vectors).reserve(loop_count); + + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto& output_buffer = edge_partition_major_output_buffers[j]; + std::variant, rmm::device_uvector> + output_offsets = rmm::device_uvector(0, loop_stream); + if (!uint32_key_output_offset) { + output_offsets = rmm::device_uvector(0, loop_stream); + } + + if (process_local_edges[j]) { + auto& values = edge_partition_values[j]; + + size_t output_offset_buf_size{0}; + if constexpr (filter_input_key) { + output_offset_buf_size = (*edge_partition_deg1_hypersparse_key_offset_counts)[j]; + } else { + assert(!use_input_key); + output_offset_buf_size = + size_dataframe_buffer(output_buffer) - edge_partition_allreduce_sizes[j]; + } + + if (output_offsets.index() == 0) { + std::get<0>(output_offsets).resize(output_offset_buf_size, loop_stream); + } else { + output_offsets = rmm::device_uvector(output_offset_buf_size, loop_stream); + } + + size_t input_start_offset{}; + if constexpr (filter_input_key) { + auto span = (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j]; + input_start_offset = + edge_partition_contiguous_sizes[j] + + (span.index() == 0 ? std::get<0>(span).size() : std::get<1>(span).size()); + } else { + static_assert(!use_input_key); + input_start_offset = edge_partition_allreduce_sizes[j]; + } + auto flag_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(output_buffer) + input_start_offset, + cuda::proclaim_return_type( + [init] __device__(auto val) { return val != init; })); + + if constexpr (filter_input_key) { + auto& hypersparse_key_offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + auto span = (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j]; + if (hypersparse_key_offsets.index() == 0) { + assert(output_offsets.index() == 0); + auto input_pair_first = thrust::make_zip_iterator( + get_dataframe_buffer_begin(output_buffer) + input_start_offset, + std::get<0>(hypersparse_key_offsets).begin() + std::get<0>(span).size()); + copy_if_nosync( + input_pair_first, + input_pair_first + (*edge_partition_deg1_hypersparse_key_offset_counts)[j], + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<0>(output_offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + std::get<0>(hypersparse_key_offsets).resize(0, loop_stream); + std::get<0>(hypersparse_key_offsets).shrink_to_fit(loop_stream); + } else { + assert(output_offsets.index() == 1); + auto input_pair_first = thrust::make_zip_iterator( + get_dataframe_buffer_begin(output_buffer) + input_start_offset, + std::get<1>(hypersparse_key_offsets).begin() + std::get<1>(span).size()); + copy_if_nosync( + input_pair_first, + input_pair_first + (*edge_partition_deg1_hypersparse_key_offset_counts)[j], + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<1>(output_offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + std::get<1>(hypersparse_key_offsets).resize(0, loop_stream); + std::get<1>(hypersparse_key_offsets).shrink_to_fit(loop_stream); + } + } else { + static_assert(!use_input_key); + assert(process_local_edges[j]); + if (output_offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(output_buffer), + thrust::make_counting_iterator(uint32_t{0})); + copy_if_nosync( + input_pair_first + input_start_offset, + input_pair_first + size_dataframe_buffer(output_buffer), + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<0>(output_offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(output_buffer), + thrust::make_counting_iterator(size_t{0})); + copy_if_nosync( + input_pair_first + input_start_offset, + input_pair_first + size_dataframe_buffer(output_buffer), + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<1>(output_offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } + } + + (*edge_partition_deg1_hypersparse_output_offset_vectors) + .push_back(std::move(output_offsets)); + + resize_dataframe_buffer(output_buffer, 0, loop_stream); + shrink_to_fit_dataframe_buffer(output_buffer, loop_stream); + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + + std::vector deg1_copy_sizes(loop_count); + raft::update_host( + deg1_copy_sizes.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); + + for (size_t j = 0; j < loop_count; ++j) { + if (process_local_edges[j]) { + copy_sizes[j] += deg1_copy_sizes[j]; + auto& offsets = (*edge_partition_deg1_hypersparse_output_offset_vectors)[j]; + if (offsets.index() == 0) { + std::get<0>(offsets).resize(deg1_copy_sizes[j], handle.get_stream()); + } else { + assert(offsets.index() == 1); + std::get<1>(offsets).resize(deg1_copy_sizes[j], handle.get_stream()); + } + // skip shrink_to_fit() to cut execution time + } + } + } + + for (size_t j = 0; j < loop_count; ++j) { + if (process_local_edges[j]) { + resize_dataframe_buffer(edge_partition_values[j], copy_sizes[j], handle.get_stream()); + // skip shrink_to_fit() to cut execution time + } + } +#if PER_V_PERFORMANCE_MEASUREMENT + if (loop_stream_pool_indices) { handle.sync_stream(); } + auto subtime9 = std::chrono::steady_clock::now(); +#endif + + size_t min_element_size{cache_line_size}; + if constexpr (std::is_arithmetic_v) { + min_element_size = std::min(sizeof(T), min_element_size); + } else { + static_assert(is_thrust_tuple_of_arithmetic::value); + min_element_size = + std::min(cugraph::min_thrust_tuple_element_sizes(), min_element_size); + } + assert((cache_line_size % min_element_size) == 0); + size_t value_alignment = cache_line_size / min_element_size; + + size_t offset_alignment = 1; + if (graph_view.use_dcs()) { + static_assert(((cache_line_size % sizeof(uint32_t)) == 0) && + ((cache_line_size % sizeof(size_t)) == 0)); + offset_alignment = + cache_line_size / (uint32_key_output_offset ? sizeof(uint32_t) : sizeof(size_t)); + } + + std::optional> rx_value_sizes{}; + std::optional> rx_value_displs{}; + std::optional> rx_values{}; + + std::optional> rx_offset_sizes{}; + std::optional> rx_offset_displs{}; + std::optional, rmm::device_uvector>> + rx_offsets{}; + { + auto size_per_rank = + loop_count * (graph_view.use_dcs() ? 2 /* value buffer size, offset buffer size */ + : 1 /* value buffer size */); + rmm::device_uvector d_aggregate_buffer_sizes(minor_comm_size * size_per_rank, + handle.get_stream()); + std::vector h_buffer_sizes(size_per_rank); + for (size_t j = 0; j < loop_count; ++j) { + h_buffer_sizes[j] = size_dataframe_buffer(edge_partition_values[j]); + if (graph_view.use_dcs()) { + auto const& offsets = (*edge_partition_deg1_hypersparse_output_offset_vectors)[j]; + if (offsets.index() == 0) { + h_buffer_sizes[loop_count + j] = std::get<0>(offsets).size(); + } else { + assert(offsets.index() == 1); + h_buffer_sizes[loop_count + j] = std::get<1>(offsets).size(); + } + } + } + raft::update_device(d_aggregate_buffer_sizes.data() + minor_comm_rank * size_per_rank, + h_buffer_sizes.data(), + h_buffer_sizes.size(), + handle.get_stream()); + device_allgather(minor_comm, + d_aggregate_buffer_sizes.data() + minor_comm_rank * size_per_rank, + d_aggregate_buffer_sizes.data(), + size_per_rank, + handle.get_stream()); + if (static_cast(minor_comm_rank / num_concurrent_loops) == + (i / num_concurrent_loops)) { + std::vector h_aggregate_buffer_sizes(d_aggregate_buffer_sizes.size()); + raft::update_host(h_aggregate_buffer_sizes.data(), + d_aggregate_buffer_sizes.data(), + d_aggregate_buffer_sizes.size(), + handle.get_stream()); + handle.sync_stream(); + auto j = static_cast(minor_comm_rank % num_concurrent_loops); + rx_value_sizes = std::vector(minor_comm_size); + rx_value_displs = std::vector(minor_comm_size); + if (graph_view.use_dcs()) { + rx_offset_sizes = std::vector(minor_comm_size); + rx_offset_displs = std::vector(minor_comm_size); + } + for (int k = 0; k < minor_comm_size; ++k) { + (*rx_value_sizes)[k] = h_aggregate_buffer_sizes[k * size_per_rank + j]; + if (graph_view.use_dcs()) { + (*rx_offset_sizes)[k] = + h_aggregate_buffer_sizes[k * size_per_rank + loop_count + j]; + } + } + + std::vector aligned_sizes(minor_comm_size); + for (int k = 0; k < minor_comm_size; ++k) { + if (k == (minor_comm_size - 1)) { + aligned_sizes[k] = (*rx_value_sizes)[k]; + } else { + aligned_sizes[k] = raft::round_up_safe((*rx_value_sizes)[k], value_alignment); + } + } + std::exclusive_scan( + aligned_sizes.begin(), aligned_sizes.end(), (*rx_value_displs).begin(), size_t{0}); + + if (graph_view.use_dcs()) { + for (int k = 0; k < minor_comm_size; ++k) { + if (k == (minor_comm_size - 1)) { + aligned_sizes[k] = (*rx_offset_sizes)[k]; + } else { + aligned_sizes[k] = raft::round_up_safe((*rx_offset_sizes)[k], offset_alignment); + } + } + std::exclusive_scan( + aligned_sizes.begin(), aligned_sizes.end(), (*rx_offset_displs).begin(), size_t{0}); + } + + rx_values = allocate_dataframe_buffer( + (*rx_value_displs).back() + (*rx_value_sizes).back(), handle.get_stream()); + if (graph_view.use_dcs()) { + if (uint32_key_output_offset) { + rx_offsets = rmm::device_uvector( + (*rx_offset_displs).back() + (*rx_offset_sizes).back(), handle.get_stream()); + } else { + rx_offsets = rmm::device_uvector( + (*rx_offset_displs).back() + (*rx_offset_sizes).back(), handle.get_stream()); + } + } + } + } +#if PER_V_PERFORMANCE_MEASUREMENT + handle.sync_stream(); + auto subtime10 = std::chrono::steady_clock::now(); +#endif + + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto& values = edge_partition_values[j]; + + if (minor_comm_rank == static_cast(partition_idx)) { + device_gatherv(minor_comm, + get_dataframe_buffer_begin(values), + get_dataframe_buffer_begin(*rx_values), + values.size(), + *rx_value_sizes, + *rx_value_displs, + static_cast(partition_idx), + handle.get_stream()); + } else { + device_gatherv(minor_comm, + get_dataframe_buffer_begin(values), + dataframe_buffer_iterator_type_t{}, + values.size(), + std::vector{}, + std::vector{}, + static_cast(partition_idx), + handle.get_stream()); + } + } + device_group_end(minor_comm); + if (graph_view.use_dcs()) { + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto& values = edge_partition_values[j]; + + auto const& offsets = (*edge_partition_deg1_hypersparse_output_offset_vectors)[j]; + if (offsets.index() == 0) { + if (minor_comm_rank == static_cast(partition_idx)) { + device_gatherv(minor_comm, + std::get<0>(offsets).data(), + std::get<0>(*rx_offsets).data(), + std::get<0>(offsets).size(), + *rx_offset_sizes, + *rx_offset_displs, + static_cast(partition_idx), + handle.get_stream()); + } else { + device_gatherv(minor_comm, + std::get<0>(offsets).data(), + static_cast(nullptr), + std::get<0>(offsets).size(), + std::vector{}, + std::vector{}, + static_cast(partition_idx), + handle.get_stream()); + } + } else { + assert(offsets.index() == 1); + if (minor_comm_rank == static_cast(partition_idx)) { + device_gatherv(minor_comm, + std::get<1>(offsets).data(), + std::get<1>(*rx_offsets).data(), + std::get<1>(offsets).size(), + *rx_offset_sizes, + *rx_offset_displs, + static_cast(partition_idx), + handle.get_stream()); + } else { + device_gatherv(minor_comm, + std::get<1>(offsets).data(), + static_cast(nullptr), + std::get<1>(offsets).size(), + std::vector{}, + std::vector{}, + static_cast(partition_idx), + handle.get_stream()); + } + } + } + device_group_end(minor_comm); + } + handle.sync_stream(); // this is required before edge_partition_values.clear(); + edge_partition_values.clear(); + if (loop_stream_pool_indices) { + handle.sync_stream_pool(*loop_stream_pool_indices); + } // to ensure that memory is freed +#if PER_V_PERFORMANCE_MEASUREMENT + auto subtime11 = std::chrono::steady_clock::now(); +#endif + + if (rx_values && (size_dataframe_buffer(*rx_values) > 0)) { + auto j = static_cast(minor_comm_rank % num_concurrent_loops); + auto partition_idx = i + j; + + { // remove gaps introduced to enforce alignment + rmm::device_uvector bitmap( + packed_bool_size(size_dataframe_buffer(*rx_values)), handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + rmm::device_uvector d_displs((*rx_value_displs).size(), handle.get_stream()); + rmm::device_uvector d_sizes((*rx_value_sizes).size(), handle.get_stream()); + raft::update_device(d_displs.data(), + (*rx_value_displs).data(), + (*rx_value_displs).size(), + handle.get_stream()); + raft::update_device(d_sizes.data(), + (*rx_value_sizes).data(), + (*rx_value_sizes).size(), + handle.get_stream()); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(static_cast(minor_comm_size - 1) * + value_alignment), + [bitmap = raft::device_span(bitmap.data(), bitmap.size()), + displs = raft::device_span(d_displs.data(), d_displs.size()), + sizes = raft::device_span(d_sizes.data(), d_sizes.size()), + alignment = value_alignment] __device__(size_t i) { + auto rank = static_cast(i / alignment); + auto first = displs[rank] + sizes[rank]; + auto last = displs[rank + 1]; + if ((i % alignment) < (last - first)) { + auto offset = first + (i % alignment); + cuda::atomic_ref word( + bitmap[packed_bool_offset(offset)]); + word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed); + } + }); + resize_dataframe_buffer( + *rx_values, + thrust::distance( + get_dataframe_buffer_begin(*rx_values), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(*rx_values), + get_dataframe_buffer_end(*rx_values), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + bitmap.data(), bitmap.size())] __device__(size_t i) { + return (bitmap[packed_bool_offset(i)] & + packed_bool_mask(i)) == packed_bool_mask(i); + })), + thrust::identity{})), + handle.get_stream()); + // skip shrink_to_fit() to cut execution time + std::exclusive_scan((*rx_value_sizes).begin(), + (*rx_value_sizes).end(), + (*rx_value_displs).begin(), + size_t{0}); // now gaps are removed + + if (rx_offsets) { + size_t num_offsets = ((*rx_offsets).index() == 0) + ? size_dataframe_buffer(std::get<0>(*rx_offsets)) + : size_dataframe_buffer(std::get<1>(*rx_offsets)); + bitmap.resize(packed_bool_size(num_offsets), handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + d_displs.resize((*rx_offset_displs).size(), handle.get_stream()); + d_sizes.resize((*rx_offset_sizes).size(), handle.get_stream()); + raft::update_device(d_displs.data(), + (*rx_offset_displs).data(), + (*rx_offset_displs).size(), + handle.get_stream()); + raft::update_device(d_sizes.data(), + (*rx_offset_sizes).data(), + (*rx_offset_sizes).size(), + handle.get_stream()); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(static_cast(minor_comm_size - 1) * + offset_alignment), + [bitmap = raft::device_span(bitmap.data(), bitmap.size()), + displs = raft::device_span(d_displs.data(), d_displs.size()), + sizes = raft::device_span(d_sizes.data(), d_sizes.size()), + alignment = offset_alignment] __device__(size_t i) { + auto rank = static_cast(i / alignment); + auto first = displs[rank] + sizes[rank]; + auto last = displs[rank + 1]; + if ((i % alignment) < (last - first)) { + auto offset = first + (i % alignment); + cuda::atomic_ref word( + bitmap[packed_bool_offset(offset)]); + word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed); + } + }); + if ((*rx_offsets).index() == 0) { + resize_dataframe_buffer( + std::get<0>(*rx_offsets), + thrust::distance( + get_dataframe_buffer_begin(std::get<0>(*rx_offsets)), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(std::get<0>(*rx_offsets)), + get_dataframe_buffer_end(std::get<0>(*rx_offsets)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + bitmap.data(), bitmap.size())] __device__(size_t i) { + return (bitmap[packed_bool_offset(i)] & + packed_bool_mask(i)) == packed_bool_mask(i); + })), + thrust::identity{})), + handle.get_stream()); + // skip shrink_to_fit() to cut execution time + } else { + resize_dataframe_buffer( + std::get<1>(*rx_offsets), + thrust::distance( + get_dataframe_buffer_begin(std::get<1>(*rx_offsets)), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(std::get<1>(*rx_offsets)), + get_dataframe_buffer_end(std::get<1>(*rx_offsets)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + bitmap.data(), bitmap.size())] __device__(size_t i) { + return (bitmap[packed_bool_offset(i)] & + packed_bool_mask(i)) == packed_bool_mask(i); + })), + thrust::identity{})), + handle.get_stream()); + // skip shrink_to_fit() to cut execution time + } + std::exclusive_scan((*rx_offset_sizes).begin(), + (*rx_offset_sizes).end(), + (*rx_offset_displs).begin(), + size_t{0}); // now gaps are removed + } + } + + size_t output_range_size{}; + if constexpr (filter_input_key) { + output_range_size = local_key_list_sizes[partition_idx]; + } else { + auto const& segment_offsets = graph_view.local_vertex_partition_segment_offsets(); + output_range_size = + segment_offsets + ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : graph_view.local_vertex_partition_range_size(); + } + auto& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); + if (selected_ranks.index() == 0) { + auto old_size = std::get<0>(selected_ranks).size(); + std::get<0>(selected_ranks).resize(output_range_size, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + std::get<0>(selected_ranks).begin() + old_size, + std::get<0>(selected_ranks).end(), + static_cast(minor_comm_size)); + } else { + assert(selected_ranks.index() == 1); + auto old_size = std::get<1>(selected_ranks).size(); + std::get<1>(selected_ranks).resize(output_range_size, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + std::get<1>(selected_ranks).begin() + old_size, + std::get<1>(selected_ranks).end(), + minor_comm_size); + } + if (rx_offsets) { + rmm::device_uvector lasts((*rx_offset_displs).size(), handle.get_stream()); + raft::update_device(lasts.data(), + (*rx_offset_displs).data() + 1, + (*rx_offset_displs).size() - 1, + handle.get_stream()); + auto num_elements = (*rx_offset_displs).back() + (*rx_offset_sizes).back(); + lasts.set_element_async(lasts.size() - 1, num_elements, handle.get_stream()); + handle.sync_stream(); // this is necessary before num_elements becomes out-of-scope + + if ((*rx_offsets).index() == 0) { + auto& offsets = std::get<0>(*rx_offsets); + if (selected_ranks.index() == 0) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + std::get<0>(selected_ranks).data(), + std::get<0>(selected_ranks).size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + } else { + assert(selected_ranks.index() == 1); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + std::get<1>(selected_ranks).data(), + std::get<1>(selected_ranks).size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + } + offsets.resize(0, handle.get_stream()); + offsets.shrink_to_fit(handle.get_stream()); + } else { + assert((*rx_offsets).index() == 1); + auto& offsets = std::get<1>(*rx_offsets); + if (selected_ranks.index() == 0) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + std::get<0>(selected_ranks).data(), + std::get<0>(selected_ranks).size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + } else { + assert(selected_ranks.index() == 1); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + std::get<1>(selected_ranks).data(), + std::get<1>(selected_ranks).size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + } + offsets.resize(0, handle.get_stream()); + offsets.shrink_to_fit(handle.get_stream()); + } + } + + size_t num_positions = (selected_ranks.index() == 0) ? std::get<0>(selected_ranks).size() + : std::get<1>(selected_ranks).size(); + if (num_positions <= static_cast(std::numeric_limits::max())) { + rmm::device_uvector rx_positions(num_positions, handle.get_stream()); + thrust::sequence( + handle.get_thrust_policy(), rx_positions.begin(), rx_positions.end(), uint32_t{0}); + if (selected_ranks.index() == 0) { + thrust::stable_sort_by_key(handle.get_thrust_policy(), + std::get<0>(selected_ranks).begin(), + std::get<0>(selected_ranks).end(), + rx_positions.begin()); + } else { + assert(selected_ranks.index() == 1); + thrust::stable_sort_by_key(handle.get_thrust_policy(), + std::get<1>(selected_ranks).begin(), + std::get<1>(selected_ranks).end(), + rx_positions.begin()); + } + // selected_ranks[] == minor_comm_size if no GPU in minor_comm has a non-init value + rx_positions.resize((*rx_value_displs).back() + (*rx_value_sizes).back(), + handle.get_stream()); + thrust::scatter(handle.get_thrust_policy(), + get_dataframe_buffer_begin(*rx_values), + get_dataframe_buffer_end(*rx_values), + rx_positions.begin(), + tmp_vertex_value_output_first); + } else { + rmm::device_uvector rx_positions(num_positions, handle.get_stream()); + thrust::sequence( + handle.get_thrust_policy(), rx_positions.begin(), rx_positions.end(), size_t{0}); + if (selected_ranks.index() == 0) { + thrust::stable_sort_by_key(handle.get_thrust_policy(), + std::get<0>(selected_ranks).begin(), + std::get<0>(selected_ranks).end(), + rx_positions.begin()); + } else { + assert(selected_ranks.index() == 1); + thrust::stable_sort_by_key(handle.get_thrust_policy(), + std::get<1>(selected_ranks).begin(), + std::get<1>(selected_ranks).end(), + rx_positions.begin()); + } + // selected_ranks[] == minor_comm_size if no GPU in minor_comm has a non-init value + rx_positions.resize((*rx_value_displs).back() + (*rx_value_sizes).back(), + handle.get_stream()); + thrust::scatter(handle.get_thrust_policy(), + get_dataframe_buffer_begin(*rx_values), + get_dataframe_buffer_end(*rx_values), + rx_positions.begin(), + tmp_vertex_value_output_first); + } + } + handle.sync_stream(); +#if PER_V_PERFORMANCE_MEASUREMENT + auto subtime12 = std::chrono::steady_clock::now(); + std::chrono::duration subdur0 = subtime1 - subtime0; + std::chrono::duration subdur1 = subtime2 - subtime1; + std::chrono::duration subdur2 = subtime3 - subtime2; + std::chrono::duration subdur3 = subtime4 - subtime3; + std::chrono::duration subdur4 = subtime5 - subtime4; + std::chrono::duration subdur5 = subtime6 - subtime5; + std::chrono::duration subdur6 = subtime7 - subtime6; + std::chrono::duration subdur7 = subtime8 - subtime7; + std::chrono::duration subdur8 = subtime9 - subtime8; + std::chrono::duration subdur9 = subtime10 - subtime9; + std::chrono::duration subdur10 = subtime11 - subtime10; + std::chrono::duration subdur11 = subtime12 - subtime11; + std::cerr << "sub (per_v) took (" << subdur0.count() << "," << subdur1.count() << "," + << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," + << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," + << subdur8.count() << "," << subdur9.count() << "," << subdur10.count() << "," + << subdur11.count() << ")" << std::endl; + raft::print_host_vector("bcast_sizes", bcast_sizes.data(), bcast_sizes.size(), std::cerr); + raft::print_host_vector("edge_partition_allreduce_sizes", + edge_partition_allreduce_sizes.data(), + edge_partition_allreduce_sizes.size(), + std::cerr); +#endif + } else { + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + device_reduce(minor_comm, + get_dataframe_buffer_begin(edge_partition_major_output_buffers[j]), + tmp_vertex_value_output_first, + size_dataframe_buffer(edge_partition_major_output_buffers[j]), + ReduceOp::compatible_raft_comms_op, + static_cast(partition_idx), + handle.get_stream()); + } + device_group_end(minor_comm); + if (loop_stream_pool_indices) { handle.sync_stream(); } + } + } + } +#if PER_V_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time4 = std::chrono::steady_clock::now(); +#endif + + // 10. communication + + if constexpr (GraphViewType::is_multi_gpu && !update_major) { + auto& comm = handle.get_comms(); + auto const comm_rank = comm.get_rank(); + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + auto view = minor_tmp_buffer->view(); + if (view.keys()) { // applying the initial value is deferred to here + vertex_t max_vertex_partition_size{0}; + for (int i = 0; i < major_comm_size; ++i) { + auto this_segment_vertex_partition_id = + compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + max_vertex_partition_size = + std::max(max_vertex_partition_size, + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)); + } + auto tx_buffer = allocate_dataframe_buffer(max_vertex_partition_size, handle.get_stream()); + auto tx_buffer_first = get_dataframe_buffer_begin(tx_buffer); + std::optional> minor_key_offsets{}; + if constexpr (GraphViewType::is_storage_transposed) { + minor_key_offsets = graph_view.local_sorted_unique_edge_src_vertex_partition_offsets(); + } else { + minor_key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets(); + } + for (int i = 0; i < major_comm_size; ++i) { + auto minor_init = (major_comm_rank == i) ? init : ReduceOp::identity_element; + auto this_segment_vertex_partition_id = + compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + thrust::fill(handle.get_thrust_policy(), + tx_buffer_first, + tx_buffer_first + + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id), + minor_init); + auto value_first = thrust::make_transform_iterator( + view.value_first(), + cuda::proclaim_return_type( + [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); })); + thrust::scatter(handle.get_thrust_policy(), + value_first + (*minor_key_offsets)[i], + value_first + (*minor_key_offsets)[i + 1], + thrust::make_transform_iterator( + (*(view.keys())).begin() + (*minor_key_offsets)[i], + cuda::proclaim_return_type( + [key_first = graph_view.vertex_partition_range_first( + this_segment_vertex_partition_id)] __device__(auto key) { + return key - key_first; + })), + tx_buffer_first); + device_reduce(major_comm, + tx_buffer_first, + tmp_vertex_value_output_first, + static_cast( + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), + ReduceOp::compatible_raft_comms_op, + i, + handle.get_stream()); + } + } else { + auto first_segment_vertex_partition_id = + compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(0); + vertex_t minor_range_first = + graph_view.vertex_partition_range_first(first_segment_vertex_partition_id); + for (int i = 0; i < major_comm_size; ++i) { + auto this_segment_vertex_partition_id = + compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + auto offset = graph_view.vertex_partition_range_first(this_segment_vertex_partition_id) - + minor_range_first; + device_reduce(major_comm, + view.value_first() + offset, + tmp_vertex_value_output_first, + static_cast( + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), + ReduceOp::compatible_raft_comms_op, + i, + handle.get_stream()); + } + } + } + +#if PER_V_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time5 = std::chrono::steady_clock::now(); + std::chrono::duration dur0 = time1 - time0; + std::chrono::duration dur1 = time2 - time1; + std::chrono::duration dur2 = time3 - time2; + std::chrono::duration dur3 = time4 - time3; + std::chrono::duration dur4 = time5 - time4; + std::cerr << "detail::per_v (pre, filter, post, ep, comm) took (" << dur0.count() << "," + << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() + << ") num_concurrent_loops=" << num_concurrent_loops << std::endl; +#endif +} + +} // namespace detail + +} // namespace cugraph diff --git a/cpp/src/prims/detail/prim_functors.cuh b/cpp/src/prims/detail/prim_functors.cuh index f426cd993ea..a166f37906a 100644 --- a/cpp/src/prims/detail/prim_functors.cuh +++ b/cpp/src/prims/detail/prim_functors.cuh @@ -21,6 +21,23 @@ namespace cugraph { namespace detail { +template +struct const_true_e_op_t { + __device__ auto operator()(std::conditional_t key_or_src, + std::conditional_t key_or_dst, + src_value_t, + dst_value_t, + e_value_t) const + { + return true; + } +}; + template +struct call_const_true_e_op_t { + __device__ auto operator()(edge_t i) const { return true; } +}; + template +#include + +namespace cugraph { + +namespace detail { + +template +__host__ __device__ priority_t +rank_to_priority(int rank, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + int comm_size, + vertex_t offset /* to evenly distribute traffic */) +{ + static_assert(sizeof(priority_t) == 1 || sizeof(priority_t) == 2 || sizeof(priority_t) == 4); + using cast_t = std::conditional_t< + sizeof(priority_t) == 1, + int16_t, + std::conditional_t>; // to prevent overflow + + if (rank == root) { + return priority_t{0}; + } else if (rank / subgroup_size == + root / subgroup_size) { // intra-subgroup communication is sufficient (priorities in + // [1, subgroup_size) + auto rank_dist = + static_cast(((static_cast(rank) + subgroup_size) - root) % subgroup_size); + int modulo = subgroup_size - 1; + return static_cast(1 + (static_cast(rank_dist - 1) + (offset % modulo)) % + modulo); + } else { // inter-subgroup communication is necessary (priorities in [subgroup_size, comm_size) + auto subgroup_dist = + static_cast(((static_cast(rank / subgroup_size) + (comm_size / subgroup_size)) - + (root / subgroup_size)) % + (comm_size / subgroup_size)); + auto intra_subgroup_rank_dist = static_cast( + ((static_cast(rank % subgroup_size) + subgroup_size) - (root % subgroup_size)) % + subgroup_size); + auto rank_dist = subgroup_dist * subgroup_size + intra_subgroup_rank_dist; + int modulo = comm_size - subgroup_size; + return static_cast( + subgroup_size + + (static_cast(rank_dist - subgroup_size) + (offset % modulo)) % modulo); + } +} + +template +__host__ __device__ int priority_to_rank( + priority_t priority, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + int comm_size, + vertex_t offset /* to evenly distribute traffict */) +{ + static_assert(sizeof(priority_t) == 1 || sizeof(priority_t) == 2 || sizeof(priority_t) == 4); + using cast_t = std::conditional_t< + sizeof(priority_t) == 1, + int16_t, + std::conditional_t>; // to prevent overflow + + if (priority == priority_t{0}) { + return root; + } else if (priority < static_cast(subgroup_size)) { + int modulo = subgroup_size - 1; + auto rank_dist = static_cast( + 1 + ((static_cast(priority - 1) + modulo) - (offset % modulo)) % modulo); + return static_cast((root - (root % subgroup_size)) + + ((static_cast(root) + rank_dist) % subgroup_size)); + } else { + int modulo = comm_size - subgroup_size; + auto rank_dist = static_cast( + subgroup_size + + ((static_cast(priority) - subgroup_size) + (modulo - (offset % modulo))) % modulo); + auto subgroup_dist = rank_dist / subgroup_size; + auto intra_subgroup_rank_dist = rank_dist % subgroup_size; + return static_cast( + ((static_cast((root / subgroup_size) * subgroup_size) + + subgroup_dist * subgroup_size) + + (static_cast(root) + intra_subgroup_rank_dist) % subgroup_size) % + comm_size); + } +} + +} // namespace detail + +} // namespace cugraph diff --git a/cpp/src/prims/extract_transform_e.cuh b/cpp/src/prims/extract_transform_e.cuh index d51e03628e1..5741c98d90e 100644 --- a/cpp/src/prims/extract_transform_e.cuh +++ b/cpp/src/prims/extract_transform_e.cuh @@ -116,8 +116,8 @@ extract_transform_e(raft::handle_t const& handle, thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last())); auto value_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(std::ignore, value_buffer) = detail:: - extract_transform_v_frontier_e( + std::tie(std::ignore, value_buffer) = + detail::extract_transform_v_frontier_e( handle, graph_view, frontier, diff --git a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh index 7ad033b93c2..ba227b263bc 100644 --- a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh +++ b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh @@ -64,13 +64,13 @@ namespace cugraph { * @return Dataframe buffer object storing extracted and accumulated valid @p e_op return values. */ template decltype(allocate_dataframe_buffer< - typename detail::edge_op_result_type(size_t{0}, handle.get_stream()); std::tie(std::ignore, value_buffer) = - detail::extract_transform_v_frontier_e(handle, - graph_view, - frontier, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - do_expensive_check); + detail::extract_transform_v_frontier_e(handle, + graph_view, + frontier, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + do_expensive_check); return value_buffer; } diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 58dbf7e74a0..83e33f2b045 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -15,6 +15,8 @@ */ #pragma once +#include "prims/vertex_frontier.cuh" + #include #include #include @@ -129,8 +131,8 @@ template void fill_edge_major_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, EdgeMajorPropertyOutputWrapper edge_major_property_output, T input) { @@ -153,12 +155,12 @@ void fill_edge_major_property(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - auto rx_counts = - host_scalar_allgather(minor_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); - auto max_rx_size = - std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { + auto local_v_list_sizes = host_scalar_allgather( + minor_comm, + static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)), + handle.get_stream()); + auto max_rx_size = std::reduce( + local_v_list_sizes.begin(), local_v_list_sizes.end(), size_t{0}, [](auto lhs, auto rhs) { return std::max(lhs, rhs); }); rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); @@ -169,14 +171,18 @@ void fill_edge_major_property(raft::handle_t const& handle, edge_partition_device_view_t( graph_view.local_edge_partition_view(i)); - device_bcast( - minor_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + device_bcast(minor_comm, + sorted_unique_vertex_first, + rx_vertices.begin(), + local_v_list_sizes[i], + i, + handle.get_stream()); if (edge_partition_keys) { thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_counts[i]), + thrust::make_counting_iterator(local_v_list_sizes[i]), [rx_vertex_first = rx_vertices.begin(), input, edge_partition_key_first = ((*edge_partition_keys)[i]).begin(), @@ -199,7 +205,7 @@ void fill_edge_major_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(rx_counts[i])), + thrust::make_counting_iterator(static_cast(local_v_list_sizes[i])), [edge_partition, rx_vertex_first = rx_vertices.begin(), input, @@ -219,7 +225,7 @@ void fill_edge_major_property(raft::handle_t const& handle, // directly scatters from the internal buffer) thrust::scatter(handle.get_thrust_policy(), val_first, - val_first + rx_counts[i], + val_first + local_v_list_sizes[i], map_first, edge_partition_value_firsts[i]); } @@ -232,17 +238,18 @@ void fill_edge_major_property(raft::handle_t const& handle, assert(edge_partition_value_firsts.size() == size_t{1}); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [input, output_value_first = edge_partition_value_firsts[0]] __device__( auto v) { packed_bool_atomic_set(output_value_first, v, input); }); } else { auto val_first = thrust::make_constant_iterator(input); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + thrust::distance(vertex_first, vertex_last), - vertex_first, - edge_partition_value_firsts[0]); + thrust::scatter( + handle.get_thrust_policy(), + val_first, + val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + sorted_unique_vertex_first, + edge_partition_value_firsts[0]); } } } @@ -280,17 +287,23 @@ void fill_edge_minor_property(raft::handle_t const& handle, } } +#define FILL_PERFORMANCE_MEASUREMENT 0 + template void fill_edge_minor_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, EdgeMinorPropertyOutputWrapper edge_minor_property_output, T input) { +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto t0 = std::chrono::steady_clock::now(); +#endif constexpr bool contains_packed_bool_element = cugraph::has_packed_bool_element(); @@ -300,22 +313,284 @@ void fill_edge_minor_property(raft::handle_t const& handle, using edge_t = typename GraphViewType::edge_type; auto edge_partition_value_first = edge_minor_property_output.value_first(); + vertex_t minor_range_first{}; + if constexpr (GraphViewType::is_storage_transposed) { + minor_range_first = graph_view.local_edge_partition_src_range_first(); + } else { + minor_range_first = graph_view.local_edge_partition_dst_range_first(); + } + if constexpr (GraphViewType::is_multi_gpu) { auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); auto const major_comm_rank = major_comm.get_rank(); auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); - auto rx_counts = - host_scalar_allgather(major_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); - auto max_rx_size = - std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { - return std::max(lhs, rhs); - }); - rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); + constexpr size_t packed_bool_word_bcast_alignment = + 128 / + sizeof( + uint32_t); // 128B cache line alignment (unaligned ncclBroadcast operations are slower) + + std::vector max_tmp_buffer_sizes{}; + std::vector local_v_list_sizes{}; + std::vector local_v_list_range_firsts{}; + std::vector local_v_list_range_lasts{}; + { + auto v_list_size = static_cast( + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); + rmm::device_uvector d_aggregate_tmps(major_comm_size * size_t{4}, + handle.get_stream()); + thrust::tabulate( + handle.get_thrust_policy(), + d_aggregate_tmps.begin() + major_comm_rank * size_t{4}, + d_aggregate_tmps.begin() + (major_comm_rank + 1) * size_t{4}, + [max_tmp_buffer_size = static_cast( + static_cast(handle.get_device_properties().totalGlobalMem) * 0.05), + sorted_unique_vertex_first, + v_list_size, + vertex_partition_range_first = + graph_view.local_vertex_partition_range_first()] __device__(size_t i) { + if (i == 0) { + return max_tmp_buffer_size; + } else if (i == 1) { + return static_cast(v_list_size); + } else if (i == 2) { + vertex_t first{}; + if (v_list_size > 0) { + first = *sorted_unique_vertex_first; + } else { + first = vertex_partition_range_first; + } + assert(static_cast(static_cast(first)) == first); + return static_cast(first); + } else { + vertex_t last{}; + if (v_list_size > 0) { + last = *(sorted_unique_vertex_first + (v_list_size - 1)) + 1; + } else { + last = vertex_partition_range_first; + } + assert(static_cast(static_cast(last)) == last); + return static_cast(last); + } + }); + + if (major_comm_size > 1) { // allgather max_tmp_buffer_size, v_list_size, v_list_range_first + // (inclusive), v_list_range_last (exclusive) + device_allgather(major_comm, + d_aggregate_tmps.data() + major_comm_rank * size_t{4}, + d_aggregate_tmps.data(), + size_t{4}, + handle.get_stream()); + } + + std::vector h_aggregate_tmps(d_aggregate_tmps.size()); + raft::update_host(h_aggregate_tmps.data(), + d_aggregate_tmps.data(), + d_aggregate_tmps.size(), + handle.get_stream()); + handle.sync_stream(); + max_tmp_buffer_sizes = std::vector(major_comm_size); + local_v_list_sizes = std::vector(major_comm_size); + local_v_list_range_firsts = std::vector(major_comm_size); + local_v_list_range_lasts = std::vector(major_comm_size); + for (int i = 0; i < major_comm_size; ++i) { + max_tmp_buffer_sizes[i] = h_aggregate_tmps[i * size_t{4}]; + local_v_list_sizes[i] = static_cast(h_aggregate_tmps[i * size_t{4} + 1]); + local_v_list_range_firsts[i] = static_cast(h_aggregate_tmps[i * size_t{4} + 2]); + local_v_list_range_lasts[i] = static_cast(h_aggregate_tmps[i * size_t{4} + 3]); + } + } +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); +#endif + + auto edge_partition_keys = edge_minor_property_output.keys(); + + std::optional> v_list_bitmap{std::nullopt}; + std::optional> compressed_v_list{std::nullopt}; + if (major_comm_size > 1) { + bool v_compressible{false}; + if constexpr (sizeof(vertex_t) > sizeof(uint32_t)) { + vertex_t local_v_list_max_range_size{0}; + for (int i = 0; i < major_comm_size; ++i) { + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + local_v_list_max_range_size = std::max(range_size, local_v_list_max_range_size); + } + if (local_v_list_max_range_size <= + std::numeric_limits::max()) { // broadcast 32bit offset values instead of 64 + // bit vertex IDs + v_compressible = true; + } + } + + double avg_fill_ratio{0.0}; + for (int i = 0; i < major_comm_size; ++i) { + auto num_keys = static_cast(local_v_list_sizes[i]); + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + avg_fill_ratio += (range_size > 0) + ? (static_cast(num_keys) / static_cast(range_size)) + : double{0.0}; + } + avg_fill_ratio /= static_cast(major_comm_size); + double threshold_ratio = + 1.0 / static_cast((v_compressible ? sizeof(uint32_t) : sizeof(vertex_t)) * 8); + auto avg_v_list_size = std::reduce(local_v_list_sizes.begin(), local_v_list_sizes.end()) / + static_cast(major_comm_size); + + if ((avg_fill_ratio > threshold_ratio) && + (static_cast(avg_v_list_size) > packed_bool_word_bcast_alignment)) { + if (is_packed_bool() && + !edge_partition_keys) { // directly update edge_minor_property_output (with special + // care for unaligned boundaries) + rmm::device_uvector boundary_words( + packed_bool_word_bcast_alignment, + handle.get_stream()); // for unaligned boundaries + auto leading_boundary_words = + (packed_bool_word_bcast_alignment - + packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first) % + packed_bool_word_bcast_alignment) % + packed_bool_word_bcast_alignment; + if ((leading_boundary_words == 0) && + (packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first) == + packed_bool_offset(graph_view.local_vertex_partition_range_first() - + minor_range_first)) && + (((local_v_list_range_firsts[major_comm_rank] - minor_range_first) % + packed_bools_per_word()) != + 0)) { // there are unaligned bits (fewer than packed_bools_per_word()) in the vertex + // partition boundary + leading_boundary_words = packed_bool_word_bcast_alignment; + } + thrust::fill(handle.get_thrust_policy(), + boundary_words.begin(), + boundary_words.begin() + leading_boundary_words, + packed_bool_empty_mask()); + if (local_v_list_range_firsts[major_comm_rank] < + local_v_list_range_lasts[major_comm_rank]) { + auto word_offset_first = + packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first); + auto word_offset_last = + packed_bool_offset((local_v_list_range_lasts[major_comm_rank] - 1) - + minor_range_first) + + 1; + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(word_offset_first), + thrust::make_counting_iterator(word_offset_last), + [sorted_unique_vertex_first, + sorted_unique_vertex_last, + input, + minor_range_first, + leading_boundary_words, + word_offset_first, + vertex_partition_range_last = graph_view.local_vertex_partition_range_last(), + output_value_first = edge_partition_value_first, + boundary_words = raft::device_span( + boundary_words.data(), boundary_words.size())] __device__(auto i) { + auto& word = ((i - word_offset_first) < leading_boundary_words) + ? boundary_words[i - word_offset_first] + : *(output_value_first + i); + auto word_v_first = + minor_range_first + static_cast(i * packed_bools_per_word()); + auto word_v_last = + ((vertex_partition_range_last - word_v_first) <= packed_bools_per_word()) + ? vertex_partition_range_last + : (word_v_first + static_cast(packed_bools_per_word())); + auto it = thrust::lower_bound( + thrust::seq, sorted_unique_vertex_first, sorted_unique_vertex_last, word_v_first); + while ((it != sorted_unique_vertex_last) && (*it < word_v_last)) { + auto v_offset = *it - minor_range_first; + if (input) { + word |= packed_bool_mask(v_offset); + } else { + word &= ~packed_bool_mask(v_offset); + } + ++it; + } + }); + } + rmm::device_uvector aggregate_boundary_words( + major_comm_size * packed_bool_word_bcast_alignment, handle.get_stream()); + device_allgather(major_comm, + boundary_words.data(), + aggregate_boundary_words.data(), + packed_bool_word_bcast_alignment, + handle.get_stream()); + v_list_bitmap = std::move(aggregate_boundary_words); + } else { + v_list_bitmap = + compute_vertex_list_bitmap_info(sorted_unique_vertex_first, + sorted_unique_vertex_last, + local_v_list_range_firsts[major_comm_rank], + local_v_list_range_lasts[major_comm_rank], + handle.get_stream()); + } + } else if (v_compressible) { + rmm::device_uvector tmps(local_v_list_sizes[major_comm_rank], + handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + sorted_unique_vertex_first, + sorted_unique_vertex_last, + tmps.begin(), + cuda::proclaim_return_type( + [range_first = local_v_list_range_firsts[major_comm_rank]] __device__( + auto v) { return static_cast(v - range_first); })); + compressed_v_list = std::move(tmps); + } + } +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); +#endif + + std::optional> stream_pool_indices{std::nullopt}; + size_t num_concurrent_bcasts{1}; + { + size_t tmp_buffer_size_per_loop{}; + for (int i = 0; i < major_comm_size; ++i) { + if (is_packed_bool() && + !edge_partition_keys && v_list_bitmap) { + tmp_buffer_size_per_loop += 0; + } else if (v_list_bitmap) { + tmp_buffer_size_per_loop += + packed_bool_size(local_v_list_range_lasts[i] - local_v_list_range_firsts[i]) * + sizeof(uint32_t) + + static_cast(local_v_list_sizes[i]) * sizeof(vertex_t); + } else { + tmp_buffer_size_per_loop += static_cast(local_v_list_sizes[i]) * sizeof(vertex_t); + } + } + tmp_buffer_size_per_loop /= major_comm_size; + size_t max_streams = + static_cast(major_comm_size); // to allow setting num_concurrent_bcasts above + // hnadle.get_stream_pool_size() + stream_pool_indices = init_stream_pool_indices( + std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / + static_cast(major_comm_size), + tmp_buffer_size_per_loop, + major_comm_size, + 1, + max_streams); + num_concurrent_bcasts = (*stream_pool_indices).size(); + if ((*stream_pool_indices).size() > handle.get_stream_pool_size()) { + (*stream_pool_indices).resize(handle.get_stream_pool_size()); + } + if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } + } + +#if FILL_PERFORMANCE_MEASUREMENT + std::cerr << "v_list_size=" << local_v_list_sizes[major_comm_rank] << " v_list_range=(" + << local_v_list_range_firsts[major_comm_rank] << "," + << local_v_list_range_lasts[major_comm_rank] + << ") v_list_bitmap.has_value()=" << v_list_bitmap.has_value() + << " compressed_v_list.has_value()=" << compressed_v_list.has_value() + << " num_concurrent_bcasts=" << num_concurrent_bcasts << std::endl; +#endif std::optional> key_offsets{}; if constexpr (GraphViewType::is_storage_transposed) { @@ -324,88 +599,471 @@ void fill_edge_minor_property(raft::handle_t const& handle, key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets(); } - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(size_t{0})); - auto edge_partition_keys = edge_minor_property_output.keys(); - for (int i = 0; i < major_comm_size; ++i) { - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - device_bcast( - major_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto t1 = std::chrono::steady_clock::now(); +#endif + for (size_t i = 0; i < static_cast(major_comm_size); i += num_concurrent_bcasts) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub0 = std::chrono::steady_clock::now(); + auto loop_count = std::min(num_concurrent_bcasts, static_cast(major_comm_size) - i); + + if (is_packed_bool() && + !edge_partition_keys && v_list_bitmap) { +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub0 = std::chrono::steady_clock::now(); +#endif + std::vector leading_boundary_word_counts(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto leading_boundary_words = + (packed_bool_word_bcast_alignment - + packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first) % + packed_bool_word_bcast_alignment) % + packed_bool_word_bcast_alignment; + auto vertex_partition_id = + partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks( + major_comm_size, minor_comm_size, partition_idx, minor_comm_rank); + if ((leading_boundary_words == 0) && + (packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first) == + packed_bool_offset(graph_view.vertex_partition_range_first(vertex_partition_id) - + minor_range_first)) && + (((local_v_list_range_firsts[partition_idx] - minor_range_first) % + packed_bools_per_word()) != 0)) { + leading_boundary_words = packed_bool_word_bcast_alignment; + } + leading_boundary_word_counts[j] = leading_boundary_words; + } +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub1 = std::chrono::steady_clock::now(); +#endif + device_group_start(major_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + size_t bcast_size{0}; + vertex_t packed_bool_offset_first{0}; + if (local_v_list_range_firsts[partition_idx] < local_v_list_range_lasts[partition_idx]) { + auto leading_boundary_words = leading_boundary_word_counts[j]; + packed_bool_offset_first = + packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first) + + static_cast(leading_boundary_words); + auto packed_bool_offset_last = + packed_bool_offset(local_v_list_range_lasts[partition_idx] - 1 - minor_range_first); + if (packed_bool_offset_first <= packed_bool_offset_last) { + bcast_size = (packed_bool_offset_last - packed_bool_offset_first) + 1; + } + } + + device_bcast(major_comm, + edge_partition_value_first + packed_bool_offset_first, + edge_partition_value_first + packed_bool_offset_first, + bcast_size, + static_cast(partition_idx), + handle.get_stream()); + } + device_group_end(major_comm); +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub2 = std::chrono::steady_clock::now(); +#endif + + rmm::device_uvector d_leading_boundary_word_counts( + leading_boundary_word_counts.size(), handle.get_stream()); + raft::update_device(d_leading_boundary_word_counts.data(), + leading_boundary_word_counts.data(), + leading_boundary_word_counts.size(), + handle.get_stream()); + + rmm::device_uvector d_local_v_list_range_firsts(loop_count, handle.get_stream()); + raft::update_device(d_local_v_list_range_firsts.data(), + local_v_list_range_firsts.data() + i, + loop_count, + handle.get_stream()); - if (edge_partition_keys) { thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_counts[i]), - [rx_vertex_first = rx_vertices.begin(), - input, - subrange_key_first = (*edge_partition_keys).begin() + (*key_offsets)[i], - subrange_key_last = (*edge_partition_keys).begin() + (*key_offsets)[i + 1], - edge_partition_value_first = edge_partition_value_first, - subrange_start_offset = (*key_offsets)[i]] __device__(auto i) { - auto minor = *(rx_vertex_first + i); - auto it = - thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor); - if ((it != subrange_key_last) && (*it == minor)) { - auto subrange_offset = thrust::distance(subrange_key_first, it); - if constexpr (contains_packed_bool_element) { - fill_scalar_or_thrust_tuple( - edge_partition_value_first, subrange_start_offset + subrange_offset, input); - } else { - *(edge_partition_value_first + subrange_start_offset + subrange_offset) = input; + thrust::make_counting_iterator(loop_count * packed_bool_word_bcast_alignment), + [input, + minor_range_first, + leading_boundary_word_counts = raft::device_span( + d_leading_boundary_word_counts.data(), d_leading_boundary_word_counts.size()), + local_v_list_range_firsts = raft::device_span( + d_local_v_list_range_firsts.data(), d_local_v_list_range_firsts.size()), + aggregate_boundary_words = raft::device_span( + (*v_list_bitmap).data() + i * packed_bool_word_bcast_alignment, + loop_count * packed_bool_word_bcast_alignment), + output_value_first = edge_partition_value_first] __device__(size_t i) { + auto j = i / packed_bool_word_bcast_alignment; + auto leading_boundary_words = leading_boundary_word_counts[j]; + if ((i % packed_bool_word_bcast_alignment) < leading_boundary_words) { + auto boundary_word = aggregate_boundary_words[i]; + if (boundary_word != packed_bool_empty_mask()) { + auto word_offset = + packed_bool_offset(local_v_list_range_firsts[j] - minor_range_first) + + (i % packed_bool_word_bcast_alignment); + cuda::atomic_ref word( + *(output_value_first + word_offset)); + if (input) { + word.fetch_or(aggregate_boundary_words[i], cuda::std::memory_order_relaxed); + } else { + word.fetch_and(~aggregate_boundary_words[i], cuda::std::memory_order_relaxed); + } } } }); +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub3 = std::chrono::steady_clock::now(); + std::chrono::duration subdur0 = sub1 - sub0; + std::chrono::duration subdur1 = sub2 - sub1; + std::chrono::duration subdur2 = sub3 - sub2; + std::cerr << "fill_edge_minor path A took (" << subdur0.count() << "," << subdur1.count() + << "," << subdur2.count() << ")" << std::endl; +#endif } else { - if constexpr (contains_packed_bool_element) { - thrust::for_each(handle.get_thrust_policy(), - thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(rx_counts[i])), - [edge_partition, - rx_vertex_first = rx_vertices.begin(), - input, - output_value_first = edge_partition_value_first] __device__(auto i) { - auto rx_vertex = *(rx_vertex_first + i); - auto minor_offset = - edge_partition.minor_offset_from_minor_nocheck(rx_vertex); - fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); - }); - } else { - auto map_first = thrust::make_transform_iterator( - rx_vertices.begin(), - cuda::proclaim_return_type([edge_partition] __device__(auto v) { - return edge_partition.minor_offset_from_minor_nocheck(v); - })); - auto val_first = thrust::make_constant_iterator(input); - // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and - // directly scatters from the internal buffer) - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + rx_counts[i], - map_first, - edge_partition_value_first); +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub0 = std::chrono::steady_clock::now(); +#endif + std::vector, rmm::device_uvector>> + edge_partition_v_buffers{}; + edge_partition_v_buffers.reserve(loop_count); + rmm::device_uvector dummy_counters(loop_count, handle.get_stream()); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + std::variant, rmm::device_uvector> v_buffer = + rmm::device_uvector(0, handle.get_stream()); + if (v_list_bitmap) { + v_buffer = rmm::device_uvector( + packed_bool_size(local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]), + handle.get_stream()); + } else if (compressed_v_list) { + v_buffer = + rmm::device_uvector(local_v_list_sizes[partition_idx], handle.get_stream()); + } else { + std::get<0>(v_buffer).resize(local_v_list_sizes[partition_idx], handle.get_stream()); + } + edge_partition_v_buffers.push_back(std::move(v_buffer)); + } +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub1 = std::chrono::steady_clock::now(); +#endif + + device_group_start(major_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + auto& v_buffer = edge_partition_v_buffers[j]; + if (v_list_bitmap) { + device_bcast(major_comm, + (*v_list_bitmap).data(), + std::get<1>(v_buffer).data(), + std::get<1>(v_buffer).size(), + static_cast(partition_idx), + handle.get_stream()); + } else if (compressed_v_list) { + device_bcast(major_comm, + (*compressed_v_list).data(), + std::get<1>(v_buffer).data(), + std::get<1>(v_buffer).size(), + static_cast(partition_idx), + handle.get_stream()); + } else { + device_bcast(major_comm, + (static_cast(partition_idx) == major_comm_rank) + ? sorted_unique_vertex_first + : static_cast(nullptr), + std::get<0>(v_buffer).data(), + std::get<0>(v_buffer).size(), + static_cast(partition_idx), + handle.get_stream()); + } + } + device_group_end(major_comm); + bool kernel_fusion = + !edge_partition_keys && !v_list_bitmap && (loop_count > 1) && + (static_cast(std::reduce(local_v_list_sizes.begin() + i, + local_v_list_sizes.begin() + (i + loop_count))) < + size_t{256 * 1024} /* tuning parameter (binary search vs kernel launch overhead) */ * + loop_count); // FIXME: kernle fusion can be useful even when + // edge_partition_keys.has_value() is true + + if (!kernel_fusion) { + if (stream_pool_indices) { handle.sync_stream(); } + } +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub2 = std::chrono::steady_clock::now(); +#endif + + if (!kernel_fusion) { + size_t stream_pool_size{0}; + if (stream_pool_indices) { stream_pool_size = (*stream_pool_indices).size(); } + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j % stream_pool_size]) + : handle.get_stream(); + + if (v_list_bitmap) { + auto const& rx_bitmap = std::get<1>(edge_partition_v_buffers[j]); + rmm::device_uvector rx_vertices(local_v_list_sizes[partition_idx], + loop_stream); + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + rx_vertices.begin(), + raft::device_span(dummy_counters.data() + j, size_t{1}), + local_v_list_range_firsts[partition_idx], + local_v_list_range_lasts[partition_idx], + loop_stream); + edge_partition_v_buffers[j] = std::move(rx_vertices); + } + + if (edge_partition_keys) { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(local_v_list_sizes[partition_idx]), + [rx_vertex_first = compressed_v_list + ? static_cast(nullptr) + : std::get<0>(edge_partition_v_buffers[j]).data(), + rx_compressed_vertex_first = compressed_v_list + ? std::get<1>(edge_partition_v_buffers[j]).data() + : static_cast(nullptr), + range_first = local_v_list_range_firsts[partition_idx], + input, + subrange_key_first = + (*edge_partition_keys).begin() + (*key_offsets)[partition_idx], + subrange_key_last = + (*edge_partition_keys).begin() + (*key_offsets)[partition_idx + 1], + edge_partition_value_first = edge_partition_value_first, + subrange_start_offset = (*key_offsets)[partition_idx]] __device__(auto i) { + vertex_t minor{}; + if (rx_vertex_first != nullptr) { + minor = *(rx_vertex_first + i); + } else { + minor = range_first + *(rx_compressed_vertex_first + i); + } + auto it = + thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor); + if ((it != subrange_key_last) && (*it == minor)) { + auto subrange_offset = thrust::distance(subrange_key_first, it); + if constexpr (contains_packed_bool_element) { + fill_scalar_or_thrust_tuple( + edge_partition_value_first, subrange_start_offset + subrange_offset, input); + } else { + *(edge_partition_value_first + subrange_start_offset + subrange_offset) = + input; + } + } + }); + } else { + if constexpr (contains_packed_bool_element) { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(local_v_list_sizes[partition_idx]), + [minor_range_first, + rx_vertex_first = compressed_v_list + ? static_cast(nullptr) + : std::get<0>(edge_partition_v_buffers[j]).data(), + rx_compressed_vertex_first = compressed_v_list + ? std::get<1>(edge_partition_v_buffers[j]).data() + : static_cast(nullptr), + range_first = local_v_list_range_firsts[partition_idx], + input, + output_value_first = edge_partition_value_first] __device__(auto i) { + vertex_t minor{}; + if (rx_vertex_first != nullptr) { + minor = *(rx_vertex_first + i); + } else { + minor = range_first + *(rx_compressed_vertex_first + i); + } + auto minor_offset = minor - minor_range_first; + fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); + }); + } else { + if (compressed_v_list) { + auto map_first = thrust::make_transform_iterator( + std::get<1>(edge_partition_v_buffers[j]).begin(), + cuda::proclaim_return_type( + [minor_range_first, + range_first = + local_v_list_range_firsts[partition_idx]] __device__(auto v_offset) { + return static_cast(v_offset + (range_first - minor_range_first)); + })); + auto val_first = thrust::make_constant_iterator(input); + thrust::scatter(rmm::exec_policy_nosync(loop_stream), + val_first, + val_first + local_v_list_sizes[partition_idx], + map_first, + edge_partition_value_first); + } else { + auto map_first = thrust::make_transform_iterator( + std::get<0>(edge_partition_v_buffers[j]).begin(), + cuda::proclaim_return_type( + [minor_range_first] __device__(auto v) { return v - minor_range_first; })); + auto val_first = thrust::make_constant_iterator(input); + thrust::scatter(rmm::exec_policy_nosync(loop_stream), + val_first, + val_first + local_v_list_sizes[partition_idx], + map_first, + edge_partition_value_first); + } + } + } + } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + } else { // kernel fusion + std::vector h_vertex_vars(loop_count /* range_first values */ + + (loop_count + 1) /* loop offsets */); + std::copy(local_v_list_range_firsts.begin() + i, + local_v_list_range_firsts.begin() + (i + loop_count), + h_vertex_vars.begin()); + h_vertex_vars[loop_count] = 0; + std::inclusive_scan(local_v_list_sizes.begin() + i, + local_v_list_sizes.begin() + (i + loop_count), + h_vertex_vars.begin() + (loop_count + 1)); + std::vector h_ptrs(loop_count); + if (compressed_v_list) { + for (size_t j = 0; j < loop_count; ++j) { + h_ptrs[j] = static_cast(std::get<1>(edge_partition_v_buffers[j]).data()); + } + } else { + for (size_t j = 0; j < loop_count; ++j) { + h_ptrs[j] = static_cast(std::get<0>(edge_partition_v_buffers[j]).data()); + } + } + rmm::device_uvector d_vertex_vars(h_vertex_vars.size(), handle.get_stream()); + rmm::device_uvector d_ptrs(h_ptrs.size(), handle.get_stream()); + raft::update_device( + d_vertex_vars.data(), h_vertex_vars.data(), h_vertex_vars.size(), handle.get_stream()); + raft::update_device(d_ptrs.data(), h_ptrs.data(), h_ptrs.size(), handle.get_stream()); + + raft::device_span range_firsts(d_vertex_vars.data(), loop_count); + raft::device_span loop_offsets(d_vertex_vars.data() + loop_count, + loop_count + 1); + if constexpr (contains_packed_bool_element) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(h_vertex_vars.back()), + [range_firsts, + loop_offsets, + minor_range_first, + input, + rx_firsts = raft::device_span(d_ptrs.data(), d_ptrs.size()), + output_value_first = edge_partition_value_first, + compressed = compressed_v_list.has_value()] __device__(auto i) { + auto loop_idx = + thrust::distance(loop_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, loop_offsets.begin() + 1, loop_offsets.end(), i)); + auto rx_first = rx_firsts[loop_idx]; + vertex_t minor{}; + if (compressed) { + minor = range_firsts[loop_idx] + + *(static_cast(rx_first) + (i - loop_offsets[loop_idx])); + } else { + minor = *(static_cast(rx_first) + (i - loop_offsets[loop_idx])); + } + auto minor_offset = minor - minor_range_first; + fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); + }); + } else { + auto val_first = thrust::make_constant_iterator(input); + if (compressed_v_list) { + auto map_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type( + [range_firsts, + loop_offsets, + rx_firsts = raft::device_span(d_ptrs.data(), d_ptrs.size()), + minor_range_first] __device__(auto i) { + auto loop_idx = thrust::distance( + loop_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, loop_offsets.begin() + 1, loop_offsets.end(), i)); + auto minor = + range_firsts[loop_idx] + *(static_cast(rx_firsts[loop_idx]) + + (i - loop_offsets[loop_idx])); + return minor - minor_range_first; + })); + thrust::scatter(handle.get_thrust_policy(), + val_first, + val_first + h_vertex_vars.back(), + map_first, + edge_partition_value_first); + } else { + auto map_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type( + [loop_offsets, + rx_firsts = raft::device_span(d_ptrs.data(), d_ptrs.size()), + minor_range_first] __device__(auto i) { + auto loop_idx = thrust::distance( + loop_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, loop_offsets.begin() + 1, loop_offsets.end(), i)); + auto minor = *(static_cast(rx_firsts[loop_idx]) + + (i - loop_offsets[loop_idx])); + return minor - minor_range_first; + })); + thrust::scatter(handle.get_thrust_policy(), + val_first, + val_first + h_vertex_vars.back(), + map_first, + edge_partition_value_first); + } + } } +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub3 = std::chrono::steady_clock::now(); + std::chrono::duration subdur0 = sub1 - sub0; + std::chrono::duration subdur1 = sub2 - sub1; + std::chrono::duration subdur2 = sub3 - sub2; + std::cerr << "fill_edge_minor path B took (" << subdur0.count() << "," << subdur1.count() + << "," << subdur2.count() << ") kernel_fusion=" << kernel_fusion << std::endl; +#endif } } +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto t2 = std::chrono::steady_clock::now(); + std::chrono::duration dur0 = t1 - t0; + std::chrono::duration dur1 = t2 - t1; + std::cerr << "fill_edge_minor took (" << dur0.count() << "," << dur1.count() << ")" + << std::endl; +#endif } else { assert(graph_view.local_vertex_partition_range_size() == - graph_view.local_edge_partition_src_range_size()); + (GraphViewType::is_storage_transposed + ? graph_view.local_edge_partition_src_range_size() + : graph_view.local_edge_partition_dst_range_sizse())); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [input, output_value_first = edge_partition_value_first] __device__(auto v) { fill_scalar_or_thrust_tuple(output_value_first, v, input); }); } else { auto val_first = thrust::make_constant_iterator(input); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + thrust::distance(vertex_first, vertex_last), - vertex_first, - edge_partition_value_first); + thrust::scatter( + handle.get_thrust_policy(), + val_first, + val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + sorted_unique_vertex_first, + edge_partition_value_first); } } } @@ -451,8 +1109,8 @@ void fill_edge_src_property(raft::handle_t const& handle, /** * @brief Fill graph edge source property values to the input value. * - * This version fills only a subset of graph edge source property values. [@p vertex_first, - * @p vertex_last) specifies the vertices to be filled. + * This version fills only a subset of graph edge source property values. [@p + * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices to be filled. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. @@ -461,10 +1119,12 @@ void fill_edge_src_property(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex with a value to be filled. - * v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex partition - * assigned to this process in multi-GPU), otherwise undefined behavior. - * @param vertex_last Iterator pointing to the last (exclusive) vertex with a value to be filled. + * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a value + * to be filled. v in [vertex_first, sorted_unique_vertex_last) should be sorted & distinct (and + * should belong to the vertex partition assigned to this process in multi-GPU), otherwise undefined + * behavior. + * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a value to + * be filled. * @param edge_src_property_output edge_src_property_view_t class object to store source property * values (for the edge source assigned to this process in multi-GPU). * @param input Edge source property values will be set to @p input. @@ -476,8 +1136,8 @@ template void fill_edge_src_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, EdgeSrcValueOutputWrapper edge_src_property_output, T input, bool do_expensive_check = false) @@ -486,8 +1146,8 @@ void fill_edge_src_property(raft::handle_t const& handle, if (do_expensive_check) { auto num_invalids = thrust::count_if( handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(), local_vertex_partition_range_last = graph_view.local_vertex_partition_range_last()] __device__(auto v) { @@ -498,17 +1158,25 @@ void fill_edge_src_property(raft::handle_t const& handle, num_invalids = host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream()); } - CUGRAPH_EXPECTS( - num_invalids == 0, - "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last)."); + CUGRAPH_EXPECTS(num_invalids == 0, + "Invalid input argument: invalid or non-local vertices in " + "[sorted_unique_vertex_first, sorted_unique_vertex_last)."); } if constexpr (GraphViewType::is_storage_transposed) { - detail::fill_edge_minor_property( - handle, graph_view, vertex_first, vertex_last, edge_src_property_output, input); + detail::fill_edge_minor_property(handle, + graph_view, + sorted_unique_vertex_first, + sorted_unique_vertex_last, + edge_src_property_output, + input); } else { - detail::fill_edge_major_property( - handle, graph_view, vertex_first, vertex_last, edge_src_property_output, input); + detail::fill_edge_major_property(handle, + graph_view, + sorted_unique_vertex_first, + sorted_unique_vertex_last, + edge_src_property_output, + input); } } @@ -552,8 +1220,8 @@ void fill_edge_dst_property(raft::handle_t const& handle, /** * @brief Fill graph edge destination property values to the input value. * - * This version fills only a subset of graph edge destination property values. [@p vertex_first, - * @p vertex_last) specifies the vertices to be filled. + * This version fills only a subset of graph edge destination property values. [@p + * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices to be filled. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. @@ -563,10 +1231,12 @@ void fill_edge_dst_property(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex with a value to be filled. - * v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex partition - * assigned to this process in multi-GPU), otherwise undefined behavior. - * @param vertex_last Iterator pointing to the last (exclusive) vertex with a value to be filled. + * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a value + * to be filled. v in [sorted_unique_vertex_first, sorted_unique_vertex_last) should be sorted & + * distinct (and should belong to the vertex partition assigned to this process in multi-GPU), + * otherwise undefined behavior. + * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a value to + * be filled. * @param edge_dst_property_output edge_dst_property_view_t class object to store destination * property values (for the edge destinations assigned to this process in multi-GPU). * @param input Edge destination property values will be set to @p input. @@ -578,8 +1248,8 @@ template void fill_edge_dst_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, EdgeDstValueOutputWrapper edge_dst_property_output, T input, bool do_expensive_check = false) @@ -588,8 +1258,8 @@ void fill_edge_dst_property(raft::handle_t const& handle, if (do_expensive_check) { auto num_invalids = thrust::count_if( handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(), local_vertex_partition_range_last = graph_view.local_vertex_partition_range_last()] __device__(auto v) { @@ -600,17 +1270,25 @@ void fill_edge_dst_property(raft::handle_t const& handle, num_invalids = host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream()); } - CUGRAPH_EXPECTS( - num_invalids == 0, - "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last)."); + CUGRAPH_EXPECTS(num_invalids == 0, + "Invalid input argument: invalid or non-local vertices in " + "[sorted_unique_vertex_first, sorted_unique_vertex_last)."); } if constexpr (GraphViewType::is_storage_transposed) { - detail::fill_edge_major_property( - handle, graph_view, vertex_first, vertex_last, edge_dst_property_output, input); + detail::fill_edge_major_property(handle, + graph_view, + sorted_unique_vertex_first, + sorted_unique_vertex_last, + edge_dst_property_output, + input); } else { - detail::fill_edge_minor_property( - handle, graph_view, vertex_first, vertex_last, edge_dst_property_output, input); + detail::fill_edge_minor_property(handle, + graph_view, + sorted_unique_vertex_first, + sorted_unique_vertex_last, + edge_dst_property_output, + input); } } diff --git a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh index ce5e5d3e8cf..f03e8f54fb2 100644 --- a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh +++ b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh @@ -250,11 +250,14 @@ void per_v_pair_transform_dst_nbr_intersection( } auto num_input_pairs = static_cast(thrust::distance(vertex_pair_first, vertex_pair_last)); - std::optional> unique_vertices{std::nullopt}; + std::optional> sorted_unique_vertices{std::nullopt}; std::optional(size_t{0}, rmm::cuda_stream_view{}))> - property_buffer_for_unique_vertices{std::nullopt}; + property_buffer_for_sorted_unique_vertices{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { - unique_vertices = rmm::device_uvector(num_input_pairs * 2, handle.get_stream()); + auto& comm = handle.get_comms(); + + sorted_unique_vertices = + rmm::device_uvector(num_input_pairs * 2, handle.get_stream()); auto elem0_first = thrust::make_transform_iterator( vertex_pair_first, cugraph::thrust_tuple_get::value_type, @@ -262,7 +265,7 @@ void per_v_pair_transform_dst_nbr_intersection( thrust::copy(handle.get_thrust_policy(), elem0_first, elem0_first + num_input_pairs, - (*unique_vertices).begin()); + (*sorted_unique_vertices).begin()); auto elem1_first = thrust::make_transform_iterator( vertex_pair_first, cugraph::thrust_tuple_get::value_type, @@ -270,25 +273,25 @@ void per_v_pair_transform_dst_nbr_intersection( thrust::copy(handle.get_thrust_policy(), elem1_first, elem1_first + num_input_pairs, - (*unique_vertices).begin() + num_input_pairs); - thrust::sort(handle.get_thrust_policy(), (*unique_vertices).begin(), (*unique_vertices).end()); - (*unique_vertices) - .resize(thrust::distance((*unique_vertices).begin(), + (*sorted_unique_vertices).begin() + num_input_pairs); + thrust::sort(handle.get_thrust_policy(), + (*sorted_unique_vertices).begin(), + (*sorted_unique_vertices).end()); + (*sorted_unique_vertices) + .resize(thrust::distance((*sorted_unique_vertices).begin(), thrust::unique(handle.get_thrust_policy(), - (*unique_vertices).begin(), - (*unique_vertices).end())), + (*sorted_unique_vertices).begin(), + (*sorted_unique_vertices).end())), handle.get_stream()); - std::tie(unique_vertices, property_buffer_for_unique_vertices) = - collect_values_for_unique_int_vertices(handle, - std::move(*unique_vertices), - vertex_value_input_first, - graph_view.vertex_partition_range_lasts()); - thrust::sort_by_key( - handle.get_thrust_policy(), - (*unique_vertices).begin(), - (*unique_vertices).end(), - (*property_buffer_for_unique_vertices).begin()); // necessary for binary search + property_buffer_for_sorted_unique_vertices = collect_values_for_sorted_unique_int_vertices( + comm, + raft::device_span((*sorted_unique_vertices).data(), + (*sorted_unique_vertices).size()), + vertex_value_input_first, + graph_view.vertex_partition_range_lasts(), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); } rmm::device_uvector vertex_pair_indices(num_input_pairs, handle.get_stream()); @@ -412,32 +415,32 @@ void per_v_pair_transform_dst_nbr_intersection( do_expensive_check); } - if (unique_vertices) { - auto vertex_value_input_for_unique_vertices_first = - get_dataframe_buffer_begin(*property_buffer_for_unique_vertices); - thrust::for_each( - handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(this_chunk_size), - detail::call_intersection_op_t< - GraphViewType, - decltype(vertex_value_input_for_unique_vertices_first), - typename decltype(r_nbr_intersection_property_values0)::const_pointer, - IntersectionOp, - decltype(chunk_vertex_pair_index_first), - VertexPairIterator, - VertexPairValueOutputIterator>{edge_partition, - thrust::make_optional>( - (*unique_vertices).data(), (*unique_vertices).size()), - vertex_value_input_for_unique_vertices_first, - intersection_op, - intersection_offsets.data(), - intersection_indices.data(), - r_nbr_intersection_property_values0.data(), - r_nbr_intersection_property_values1.data(), - chunk_vertex_pair_index_first, - vertex_pair_first, - vertex_pair_value_output_first}); + if (sorted_unique_vertices) { + auto vertex_value_input_for_sorted_unique_vertices_first = + get_dataframe_buffer_begin(*property_buffer_for_sorted_unique_vertices); + thrust::for_each(handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(this_chunk_size), + detail::call_intersection_op_t< + GraphViewType, + decltype(vertex_value_input_for_sorted_unique_vertices_first), + typename decltype(r_nbr_intersection_property_values0)::const_pointer, + IntersectionOp, + decltype(chunk_vertex_pair_index_first), + VertexPairIterator, + VertexPairValueOutputIterator>{ + edge_partition, + thrust::make_optional>( + (*sorted_unique_vertices).data(), (*sorted_unique_vertices).size()), + vertex_value_input_for_sorted_unique_vertices_first, + intersection_op, + intersection_offsets.data(), + intersection_indices.data(), + r_nbr_intersection_property_values0.data(), + r_nbr_intersection_property_values1.data(), + chunk_vertex_pair_index_first, + vertex_pair_first, + vertex_pair_value_output_first}); } else { thrust::for_each(handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh index 03514e52e6e..ce810c1f854 100644 --- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh +++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh @@ -206,7 +206,7 @@ struct return_value_compute_offset_t { template >, decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> per_v_random_select_transform_e(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& key_list, EdgeBiasSrcValueInputWrapper edge_bias_src_value_input, EdgeBiasDstValueInputWrapper edge_bias_dst_value_input, EdgeBiasValueInputWrapper edge_bias_value_input, @@ -238,7 +238,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, #ifndef NO_CUGRAPH_OPS using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; - using key_t = typename VertexFrontierBucketType::key_type; + using key_t = typename KeyBucketType::key_type; using key_buffer_t = dataframe_buffer_type_t; using edge_partition_src_input_device_view_t = std::conditional_t< @@ -287,15 +287,15 @@ per_v_random_select_transform_e(raft::handle_t const& handle, if (do_expensive_check) { // FIXME: better re-factor this check function? - auto frontier_vertex_first = - thrust_tuple_get_or_identity(frontier.begin()); - auto frontier_vertex_last = - thrust_tuple_get_or_identity(frontier.end()); + auto key_list_vertex_first = + thrust_tuple_get_or_identity(key_list.begin()); + auto key_list_vertex_last = + thrust_tuple_get_or_identity(key_list.end()); auto num_invalid_keys = - frontier.size() - + key_list.size() - thrust::count_if(handle.get_thrust_policy(), - frontier_vertex_first, - frontier_vertex_last, + key_list_vertex_first, + key_list_vertex_last, check_in_range_t{graph_view.local_vertex_partition_range_first(), graph_view.local_vertex_partition_range_last()}); if constexpr (GraphViewType::is_multi_gpu) { @@ -303,35 +303,35 @@ per_v_random_select_transform_e(raft::handle_t const& handle, handle.get_comms(), num_invalid_keys, raft::comms::op_t::SUM, handle.get_stream()); } CUGRAPH_EXPECTS(num_invalid_keys == size_t{0}, - "Invalid input argument: frontier includes out-of-range keys."); + "Invalid input argument: key_list includes out-of-range keys."); } - std::vector local_frontier_sizes{}; + std::vector local_key_list_sizes{}; if (minor_comm_size > 1) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - local_frontier_sizes = host_scalar_allgather(minor_comm, frontier.size(), handle.get_stream()); + local_key_list_sizes = host_scalar_allgather(minor_comm, key_list.size(), handle.get_stream()); } else { - local_frontier_sizes = std::vector{frontier.size()}; + local_key_list_sizes = std::vector{key_list.size()}; } - std::vector local_frontier_displacements(local_frontier_sizes.size()); - std::exclusive_scan(local_frontier_sizes.begin(), - local_frontier_sizes.end(), - local_frontier_displacements.begin(), + std::vector local_key_list_displacements(local_key_list_sizes.size()); + std::exclusive_scan(local_key_list_sizes.begin(), + local_key_list_sizes.end(), + local_key_list_displacements.begin(), size_t{0}); - // 1. aggregate frontier + // 1. aggregate key_list - std::optional aggregate_local_frontier{std::nullopt}; + std::optional aggregate_local_key_list{std::nullopt}; if (minor_comm_size > 1) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - aggregate_local_frontier = allocate_dataframe_buffer( - local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); + aggregate_local_key_list = allocate_dataframe_buffer( + local_key_list_displacements.back() + local_key_list_sizes.back(), handle.get_stream()); device_allgatherv(minor_comm, - frontier.begin(), - get_dataframe_buffer_begin(*aggregate_local_frontier), - local_frontier_sizes, - local_frontier_displacements, + key_list.begin(), + get_dataframe_buffer_begin(*aggregate_local_key_list), + local_key_list_sizes, + local_key_list_displacements, handle.get_stream()); } @@ -340,66 +340,66 @@ per_v_random_select_transform_e(raft::handle_t const& handle, rmm::device_uvector sample_local_nbr_indices(0, handle.get_stream()); std::optional> sample_key_indices{std::nullopt}; - std::vector local_frontier_sample_offsets{}; + std::vector local_key_list_sample_offsets{}; if constexpr (std::is_same_v>) { - std::tie(sample_local_nbr_indices, sample_key_indices, local_frontier_sample_offsets) = + std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = uniform_sample_and_compute_local_nbr_indices( handle, graph_view, - (minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_frontier) - : frontier.begin(), - local_frontier_displacements, - local_frontier_sizes, + (minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_key_list) + : key_list.begin(), + local_key_list_displacements, + local_key_list_sizes, rng_state, K, with_replacement); } else { - std::tie(sample_local_nbr_indices, sample_key_indices, local_frontier_sample_offsets) = + std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = biased_sample_and_compute_local_nbr_indices( handle, graph_view, - (minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_frontier) - : frontier.begin(), + (minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_key_list) + : key_list.begin(), edge_bias_src_value_input, edge_bias_dst_value_input, edge_bias_value_input, e_bias_op, - local_frontier_displacements, - local_frontier_sizes, + local_key_list_displacements, + local_key_list_sizes, rng_state, K, with_replacement, do_expensive_check); } - std::vector local_frontier_sample_counts(minor_comm_size); - std::adjacent_difference(local_frontier_sample_offsets.begin() + 1, - local_frontier_sample_offsets.end(), - local_frontier_sample_counts.begin()); + std::vector local_key_list_sample_counts(minor_comm_size); + std::adjacent_difference(local_key_list_sample_offsets.begin() + 1, + local_key_list_sample_offsets.end(), + local_key_list_sample_counts.begin()); // 3. transform auto sample_e_op_results = - allocate_dataframe_buffer(local_frontier_sample_offsets.back(), handle.get_stream()); + allocate_dataframe_buffer(local_key_list_sample_offsets.back(), handle.get_stream()); for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { auto edge_partition = edge_partition_device_view_t( graph_view.local_edge_partition_view(i)); - auto edge_partition_frontier_key_first = - ((minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_frontier) - : frontier.begin()) + - local_frontier_displacements[i]; + auto edge_partition_key_list_first = + ((minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_key_list) + : key_list.begin()) + + local_key_list_displacements[i]; auto edge_partition_sample_local_nbr_index_first = - sample_local_nbr_indices.begin() + local_frontier_sample_offsets[i]; + sample_local_nbr_indices.begin() + local_key_list_sample_offsets[i]; auto edge_partition_sample_e_op_result_first = - get_dataframe_buffer_begin(sample_e_op_results) + local_frontier_sample_offsets[i]; + get_dataframe_buffer_begin(sample_e_op_results) + local_key_list_sample_offsets[i]; edge_partition_src_input_device_view_t edge_partition_src_value_input{}; edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; @@ -416,14 +416,14 @@ per_v_random_select_transform_e(raft::handle_t const& handle, if (sample_key_indices) { auto edge_partition_sample_key_index_first = - (*sample_key_indices).begin() + local_frontier_sample_offsets[i]; + (*sample_key_indices).begin() + local_key_list_sample_offsets[i]; thrust::transform( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(local_frontier_sample_counts[i]), + thrust::make_counting_iterator(local_key_list_sample_counts[i]), edge_partition_sample_e_op_result_first, transform_local_nbr_indices_t{ edge_partition, thrust::make_optional(edge_partition_sample_key_index_first), - edge_partition_frontier_key_first, + edge_partition_key_list_first, edge_partition_sample_local_nbr_index_first, edge_partition_src_value_input, edge_partition_dst_value_input, @@ -445,10 +445,10 @@ per_v_random_select_transform_e(raft::handle_t const& handle, thrust::transform( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(frontier.size() * K), + thrust::make_counting_iterator(key_list.size() * K), edge_partition_sample_e_op_result_first, transform_local_nbr_indices_t{edge_partition, thrust::nullopt, - edge_partition_frontier_key_first, + edge_partition_key_list_first, edge_partition_sample_local_nbr_index_first, edge_partition_src_value_input, edge_partition_dst_value_input, @@ -467,13 +467,13 @@ per_v_random_select_transform_e(raft::handle_t const& handle, K}); } } - aggregate_local_frontier = std::nullopt; + aggregate_local_key_list = std::nullopt; // 4. shuffle randomly selected & transformed results and update sample_offsets auto sample_offsets = invalid_value ? std::nullopt : std::make_optional>( - frontier.size() + 1, handle.get_stream()); + key_list.size() + 1, handle.get_stream()); assert(K <= std::numeric_limits::max()); if (minor_comm_size > 1) { sample_local_nbr_indices.resize(0, handle.get_stream()); @@ -484,12 +484,12 @@ per_v_random_select_transform_e(raft::handle_t const& handle, std::tie(sample_e_op_results, std::ignore) = shuffle_values(minor_comm, get_dataframe_buffer_begin(sample_e_op_results), - local_frontier_sample_counts, + local_key_list_sample_counts, handle.get_stream()); std::tie(sample_key_indices, std::ignore) = shuffle_values( - minor_comm, (*sample_key_indices).begin(), local_frontier_sample_counts, handle.get_stream()); + minor_comm, (*sample_key_indices).begin(), local_key_list_sample_counts, handle.get_stream()); - rmm::device_uvector sample_counts(frontier.size(), handle.get_stream()); + rmm::device_uvector sample_counts(key_list.size(), handle.get_stream()); thrust::fill( handle.get_thrust_policy(), sample_counts.begin(), sample_counts.end(), int32_t{0}); auto sample_intra_partition_displacements = @@ -505,7 +505,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, sample_counts.resize(0, handle.get_stream()); sample_counts.shrink_to_fit(handle.get_stream()); - resize_dataframe_buffer(tmp_sample_e_op_results, frontier.size() * K, handle.get_stream()); + resize_dataframe_buffer(tmp_sample_e_op_results, key_list.size() * K, handle.get_stream()); thrust::fill(handle.get_thrust_policy(), get_dataframe_buffer_begin(tmp_sample_e_op_results), get_dataframe_buffer_end(tmp_sample_e_op_results), @@ -554,7 +554,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, sample_e_op_results = std::move(tmp_sample_e_op_results); } else { if (!invalid_value) { - rmm::device_uvector sample_counts(frontier.size(), handle.get_stream()); + rmm::device_uvector sample_counts(key_list.size(), handle.get_stream()); thrust::tabulate( handle.get_thrust_policy(), sample_counts.begin(), @@ -603,8 +603,8 @@ per_v_random_select_transform_e(raft::handle_t const& handle, * @brief Randomly select and transform the input (tagged-)vertices' outgoing edges with biases. * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the - * current (tagged-)vertex frontier. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. @@ -615,8 +615,8 @@ per_v_random_select_transform_e(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param frontier VertexFrontierBucketType class object to store the (tagged-)vertex list to sample - * outgoing edges. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to sample outgoing + * edges. * @param edge_src_value_input Wrapper used to access source input property values (for the edge * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() @@ -653,11 +653,11 @@ per_v_random_select_transform_e(raft::handle_t const& handle, * @return std::tuple Tuple of an optional offset vector of type * std::optional> and a dataframe buffer storing the output values of * type @p T from the selected edges. If @p invalid_value is std::nullopt, the offset vector is - * valid and has the size of @p frontier.size() + 1. If @p invalid_value.has_value() is true, - * std::nullopt is returned (the dataframe buffer will store @p frontier.size() * @p K elements). + * valid and has the size of @p key_list.size() + 1. If @p invalid_value.has_value() is true, + * std::nullopt is returned (the dataframe buffer will store @p key_list.size() * @p K elements). */ template >, decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& key_list, EdgeBiasSrcValueInputWrapper edge_bias_src_value_input, EdgeBiasDstValueInputWrapper edge_bias_dst_value_input, EdgeBiasValueInputWrapper edge_bias_value_input, @@ -688,7 +688,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, { return detail::per_v_random_select_transform_e(handle, graph_view, - frontier, + key_list, edge_bias_src_value_input, edge_bias_dst_value_input, edge_bias_value_input, @@ -711,8 +711,8 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, * (uniform neighbor sampling). * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the - * current (tagged-)vertex frontier. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. @@ -721,8 +721,8 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param frontier VertexFrontierBucketType class object to store the (tagged-)vertex list to sample - * outgoing edges. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to sample outgoing + * edges. * @param edge_src_value_input Wrapper used to access source input property values (for the edge * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() @@ -753,11 +753,11 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, * @return std::tuple Tuple of an optional offset vector of type * std::optional> and a dataframe buffer storing the output values of * type @p T from the selected edges. If @p invalid_value is std::nullopt, the offset vector is - * valid and has the size of @p frontier.size() + 1. If @p invalid_value.has_value() is true, - * std::nullopt is returned (the dataframe buffer will store @p frontier.size() * @p K elements). + * valid and has the size of @p key_list.size() + 1. If @p invalid_value.has_value() is true, + * std::nullopt is returned (the dataframe buffer will store @p key_list.size() * @p K elements). */ template >, decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& key_list, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, @@ -781,7 +781,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, return detail::per_v_random_select_transform_e( handle, graph_view, - frontier, + key_list, edge_src_dummy_property_t{}.view(), edge_dst_dummy_property_t{}.view(), edge_dummy_property_t{}.view(), @@ -789,7 +789,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, detail::edge_endpoint_dummy_property_view_t, detail::edge_endpoint_dummy_property_view_t, edge_dummy_property_view_t, - typename VertexFrontierBucketType::key_type>{}, + typename KeyBucketType::key_type>{}, edge_src_value_input, edge_dst_value_input, edge_value_input, diff --git a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh index 5a5e9332094..c13816242bc 100644 --- a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh @@ -924,11 +924,12 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e( auto values_for_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); std::tie(unique_minor_keys, values_for_unique_keys) = - collect_values_for_unique_keys(handle, + collect_values_for_unique_keys(comm, kv_store_view, std::move(unique_minor_keys), cugraph::detail::compute_gpu_id_from_ext_vertex_t{ - comm_size, major_comm_size, minor_comm_size}); + comm_size, major_comm_size, minor_comm_size}, + handle.get_stream()); if constexpr (KVStoreViewType::binary_search) { multi_gpu_minor_key_value_map_ptr = diff --git a/cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh new file mode 100644 index 00000000000..1e0d366429e --- /dev/null +++ b/cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "prims/detail/per_v_transform_reduce_e.cuh" +#include "prims/vertex_frontier.cuh" + +#include +#include +#include + +#include + +#include +#include + +namespace cugraph { + +/** + * @brief Iterate over every vertex's incoming edges to update vertex properties. + * + * This function is inspired by thrust::transform_reduce. In addition, this function excludes the + * edges that return false when the predicate @p pred_op is applied. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam PredOp Type of the quinary predicate operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to + * fill the wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be added to the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param pred_op Quinary operator takes edge source, edge destination, property values for the + * source, destination, and edge and returns whether this edge should be included (if true is + * returned) or excluded. + * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first + * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` + * (exclusive) is deduced as @p vertex_value_output_first + @p + * graph_view.local_vertex_partition_range_size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_if_incoming_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = true; + + detail::per_v_transform_reduce_e(handle, + graph_view, + static_cast(nullptr), + static_cast(nullptr), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + pred_op, + vertex_value_output_first); +} + +/** + * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the incoming + * edges to update (tagged-)vertex properties. + * + * This function is inspired by thrust::transform_reduce(). In addition, this function excludes the + * edges that return false when the predicate @p pred_op is applied. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam PredOp Type of the quinary predicate operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update + * (tagged-)vertex properties. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 incoming edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param pred_op Quinary operator takes edge source, edge destination, property values for the + * source, destination, and edge and returns whether this edge should be included (if true is + * returned) or excluded. + * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for + * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is + * deduced as @p vertex_value_output_first + @p key_list.size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_if_incoming_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& key_list, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + static_assert(GraphViewType::is_storage_transposed); + + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = true; + + detail::per_v_transform_reduce_e(handle, + graph_view, + key_list.begin(), + key_list.end(), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + pred_op, + vertex_value_output_first); +} + +/** + * @brief Iterate over every vertex's outgoing edges to update vertex properties. + * + * This function is inspired by thrust::transform_reduce(). In addition, this function excludes the + * edges that return false when the predicate @p pred_op is applied. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam PredOp Type of the quinary predicate operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be added to the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param pred_op Quinary operator takes edge source, edge destination, property values for the + * source, destination, and edge and returns whether this edge should be included (if true is + * returned) or excluded. + * @param vertex_value_output_first Iterator pointing to the vertex property variables for the + * first (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` + * (exclusive) is deduced as @p vertex_value_output_first + @p + * graph_view.local_vertex_partition_range_size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_if_outgoing_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = false; + + detail::per_v_transform_reduce_e(handle, + graph_view, + static_cast(nullptr), + static_cast(nullptr), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + pred_op, + vertex_value_output_first); +} + +/** + * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the outgoing + * edges to update (tagged-)vertex properties. + * + * This function is inspired by thrust::transform_reduce(). In addition, this function excludes the + * edges that return false when the predicate @p pred_op is applied. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam PredOp Type of the quinary predicate operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update + * (tagged-)vertex properties. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param pred_op Quinary operator takes edge source, edge destination, property values for the + * source, destination, and edge and returns whether this edge should be included (if true is + * returned) or excluded. + * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for + * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is + * deduced as @p vertex_value_output_first + @p key_list.size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_if_outgoing_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& key_list, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + static_assert(!GraphViewType::is_storage_transposed); + static_assert(KeyBucketType::is_sorted_unique); + + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = false; + + detail::per_v_transform_reduce_e(handle, + graph_view, + key_list.begin(), + key_list.end(), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + pred_op, + vertex_value_output_first); +} + +} // namespace cugraph diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index 027ef1f662d..5ba7edec894 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -15,558 +15,165 @@ */ #pragma once -#include "detail/graph_partition_utils.cuh" -#include "prims/detail/prim_functors.cuh" -#include "prims/fill_edge_src_dst_property.cuh" -#include "prims/property_op_utils.cuh" -#include "prims/reduce_op.cuh" +#include "prims/detail/per_v_transform_reduce_e.cuh" +#include "prims/vertex_frontier.cuh" -#include -#include -#include #include #include -#include -#include -#include #include -#include -#include #include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include #include namespace cugraph { -namespace detail { - -int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512; - -template -struct transform_and_atomic_reduce_t { - edge_partition_device_view_t const& edge_partition{}; - result_t identity_element{}; - vertex_t const* indices{nullptr}; - TransformOp const& transform_op{}; - ResultValueOutputIteratorOrWrapper& result_value_output{}; - - __device__ void operator()(edge_t i) const - { - auto e_op_result = transform_op(i); - if (e_op_result != identity_element) { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - if constexpr (multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } -}; - -template -__device__ void update_result_value_output( - edge_partition_device_view_t const& edge_partition, - vertex_t const* indices, - edge_t local_degree, - TransformOp const& transform_op, - result_t init, - ReduceOp const& reduce_op, - size_t output_idx /* relevent only when update_major === true */, - result_t identity_element, - ResultValueOutputIteratorOrWrapper& result_value_output) -{ - if constexpr (update_major) { - *(result_value_output + output_idx) = - thrust::transform_reduce(thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - transform_op, - init, - reduce_op); - } else { - thrust::for_each( - thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - transform_and_atomic_reduce_t{ - edge_partition, identity_element, indices, transform_op, result_value_output}); - } -} - -template -__global__ static void per_v_transform_reduce_e_hypersparse( - edge_partition_device_view_t edge_partition, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - thrust::optional edge_partition_e_mask, - ResultValueOutputIteratorOrWrapper result_value_output, - EdgeOp e_op, - T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true */, - ReduceOp reduce_op) -{ - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< - ReduceOp>); // atomic_reduce is defined only when - // has_compatible_raft_comms_op_t is true - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto major_start_offset = static_cast(*(edge_partition.major_hypersparse_first()) - - edge_partition.major_range_first()); - auto idx = static_cast(tid); - - auto dcs_nzd_vertex_count = *(edge_partition.dcs_nzd_vertex_count()); - - while (idx < static_cast(dcs_nzd_vertex_count)) { - auto major = - *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast(idx))); - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - auto major_idx = - major_start_offset + idx; // major_offset != major_idx in the hypersparse region - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = - edge_partition.local_edges(static_cast(major_idx)); - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; - - if (edge_partition_e_mask) { - auto transform_op = - [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - return call_e_op(i); - } else { - return identity_element; - } - }; - - update_result_value_output(edge_partition, - indices, - local_degree, - transform_op, - init, - reduce_op, - major - *(edge_partition).major_hypersparse_first(), - identity_element, - result_value_output); - } else { - update_result_value_output(edge_partition, - indices, - local_degree, - call_e_op, - init, - reduce_op, - major - *(edge_partition).major_hypersparse_first(), - identity_element, - result_value_output); - } - idx += gridDim.x * blockDim.x; - } -} - -template -__global__ static void per_v_transform_reduce_e_low_degree( - edge_partition_device_view_t edge_partition, - typename GraphViewType::vertex_type major_range_first, - typename GraphViewType::vertex_type major_range_last, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - thrust::optional edge_partition_e_mask, - ResultValueOutputIteratorOrWrapper result_value_output, - EdgeOp e_op, - T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true */, - ReduceOp reduce_op) -{ - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< - ReduceOp>); // atomic_reduce is defined only when - // has_compatible_raft_comms_op_t is true - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto major_start_offset = - static_cast(major_range_first - edge_partition.major_range_first()); - auto idx = static_cast(tid); - - while (idx < static_cast(major_range_last - major_range_first)) { - auto major_offset = static_cast(major_start_offset + idx); - auto major = edge_partition.major_from_major_offset_nocheck(major_offset); - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = - edge_partition.local_edges(static_cast(major_offset)); - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; - - if (edge_partition_e_mask) { - auto transform_op = - [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - return call_e_op(i); - } else { - return identity_element; - } - }; - - update_result_value_output(edge_partition, - indices, - local_degree, - transform_op, - init, - reduce_op, - idx, - identity_element, - result_value_output); - } else { - update_result_value_output(edge_partition, - indices, - local_degree, - call_e_op, - init, - reduce_op, - idx, - identity_element, - result_value_output); - } - idx += gridDim.x * blockDim.x; - } -} - -template -__global__ static void per_v_transform_reduce_e_mid_degree( - edge_partition_device_view_t edge_partition, - typename GraphViewType::vertex_type major_range_first, - typename GraphViewType::vertex_type major_range_last, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - thrust::optional edge_partition_e_mask, - ResultValueOutputIteratorOrWrapper result_value_output, - EdgeOp e_op, - T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true */, - ReduceOp reduce_op) + typename T, + typename VertexValueOutputIterator> +void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) { - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< - ReduceOp>); // atomic_reduce is defined only when - // has_compatible_raft_comms_op_t is true - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using e_op_result_t = T; - - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - static_assert(per_v_transform_reduce_e_kernel_block_size % raft::warp_size() == 0); - auto const lane_id = tid % raft::warp_size(); - auto major_start_offset = - static_cast(major_range_first - edge_partition.major_range_first()); - auto idx = static_cast(tid / raft::warp_size()); - - using WarpReduce = cub::WarpReduce; - [[maybe_unused]] __shared__ typename WarpReduce::TempStorage - temp_storage[per_v_transform_reduce_e_kernel_block_size / - raft::warp_size()]; // relevant only if update_major == true - - while (idx < static_cast(major_range_last - major_range_first)) { - auto major_offset = static_cast(major_start_offset + idx); - auto major = edge_partition.major_from_major_offset_nocheck(major_offset); - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; - - [[maybe_unused]] auto reduced_e_op_result = - lane_id == 0 ? init : identity_element; // relevant only if update_major == true - if (edge_partition_e_mask) { - for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } - } - } else { - for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } - } - - if constexpr (update_major) { - reduced_e_op_result = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) - .Reduce(reduced_e_op_result, reduce_op); - if (lane_id == 0) { *(result_value_output + idx) = reduced_e_op_result; } - } - - idx += gridDim.x * (blockDim.x / raft::warp_size()); + if (do_expensive_check) { + // currently, nothing to do } -} - -template -__global__ static void per_v_transform_reduce_e_high_degree( - edge_partition_device_view_t edge_partition, - typename GraphViewType::vertex_type major_range_first, - typename GraphViewType::vertex_type major_range_last, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - thrust::optional edge_partition_e_mask, - ResultValueOutputIteratorOrWrapper result_value_output, - EdgeOp e_op, - T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true */, - ReduceOp reduce_op) -{ - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< - ReduceOp>); // atomic_reduce is defined only when - // has_compatible_raft_comms_op_t is true - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using e_op_result_t = T; - auto major_start_offset = - static_cast(major_range_first - edge_partition.major_range_first()); - auto idx = static_cast(blockIdx.x); - - using BlockReduce = cub::BlockReduce; - [[maybe_unused]] __shared__ - typename BlockReduce::TempStorage temp_storage; // relevant only if update_major == true - - while (idx < static_cast(major_range_last - major_range_first)) { - auto major_offset = static_cast(major_start_offset + idx); - auto major = edge_partition.major_from_major_offset_nocheck(major_offset); - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; - - [[maybe_unused]] auto reduced_e_op_result = - threadIdx.x == 0 ? init : identity_element; // relevant only if update_major == true - if (edge_partition_e_mask) { - for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } - } - } else { - for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } - } - - if constexpr (update_major) { - reduced_e_op_result = BlockReduce(temp_storage).Reduce(reduced_e_op_result, reduce_op); - if (threadIdx.x == 0) { *(result_value_output + idx) = reduced_e_op_result; } - } - - idx += gridDim.x; - } + constexpr bool incoming = true; + + detail::per_v_transform_reduce_e( + handle, + graph_view, + static_cast(nullptr), + static_cast(nullptr), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + detail::const_true_e_op_t{}, + vertex_value_output_first); } -template -void per_v_transform_reduce_e(raft::handle_t const& handle, - GraphViewType const& graph_view, - EdgeSrcValueInputWrapper edge_src_value_input, - EdgeDstValueInputWrapper edge_dst_value_input, - EdgeValueInputWrapper edge_value_input, - EdgeOp e_op, - T init, - ReduceOp reduce_op, - VertexValueOutputIterator vertex_value_output_first) +void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& key_list, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) { - static_assert(ReduceOp::pure_function && reduce_op::has_compatible_raft_comms_op_v && - reduce_op::has_identity_element_v); // current restriction, to support - // general reduction, we may need to - // take a less efficient code path - - constexpr auto update_major = (incoming == GraphViewType::is_storage_transposed); - [[maybe_unused]] constexpr auto max_segments = - detail::num_sparse_segments_per_vertex_partition + size_t{1}; - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; + static_assert(GraphViewType::is_storage_transposed); - using edge_partition_src_input_device_view_t = std::conditional_t< - std::is_same_v, - detail::edge_partition_endpoint_dummy_property_device_view_t, - detail::edge_partition_endpoint_property_device_view_t< - vertex_t, - typename EdgeSrcValueInputWrapper::value_iterator, - typename EdgeSrcValueInputWrapper::value_type>>; - using edge_partition_dst_input_device_view_t = std::conditional_t< - std::is_same_v, - detail::edge_partition_endpoint_dummy_property_device_view_t, - detail::edge_partition_endpoint_property_device_view_t< - vertex_t, - typename EdgeDstValueInputWrapper::value_iterator, - typename EdgeDstValueInputWrapper::value_type>>; - using edge_partition_e_input_device_view_t = std::conditional_t< - std::is_same_v, - detail::edge_partition_edge_dummy_property_device_view_t, - detail::edge_partition_edge_property_device_view_t< - edge_t, - typename EdgeValueInputWrapper::value_iterator, - typename EdgeValueInputWrapper::value_type>>; - - static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); - - using minor_tmp_buffer_type = std::conditional_t, - edge_dst_property_t>; - [[maybe_unused]] std::unique_ptr minor_tmp_buffer{}; - if constexpr (GraphViewType::is_multi_gpu && !update_major) { - minor_tmp_buffer = std::make_unique(handle, graph_view); - } - - using edge_partition_minor_output_device_view_t = - std::conditional_tmutable_view().value_first())>, - void /* dummy */>; - - if constexpr (update_major) { - size_t partition_idx = 0; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - partition_idx = static_cast(minor_comm_rank); - } - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - if (segment_offsets) { // no vertices in the zero degree segment are visited - thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first + *((*segment_offsets).rbegin() + 1), - vertex_value_output_first + *((*segment_offsets).rbegin()), - init); - } - } else { - if constexpr (GraphViewType::is_multi_gpu) { - auto minor_init = init; - auto view = minor_tmp_buffer->view(); - if (view.keys()) { // defer applying the initial value to the end as minor_tmp_buffer may not - // store values for the entire minor range - minor_init = ReduceOp::identity_element; - } else { - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_rank = major_comm.get_rank(); - minor_init = (major_comm_rank == 0) ? init : ReduceOp::identity_element; - } - fill_edge_minor_property(handle, graph_view, minor_tmp_buffer->mutable_view(), minor_init); - } else { - thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first, - vertex_value_output_first + graph_view.local_vertex_partition_range_size(), - init); - } - } - - std::optional> stream_pool_indices{std::nullopt}; - if constexpr (GraphViewType::is_multi_gpu) { - if ((graph_view.local_edge_partition_segment_offsets(0)) && - (handle.get_stream_pool_size() >= max_segments)) { - for (size_t i = 1; i < graph_view.number_of_local_edge_partitions(); ++i) { - assert(graph_view.local_edge_partition_segment_offsets(i)); - } - - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - - // memory footprint vs parallelism trade-off - // peak memory requirement per loop is - // update_major ? V / comm_size * sizeof(T) : 0 - // and limit memory requirement to (E / comm_size) * sizeof(vertex_t) - - size_t num_streams = - std::min(static_cast(minor_comm_size) * max_segments, - raft::round_down_safe(handle.get_stream_pool_size(), max_segments)); - if constexpr (update_major) { - size_t value_size{0}; - if constexpr (is_thrust_tuple_of_arithmetic::value) { - auto elem_sizes = compute_thrust_tuple_element_sizes{}(); - value_size = std::reduce(elem_sizes.begin(), elem_sizes.end()); - } else { - value_size = sizeof(T); - } - - auto avg_vertex_degree = - graph_view.number_of_vertices() > 0 - ? (static_cast(graph_view.compute_number_of_edges(handle)) / - static_cast(graph_view.number_of_vertices())) - : double{0.0}; - - num_streams = - std::min(static_cast(avg_vertex_degree * (static_cast(sizeof(vertex_t)) / - static_cast(value_size))) * - max_segments, - num_streams); - } - - if (num_streams >= max_segments) { - stream_pool_indices = std::vector(num_streams); - std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); - handle.sync_stream(); - } - } - } - - std::vector(0, rmm::cuda_stream_view{}))> - major_tmp_buffers{}; - if constexpr (GraphViewType::is_multi_gpu && update_major) { - std::vector major_tmp_buffer_sizes(graph_view.number_of_local_edge_partitions(), - size_t{0}); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - if (segment_offsets) { - major_tmp_buffer_sizes[i] = - *((*segment_offsets).rbegin() + 1); // exclude the zero degree segment - } else { - if constexpr (GraphViewType::is_storage_transposed) { - major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_dst_range_size(i); - } else { - major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_src_range_size(i); - } - } - } - if (stream_pool_indices) { - auto num_concurrent_loops = (*stream_pool_indices).size() / max_segments; - major_tmp_buffers.reserve(num_concurrent_loops); - for (size_t i = 0; i < num_concurrent_loops; ++i) { - size_t max_size{0}; - for (size_t j = i; j < graph_view.number_of_local_edge_partitions(); - j += num_concurrent_loops) { - max_size = std::max(major_tmp_buffer_sizes[j], max_size); - } - major_tmp_buffers.push_back(allocate_dataframe_buffer(max_size, handle.get_stream())); - } - } else { - major_tmp_buffers.reserve(1); - major_tmp_buffers.push_back(allocate_dataframe_buffer( - *std::max_element(major_tmp_buffer_sizes.begin(), major_tmp_buffer_sizes.end()), - handle.get_stream())); - } - } else { // dummy - major_tmp_buffers.reserve(1); - major_tmp_buffers.push_back(allocate_dataframe_buffer(size_t{0}, handle.get_stream())); - } - - if (stream_pool_indices) { handle.sync_stream(); } - - auto edge_mask_view = graph_view.edge_mask_view(); - - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - auto edge_partition_e_mask = - edge_mask_view - ? thrust::make_optional< - detail::edge_partition_edge_property_device_view_t>( - *edge_mask_view, i) - : thrust::nullopt; - - auto major_init = ReduceOp::identity_element; - if constexpr (update_major) { - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - major_init = (static_cast(i) == minor_comm_rank) ? init : ReduceOp::identity_element; - } else { - major_init = init; - } - } - - edge_partition_src_input_device_view_t edge_partition_src_value_input{}; - edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; - if constexpr (GraphViewType::is_storage_transposed) { - edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input); - edge_partition_dst_value_input = - edge_partition_dst_input_device_view_t(edge_dst_value_input, i); - } else { - edge_partition_src_value_input = - edge_partition_src_input_device_view_t(edge_src_value_input, i); - edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input); - } - auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i); - - auto major_buffer_first = - get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]); - - std::conditional_t, - VertexValueOutputIterator> - output_buffer{}; - if constexpr (GraphViewType::is_multi_gpu) { - if constexpr (update_major) { - output_buffer = major_buffer_first; - } else { - output_buffer = edge_partition_minor_output_device_view_t(minor_tmp_buffer->mutable_view()); - } - } else { - output_buffer = vertex_value_output_first; - } - - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - if (segment_offsets) { - static_assert(detail::num_sparse_segments_per_vertex_partition == 3); - - // FIXME: we may further improve performance by 1) individually tuning block sizes for - // different segments; and 2) adding one more segment for very high degree vertices and - // running segmented reduction - if (edge_partition.dcs_nzd_vertex_count()) { - auto exec_stream = - stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) - : handle.get_stream(); - - if constexpr (update_major) { // this is necessary as we don't visit every vertex in the - // hypersparse segment - thrust::fill(rmm::exec_policy(exec_stream), - output_buffer + (*segment_offsets)[3], - output_buffer + (*segment_offsets)[4], - major_init); - } - - if (*(edge_partition.dcs_nzd_vertex_count()) > 0) { - raft::grid_1d_thread_t update_grid(*(edge_partition.dcs_nzd_vertex_count()), - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[3]; } - detail::per_v_transform_reduce_e_hypersparse - <<>>( - edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - segment_output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op); - } - } - if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { - auto exec_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments + 1) % - (*stream_pool_indices).size()) - : handle.get_stream(); - raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2], - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[2]; } - detail::per_v_transform_reduce_e_low_degree - <<>>( - edge_partition, - edge_partition.major_range_first() + (*segment_offsets)[2], - edge_partition.major_range_first() + (*segment_offsets)[3], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - segment_output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op); - } - if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { - auto exec_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments + 2) % - (*stream_pool_indices).size()) - : handle.get_stream(); - raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1], - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[1]; } - detail::per_v_transform_reduce_e_mid_degree - <<>>( - edge_partition, - edge_partition.major_range_first() + (*segment_offsets)[1], - edge_partition.major_range_first() + (*segment_offsets)[2], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - segment_output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op); - } - if ((*segment_offsets)[1] > 0) { - auto exec_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments + 3) % - (*stream_pool_indices).size()) - : handle.get_stream(); - raft::grid_1d_block_t update_grid((*segment_offsets)[1], - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - detail::per_v_transform_reduce_e_high_degree - <<>>( - edge_partition, - edge_partition.major_range_first(), - edge_partition.major_range_first() + (*segment_offsets)[1], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op); - } - } else { - if (edge_partition.major_range_size() > 0) { - raft::grid_1d_thread_t update_grid(edge_partition.major_range_size(), - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - detail::per_v_transform_reduce_e_low_degree - <<>>( - edge_partition, - edge_partition.major_range_first(), - edge_partition.major_range_last(), - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op); - } - } - - if constexpr (GraphViewType::is_multi_gpu && update_major) { - auto& comm = handle.get_comms(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); - - if (segment_offsets && stream_pool_indices) { - if (edge_partition.dcs_nzd_vertex_count()) { - device_reduce( - minor_comm, - major_buffer_first + (*segment_offsets)[3], - vertex_value_output_first + (*segment_offsets)[3], - (*segment_offsets)[4] - (*segment_offsets)[3], - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size())); - } - if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { - device_reduce(minor_comm, - major_buffer_first + (*segment_offsets)[2], - vertex_value_output_first + (*segment_offsets)[2], - (*segment_offsets)[3] - (*segment_offsets)[2], - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream_from_stream_pool((i * max_segments + 1) % - (*stream_pool_indices).size())); - } - if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { - device_reduce(minor_comm, - major_buffer_first + (*segment_offsets)[1], - vertex_value_output_first + (*segment_offsets)[1], - (*segment_offsets)[2] - (*segment_offsets)[1], - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream_from_stream_pool((i * max_segments + 2) % - (*stream_pool_indices).size())); - } - if ((*segment_offsets)[1] > 0) { - device_reduce(minor_comm, - major_buffer_first, - vertex_value_output_first, - (*segment_offsets)[1], - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream_from_stream_pool((i * max_segments + 3) % - (*stream_pool_indices).size())); - } - } else { - size_t reduction_size = static_cast( - segment_offsets ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ - : edge_partition.major_range_size()); - device_reduce(minor_comm, - major_buffer_first, - vertex_value_output_first, - reduction_size, - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream()); - } - } - - if (stream_pool_indices && ((i + 1) % major_tmp_buffers.size() == 0)) { - handle.sync_stream_pool( - *stream_pool_indices); // to prevent buffer over-write (this can happen as *segment_offsets - // do not necessarily coincide in different edge partitions). - } + if (do_expensive_check) { + // currently, nothing to do } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - - if constexpr (GraphViewType::is_multi_gpu && !update_major) { - auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_rank = major_comm.get_rank(); - auto const major_comm_size = major_comm.get_size(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); - - auto view = minor_tmp_buffer->view(); - if (view.keys()) { // applying the initial value is deferred to here - vertex_t max_vertex_partition_size{0}; - for (int i = 0; i < major_comm_size; ++i) { - auto this_segment_vertex_partition_id = - compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - max_vertex_partition_size = - std::max(max_vertex_partition_size, - graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)); - } - auto tx_buffer = allocate_dataframe_buffer(max_vertex_partition_size, handle.get_stream()); - auto tx_buffer_first = get_dataframe_buffer_begin(tx_buffer); - std::optional> minor_key_offsets{}; - if constexpr (GraphViewType::is_storage_transposed) { - minor_key_offsets = graph_view.local_sorted_unique_edge_src_vertex_partition_offsets(); - } else { - minor_key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets(); - } - for (int i = 0; i < major_comm_size; ++i) { - auto minor_init = (major_comm_rank == i) ? init : ReduceOp::identity_element; - auto this_segment_vertex_partition_id = - compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - thrust::fill(handle.get_thrust_policy(), - tx_buffer_first, - tx_buffer_first + - graph_view.vertex_partition_range_size(this_segment_vertex_partition_id), - minor_init); - auto value_first = thrust::make_transform_iterator( - view.value_first(), - cuda::proclaim_return_type( - [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); })); - thrust::scatter(handle.get_thrust_policy(), - value_first + (*minor_key_offsets)[i], - value_first + (*minor_key_offsets)[i + 1], - thrust::make_transform_iterator( - (*(view.keys())).begin() + (*minor_key_offsets)[i], - cuda::proclaim_return_type( - [key_first = graph_view.vertex_partition_range_first( - this_segment_vertex_partition_id)] __device__(auto key) { - return key - key_first; - })), - tx_buffer_first); - device_reduce(major_comm, - tx_buffer_first, - vertex_value_output_first, - static_cast( - graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), - ReduceOp::compatible_raft_comms_op, - i, - handle.get_stream()); - } - } else { - auto first_segment_vertex_partition_id = - compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(0); - vertex_t minor_range_first = - graph_view.vertex_partition_range_first(first_segment_vertex_partition_id); - for (int i = 0; i < major_comm_size; ++i) { - auto this_segment_vertex_partition_id = - compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - auto offset = graph_view.vertex_partition_range_first(this_segment_vertex_partition_id) - - minor_range_first; - device_reduce(major_comm, - view.value_first() + offset, - vertex_value_output_first, - static_cast( - graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), - ReduceOp::compatible_raft_comms_op, - i, - handle.get_stream()); - } - } - } + constexpr bool incoming = true; + + detail::per_v_transform_reduce_e( + handle, + graph_view, + key_list.begin(), + key_list.end(), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + detail::const_true_e_op_t{}, + vertex_value_output_first); } -} // namespace detail - /** - * @brief Iterate over every vertex's incoming edges to update vertex properties. + * @brief Iterate over every vertex's outgoing edges to update vertex properties. * - * This function is inspired by thrust::transform_reduce. + * This function is inspired by thrust::transform_reduce(). * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. @@ -1131,8 +240,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, * @param edge_src_value_input Wrapper used to access source input property values (for the edge * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() - * (if @p e_op does not access source property values). Use update_edge_src_property to - * fill the wrapper. + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. * @param edge_dst_value_input Wrapper used to access destination input property values (for the * edge destinations assigned to this process in multi-GPU). Use either * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or @@ -1145,14 +254,16 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, * @param e_op Quinary operator takes edge source, edge destination, property values for the source, * destination, and edge and returns a value to be reduced. * @param init Initial value to be added to the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is * recommended to use the pre-defined reduction operators whenever possible as the current (and * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has * known member variables) to take a more optimized code path. See the documentation in the * reduce_op.cuh file for instructions on writing custom reduction operators. - * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first - * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` + * @param vertex_value_output_first Iterator pointing to the vertex property variables for the + * first (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` * (exclusive) is deduced as @p vertex_value_output_first + @p * graph_view.local_vertex_partition_range_size(). * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). @@ -1165,7 +276,7 @@ template -void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, +void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, GraphViewType const& graph_view, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, @@ -1180,23 +291,37 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, // currently, nothing to do } - detail::per_v_transform_reduce_e(handle, - graph_view, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - init, - reduce_op, - vertex_value_output_first); + constexpr bool incoming = false; + + detail::per_v_transform_reduce_e( + handle, + graph_view, + static_cast(nullptr), + static_cast(nullptr), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + detail::const_true_e_op_t{}, + vertex_value_output_first); } /** - * @brief Iterate over every vertex's outgoing edges to update vertex properties. + * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the outgoing + * edges to update (tagged-)vertex properties. * * This function is inspired by thrust::transform_reduce(). * * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. @@ -1207,6 +332,8 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update + * (tagged-)vertex properties. * @param edge_src_value_input Wrapper used to access source input property values (for the edge * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() @@ -1223,20 +350,22 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, * access edge property values). * @param e_op Quinary operator takes edge source, edge destination, property values for the source, * destination, and edge and returns a value to be reduced. - * @param init Initial value to be added to the reduced @p e_op return values for each vertex. + * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is * recommended to use the pre-defined reduction operators whenever possible as the current (and * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has * known member variables) to take a more optimized code path. See the documentation in the * reduce_op.cuh file for instructions on writing custom reduction operators. - * @param vertex_value_output_first Iterator pointing to the vertex property variables for the - * first (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` - * (exclusive) is deduced as @p vertex_value_output_first + @p - * graph_view.local_vertex_partition_range_size(). + * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for + * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is + * deduced as @p vertex_value_output_first + @p key_list.size(). * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). */ template void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, GraphViewType const& graph_view, + KeyBucketType const& key_list, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, @@ -1255,19 +385,33 @@ void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, VertexValueOutputIterator vertex_value_output_first, bool do_expensive_check = false) { + static_assert(!GraphViewType::is_storage_transposed); + static_assert(KeyBucketType::is_sorted_unique); + if (do_expensive_check) { // currently, nothing to do } - detail::per_v_transform_reduce_e(handle, - graph_view, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - init, - reduce_op, - vertex_value_output_first); + constexpr bool incoming = false; + + detail::per_v_transform_reduce_e( + handle, + graph_view, + key_list.begin(), + key_list.end(), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + detail::const_true_e_op_t{}, + vertex_value_output_first); } } // namespace cugraph diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh new file mode 100644 index 00000000000..eaa328a0309 --- /dev/null +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh @@ -0,0 +1,1277 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "detail/graph_partition_utils.cuh" +#include "prims/detail/extract_transform_v_frontier_e.cuh" +#include "prims/detail/prim_utils.cuh" +#include "prims/property_op_utils.cuh" +#include "prims/reduce_op.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace cugraph { + +namespace detail { + +int32_t constexpr update_v_frontier_from_outgoing_e_kernel_block_size = 512; + +template +struct transform_reduce_v_frontier_call_e_op_t { + EdgeOp e_op{}; + + __device__ thrust::optional< + std::conditional_t && !std::is_same_v, + thrust::tuple, + std::conditional_t, key_t, payload_t>>> + operator()(key_t key, vertex_t dst, src_value_t sv, dst_value_t dv, e_value_t ev) const + { + auto e_op_result = e_op(key, dst, sv, dv, ev); + if (e_op_result.has_value()) { + auto reduce_by = dst; + if constexpr (std::is_same_v && std::is_same_v) { + return reduce_by; + } else if constexpr (std::is_same_v && !std::is_same_v) { + return thrust::make_tuple(reduce_by, *e_op_result); + } else if constexpr (!std::is_same_v && std::is_same_v) { + return thrust::make_tuple(reduce_by, *e_op_result); + } else { + return thrust::make_tuple(thrust::make_tuple(reduce_by, thrust::get<0>(*e_op_result)), + thrust::get<1>(*e_op_result)); + } + } else { + return thrust::nullopt; + } + } +}; + +template +struct update_keep_flag_t { + using input_key_t = + typename thrust::iterator_traits::value_type; // uint32_t (compressed) or + // key_t (i.e. vertex_t) + + raft::device_span bitmap{}; + raft::device_span keep_flags{}; + key_t v_range_first{}; + InputKeyIterator input_key_first{}; + thrust::optional invalid_input_key{}; + + __device__ void operator()(size_t i) const + { + auto v = *(input_key_first + i); + if (invalid_input_key && (v == *invalid_input_key)) { + return; // just discard + } + input_key_t v_offset{}; + if constexpr ((sizeof(key_t) == 8) && std::is_same_v) { + v_offset = v; + } else { + v_offset = v - v_range_first; + } + cuda::atomic_ref bitmap_word( + bitmap[packed_bool_offset(v_offset)]); + auto old = bitmap_word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + if ((old & packed_bool_mask(v_offset)) == packed_bool_empty_mask()) { + cuda::atomic_ref keep_flag_word( + keep_flags[packed_bool_offset(i)]); + keep_flag_word.fetch_or(packed_bool_mask(i), cuda::std::memory_order_relaxed); + } + } +}; + +template +std::tuple, optional_dataframe_buffer_type_t> +filter_buffer_elements( + raft::handle_t const& handle, + rmm::device_uvector&& + unique_v_buffer, // assumes that buffer elements are locally reduced first and unique + optional_dataframe_buffer_type_t&& payload_buffer, + raft::device_span vertex_range_offsets, + vertex_t allreduce_count_per_rank, + int subgroup_size) +{ + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + auto const major_comm_size = major_comm.get_size(); + + rmm::device_uvector priorities(allreduce_count_per_rank * major_comm_size, + handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + priorities.begin(), + priorities.end(), + std::numeric_limits::max()); + thrust::for_each( + handle.get_thrust_policy(), + unique_v_buffer.begin(), + unique_v_buffer.end(), + [offsets = vertex_range_offsets, + priorities = raft::device_span(priorities.data(), priorities.size()), + allreduce_count_per_rank, + subgroup_size, + major_comm_rank, + major_comm_size] __device__(auto v) { + auto root = + thrust::distance(offsets.begin() + 1, + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v)); + auto v_offset = v - offsets[root]; + if (v_offset < allreduce_count_per_rank) { + priorities[allreduce_count_per_rank * root + v_offset] = + rank_to_priority( + major_comm_rank, root, subgroup_size, major_comm_size, v_offset); + } + }); + device_allreduce(major_comm, + priorities.data(), + priorities.data(), + priorities.size(), + raft::comms::op_t::MIN, + handle.get_stream()); + if constexpr (std::is_same_v) { + unique_v_buffer.resize( + thrust::distance( + unique_v_buffer.begin(), + thrust::remove_if( + handle.get_thrust_policy(), + unique_v_buffer.begin(), + unique_v_buffer.end(), + unique_v_buffer.begin(), + [offsets = vertex_range_offsets, + priorities = raft::device_span(priorities.data(), priorities.size()), + allreduce_count_per_rank, + subgroup_size, + major_comm_rank, + major_comm_size] __device__(auto v) { + auto root = thrust::distance( + offsets.begin() + 1, + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v)); + auto v_offset = v - offsets[root]; + if (v_offset < allreduce_count_per_rank) { + auto selected_rank = priority_to_rank( + priorities[allreduce_count_per_rank * root + v_offset], + root, + subgroup_size, + major_comm_size, + v_offset); + return major_comm_rank != selected_rank; + } else { + return false; + } + })), + handle.get_stream()); + } else { + auto kv_pair_first = thrust::make_zip_iterator(unique_v_buffer.begin(), + get_dataframe_buffer_begin(payload_buffer)); + unique_v_buffer.resize( + thrust::distance( + kv_pair_first, + thrust::remove_if( + handle.get_thrust_policy(), + kv_pair_first, + kv_pair_first + unique_v_buffer.size(), + unique_v_buffer.begin(), + [offsets = vertex_range_offsets, + priorities = raft::device_span(priorities.data(), priorities.size()), + allreduce_count_per_rank, + subgroup_size, + major_comm_rank, + major_comm_size] __device__(auto v) { + auto root = thrust::distance( + offsets.begin() + 1, + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v)); + auto v_offset = v - offsets[root]; + if (v_offset < allreduce_count_per_rank) { + auto selected_rank = priority_to_rank( + priorities[allreduce_count_per_rank * root + v_offset], + root, + subgroup_size, + major_comm_size, + v_offset); + return major_comm_rank != selected_rank; + } else { + return false; + } + })), + handle.get_stream()); + resize_dataframe_buffer(payload_buffer, unique_v_buffer.size(), handle.get_stream()); + } + + return std::make_tuple(std::move(unique_v_buffer), std::move(payload_buffer)); +} + +template +std::tuple, optional_dataframe_buffer_type_t> +sort_and_reduce_buffer_elements( + raft::handle_t const& handle, + dataframe_buffer_type_t&& key_buffer, + optional_dataframe_buffer_type_t&& payload_buffer, + ReduceOp reduce_op, + std::conditional_t, std::vector, std::byte /* dummy */> + vertex_range_offsets, + std::optional invalid_key /* drop (key, (payload)) pairs with invalid key */) +{ + constexpr bool compressed = + std::is_integral_v && (sizeof(key_t) == 8) && + std::is_same_v; // we currently compress only when key_t is an integral + // type (i.e. vertex_t) + static_assert(compressed || std::is_same_v); + + if constexpr (std::is_integral_v && + (std::is_same_v || + std::is_same_v>)) { // try to use + // bitmap for + // filtering + key_t range_size = vertex_range_offsets.back() - vertex_range_offsets.front(); + if (static_cast(size_dataframe_buffer(key_buffer)) >= + static_cast(range_size) * + 0.125 /* tuning parameter */) { // use bitmap for filtering + rmm::device_uvector bitmap(packed_bool_size(range_size), handle.get_stream()); + rmm::device_uvector keep_flags(packed_bool_size(size_dataframe_buffer(key_buffer)), + handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + thrust::fill( + handle.get_thrust_policy(), keep_flags.begin(), keep_flags.end(), packed_bool_empty_mask()); + thrust::for_each(handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(size_dataframe_buffer(key_buffer)), + update_keep_flag_t{ + raft::device_span(bitmap.data(), bitmap.size()), + raft::device_span(keep_flags.data(), keep_flags.size()), + vertex_range_offsets.front(), + get_dataframe_buffer_begin(key_buffer), + to_thrust_optional(invalid_key)}); + auto stencil_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [keep_flags = raft::device_span(keep_flags.data(), + keep_flags.size())] __device__(size_t i) { + return (keep_flags[packed_bool_offset(i)] & packed_bool_mask(i)) != + packed_bool_empty_mask(); + })); + if constexpr (std::is_same_v) { + resize_dataframe_buffer( + key_buffer, + thrust::distance(get_dataframe_buffer_begin(key_buffer), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + stencil_first, + is_not_equal_t{true})), + handle.get_stream()); + shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); + thrust::sort(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer)); + } else { + static_assert(std::is_same_v>); + auto pair_first = thrust::make_zip_iterator(get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_begin(payload_buffer)); + resize_dataframe_buffer( + key_buffer, + thrust::distance(pair_first, + thrust::remove_if(handle.get_thrust_policy(), + pair_first, + pair_first + size_dataframe_buffer(key_buffer), + stencil_first, + is_not_equal_t{true})), + handle.get_stream()); + resize_dataframe_buffer( + payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); + shrink_to_fit_dataframe_buffer(payload_buffer, handle.get_stream()); + thrust::sort_by_key(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + get_dataframe_buffer_begin(payload_buffer)); + } + + if constexpr (compressed) { + rmm::device_uvector output_key_buffer(key_buffer.size(), handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + key_buffer.begin(), + key_buffer.end(), + output_key_buffer.begin(), + cuda::proclaim_return_type( + [v_first = vertex_range_offsets.front()] __device__(uint32_t v_offset) { + return static_cast(v_first + v_offset); + })); + return std::make_tuple(std::move(output_key_buffer), std::move(payload_buffer)); + } else { + return std::make_tuple(std::move(key_buffer), std::move(payload_buffer)); + } + } + } + + if constexpr (std::is_same_v) { + thrust::sort(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer)); + } else { + thrust::sort_by_key(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + get_optional_dataframe_buffer_begin(payload_buffer)); + } + + auto output_key_buffer = allocate_dataframe_buffer(0, handle.get_stream()); + if constexpr (std::is_same_v) { + if constexpr (compressed) { + resize_dataframe_buffer( + output_key_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + auto input_key_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(key_buffer), + cuda::proclaim_return_type( + [v_first = vertex_range_offsets.front()] __device__(auto v_offset) { + return static_cast(v_first + v_offset); + })); + resize_dataframe_buffer( + output_key_buffer, + thrust::distance( + get_dataframe_buffer_begin(output_key_buffer), + thrust::copy_if(handle.get_thrust_policy(), + input_key_first, + input_key_first + size_dataframe_buffer(key_buffer), + thrust::make_counting_iterator(size_t{0}), + get_dataframe_buffer_begin(output_key_buffer), + cuda::proclaim_return_type( + [key_first = get_dataframe_buffer_begin(key_buffer), + invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) { + auto key = *(key_first + i); + if (invalid_key && (key == *invalid_key)) { + return false; + } else if ((i != 0) && (key == *(key_first + (i - 1)))) { + return false; + } else { + return true; + } + }))), + handle.get_stream()); + } else { + resize_dataframe_buffer( + key_buffer, + thrust::distance( + get_dataframe_buffer_begin(key_buffer), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [key_first = get_dataframe_buffer_begin(key_buffer), + invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) { + auto key = *(key_first + i); + if (invalid_key && (key == *invalid_key)) { + return true; + } else if ((i != 0) && (key == *(key_first + (i - 1)))) { + return true; + } else { + return false; + } + }))), + handle.get_stream()); + output_key_buffer = std::move(key_buffer); + } + shrink_to_fit_dataframe_buffer(output_key_buffer, handle.get_stream()); + } else if constexpr (std::is_same_v>) { + if constexpr (compressed) { + resize_dataframe_buffer( + output_key_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + auto input_key_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(key_buffer), + cuda::proclaim_return_type( + [v_first = vertex_range_offsets.front()] __device__(auto v_offset) { + return static_cast(v_first + v_offset); + })); + auto tmp_payload_buffer = allocate_dataframe_buffer( + size_dataframe_buffer(payload_buffer), handle.get_stream()); + auto input_pair_first = + thrust::make_zip_iterator(input_key_first, get_dataframe_buffer_begin(payload_buffer)); + auto output_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(output_key_buffer), + get_dataframe_buffer_begin(tmp_payload_buffer)); + resize_dataframe_buffer( + output_key_buffer, + thrust::distance( + output_pair_first, + thrust::copy_if(handle.get_thrust_policy(), + input_pair_first, + input_pair_first + size_dataframe_buffer(key_buffer), + thrust::make_counting_iterator(size_t{0}), + output_pair_first, + cuda::proclaim_return_type( + [key_first = get_dataframe_buffer_begin(key_buffer), + invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) { + auto key = *(key_first + i); + if (invalid_key && (key == *invalid_key)) { + return false; + } else if ((i != 0) && (key == *(key_first + (i - 1)))) { + return false; + } else { + return true; + } + }))), + handle.get_stream()); + resize_dataframe_buffer( + tmp_payload_buffer, size_dataframe_buffer(output_key_buffer), handle.get_stream()); + payload_buffer = std::move(tmp_payload_buffer); + } else { + auto pair_first = thrust::make_zip_iterator(get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_begin(payload_buffer)); + resize_dataframe_buffer( + key_buffer, + thrust::distance( + pair_first, + thrust::remove_if(handle.get_thrust_policy(), + pair_first, + pair_first + size_dataframe_buffer(key_buffer), + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [key_first = get_dataframe_buffer_begin(key_buffer), + invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) { + auto key = *(key_first + i); + if (invalid_key && (key == *invalid_key)) { + return true; + } else if ((i != 0) && (key == *(key_first + (i - 1)))) { + return true; + } else { + return false; + } + }))), + handle.get_stream()); + resize_dataframe_buffer( + payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + output_key_buffer = std::move(key_buffer); + } + shrink_to_fit_dataframe_buffer(output_key_buffer, handle.get_stream()); + shrink_to_fit_dataframe_buffer(payload_buffer, handle.get_stream()); + } else { + if (invalid_key) { + auto pair_first = thrust::make_zip_iterator(get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_begin(payload_buffer)); + resize_dataframe_buffer( + key_buffer, + thrust::distance(pair_first, + thrust::remove_if(handle.get_thrust_policy(), + pair_first, + pair_first + size_dataframe_buffer(key_buffer), + cuda::proclaim_return_type( + [invalid_key = *invalid_key] __device__(auto kv) { + auto key = thrust::get<0>(kv); + return key == invalid_key; + }))), + handle.get_stream()); + resize_dataframe_buffer( + payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + } + auto num_uniques = + thrust::count_if(handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(size_dataframe_buffer(key_buffer)), + is_first_in_run_t{ + get_dataframe_buffer_begin(key_buffer)}); + + auto new_key_buffer = allocate_dataframe_buffer(num_uniques, handle.get_stream()); + auto new_payload_buffer = + allocate_dataframe_buffer(num_uniques, handle.get_stream()); + + if constexpr (compressed) { + auto input_key_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(key_buffer), + cuda::proclaim_return_type( + [v_first = vertex_range_offsets.front()] __device__(auto v_offset) { + return static_cast(v_first + v_offset); + })); + thrust::reduce_by_key(handle.get_thrust_policy(), + input_key_first, + input_key_first + size_dataframe_buffer(key_buffer), + get_optional_dataframe_buffer_begin(payload_buffer), + get_dataframe_buffer_begin(new_key_buffer), + get_dataframe_buffer_begin(new_payload_buffer), + thrust::equal_to(), + reduce_op); + } else { + thrust::reduce_by_key(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + get_optional_dataframe_buffer_begin(payload_buffer), + get_dataframe_buffer_begin(new_key_buffer), + get_dataframe_buffer_begin(new_payload_buffer), + thrust::equal_to(), + reduce_op); + } + + output_key_buffer = std::move(new_key_buffer); + payload_buffer = std::move(new_payload_buffer); + } + + return std::make_tuple(std::move(output_key_buffer), std::move(payload_buffer)); +} + +#if 1 // FIXME: delete +#define TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT 0 +#endif + +template +std::conditional_t< + !std::is_same_v, + std::tuple( + 0, rmm::cuda_stream_view{})), + decltype(detail::allocate_optional_dataframe_buffer( + 0, rmm::cuda_stream_view{}))>, + decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{}))> +transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& frontier, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + ReduceOp reduce_op, + bool do_expensive_check = false) +{ + static_assert(!GraphViewType::is_storage_transposed, + "GraphViewType should support the push model."); + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = typename KeyBucketType::key_type; + using payload_t = typename ReduceOp::value_type; + + if (do_expensive_check) { + // currently, nothing to do + } + + // 1. fill the buffer + +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time0 = std::chrono::steady_clock::now(); +#endif + detail::transform_reduce_v_frontier_call_e_op_t + e_op_wrapper{e_op}; + + auto [key_buffer, payload_buffer] = + detail::extract_transform_v_frontier_e(handle, + graph_view, + frontier, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op_wrapper, + do_expensive_check); +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time1 = std::chrono::steady_clock::now(); + auto size_before_lreduce = size_dataframe_buffer(key_buffer); +#endif + + // 2. reduce the buffer + + std::vector vertex_range_offsets{}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + vertex_range_offsets = std::vector(major_comm_size + 1); + for (int i = 0; i < major_comm_size; ++i) { + auto vertex_partition_id = + detail::compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + vertex_range_offsets[i] = graph_view.vertex_partition_range_first(vertex_partition_id); + } + vertex_range_offsets.back() = graph_view.local_edge_partition_dst_range_last(); + } else { + vertex_range_offsets = std::vector{graph_view.local_edge_partition_dst_range_first(), + graph_view.local_edge_partition_dst_range_last()}; + } + std::conditional_t, std::vector, std::byte /* dummy */> + aux_range_offsets{}; + if constexpr (std::is_integral_v) { aux_range_offsets = vertex_range_offsets; } + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + reduce_op, + aux_range_offsets, + std::nullopt); +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time2 = std::chrono::steady_clock::now(); + auto time3 = std::chrono::steady_clock::now(); + auto time4 = std::chrono::steady_clock::now(); + auto time5 = std::chrono::steady_clock::now(); + auto size_after_lreduce = size_dataframe_buffer(key_buffer); + auto size_after_filter = size_after_lreduce; + auto size_before_greduce = size_after_lreduce; +#endif + bool aligned_path = false; // FIXME: delete + double fill_ratio = 0.0; // FIXME: delete + if constexpr (GraphViewType::is_multi_gpu) { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + if (major_comm_size > 1) { + size_t local_key_buffer_size = size_dataframe_buffer(key_buffer); + auto avg_key_buffer_size = + host_scalar_allreduce( + major_comm, local_key_buffer_size, raft::comms::op_t::SUM, handle.get_stream()) / + major_comm_size; + + rmm::device_uvector d_vertex_range_offsets(vertex_range_offsets.size(), + handle.get_stream()); + raft::update_device(d_vertex_range_offsets.data(), + vertex_range_offsets.data(), + vertex_range_offsets.size(), + handle.get_stream()); + + constexpr bool try_compression = (sizeof(vertex_t) == 8) && std::is_same_v; + std::conditional_t + max_vertex_partition_size{}; + if constexpr (try_compression) { + for (int i = 0; i < major_comm_size; ++i) { + max_vertex_partition_size = std::max( + vertex_range_offsets[i + 1] - vertex_range_offsets[i], max_vertex_partition_size); + } + } + + if constexpr (std::is_same_v && + std::is_same_v>) { + vertex_t min_vertex_partition_size = std::numeric_limits::max(); + for (int i = 0; i < major_comm_size; ++i) { + min_vertex_partition_size = std::min( + vertex_range_offsets[i + 1] - vertex_range_offsets[i], min_vertex_partition_size); + } + + auto segment_offsets = graph_view.local_vertex_partition_segment_offsets(); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + if (segment_offsets && + (static_cast(avg_key_buffer_size) > + static_cast(graph_view.number_of_vertices() / comm_size) * + double{0.2})) { // duplicates expected for high in-degree vertices (and we assume + // correlation between in-degrees & out-degrees) // FIXME: we need + // a better criterion + size_t key_size{0}; + size_t payload_size{0}; + if constexpr (try_compression) { + if (max_vertex_partition_size <= std::numeric_limits::max()) { + key_size = sizeof(uint32_t); + } else { + key_size = sizeof(key_t); + } + } else { + if constexpr (std::is_arithmetic_v) { + key_size = sizeof(key_t); + } else { + key_size = sum_thrust_tuple_element_sizes(); + } + } + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + payload_size = sizeof(payload_t); + } else { + payload_size = sum_thrust_tuple_element_sizes(); + } + } + + int subgroup_size{}; + int num_gpus_per_node{}; + RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); + if (comm_size <= num_gpus_per_node) { + subgroup_size = major_comm_size; + } else { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm + ? std::min(major_comm_size, num_gpus_per_node) + : std::max(num_gpus_per_node / minor_comm_size, int{1}); + } + + auto p2p_size_per_rank = avg_key_buffer_size * (key_size + payload_size); + auto p2p_size_per_node = p2p_size_per_rank * std::min(num_gpus_per_node, comm_size); + auto allreduce_size_per_node = p2p_size_per_node / 16 /* tuning parameter */; + auto allreduce_size_per_rank = + allreduce_size_per_node / (major_comm_size * (num_gpus_per_node / subgroup_size)); +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + std::cerr << "p2p_size_per_rank=" << p2p_size_per_rank + << " p2p_size_per_node=" << p2p_size_per_node + << " allreduce_size_per_node=" << allreduce_size_per_node + << " allreduce_size_per_rank=" << allreduce_size_per_rank << std::endl; +#endif + + if (major_comm_size <= std::numeric_limits::max()) { // priority = uint8_t + std::tie(key_buffer, payload_buffer) = + filter_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + raft::device_span(d_vertex_range_offsets.data(), + d_vertex_range_offsets.size()), + std::min(static_cast(allreduce_size_per_rank / sizeof(uint8_t)), + min_vertex_partition_size), + subgroup_size); + } else { // priority = uint32_t + std::tie(key_buffer, payload_buffer) = + filter_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + raft::device_span(d_vertex_range_offsets.data(), + d_vertex_range_offsets.size()), + std::min(static_cast(allreduce_size_per_rank / sizeof(uint32_t)), + min_vertex_partition_size), + subgroup_size); + } + } +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + size_after_filter = size_dataframe_buffer(key_buffer); +#endif + } +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + time3 = std::chrono::steady_clock::now(); +#endif + + rmm::device_uvector d_tx_buffer_last_boundaries(major_comm_size, handle.get_stream()); + auto key_v_first = + thrust_tuple_get_or_identity( + get_dataframe_buffer_begin(key_buffer)); + thrust::lower_bound(handle.get_thrust_policy(), + key_v_first, + key_v_first + size_dataframe_buffer(key_buffer), + d_vertex_range_offsets.begin() + 1, + d_vertex_range_offsets.end(), + d_tx_buffer_last_boundaries.begin()); + std::conditional_t>, + std::byte /* dummy */> + compressed_v_buffer{}; + if constexpr (try_compression) { + if (max_vertex_partition_size <= std::numeric_limits::max()) { + compressed_v_buffer = + rmm::device_uvector(size_dataframe_buffer(key_buffer), handle.get_stream()); + thrust::transform( + handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + (*compressed_v_buffer).begin(), + cuda::proclaim_return_type( + [firsts = raft::device_span(d_vertex_range_offsets.data(), + static_cast(major_comm_size)), + lasts = raft::device_span( + d_vertex_range_offsets.data() + 1, + static_cast(major_comm_size))] __device__(auto v) { + auto major_comm_rank = thrust::distance( + lasts.begin(), thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), v)); + return static_cast(v - firsts[major_comm_rank]); + })); + resize_dataframe_buffer(key_buffer, 0, handle.get_stream()); + shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); + } + } + std::vector h_tx_buffer_last_boundaries(d_tx_buffer_last_boundaries.size()); + raft::update_host(h_tx_buffer_last_boundaries.data(), + d_tx_buffer_last_boundaries.data(), + d_tx_buffer_last_boundaries.size(), + handle.get_stream()); + handle.sync_stream(); + std::vector tx_counts(h_tx_buffer_last_boundaries.size()); + std::adjacent_difference( + h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); + + size_t min_element_size{cache_line_size}; + if constexpr (std::is_same_v) { + if constexpr (try_compression) { + if (compressed_v_buffer) { + min_element_size = std::min(sizeof(uint32_t), min_element_size); + } else { + min_element_size = std::min(sizeof(key_t), min_element_size); + } + } else { + min_element_size = std::min(sizeof(key_t), min_element_size); + } + } else { + static_assert(is_thrust_tuple_of_arithmetic::value); + min_element_size = + std::min(cugraph::min_thrust_tuple_element_sizes(), min_element_size); + } + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + min_element_size = std::min(sizeof(payload_t), min_element_size); + } else { + static_assert(is_thrust_tuple_of_arithmetic::value); + min_element_size = + std::min(min_thrust_tuple_element_sizes(), min_element_size); + } + } + assert((cache_line_size % min_element_size) == 0); + auto alignment = cache_line_size / min_element_size; + std::optional, key_t>> + invalid_key{std::nullopt}; + +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + time4 = std::chrono::steady_clock::now(); +#endif + if (avg_key_buffer_size >= alignment * size_t{128} /* 128 tuning parameter */) { + aligned_path = true; // FIXME: delete + if constexpr (std::is_same_v) { + if constexpr (try_compression) { + if (compressed_v_buffer) { + invalid_key = std::numeric_limits::max(); + } else { + invalid_key = invalid_vertex_id_v; + } + } else { + invalid_key = invalid_vertex_id_v; + } + } else { + invalid_key = key_t{}; + thrust::get<0>(*invalid_key) = invalid_vertex_id_v; + } + + if constexpr (try_compression) { + if (compressed_v_buffer) { + auto rx_compressed_v_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_compressed_v_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(*compressed_v_buffer), + tx_counts, + alignment, + std::make_optional(std::get<1>(*invalid_key)), + handle.get_stream()); + compressed_v_buffer = std::move(rx_compressed_v_buffer); + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(key_buffer), + tx_counts, + alignment, + std::make_optional(std::get<0>(*invalid_key)), + handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(key_buffer), + tx_counts, + alignment, + invalid_key, + handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + if constexpr (!std::is_same_v) { + auto rx_payload_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_payload_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(payload_buffer), + tx_counts, + alignment, + std::nullopt, + handle.get_stream()); + payload_buffer = std::move(rx_payload_buffer); + } + } else { + if constexpr (try_compression) { + if (compressed_v_buffer) { + auto rx_compressed_v_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_compressed_v_buffer, std::ignore) = + shuffle_values(major_comm, + get_dataframe_buffer_begin(*compressed_v_buffer), + tx_counts, + handle.get_stream()); + compressed_v_buffer = std::move(rx_compressed_v_buffer); + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, std::ignore) = shuffle_values( + major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, std::ignore) = shuffle_values( + major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + + if constexpr (!std::is_same_v) { + auto rx_payload_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_payload_buffer, std::ignore) = shuffle_values( + major_comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream()); + payload_buffer = std::move(rx_payload_buffer); + } + } +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + time5 = std::chrono::steady_clock::now(); +#endif + + if constexpr (std::is_integral_v) { + aux_range_offsets = std::vector{graph_view.local_vertex_partition_range_first(), + graph_view.local_vertex_partition_range_last()}; +#if 1 // FIXME: delete + size_t key_buffer_size{}; + if constexpr (try_compression) { + if (compressed_v_buffer) { + key_buffer_size = (*compressed_v_buffer).size(); + } else { + key_buffer_size = size_dataframe_buffer(key_buffer); + } + } else { + key_buffer_size = size_dataframe_buffer(key_buffer); + } + fill_ratio = static_cast(key_buffer_size) / + static_cast(aux_range_offsets.back() - aux_range_offsets.front()); +#endif + } + if constexpr (try_compression) { + if (compressed_v_buffer) { +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + size_before_greduce = size_dataframe_buffer(*compressed_v_buffer); // FIXME: delete +#endif + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(*compressed_v_buffer), + std::move(payload_buffer), + reduce_op, + aux_range_offsets, + invalid_key ? std::make_optional(std::get<1>(*invalid_key)) : std::nullopt); + } else { +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + size_before_greduce = size_dataframe_buffer(key_buffer); // FIXME: delete +#endif + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + reduce_op, + aux_range_offsets, + invalid_key ? std::make_optional(std::get<0>(*invalid_key)) : std::nullopt); + } + } else { +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + size_before_greduce = size_dataframe_buffer(key_buffer); // FIXME: delete +#endif + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + reduce_op, + aux_range_offsets, + invalid_key); + } + } + } +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time6 = std::chrono::steady_clock::now(); + auto size_after_greduce = size_dataframe_buffer(key_buffer); + std::chrono::duration dur0 = time1 - time0; + std::chrono::duration dur1 = time2 - time1; + std::chrono::duration dur2 = time3 - time2; + std::chrono::duration dur3 = time4 - time3; + std::chrono::duration dur4 = time5 - time4; + std::chrono::duration dur5 = time6 - time5; + std::cerr << "\tprim (fill,lreduce,filter,g-prep,g-shuffle,g-s&r) took (" << dur0.count() << "," + << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() + << "," << dur5.count() << ") l_size=(" << size_before_lreduce << "," + << size_after_lreduce << ") f_size=" << size_after_filter << " g_size=(" + << size_before_greduce << "," << size_after_greduce << ")" + << " aligned_path=" << aligned_path << " fill_ratio=" << fill_ratio << std::endl; +#endif + + if constexpr (!std::is_same_v) { + return std::make_tuple(std::move(key_buffer), std::move(payload_buffer)); + } else { + return std::move(key_buffer); + } +} + +} // namespace detail + +template +size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& frontier) +{ + static_assert(!GraphViewType::is_storage_transposed, + "GraphViewType should support the push model."); + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = typename KeyBucketType::key_type; + + size_t ret{0}; + + auto local_frontier_vertex_first = + thrust_tuple_get_or_identity(frontier.begin()); + + std::vector local_frontier_sizes{}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + local_frontier_sizes = host_scalar_allgather(minor_comm, frontier.size(), handle.get_stream()); + } else { + local_frontier_sizes = std::vector{static_cast(frontier.size())}; + } + + auto edge_mask_view = graph_view.edge_mask_view(); + + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(i)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, i) + : thrust::nullopt; + + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + + rmm::device_uvector edge_partition_frontier_vertices(local_frontier_sizes[i], + handle.get_stream()); + device_bcast(minor_comm, + local_frontier_vertex_first, + edge_partition_frontier_vertices.data(), + local_frontier_sizes[i], + static_cast(i), + handle.get_stream()); + + if (edge_partition_e_mask) { + ret += + edge_partition.compute_number_of_edges_with_mask((*edge_partition_e_mask).value_first(), + edge_partition_frontier_vertices.begin(), + edge_partition_frontier_vertices.end(), + handle.get_stream()); + } else { + ret += edge_partition.compute_number_of_edges(edge_partition_frontier_vertices.begin(), + edge_partition_frontier_vertices.end(), + handle.get_stream()); + } + } else { + assert(i == 0); + if (edge_partition_e_mask) { + ret += edge_partition.compute_number_of_edges_with_mask( + (*edge_partition_e_mask).value_first(), + local_frontier_vertex_first, + local_frontier_vertex_first + frontier.size(), + handle.get_stream()); + } else { + ret += edge_partition.compute_number_of_edges(local_frontier_vertex_first, + local_frontier_vertex_first + frontier.size(), + handle.get_stream()); + } + } + } + + return ret; +} + +/** + * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor + * outputs by (tagged-)destination ID. + * + * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are + * assumed to be tagged if KeyBucketType::key_type is a tuple of a vertex type and a tag + * type (KeyBucketType::key_type is identical to a vertex type otherwise). + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the vertex frontier bucket class which abstracts the + * current (tagged-)vertex frontier. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param frontier KeyBucketType class object for the current vertex frontier. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge (tagged-)source, edge destination, property values for + * the source, destination, and edge and returns 1) thrust::nullopt (if invalid and to be + * discarded); 2) dummy (but valid) thrust::optional object (e.g. + * thrust::optional{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type is + * void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be + * reduced using the @p reduce_op (if vertices are not tagged and ReduceOp::value_type is not void); + * or 5) a tuple of a tag and a value to be reduced (if vertices are tagged and ReduceOp::value_type + * is not void). + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @return Tuple of key values and payload values (if ReduceOp::value_type is not void) or just key + * values (if ReduceOp::value_type is void). Keys in the return values are sorted in ascending order + * using a vertex ID as the primary key and a tag (if relevant) as the secondary key. + */ +template +std::conditional_t< + !std::is_same_v, + std::tuple( + 0, rmm::cuda_stream_view{})), + decltype(detail::allocate_optional_dataframe_buffer( + 0, rmm::cuda_stream_view{}))>, + decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{}))> +transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& frontier, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + ReduceOp reduce_op, + bool do_expensive_check = false) +{ + return detail::transform_reduce_v_frontier_outgoing_e_by_dst(handle, + graph_view, + frontier, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + reduce_op, + do_expensive_check); +} + +} // namespace cugraph diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh deleted file mode 100644 index e58ab08fa97..00000000000 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh +++ /dev/null @@ -1,585 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "detail/graph_partition_utils.cuh" -#include "prims/detail/extract_transform_v_frontier_e.cuh" -#include "prims/property_op_utils.cuh" -#include "prims/reduce_op.cuh" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -namespace cugraph { - -namespace detail { - -int32_t constexpr update_v_frontier_from_outgoing_e_kernel_block_size = 512; - -template -struct transform_reduce_v_frontier_call_e_op_t { - EdgeOp e_op{}; - - __device__ thrust::optional< - std::conditional_t && !std::is_same_v, - thrust::tuple, - std::conditional_t, key_t, payload_t>>> - operator()(key_t key, vertex_t dst, src_value_t sv, dst_value_t dv, e_value_t ev) const - { - auto e_op_result = e_op(key, dst, sv, dv, ev); - if (e_op_result.has_value()) { - auto reduce_by = reduce_by_src ? thrust_tuple_get_or_identity(key) : dst; - if constexpr (std::is_same_v && std::is_same_v) { - return reduce_by; - } else if constexpr (std::is_same_v && !std::is_same_v) { - return thrust::make_tuple(reduce_by, *e_op_result); - } else if constexpr (!std::is_same_v && std::is_same_v) { - return thrust::make_tuple(reduce_by, *e_op_result); - } else { - return thrust::make_tuple(thrust::make_tuple(reduce_by, thrust::get<0>(*e_op_result)), - thrust::get<1>(*e_op_result)); - } - } else { - return thrust::nullopt; - } - } -}; - -template -auto sort_and_reduce_buffer_elements( - raft::handle_t const& handle, - decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{}))&& key_buffer, - decltype(allocate_optional_dataframe_buffer(0, - rmm::cuda_stream_view{}))&& payload_buffer, - ReduceOp reduce_op) -{ - if constexpr (std::is_same_v) { - thrust::sort(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer)); - } else { - thrust::sort_by_key(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer), - get_optional_dataframe_buffer_begin(payload_buffer)); - } - - if constexpr (std::is_same_v) { - auto it = thrust::unique(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer)); - resize_dataframe_buffer( - key_buffer, - static_cast(thrust::distance(get_dataframe_buffer_begin(key_buffer), it)), - handle.get_stream()); - shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); - } else if constexpr (std::is_same_v>) { - auto it = thrust::unique_by_key(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer), - get_optional_dataframe_buffer_begin(payload_buffer)); - resize_dataframe_buffer(key_buffer, - static_cast(thrust::distance( - get_dataframe_buffer_begin(key_buffer), thrust::get<0>(it))), - handle.get_stream()); - resize_dataframe_buffer(payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); - shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); - shrink_to_fit_dataframe_buffer(payload_buffer, handle.get_stream()); - } else { - auto num_uniques = - thrust::count_if(handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(size_dataframe_buffer(key_buffer)), - is_first_in_run_t{ - get_dataframe_buffer_begin(key_buffer)}); - - auto new_key_buffer = allocate_dataframe_buffer(num_uniques, handle.get_stream()); - auto new_payload_buffer = - allocate_dataframe_buffer(num_uniques, handle.get_stream()); - - thrust::reduce_by_key(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer), - get_optional_dataframe_buffer_begin(payload_buffer), - get_dataframe_buffer_begin(new_key_buffer), - get_dataframe_buffer_begin(new_payload_buffer), - thrust::equal_to(), - reduce_op); - - key_buffer = std::move(new_key_buffer); - payload_buffer = std::move(new_payload_buffer); - } - - return std::make_tuple(std::move(key_buffer), std::move(payload_buffer)); -} - -template -std::conditional_t< - !std::is_same_v, - std::tuple( - 0, rmm::cuda_stream_view{})), - decltype(detail::allocate_optional_dataframe_buffer( - 0, rmm::cuda_stream_view{}))>, - decltype(allocate_dataframe_buffer( - 0, rmm::cuda_stream_view{}))> -transform_reduce_v_frontier_outgoing_e_by_src_dst(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, - EdgeSrcValueInputWrapper edge_src_value_input, - EdgeDstValueInputWrapper edge_dst_value_input, - EdgeValueInputWrapper edge_value_input, - EdgeOp e_op, - ReduceOp reduce_op, - bool do_expensive_check = false) -{ - static_assert(!GraphViewType::is_storage_transposed, - "GraphViewType should support the push model."); - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using key_t = typename VertexFrontierBucketType::key_type; - using payload_t = typename ReduceOp::value_type; - - if (do_expensive_check) { - // currently, nothing to do - } - - // 1. fill the buffer - - detail::transform_reduce_v_frontier_call_e_op_t - e_op_wrapper{e_op}; - - bool constexpr max_one_e_per_frontier_key = - reduce_by_src && std::is_same_v>; - auto [key_buffer, payload_buffer] = - detail::extract_transform_v_frontier_e( - handle, - graph_view, - frontier, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op_wrapper, - do_expensive_check); - - // 2. reduce the buffer - - std::tie(key_buffer, payload_buffer) = - detail::sort_and_reduce_buffer_elements( - handle, std::move(key_buffer), std::move(payload_buffer), reduce_op); - if constexpr (GraphViewType::is_multi_gpu) { - // FIXME: this step is unnecessary if major_comm_size== 1 - auto& comm = handle.get_comms(); - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_rank = major_comm.get_rank(); - auto const major_comm_size = major_comm.get_size(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); - - std::vector h_vertex_lasts(reduce_by_src ? minor_comm_size : major_comm_size); - for (size_t i = 0; i < h_vertex_lasts.size(); ++i) { - auto vertex_partition_id = - reduce_by_src - ? detail::compute_local_edge_partition_major_range_vertex_partition_id_t{major_comm_size, - minor_comm_size, - major_comm_rank, - minor_comm_rank}( - i) - : detail::compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - h_vertex_lasts[i] = graph_view.vertex_partition_range_last(vertex_partition_id); - } - - rmm::device_uvector d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream()); - raft::update_device( - d_vertex_lasts.data(), h_vertex_lasts.data(), h_vertex_lasts.size(), handle.get_stream()); - rmm::device_uvector d_tx_buffer_last_boundaries(d_vertex_lasts.size(), - handle.get_stream()); - auto reduce_by_first = - thrust_tuple_get_or_identity( - get_dataframe_buffer_begin(key_buffer)); - thrust::lower_bound(handle.get_thrust_policy(), - reduce_by_first, - reduce_by_first + size_dataframe_buffer(key_buffer), - d_vertex_lasts.begin(), - d_vertex_lasts.end(), - d_tx_buffer_last_boundaries.begin()); - std::vector h_tx_buffer_last_boundaries(d_tx_buffer_last_boundaries.size()); - raft::update_host(h_tx_buffer_last_boundaries.data(), - d_tx_buffer_last_boundaries.data(), - d_tx_buffer_last_boundaries.size(), - handle.get_stream()); - handle.sync_stream(); - std::vector tx_counts(h_tx_buffer_last_boundaries.size()); - std::adjacent_difference( - h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); - - auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_key_buffer, std::ignore) = shuffle_values(reduce_by_src ? minor_comm : major_comm, - get_dataframe_buffer_begin(key_buffer), - tx_counts, - handle.get_stream()); - key_buffer = std::move(rx_key_buffer); - - if constexpr (!std::is_same_v) { - auto rx_payload_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_payload_buffer, std::ignore) = - shuffle_values(reduce_by_src ? minor_comm : major_comm, - get_dataframe_buffer_begin(payload_buffer), - tx_counts, - handle.get_stream()); - payload_buffer = std::move(rx_payload_buffer); - } - - std::tie(key_buffer, payload_buffer) = - detail::sort_and_reduce_buffer_elements( - handle, std::move(key_buffer), std::move(payload_buffer), reduce_op); - } - - if constexpr (!std::is_same_v) { - return std::make_tuple(std::move(key_buffer), std::move(payload_buffer)); - } else { - return std::move(key_buffer); - } -} - -} // namespace detail - -template -size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier) -{ - static_assert(!GraphViewType::is_storage_transposed, - "GraphViewType should support the push model."); - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using key_t = typename VertexFrontierBucketType::key_type; - - size_t ret{0}; - - auto local_frontier_vertex_first = - thrust_tuple_get_or_identity(frontier.begin()); - - std::vector local_frontier_sizes{}; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - local_frontier_sizes = host_scalar_allgather(minor_comm, frontier.size(), handle.get_stream()); - } else { - local_frontier_sizes = std::vector{static_cast(frontier.size())}; - } - - auto edge_mask_view = graph_view.edge_mask_view(); - - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - auto edge_partition_e_mask = - edge_mask_view - ? thrust::make_optional< - detail::edge_partition_edge_property_device_view_t>( - *edge_mask_view, i) - : thrust::nullopt; - - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - - rmm::device_uvector edge_partition_frontier_vertices(local_frontier_sizes[i], - handle.get_stream()); - device_bcast(minor_comm, - local_frontier_vertex_first, - edge_partition_frontier_vertices.data(), - local_frontier_sizes[i], - static_cast(i), - handle.get_stream()); - - if (edge_partition_e_mask) { - ret += - edge_partition.compute_number_of_edges_with_mask((*edge_partition_e_mask).value_first(), - edge_partition_frontier_vertices.begin(), - edge_partition_frontier_vertices.end(), - handle.get_stream()); - } else { - ret += edge_partition.compute_number_of_edges(edge_partition_frontier_vertices.begin(), - edge_partition_frontier_vertices.end(), - handle.get_stream()); - } - } else { - assert(i == 0); - if (edge_partition_e_mask) { - ret += edge_partition.compute_number_of_edges_with_mask( - (*edge_partition_e_mask).value_first(), - local_frontier_vertex_first, - local_frontier_vertex_first + frontier.size(), - handle.get_stream()); - } else { - ret += edge_partition.compute_number_of_edges(local_frontier_vertex_first, - local_frontier_vertex_first + frontier.size(), - handle.get_stream()); - } - } - } - - return ret; -} - -/** - * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor - * outputs by (tagged-)source ID. - * - * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are - * assumed to be tagged if VertexFrontierBucketType::key_type is a tuple of a vertex type and a tag - * type (VertexFrontierBucketType::key_type is identical to a vertex type otherwise). - * - * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the - * current (tagged-)vertex frontier. - * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. - * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. - * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. - * @tparam EdgeOp Type of the quinary edge operator. - * @tparam ReduceOp Type of the binary reduction operator. - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param graph_view Non-owning graph object. - * @param frontier VertexFrontierBucketType class object for the current vertex frontier. - * @param edge_src_value_input Wrapper used to access source input property values (for the edge - * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() - * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() - * (if @p e_op does not access source property values). Use update_edge_src_property to fill the - * wrapper. - * @param edge_dst_value_input Wrapper used to access destination input property values (for the - * edge destinations assigned to this process in multi-GPU). Use either - * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or - * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property - * values). Use update_edge_dst_property to fill the wrapper. - * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned - * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to - * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not - * access edge property values). - * @param e_op Quinary operator takes edge (tagged-)source, edge destination, property values for - * the source, destination, and edge and returns 1) thrust::nullopt (if invalid and to be - * discarded); 2) dummy (but valid) thrust::optional object (e.g. - * thrust::optional{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type is - * void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be - * reduced using the @p reduce_op (if vertices are not tagged and ReduceOp::value_type is not void); - * or 5) a tuple of a tag and a value to be reduced (if vertices are tagged and ReduceOp::value_type - * is not void). - * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. - * There are pre-defined reduction operators in prims/reduce_op.cuh. It is - * recommended to use the pre-defined reduction operators whenever possible as the current (and - * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has - * known member variables) to take a more optimized code path. See the documentation in the - * reduce_op.cuh file for instructions on writing custom reduction operators. - * @return Tuple of key values and payload values (if ReduceOp::value_type is not void) or just key - * values (if ReduceOp::value_type is void). Keys in the return values are sorted in ascending order - * using a vertex ID as the primary key and a tag (if relevant) as the secondary key. - */ -template -std::conditional_t< - !std::is_same_v, - std::tuple( - 0, rmm::cuda_stream_view{})), - decltype(detail::allocate_optional_dataframe_buffer( - 0, rmm::cuda_stream_view{}))>, - decltype(allocate_dataframe_buffer( - 0, rmm::cuda_stream_view{}))> -transform_reduce_v_frontier_outgoing_e_by_src(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, - EdgeSrcValueInputWrapper edge_src_value_input, - EdgeDstValueInputWrapper edge_dst_value_input, - EdgeValueInputWrapper edge_value_input, - EdgeOp e_op, - ReduceOp reduce_op, - bool do_expensive_check = false) -{ - return detail::transform_reduce_v_frontier_outgoing_e_by_src_dst(handle, - graph_view, - frontier, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - reduce_op, - do_expensive_check); -} - -/** - * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor - * outputs by (tagged-)destination ID. - * - * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are - * assumed to be tagged if VertexFrontierBucketType::key_type is a tuple of a vertex type and a tag - * type (VertexFrontierBucketType::key_type is identical to a vertex type otherwise). - * - * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the - * current (tagged-)vertex frontier. - * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. - * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. - * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. - * @tparam EdgeOp Type of the quinary edge operator. - * @tparam ReduceOp Type of the binary reduction operator. - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param graph_view Non-owning graph object. - * @param frontier VertexFrontierBucketType class object for the current vertex frontier. - * @param edge_src_value_input Wrapper used to access source input property values (for the edge - * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() - * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() - * (if @p e_op does not access source property values). Use update_edge_src_property to fill the - * wrapper. - * @param edge_dst_value_input Wrapper used to access destination input property values (for the - * edge destinations assigned to this process in multi-GPU). Use either - * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or - * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property - * values). Use update_edge_dst_property to fill the wrapper. - * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned - * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to - * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not - * access edge property values). - * @param e_op Quinary operator takes edge (tagged-)source, edge destination, property values for - * the source, destination, and edge and returns 1) thrust::nullopt (if invalid and to be - * discarded); 2) dummy (but valid) thrust::optional object (e.g. - * thrust::optional{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type is - * void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be - * reduced using the @p reduce_op (if vertices are not tagged and ReduceOp::value_type is not void); - * or 5) a tuple of a tag and a value to be reduced (if vertices are tagged and ReduceOp::value_type - * is not void). - * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. - * There are pre-defined reduction operators in prims/reduce_op.cuh. It is - * recommended to use the pre-defined reduction operators whenever possible as the current (and - * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has - * known member variables) to take a more optimized code path. See the documentation in the - * reduce_op.cuh file for instructions on writing custom reduction operators. - * @return Tuple of key values and payload values (if ReduceOp::value_type is not void) or just key - * values (if ReduceOp::value_type is void). Keys in the return values are sorted in ascending order - * using a vertex ID as the primary key and a tag (if relevant) as the secondary key. - */ -template -std::conditional_t< - !std::is_same_v, - std::tuple( - 0, rmm::cuda_stream_view{})), - decltype(detail::allocate_optional_dataframe_buffer( - 0, rmm::cuda_stream_view{}))>, - decltype(allocate_dataframe_buffer( - 0, rmm::cuda_stream_view{}))> -transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, - EdgeSrcValueInputWrapper edge_src_value_input, - EdgeDstValueInputWrapper edge_dst_value_input, - EdgeValueInputWrapper edge_value_input, - EdgeOp e_op, - ReduceOp reduce_op, - bool do_expensive_check = false) -{ - return detail::transform_reduce_v_frontier_outgoing_e_by_src_dst(handle, - graph_view, - frontier, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - reduce_op, - do_expensive_check); -} - -} // namespace cugraph diff --git a/cpp/src/prims/update_edge_src_dst_property.cuh b/cpp/src/prims/update_edge_src_dst_property.cuh index 1bfdc23c66d..2f842f710ca 100644 --- a/cpp/src/prims/update_edge_src_dst_property.cuh +++ b/cpp/src/prims/update_edge_src_dst_property.cuh @@ -16,6 +16,7 @@ #pragma once #include "detail/graph_partition_utils.cuh" +#include "prims/vertex_frontier.cuh" #include #include @@ -265,8 +266,8 @@ template void update_edge_major_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, VertexPropertyInputIterator vertex_property_input_first, EdgeMajorPropertyOutputWrapper edge_major_property_output) { @@ -288,12 +289,12 @@ void update_edge_major_property(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - auto rx_counts = - host_scalar_allgather(minor_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); - auto max_rx_size = - std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { + auto local_v_list_sizes = host_scalar_allgather( + minor_comm, + static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)), + handle.get_stream()); + auto max_rx_size = std::reduce( + local_v_list_sizes.begin(), local_v_list_sizes.end(), size_t{0}, [](auto lhs, auto rhs) { return std::max(lhs, rhs); }); rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); @@ -317,7 +318,7 @@ void update_edge_major_property(raft::handle_t const& handle, graph_view.local_vertex_partition_view()); if constexpr (contains_packed_bool_element) { auto bool_first = thrust::make_transform_iterator( - vertex_first, + sorted_unique_vertex_first, cuda::proclaim_return_type([vertex_property_input_first, vertex_partition] __device__(auto v) { auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); @@ -325,34 +326,41 @@ void update_edge_major_property(raft::handle_t const& handle, *(vertex_property_input_first + packed_bool_offset(v_offset)) & packed_bool_mask(v_offset)); })); - pack_bools(handle, - bool_first, - bool_first + thrust::distance(vertex_first, vertex_last), - rx_value_first); + pack_bools( + handle, + bool_first, + bool_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + rx_value_first); } else { auto map_first = thrust::make_transform_iterator( - vertex_first, + sorted_unique_vertex_first, cuda::proclaim_return_type([vertex_partition] __device__(auto v) { return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); })); // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a // permutation iterator (and directly gathers to the internal buffer) - thrust::gather(handle.get_thrust_policy(), - map_first, - map_first + thrust::distance(vertex_first, vertex_last), - vertex_property_input_first, - rx_value_first); + thrust::gather( + handle.get_thrust_policy(), + map_first, + map_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + vertex_property_input_first, + rx_value_first); } } // FIXME: these broadcast operations can be placed between ncclGroupStart() and // ncclGroupEnd() - device_bcast( - minor_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + device_bcast(minor_comm, + sorted_unique_vertex_first, + rx_vertices.begin(), + local_v_list_sizes[i], + i, + handle.get_stream()); device_bcast(minor_comm, rx_value_first, rx_value_first, - contains_packed_bool_element ? packed_bool_size(rx_counts[i]) : rx_counts[i], + contains_packed_bool_element ? packed_bool_size(local_v_list_sizes[i]) + : local_v_list_sizes[i], i, handle.get_stream()); @@ -360,7 +368,7 @@ void update_edge_major_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_counts[i]), + thrust::make_counting_iterator(local_v_list_sizes[i]), [rx_vertex_first = rx_vertices.begin(), rx_value_first, edge_partition_key_first = ((*edge_partition_keys)[i]).begin(), @@ -386,7 +394,7 @@ void update_edge_major_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(rx_counts[i])), + thrust::make_counting_iterator(static_cast(local_v_list_sizes[i])), [edge_partition, rx_vertex_first = rx_vertices.begin(), rx_value_first, @@ -407,7 +415,7 @@ void update_edge_major_property(raft::handle_t const& handle, // directly scatters from the internal buffer) thrust::scatter(handle.get_thrust_policy(), rx_value_first, - rx_value_first + rx_counts[i], + rx_value_first + local_v_list_sizes[i], map_first, edge_partition_value_firsts[i]); } @@ -420,20 +428,22 @@ void update_edge_major_property(raft::handle_t const& handle, assert(edge_partition_value_firsts.size() == size_t{1}); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [vertex_property_input_first, output_value_first = edge_partition_value_firsts[0]] __device__(auto v) { bool val = static_cast(*(vertex_property_input_first + v)); packed_bool_atomic_set(output_value_first, v, val); }); } else { - auto val_first = thrust::make_permutation_iterator(vertex_property_input_first, vertex_first); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + thrust::distance(vertex_first, vertex_last), - vertex_first, - edge_partition_value_firsts[0]); + auto val_first = + thrust::make_permutation_iterator(vertex_property_input_first, sorted_unique_vertex_first); + thrust::scatter( + handle.get_thrust_policy(), + val_first, + val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + sorted_unique_vertex_first, + edge_partition_value_firsts[0]); } } } @@ -455,13 +465,11 @@ void update_edge_minor_property(raft::handle_t const& handle, auto edge_partition_value_first = edge_minor_property_output.value_first(); if constexpr (GraphViewType::is_multi_gpu) { - using vertex_t = typename GraphViewType::vertex_type; - using bcast_buffer_type = - decltype(allocate_dataframe_buffer< - std::conditional_t>( - size_t{0}, handle.get_stream())); + using vertex_t = typename GraphViewType::vertex_type; + using bcast_buffer_type = dataframe_buffer_type_t< + std::conditional_t>; auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); @@ -487,8 +495,8 @@ void update_edge_minor_property(raft::handle_t const& handle, (static_cast(graph_view.compute_number_of_edges(handle) / comm_size) * sizeof(vertex_t)) / std::max(bcast_size, size_t{1}); - num_concurrent_bcasts = std::max(num_concurrent_bcasts, size_t{1}); - num_concurrent_bcasts = std::min(num_concurrent_bcasts, static_cast(major_comm_size)); + num_concurrent_bcasts = + std::min(std::max(num_concurrent_bcasts, size_t{1}), static_cast(major_comm_size)); auto num_rounds = (static_cast(major_comm_size) + num_concurrent_bcasts - size_t{1}) / num_concurrent_bcasts; @@ -532,15 +540,17 @@ void update_edge_minor_property(raft::handle_t const& handle, *(graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets()); } } else { - std::vector rx_counts(major_comm_size, size_t{0}); + std::vector local_v_list_sizes(major_comm_size, size_t{0}); for (int i = 0; i < major_comm_size; ++i) { auto minor_range_vertex_partition_id = compute_local_edge_partition_minor_range_vertex_partition_id_t{ major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - rx_counts[i] = graph_view.vertex_partition_range_size(minor_range_vertex_partition_id); + local_v_list_sizes[i] = + graph_view.vertex_partition_range_size(minor_range_vertex_partition_id); } std::vector rx_displacements(major_comm_size, size_t{0}); - std::exclusive_scan(rx_counts.begin(), rx_counts.end(), rx_displacements.begin(), size_t{0}); + std::exclusive_scan( + local_v_list_sizes.begin(), local_v_list_sizes.end(), rx_displacements.begin(), size_t{0}); key_offsets_or_rx_displacements = std::move(rx_displacements); } @@ -683,8 +693,8 @@ template void update_edge_minor_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, VertexPropertyInputIterator vertex_property_input_first, EdgeMinorPropertyOutputWrapper edge_minor_property_output) { @@ -706,22 +716,49 @@ void update_edge_minor_property(raft::handle_t const& handle, auto const major_comm_rank = major_comm.get_rank(); auto const major_comm_size = major_comm.get_size(); - auto rx_counts = - host_scalar_allgather(major_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); - auto max_rx_size = - std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { - return std::max(lhs, rhs); - }); - rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); - auto rx_tmp_buffer = allocate_dataframe_buffer< - std::conditional_t>( - contains_packed_bool_element ? packed_bool_size(max_rx_size) : max_rx_size, - handle.get_stream()); - auto rx_value_first = get_dataframe_buffer_begin(rx_tmp_buffer); + auto v_list_size = + static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); + std::array v_list_range = {vertex_t{0}, vertex_t{0}}; + if (v_list_size > 0) { + rmm::device_uvector tmps(2, handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + tmps.begin(), + tmps.end(), + [sorted_unique_vertex_first, v_list_size] __device__(size_t i) { + return (i == 0) ? *sorted_unique_vertex_first + : (*(sorted_unique_vertex_first + (v_list_size - 1)) + 1); + }); + raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); + handle.sync_stream(); + } + + auto local_v_list_sizes = host_scalar_allgather(major_comm, v_list_size, handle.get_stream()); + auto local_v_list_range_firsts = + host_scalar_allgather(major_comm, v_list_range[0], handle.get_stream()); + auto local_v_list_range_lasts = + host_scalar_allgather(major_comm, v_list_range[1], handle.get_stream()); + + std::optional> v_list_bitmap{std::nullopt}; + if (major_comm_size > 1) { + double avg_fill_ratio{0.0}; + for (int i = 0; i < major_comm_size; ++i) { + auto num_keys = static_cast(local_v_list_sizes[i]); + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + avg_fill_ratio += + (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; + } + avg_fill_ratio /= static_cast(major_comm_size); + + constexpr double threshold_ratio = + 0.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); + if (avg_fill_ratio > threshold_ratio) { + v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_vertex_first, + sorted_unique_vertex_last, + local_v_list_range_firsts[major_comm_rank], + local_v_list_range_lasts[major_comm_rank], + handle.get_stream()); + } + } std::optional> key_offsets{}; if constexpr (GraphViewType::is_storage_transposed) { @@ -735,13 +772,23 @@ void update_edge_minor_property(raft::handle_t const& handle, graph_view.local_edge_partition_view(size_t{0})); auto edge_partition_keys = edge_minor_property_output.keys(); for (int i = 0; i < major_comm_size; ++i) { + rmm::device_uvector rx_vertices(local_v_list_sizes[i], handle.get_stream()); + auto rx_tmp_buffer = allocate_dataframe_buffer< + std::conditional_t>( + contains_packed_bool_element ? packed_bool_size(local_v_list_sizes[i]) + : local_v_list_sizes[i], + handle.get_stream()); + auto rx_value_first = get_dataframe_buffer_begin(rx_tmp_buffer); + if (i == major_comm_rank) { auto vertex_partition = vertex_partition_device_view_t( graph_view.local_vertex_partition_view()); if constexpr (contains_packed_bool_element) { auto bool_first = thrust::make_transform_iterator( - vertex_first, + sorted_unique_vertex_first, cuda::proclaim_return_type([vertex_property_input_first, vertex_partition] __device__(auto v) { auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); @@ -749,34 +796,53 @@ void update_edge_minor_property(raft::handle_t const& handle, *(vertex_property_input_first + packed_bool_offset(v_offset)) & packed_bool_mask(v_offset)); })); - pack_bools(handle, - bool_first, - bool_first + thrust::distance(vertex_first, vertex_last), - rx_value_first); + pack_bools( + handle, + bool_first, + bool_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + rx_value_first); } else { auto map_first = thrust::make_transform_iterator( - vertex_first, + sorted_unique_vertex_first, cuda::proclaim_return_type([vertex_partition] __device__(auto v) { return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); })); // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a // permutation iterator (and directly gathers to the internal buffer) - thrust::gather(handle.get_thrust_policy(), - map_first, - map_first + thrust::distance(vertex_first, vertex_last), - vertex_property_input_first, - rx_value_first); + thrust::gather( + handle.get_thrust_policy(), + map_first, + map_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + vertex_property_input_first, + rx_value_first); } } // FIXME: these broadcast operations can be placed between ncclGroupStart() and // ncclGroupEnd() - device_bcast( - major_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + std::variant, decltype(sorted_unique_vertex_first)> + v_list{}; + if (v_list_bitmap) { + v_list = + (i == major_comm_rank) + ? raft::device_span((*v_list_bitmap).data(), (*v_list_bitmap).size()) + : raft::device_span(static_cast(nullptr), size_t{0}); + } else { + v_list = sorted_unique_vertex_first; + } + device_bcast_vertex_list(major_comm, + v_list, + rx_vertices.begin(), + local_v_list_range_firsts[i], + local_v_list_range_lasts[i], + local_v_list_sizes[i], + i, + handle.get_stream()); device_bcast(major_comm, rx_value_first, rx_value_first, - contains_packed_bool_element ? packed_bool_size(rx_counts[i]) : rx_counts[i], + contains_packed_bool_element ? packed_bool_size(local_v_list_sizes[i]) + : local_v_list_sizes[i], i, handle.get_stream()); @@ -784,7 +850,7 @@ void update_edge_minor_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_counts[i]), + thrust::make_counting_iterator(local_v_list_sizes[i]), [rx_vertex_first = rx_vertices.begin(), rx_value_first, subrange_key_first = (*edge_partition_keys).begin() + (*key_offsets)[i], @@ -812,7 +878,7 @@ void update_edge_minor_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(rx_counts[i])), + thrust::make_counting_iterator(static_cast(local_v_list_sizes[i])), [edge_partition, rx_vertex_first = rx_vertices.begin(), rx_value_first, @@ -833,7 +899,7 @@ void update_edge_minor_property(raft::handle_t const& handle, // directly scatters from the internal buffer) thrust::scatter(handle.get_thrust_policy(), rx_value_first, - rx_value_first + rx_counts[i], + rx_value_first + local_v_list_sizes[i], map_first, edge_partition_value_first); } @@ -844,20 +910,22 @@ void update_edge_minor_property(raft::handle_t const& handle, graph_view.local_edge_partition_src_range_size()); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [vertex_property_input_first, output_value_first = edge_partition_value_first] __device__(auto v) { bool val = static_cast(*(vertex_property_input_first + v)); packed_bool_atomic_set(output_value_first, v, val); }); } else { - auto val_first = thrust::make_permutation_iterator(vertex_property_input_first, vertex_first); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + thrust::distance(vertex_first, vertex_last), - vertex_first, - edge_partition_value_first); + auto val_first = + thrust::make_permutation_iterator(vertex_property_input_first, sorted_unique_vertex_first); + thrust::scatter( + handle.get_thrust_policy(), + val_first, + val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + sorted_unique_vertex_first, + edge_partition_value_first); } } } @@ -909,8 +977,9 @@ void update_edge_src_property(raft::handle_t const& handle, /** * @brief Update graph edge source property values from the input vertex property values. * - * This version updates only a subset of graph edge source property values. [@p vertex_first, @p - * vertex_last) specifies the vertices with new property values to be updated. + * This version updates only a subset of graph edge source property values. [@p + * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices with new + * property values to be updated. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. @@ -919,10 +988,12 @@ void update_edge_src_property(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex with a new value to be - * updated. v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex - * partition assigned to this process in multi-GPU), otherwise undefined behavior. - * @param vertex_last Iterator pointing to the last (exclusive) vertex with a new value. + * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a new + * value to be updated. v in [sorted_unique_vertex_first, sorted_unique_vertex_last) should be + * sorted & distinct (and should belong to the vertex partition assigned to this process in + * multi-GPU), otherwise undefined behavior. + * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a new + * value. * @param vertex_property_input_first Iterator pointing to the vertex property value for the first * (inclusive) vertex (of the vertex partition assigned to this process in multi-GPU). * `vertex_property_input_last` (exclusive) is deduced as @p vertex_property_input_first + @p @@ -937,8 +1008,8 @@ template void update_edge_src_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, VertexPropertyInputIterator vertex_property_input_first, EdgeSrcValueOutputWrapper edge_src_property_output, bool do_expensive_check = false) @@ -946,8 +1017,8 @@ void update_edge_src_property(raft::handle_t const& handle, if (do_expensive_check) { auto num_invalids = thrust::count_if( handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(), local_vertex_partition_range_last = graph_view.local_vertex_partition_range_last()] __device__(auto v) { @@ -958,23 +1029,23 @@ void update_edge_src_property(raft::handle_t const& handle, num_invalids = host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream()); } - CUGRAPH_EXPECTS( - num_invalids == 0, - "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last)."); + CUGRAPH_EXPECTS(num_invalids == 0, + "Invalid input argument: invalid or non-local vertices in " + "[sorted_unique_vertex_first, sorted_unique_vertex_last)."); } if constexpr (GraphViewType::is_storage_transposed) { detail::update_edge_minor_property(handle, graph_view, - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, vertex_property_input_first, edge_src_property_output); } else { detail::update_edge_major_property(handle, graph_view, - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, vertex_property_input_first, edge_src_property_output); } @@ -1026,8 +1097,9 @@ void update_edge_dst_property(raft::handle_t const& handle, /** * @brief Update graph edge destination property values from the input vertex property values. * - * This version updates only a subset of graph edge destination property values. [@p vertex_first, - * @p vertex_last) specifies the vertices with new property values to be updated. + * This version updates only a subset of graph edge destination property values. [@p + * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices with new + * property values to be updated. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. @@ -1037,10 +1109,12 @@ void update_edge_dst_property(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex with a new value to be - * updated. v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex - * partition assigned to this process in multi-GPU), otherwise undefined behavior. - * @param vertex_last Iterator pointing to the last (exclusive) vertex with a new value. + * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a new + * value to be updated. v in [sorted_unique_vertex_first, sorted_unique_vertex_last) should be + * sorted & distinct (and should belong to the vertex partition assigned to this process in + * multi-GPU), otherwise undefined behavior. + * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a new + * value. * @param vertex_property_input_first Iterator pointing to the vertex property value for the first * (inclusive) vertex (of the vertex partition assigned to this process in multi-GPU). * `vertex_property_input_last` (exclusive) is deduced as @p vertex_property_input_first + @p @@ -1055,8 +1129,8 @@ template void update_edge_dst_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, VertexPropertyInputIterator vertex_property_input_first, EdgeDstValueOutputWrapper edge_dst_property_output, bool do_expensive_check = false) @@ -1064,8 +1138,8 @@ void update_edge_dst_property(raft::handle_t const& handle, if (do_expensive_check) { auto num_invalids = thrust::count_if( handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(), local_vertex_partition_range_last = graph_view.local_vertex_partition_range_last()] __device__(auto v) { @@ -1076,23 +1150,23 @@ void update_edge_dst_property(raft::handle_t const& handle, num_invalids = host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream()); } - CUGRAPH_EXPECTS( - num_invalids == 0, - "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last)."); + CUGRAPH_EXPECTS(num_invalids == 0, + "Invalid input argument: invalid or non-local vertices in " + "[sorted_unique_vertex_first, sorted_unique_vertex_last)."); } if constexpr (GraphViewType::is_storage_transposed) { detail::update_edge_major_property(handle, graph_view, - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, vertex_property_input_first, edge_dst_property_output); } else { detail::update_edge_minor_property(handle, graph_view, - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, vertex_property_input_first, edge_dst_property_output); } diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh index b13e6bfd458..6e7d8515beb 100644 --- a/cpp/src/prims/vertex_frontier.cuh +++ b/cpp/src/prims/vertex_frontier.cuh @@ -15,15 +15,24 @@ */ #pragma once +#include "prims/detail/multi_stream_utils.cuh" + +#include #include #include +#include +#include #include +#include #include #include #include +#include +#include +#include #include #include #include @@ -48,6 +57,191 @@ namespace cugraph { +template +KeyIterator compute_key_lower_bound(KeyIterator sorted_unique_key_first, + KeyIterator sorted_unique_key_last, + vertex_t v_threshold, + rmm::cuda_stream_view stream_view) +{ + using key_t = typename thrust::iterator_traits::value_type; + + if constexpr (std::is_same_v) { + return thrust::lower_bound( + rmm::exec_policy(stream_view), sorted_unique_key_first, sorted_unique_key_last, v_threshold); + } else { + key_t k_threshold{}; + thrust::get<0>(k_threshold) = v_threshold; + return thrust::lower_bound( + rmm::exec_policy(stream_view), + sorted_unique_key_first, + sorted_unique_key_last, + k_threshold, + [] __device__(auto lhs, auto rhs) { return thrust::get<0>(lhs) < thrust::get<0>(rhs); }); + } +} + +template +std::vector compute_key_segment_offsets(KeyIterator sorted_key_first, + KeyIterator sorted_key_last, + raft::host_span segment_offsets, + vertex_t vertex_range_first, + rmm::cuda_stream_view stream_view) +{ + using key_t = typename thrust::iterator_traits::value_type; + + std::vector h_thresholds(segment_offsets.size() - 2); + for (size_t i = 0; i < h_thresholds.size(); ++i) { + h_thresholds[i] = vertex_range_first + segment_offsets[i + 1]; + } + + rmm::device_uvector d_thresholds(h_thresholds.size(), stream_view); + raft::update_device(d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), stream_view); + + rmm::device_uvector d_offsets(d_thresholds.size(), stream_view); + if constexpr (std::is_same_v) { + thrust::lower_bound(rmm::exec_policy_nosync(stream_view), + sorted_key_first, + sorted_key_last, + d_thresholds.begin(), + d_thresholds.end(), + d_offsets.begin()); + } else { + auto sorted_vertex_first = + thrust::make_transform_iterator(sorted_key_first, thrust_tuple_get{}); + thrust::lower_bound(rmm::exec_policy_nosync(stream_view), + sorted_vertex_first, + sorted_vertex_first + thrust::distance(sorted_key_first, sorted_key_last), + d_thresholds.begin(), + d_thresholds.end(), + d_offsets.begin()); + } + + std::vector h_offsets(d_offsets.size() + 2); + raft::update_host(h_offsets.data() + 1, d_offsets.data(), d_offsets.size(), stream_view); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view)); + h_offsets[0] = size_t{0}; + h_offsets.back() = static_cast(thrust::distance(sorted_key_first, sorted_key_last)); + + return h_offsets; +} + +template +rmm::device_uvector compute_vertex_list_bitmap_info( + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, + typename thrust::iterator_traits::value_type vertex_range_first, + typename thrust::iterator_traits::value_type vertex_range_last, + rmm::cuda_stream_view stream_view) +{ + using vertex_t = typename thrust::iterator_traits::value_type; + + auto bitmap = rmm::device_uvector( + packed_bool_size(vertex_range_last - vertex_range_first), stream_view); + rmm::device_uvector lasts(bitmap.size(), stream_view); + auto bdry_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{1}), + cuda::proclaim_return_type( + [vertex_range_first, + vertex_range_size = vertex_range_last - vertex_range_first] __device__(vertex_t i) { + return vertex_range_first + + static_cast( + std::min(packed_bools_per_word() * i, static_cast(vertex_range_size))); + })); + thrust::lower_bound(rmm::exec_policy_nosync(stream_view), + sorted_unique_vertex_first, + sorted_unique_vertex_last, + bdry_first, + bdry_first + bitmap.size(), + lasts.begin()); + thrust::tabulate( + rmm::exec_policy_nosync(stream_view), + bitmap.begin(), + bitmap.end(), + cuda::proclaim_return_type( + [sorted_unique_vertex_first, + vertex_range_first, + lasts = raft::device_span(lasts.data(), lasts.size())] __device__(size_t i) { + auto offset_first = (i != 0) ? lasts[i - 1] : vertex_t{0}; + auto offset_last = lasts[i]; + auto ret = packed_bool_empty_mask(); + for (auto j = offset_first; j < offset_last; ++j) { + auto v_offset = *(sorted_unique_vertex_first + j) - vertex_range_first; + ret |= packed_bool_mask(v_offset); + } + return ret; + })); + + return bitmap; +} + +template +void device_bcast_vertex_list( + raft::comms::comms_t const& comm, + std::variant, InputVertexIterator> v_list, + OutputVertexIterator output_v_first, + typename thrust::iterator_traits::value_type vertex_range_first, + typename thrust::iterator_traits::value_type vertex_range_last, + size_t v_list_size, + int root, + rmm::cuda_stream_view stream_view) +{ + using vertex_t = typename thrust::iterator_traits::value_type; + + static_assert( + std::is_same_v::value_type, vertex_t>); + + if (v_list.index() == 0) { // bitmap + rmm::device_uvector tmp_bitmap( + packed_bool_size(vertex_range_last - vertex_range_first), stream_view); + assert((comm.get_rank() != root) || (std::get<0>(v_list).size() == tmp_bitmap.size())); + device_bcast( + comm, std::get<0>(v_list).data(), tmp_bitmap.data(), tmp_bitmap.size(), root, stream_view); + rmm::device_scalar dummy(size_t{0}, stream_view); // we already know the count + detail::copy_if_nosync( + thrust::make_counting_iterator(vertex_range_first), + thrust::make_counting_iterator(vertex_range_last), + thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + tmp_bitmap.data(), tmp_bitmap.size())] __device__(vertex_t v_offset) { + return ((bitmap[packed_bool_offset(v_offset)] & packed_bool_mask(v_offset)) != + packed_bool_empty_mask()); + })), + output_v_first, + raft::device_span(dummy.data(), size_t{1}), + stream_view); + } else { + device_bcast(comm, std::get<1>(v_list), output_v_first, v_list_size, root, stream_view); + } +} + +template +void retrieve_vertex_list_from_bitmap( + raft::device_span bitmap, + OutputVertexIterator output_v_first, + raft::device_span count /* size = 1 */, + typename thrust::iterator_traits::value_type vertex_range_first, + typename thrust::iterator_traits::value_type vertex_range_last, + rmm::cuda_stream_view stream_view) +{ + using vertex_t = typename thrust::iterator_traits::value_type; + + assert((comm.get_rank() != root) || + (bitmap.size() >= packed_bool_size(vertex_range_last - vertex_ragne_first))); + detail::copy_if_nosync(thrust::make_counting_iterator(vertex_range_first), + thrust::make_counting_iterator(vertex_range_last), + thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type([bitmap] __device__(vertex_t v_offset) { + return ((bitmap[packed_bool_offset(v_offset)] & + packed_bool_mask(v_offset)) != packed_bool_empty_mask()); + })), + output_v_first, + count, + stream_view); +} + // key type is either vertex_t (tag_t == void) or thrust::tuple (tag_t != void) // if sorted_unique is true, stores unique key objects in the sorted (non-descending) order. // if false, there can be duplicates and the elements may not be sorted. @@ -328,20 +522,6 @@ class key_bucket_t { } } - auto const begin() const - { - if constexpr (std::is_same_v) { - return vertices_.index() == 0 ? std::get<0>(vertices_).begin() - : std::get<1>(vertices_).begin(); - } else { - return vertices_.index() == 0 - ? thrust::make_zip_iterator( - thrust::make_tuple(std::get<0>(vertices_).begin(), std::get<0>(tags_).begin())) - : thrust::make_zip_iterator( - thrust::make_tuple(std::get<1>(vertices_).begin(), std::get<1>(tags_).begin())); - } - } - auto begin() { CUGRAPH_EXPECTS( @@ -355,12 +535,22 @@ class key_bucket_t { } } - auto const end() const + auto const cbegin() const { - return begin() + - (vertices_.index() == 0 ? std::get<0>(vertices_).size() : std::get<1>(vertices_).size()); + if constexpr (std::is_same_v) { + return vertices_.index() == 0 ? std::get<0>(vertices_).begin() + : std::get<1>(vertices_).begin(); + } else { + return vertices_.index() == 0 + ? thrust::make_zip_iterator( + thrust::make_tuple(std::get<0>(vertices_).begin(), std::get<0>(tags_).begin())) + : thrust::make_zip_iterator( + thrust::make_tuple(std::get<1>(vertices_).begin(), std::get<1>(tags_).begin())); + } } + auto const begin() const { return cbegin(); } + auto end() { CUGRAPH_EXPECTS( @@ -369,15 +559,13 @@ class key_bucket_t { return begin() + std::get<0>(vertices_).size(); } - auto const vertex_begin() const + auto const cend() const { - return vertices_.index() == 0 ? std::get<0>(vertices_).begin() : std::get<1>(vertices_).begin(); + return begin() + + (vertices_.index() == 0 ? std::get<0>(vertices_).size() : std::get<1>(vertices_).size()); } - auto const vertex_end() const - { - return vertices_.index() == 0 ? std::get<0>(vertices_).end() : std::get<1>(vertices_).end(); - } + auto const end() const { return cend(); } auto vertex_begin() { @@ -387,6 +575,13 @@ class key_bucket_t { return std::get<0>(vertices_).begin(); } + auto const vertex_cbegin() const + { + return vertices_.index() == 0 ? std::get<0>(vertices_).begin() : std::get<1>(vertices_).begin(); + } + + auto const vertex_begin() const { return vertex_cbegin(); } + auto vertex_end() { CUGRAPH_EXPECTS( @@ -395,6 +590,13 @@ class key_bucket_t { return std::get<0>(vertices_).end(); } + auto const vertex_cend() const + { + return vertices_.index() == 0 ? std::get<0>(vertices_).end() : std::get<1>(vertices_).end(); + } + + auto const vertex_end() const { return vertex_cend(); } + bool is_owning() { return (vertices_.index() == 0); } private: diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index 9796ddd60a1..e040366fe25 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -44,6 +44,7 @@ #include #include +#include namespace cugraph { @@ -299,6 +300,121 @@ bool check_no_parallel_edge(raft::handle_t const& handle, (org_edge_first + edgelist_srcs.size()); } +template +std::vector> +split_edge_chunk_compressed_elements_to_local_edge_partitions( + raft::handle_t const& handle, + std::vector>&& edgelist_compressed_elements, + std::vector> const& edgelist_edge_offset_vectors, + std::vector const& edge_partition_edge_counts, + std::vector> const& edge_partition_intra_partition_segment_offset_vectors, + std::vector> const& + edge_partition_intra_segment_copy_output_displacement_vectors, + size_t element_size) +{ + auto num_chunks = edgelist_compressed_elements.size(); + auto num_edge_partitions = edge_partition_edge_counts.size(); + auto num_segments = edge_partition_intra_partition_segment_offset_vectors[0].size() - 1; + for (size_t i = 0; i < edge_partition_intra_partition_segment_offset_vectors.size(); ++i) { + assert(edge_partition_intra_partition_segment_offset_vectors[i].size() == (num_segments + 1)); + } + + std::vector> edge_partition_compressed_elements{}; + edge_partition_compressed_elements.reserve(num_edge_partitions); + for (size_t i = 0; i < num_edge_partitions; ++i) { + edge_partition_compressed_elements.push_back(rmm::device_uvector( + edge_partition_edge_counts[i] * element_size, handle.get_stream())); + } + + for (size_t i = 0; i < num_edge_partitions; ++i) { + for (size_t j = 0; j < num_segments; ++j) { + for (size_t k = 0; k < num_chunks; ++k) { + auto segment_offset = edgelist_edge_offset_vectors[k][i * num_segments + j]; + auto segment_size = edgelist_edge_offset_vectors[k][i * num_segments + j + 1] - + edgelist_edge_offset_vectors[k][i * num_segments + j]; + auto output_offset = + edge_partition_intra_partition_segment_offset_vectors[i][j] + + edge_partition_intra_segment_copy_output_displacement_vectors[i][j * num_chunks + k]; + thrust::copy( + handle.get_thrust_policy(), + edgelist_compressed_elements[k].begin() + segment_offset * element_size, + edgelist_compressed_elements[k].begin() + (segment_offset + segment_size) * element_size, + edge_partition_compressed_elements[i].begin() + output_offset * element_size); + } + } + } + edgelist_compressed_elements.clear(); + + return edge_partition_compressed_elements; +} + +template +std::vector> split_edge_chunk_elements_to_local_edge_partitions( + raft::handle_t const& handle, + std::vector>&& edgelist_elements, + std::vector> const& edgelist_edge_offset_vectors, + std::vector const& edge_partition_edge_counts, + std::vector> const& edge_partition_intra_partition_segment_offset_vectors, + std::vector> const& + edge_partition_intra_segment_copy_output_displacement_vectors) +{ + static_assert(std::is_arithmetic_v); // otherwise, unimplemented. + auto num_chunks = edgelist_elements.size(); + auto num_edge_partitions = edge_partition_edge_counts.size(); + auto num_segments = edge_partition_intra_partition_segment_offset_vectors[0].size() - 1; + for (size_t i = 0; i < edge_partition_intra_partition_segment_offset_vectors.size(); ++i) { + assert(edge_partition_intra_partition_segment_offset_vectors[i].size() == (num_segments + 1)); + } + + std::vector> edge_partition_elements{}; + edge_partition_elements.reserve(num_edge_partitions); + for (size_t i = 0; i < num_edge_partitions; ++i) { + edge_partition_elements.push_back( + rmm::device_uvector(edge_partition_edge_counts[i], handle.get_stream())); + } + + for (size_t i = 0; i < num_edge_partitions; ++i) { + for (size_t j = 0; j < num_segments; ++j) { + for (size_t k = 0; k < num_chunks; ++k) { + auto segment_offset = edgelist_edge_offset_vectors[k][i * num_segments + j]; + auto segment_size = edgelist_edge_offset_vectors[k][i * num_segments + j + 1] - + edgelist_edge_offset_vectors[k][i * num_segments + j]; + auto output_offset = + edge_partition_intra_partition_segment_offset_vectors[i][j] + + edge_partition_intra_segment_copy_output_displacement_vectors[i][j * num_chunks + k]; + thrust::copy(handle.get_thrust_policy(), + edgelist_elements[k].begin() + segment_offset, + edgelist_elements[k].begin() + (segment_offset + segment_size), + edge_partition_elements[i].begin() + output_offset); + } + } + } + edgelist_elements.clear(); + + return edge_partition_elements; +} + +template +void decompress_vertices(raft::handle_t const& handle, + raft::device_span compressed_vertices, + raft::device_span vertices, + size_t compressed_v_size) +{ + auto input_v_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [byte_first = compressed_vertices.begin(), compressed_v_size] __device__(size_t i) { + uint64_t v{0}; + for (size_t j = 0; j < compressed_v_size; ++j) { + auto b = *(byte_first + i * compressed_v_size + j); + v |= static_cast(b) << (8 * j); + } + return static_cast(v); + })); + thrust::copy( + handle.get_thrust_policy(), input_v_first, input_v_first + vertices.size(), vertices.begin()); +} + template >>&& edge_partition_edgelist_weights, std::optional>>&& edge_partition_edgelist_edge_ids, std::optional>>&& edge_partition_edgelist_edge_types, - std::vector> const& edgelist_intra_partition_segment_offsets, + std::vector> const& edgelist_intra_partition_segment_offset_vectors, graph_properties_t graph_properties, bool renumber) { @@ -335,6 +451,10 @@ create_graph_from_partitioned_edgelist( auto const minor_comm_size = minor_comm.get_size(); // 1. renumber +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "create_graph_from_partitioned 0" << std::endl; +#endif std::vector edgelist_edge_counts(minor_comm_size, edge_t{0}); for (size_t i = 0; i < edgelist_edge_counts.size(); ++i) { @@ -347,14 +467,14 @@ create_graph_from_partitioned_edgelist( src_ptrs[i] = edge_partition_edgelist_srcs[i].begin(); dst_ptrs[i] = edge_partition_edgelist_dsts[i].begin(); } - auto [renumber_map_labels, meta] = - cugraph::renumber_edgelist(handle, - std::move(local_vertices), - src_ptrs, - dst_ptrs, - edgelist_edge_counts, - edgelist_intra_partition_segment_offsets, - store_transposed); + auto [renumber_map_labels, meta] = cugraph::renumber_edgelist( + handle, + std::move(local_vertices), + src_ptrs, + dst_ptrs, + edgelist_edge_counts, + edgelist_intra_partition_segment_offset_vectors, + store_transposed); auto num_segments_per_vertex_partition = static_cast(meta.edge_partition_segment_offsets.size() / minor_comm_size); @@ -362,6 +482,10 @@ create_graph_from_partitioned_edgelist( num_segments_per_vertex_partition > (detail::num_sparse_segments_per_vertex_partition + 2); // 2. sort and compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "create_graph_from_partitioned 1" << std::endl; +#endif auto total_global_mem = handle.get_device_properties().totalGlobalMem; size_t element_size = sizeof(vertex_t) * 2; @@ -369,7 +493,7 @@ create_graph_from_partitioned_edgelist( if (edge_partition_edgelist_edge_ids) { element_size += sizeof(edge_id_t); } if (edge_partition_edgelist_edge_types) { element_size += sizeof(edge_type_t); } auto constexpr mem_frugal_ratio = - 0.25; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the + 0.05; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the // total_global_mem, switch to the memory frugal approach auto mem_frugal_threshold = static_cast(static_cast(total_global_mem / element_size) * mem_frugal_ratio); @@ -567,6 +691,10 @@ create_graph_from_partitioned_edgelist( } // 3. segmented sort neighbors +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "create_graph_from_partitioned 2" << std::endl; +#endif for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { if (edge_partition_weights) { @@ -653,6 +781,10 @@ create_graph_from_partitioned_edgelist( } // 4. create a graph and an edge_property_t object. +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "create_graph_from_partitioned 3" << std::endl; +#endif std::optional, weight_t>> edge_weights{std::nullopt}; @@ -684,11 +816,13 @@ create_graph_from_partitioned_edgelist( std::move(edge_partition_offsets), std::move(edge_partition_indices), std::move(edge_partition_dcs_nzd_vertices), - cugraph::graph_meta_t{meta.number_of_vertices, - meta.number_of_edges, - graph_properties, - meta.partition, - meta.edge_partition_segment_offsets}), + cugraph::graph_meta_t{ + meta.number_of_vertices, + meta.number_of_edges, + graph_properties, + meta.partition, + meta.edge_partition_segment_offsets, + meta.edge_partition_hypersparse_degree_offsets}), std::move(edge_weights), std::move(edge_ids), std::move(edge_types), @@ -790,7 +924,7 @@ create_graph_from_edgelist_impl( handle.sync_stream(); std::vector edgelist_edge_counts(minor_comm_size, edge_t{0}); - auto edgelist_intra_partition_segment_offsets = std::vector>( + auto edgelist_intra_partition_segment_offset_vectors = std::vector>( minor_comm_size, std::vector(major_comm_size + 1, edge_t{0})); for (int i = 0; i < minor_comm_size; ++i) { edgelist_edge_counts[i] = std::accumulate(h_edge_counts.begin() + major_comm_size * i, @@ -798,7 +932,7 @@ create_graph_from_edgelist_impl( edge_t{0}); std::partial_sum(h_edge_counts.begin() + major_comm_size * i, h_edge_counts.begin() + major_comm_size * (i + 1), - edgelist_intra_partition_segment_offsets[i].begin() + 1); + edgelist_intra_partition_segment_offset_vectors[i].begin() + 1); } std::vector edgelist_displacements(minor_comm_size, edge_t{0}); std::partial_sum(edgelist_edge_counts.begin(), @@ -898,7 +1032,7 @@ create_graph_from_edgelist_impl( std::move(edge_partition_edgelist_weights), std::move(edge_partition_edgelist_edge_ids), std::move(edge_partition_edgelist_edge_types), - edgelist_intra_partition_segment_offsets, + edgelist_intra_partition_segment_offset_vectors, graph_properties, renumber); } @@ -1021,30 +1155,73 @@ create_graph_from_edgelist_impl( } } - // 1. groupby each edge chunks to their target local adjacency matrix partition (and further + auto num_chunks = edgelist_srcs.size(); + + // 1. set whether to temporarily compress vertex IDs or not in splitting edge chunks + + size_t compressed_v_size = + sizeof(vertex_t); // if set to a value smaller than sizeof(vertex_t), temporarily store vertex + // IDs in compressed_v_size byte variables + + static_assert((sizeof(vertex_t) == 4) || (sizeof(vertex_t) == 8)); + if constexpr (sizeof(vertex_t) == 8) { // 64 bit vertex ID + static_assert(std::is_signed_v); // __clzll takes a signed integer + + auto total_global_mem = handle.get_device_properties().totalGlobalMem; + size_t element_size = sizeof(vertex_t) * 2; + if (edgelist_weights) { element_size += sizeof(weight_t); } + if (edgelist_edge_ids) { element_size += sizeof(edge_id_t); } + if (edgelist_edge_types) { element_size += sizeof(edge_type_t); } + edge_t num_edges{0}; + for (size_t i = 0; i < edgelist_srcs.size(); ++i) { + num_edges += edgelist_srcs[i].size(); + } + bool compress{false}; + if (static_cast(num_edges) * element_size > + static_cast(total_global_mem * 0.5 /* tuning parameter */)) { + compress = true; + } + + if (compress) { + size_t min_clz{sizeof(vertex_t) * 8}; + for (size_t i = 0; i < num_chunks; ++i) { + min_clz = + thrust::transform_reduce(handle.get_thrust_policy(), + edgelist_srcs[i].begin(), + edgelist_srcs[i].end(), + cuda::proclaim_return_type([] __device__(auto v) { + return static_cast(__clzll(v)); + }), + min_clz, + thrust::minimum{}); + min_clz = + thrust::transform_reduce(handle.get_thrust_policy(), + edgelist_dsts[i].begin(), + edgelist_dsts[i].end(), + cuda::proclaim_return_type([] __device__(auto v) { + return static_cast(__clzll(v)); + }), + min_clz, + thrust::minimum{}); + } + compressed_v_size = sizeof(vertex_t) - (min_clz / 8); + compressed_v_size = std::max( + compressed_v_size, size_t{5}); // FIXME: max(compressed_v_size, size_t{1}) is sufficient, + // but we need to check whether this works at scale 40 + } + } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "create_graph_from_edgelist_impl 0 compressed_v_size=" << compressed_v_size + << std::endl; +#endif + + // 2. groupby each edge chunks to their target local adjacency matrix partition (and further // groupby within the local partition by applying the compute_gpu_id_from_vertex_t to minor vertex // IDs). - std::vector>> edgelist_partitioned_srcs( - edgelist_srcs.size()); - std::vector>> edgelist_partitioned_dsts( - edgelist_srcs.size()); - auto edgelist_partitioned_weights = - edgelist_weights ? std::make_optional>>>( - edgelist_srcs.size()) - : std::nullopt; - auto edgelist_partitioned_edge_ids = - edgelist_edge_ids - ? std::make_optional>>>( - edgelist_srcs.size()) - : std::nullopt; - auto edgelist_partitioned_edge_types = - edgelist_edge_types - ? std::make_optional>>>( - edgelist_srcs.size()) - : std::nullopt; - - for (size_t i = 0; i < edgelist_srcs.size(); ++i) { // iterate over input edge chunks + std::vector> edgelist_edge_offset_vectors(num_chunks); + for (size_t i = 0; i < num_chunks; ++i) { // iterate over input edge chunks std::optional> this_chunk_weights{std::nullopt}; if (edgelist_weights) { this_chunk_weights = std::move((*edgelist_weights)[i]); } std::optional> this_chunk_edge_ids{std::nullopt}; @@ -1060,6 +1237,9 @@ create_graph_from_edgelist_impl( this_chunk_edge_ids, this_chunk_edge_types, true); + if (this_chunk_weights) { (*edgelist_weights)[i] = std::move(*this_chunk_weights); } + if (this_chunk_edge_ids) { (*edgelist_edge_ids)[i] = std::move(*this_chunk_edge_ids); } + if (this_chunk_edge_types) { (*edgelist_edge_types)[i] = std::move(*this_chunk_edge_types); } std::vector h_this_chunk_edge_counts(d_this_chunk_edge_counts.size()); raft::update_host(h_this_chunk_edge_counts.data(), @@ -1067,132 +1247,89 @@ create_graph_from_edgelist_impl( d_this_chunk_edge_counts.size(), handle.get_stream()); handle.sync_stream(); - std::vector h_this_chunk_edge_displacements(h_this_chunk_edge_counts.size()); - std::exclusive_scan(h_this_chunk_edge_counts.begin(), + std::vector h_this_chunk_edge_offsets( + h_this_chunk_edge_counts.size() + 1, + 0); // size = minor_comm_size (# local edge partitions) * major_comm_size (# segments in the + // local minor range) + std::inclusive_scan(h_this_chunk_edge_counts.begin(), h_this_chunk_edge_counts.end(), - h_this_chunk_edge_displacements.begin(), - size_t{0}); - - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_srcs(h_this_chunk_edge_counts[j], handle.get_stream()); - auto input_first = edgelist_srcs[i].begin() + h_this_chunk_edge_displacements[j]; - thrust::copy( - handle.get_thrust_policy(), input_first, input_first + tmp_srcs.size(), tmp_srcs.begin()); - edgelist_partitioned_srcs[i].push_back(std::move(tmp_srcs)); - } - edgelist_srcs[i].resize(0, handle.get_stream()); - edgelist_srcs[i].shrink_to_fit(handle.get_stream()); - - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_dsts(h_this_chunk_edge_counts[j], handle.get_stream()); - auto input_first = edgelist_dsts[i].begin() + h_this_chunk_edge_displacements[j]; - thrust::copy( - handle.get_thrust_policy(), input_first, input_first + tmp_dsts.size(), tmp_dsts.begin()); - edgelist_partitioned_dsts[i].push_back(std::move(tmp_dsts)); - } - edgelist_dsts[i].resize(0, handle.get_stream()); - edgelist_dsts[i].shrink_to_fit(handle.get_stream()); - - if (this_chunk_weights) { - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_weights(h_this_chunk_edge_counts[j], handle.get_stream()); - auto input_first = (*this_chunk_weights).begin() + h_this_chunk_edge_displacements[j]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + tmp_weights.size(), - tmp_weights.begin()); - (*edgelist_partitioned_weights)[i].push_back(std::move(tmp_weights)); - } - (*this_chunk_weights).resize(0, handle.get_stream()); - (*this_chunk_weights).shrink_to_fit(handle.get_stream()); - } - - if (this_chunk_edge_ids) { - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_edge_ids(h_this_chunk_edge_counts[j], - handle.get_stream()); - auto input_first = (*this_chunk_edge_ids).begin() + h_this_chunk_edge_displacements[j]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + tmp_edge_ids.size(), - tmp_edge_ids.begin()); - (*edgelist_partitioned_edge_ids)[i].push_back(std::move(tmp_edge_ids)); - } - (*this_chunk_edge_ids).resize(0, handle.get_stream()); - (*this_chunk_edge_ids).shrink_to_fit(handle.get_stream()); - } - - if (this_chunk_edge_types) { - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_edge_types(h_this_chunk_edge_counts[j], - handle.get_stream()); - auto input_first = (*this_chunk_edge_types).begin() + h_this_chunk_edge_displacements[j]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + tmp_edge_types.size(), - tmp_edge_types.begin()); - (*edgelist_partitioned_edge_types)[i].push_back(std::move(tmp_edge_types)); - } - (*this_chunk_edge_types).resize(0, handle.get_stream()); - (*this_chunk_edge_types).shrink_to_fit(handle.get_stream()); + h_this_chunk_edge_offsets.begin() + 1); + edgelist_edge_offset_vectors[i] = std::move(h_this_chunk_edge_offsets); + } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "create_graph_from_edgelist_impl 1" << std::endl; +#endif + + // 3. compress edge chunk source/destination vertices to cut intermediate peak memory requirement + + std::optional>> edgelist_compressed_srcs{std::nullopt}; + std::optional>> edgelist_compressed_dsts{std::nullopt}; + if (compressed_v_size < sizeof(vertex_t)) { + edgelist_compressed_srcs = std::vector>{}; + edgelist_compressed_dsts = std::vector>{}; + (*edgelist_compressed_srcs).reserve(num_chunks); + (*edgelist_compressed_dsts).reserve(num_chunks); + for (size_t i = 0; i < num_chunks; ++i) { // iterate over input edge chunks + // compress source values + auto tmp_srcs = rmm::device_uvector(edgelist_srcs[i].size() * compressed_v_size, + handle.get_stream()); + auto input_src_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [src_first = edgelist_srcs[i].begin(), compressed_v_size] __device__(size_t i) { + auto v = static_cast(*(src_first + (i / compressed_v_size))); + return static_cast((v >> (8 * (i % compressed_v_size))) & uint64_t{0xff}); + })); + thrust::copy(handle.get_thrust_policy(), + input_src_first, + input_src_first + edgelist_srcs[i].size() * compressed_v_size, + tmp_srcs.begin()); + edgelist_srcs[i].resize(0, handle.get_stream()); + edgelist_srcs[i].shrink_to_fit(handle.get_stream()); + (*edgelist_compressed_srcs).push_back(std::move(tmp_srcs)); + + // compress destination values + + auto tmp_dsts = rmm::device_uvector(edgelist_dsts[i].size() * compressed_v_size, + handle.get_stream()); + auto input_dst_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [dst_first = edgelist_dsts[i].begin(), compressed_v_size] __device__(size_t i) { + auto v = static_cast(*(dst_first + (i / compressed_v_size))); + return static_cast((v >> (8 * (i % compressed_v_size))) & uint64_t{0xff}); + })); + thrust::copy(handle.get_thrust_policy(), + input_dst_first, + input_dst_first + edgelist_dsts[i].size() * compressed_v_size, + tmp_dsts.begin()); + edgelist_dsts[i].resize(0, handle.get_stream()); + edgelist_dsts[i].shrink_to_fit(handle.get_stream()); + (*edgelist_compressed_dsts).push_back(std::move(tmp_dsts)); } } - edgelist_srcs.clear(); - edgelist_dsts.clear(); - if (edgelist_weights) { (*edgelist_weights).clear(); } - if (edgelist_edge_ids) { (*edgelist_edge_ids).clear(); } - if (edgelist_edge_types) { (*edgelist_edge_types).clear(); } - - // 2. split the grouped edge chunks to local partitions - auto edgelist_intra_partition_segment_offsets = std::vector>(minor_comm_size); + // 4. compute additional copy_offset vectors + // FIXME: we can store chunk data in multiple rmm::device_uvector objects to free memory earlier - std::vector> edge_partition_edgelist_srcs{}; - edge_partition_edgelist_srcs.reserve(minor_comm_size); - std::vector> edge_partition_edgelist_dsts{}; - edge_partition_edgelist_dsts.reserve(minor_comm_size); - auto edge_partition_edgelist_weights = - edgelist_partitioned_weights ? std::make_optional>>() - : std::nullopt; - if (edgelist_partitioned_weights) { (*edge_partition_edgelist_weights).reserve(minor_comm_size); } - auto edge_partition_edgelist_edge_ids = - edgelist_partitioned_edge_ids - ? std::make_optional>>() - : std::nullopt; - if (edgelist_partitioned_edge_ids) { - (*edge_partition_edgelist_edge_ids).reserve(minor_comm_size); - } - auto edge_partition_edgelist_edge_types = - edgelist_partitioned_edge_types - ? std::make_optional>>() - : std::nullopt; - if (edgelist_partitioned_edge_types) { - (*edge_partition_edgelist_edge_types).reserve(minor_comm_size); - } - - for (int i = 0; i < minor_comm_size; ++i) { // iterate over local edge partitions + std::vector edge_partition_edge_counts(minor_comm_size); + std::vector> edge_partition_intra_partition_segment_offset_vectors( + minor_comm_size); + std::vector> edge_partition_intra_segment_copy_output_displacement_vectors( + minor_comm_size); + for (int i = 0; i < minor_comm_size; ++i) { edge_t edge_count{0}; std::vector intra_partition_segment_sizes(major_comm_size, 0); - std::vector intra_segment_copy_output_displacements(major_comm_size * - edgelist_partitioned_srcs.size()); + std::vector intra_segment_copy_output_displacements(major_comm_size * num_chunks); for (int j = 0; j < major_comm_size /* # segments in the local minor range */; ++j) { edge_t displacement{0}; - for (size_t k = 0; k < edgelist_partitioned_srcs.size() /* # input edge chunks */; ++k) { - auto segment_size = edgelist_partitioned_srcs[k][i * major_comm_size + j].size(); + for (size_t k = 0; k < num_chunks; ++k) { + auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - + edgelist_edge_offset_vectors[k][i * major_comm_size + j]; edge_count += segment_size; intra_partition_segment_sizes[j] += segment_size; - intra_segment_copy_output_displacements[j * edgelist_partitioned_srcs.size() + k] = - displacement; + intra_segment_copy_output_displacements[j * num_chunks + k] = displacement; displacement += segment_size; } } @@ -1201,93 +1338,169 @@ create_graph_from_edgelist_impl( intra_partition_segment_sizes.end(), intra_partition_segment_offsets.begin() + 1); - rmm::device_uvector tmp_srcs(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_srcs.size(); ++k) { - auto& input_buffer = edgelist_partitioned_srcs[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_srcs.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_srcs.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); - } - } - edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs)); + edge_partition_edge_counts[i] = edge_count; + edge_partition_intra_partition_segment_offset_vectors[i] = + std::move(intra_partition_segment_offsets); + edge_partition_intra_segment_copy_output_displacement_vectors[i] = + std::move(intra_segment_copy_output_displacements); + } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "create_graph_from_edgelist_impl 2" << std::endl; +#endif - rmm::device_uvector tmp_dsts(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { - auto& input_buffer = edgelist_partitioned_dsts[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_dsts.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); - } - } - edge_partition_edgelist_dsts.push_back(std::move(tmp_dsts)); + // 5. split the grouped edge chunks to local partitions - if (edge_partition_edgelist_weights) { - rmm::device_uvector tmp_weights(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { - auto& input_buffer = (*edgelist_partitioned_weights)[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_weights.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); - } - } - (*edge_partition_edgelist_weights).push_back(std::move(tmp_weights)); + std::vector> edge_partition_edgelist_srcs{}; + std::vector> edge_partition_edgelist_dsts{}; + std::optional>> edge_partition_edgelist_weights{ + std::nullopt}; + std::optional>> edge_partition_edgelist_edge_ids{ + std::nullopt}; + std::optional>> edge_partition_edgelist_edge_types{ + std::nullopt}; + + std::optional>> + edge_partition_edgelist_compressed_srcs{}; + std::optional>> + edge_partition_edgelist_compressed_dsts{}; + + if (compressed_v_size < sizeof(vertex_t)) { + edge_partition_edgelist_compressed_srcs = + split_edge_chunk_compressed_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_compressed_srcs), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors, + compressed_v_size); + + edge_partition_edgelist_compressed_dsts = + split_edge_chunk_compressed_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_compressed_dsts), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors, + compressed_v_size); + } else { + edge_partition_edgelist_srcs = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(edgelist_srcs), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + + edge_partition_edgelist_dsts = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(edgelist_dsts), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + } + + if (edgelist_weights) { + edge_partition_edgelist_weights = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_weights), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + } + if (edgelist_edge_ids) { + edge_partition_edgelist_edge_ids = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_edge_ids), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + } + if (edgelist_edge_types) { + edge_partition_edgelist_edge_types = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_edge_types), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "create_graph_from_edgelist_impl 3" << std::endl; +#endif + + // 6. decompress edge chunk source/destination vertices to cut intermediate peak memory + // requirement + + if (compressed_v_size < sizeof(vertex_t)) { + assert(edge_partition_edgelist_compressed_srcs); + assert(edge_partition_edgelist_compressed_dsts); + + std::vector> h_edge_partition_edgelist_compressed_srcs(minor_comm_size); + std::vector> h_edge_partition_edgelist_compressed_dsts(minor_comm_size); + for (size_t i = 0; i < static_cast(minor_comm_size); ++i) { + h_edge_partition_edgelist_compressed_srcs[i].resize(edge_partition_edge_counts[i] * + compressed_v_size); + raft::update_host(h_edge_partition_edgelist_compressed_srcs[i].data(), + (*edge_partition_edgelist_compressed_srcs)[i].data(), + (*edge_partition_edgelist_compressed_srcs)[i].size(), + handle.get_stream()); + + h_edge_partition_edgelist_compressed_dsts[i].resize(edge_partition_edge_counts[i] * + compressed_v_size); + raft::update_host(h_edge_partition_edgelist_compressed_dsts[i].data(), + (*edge_partition_edgelist_compressed_dsts)[i].data(), + (*edge_partition_edgelist_compressed_dsts)[i].size(), + handle.get_stream()); } + (*edge_partition_edgelist_compressed_srcs).clear(); + (*edge_partition_edgelist_compressed_dsts).clear(); - if (edge_partition_edgelist_edge_ids) { - rmm::device_uvector tmp_edge_ids(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { - auto& input_buffer = (*edgelist_partitioned_edge_ids)[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_edge_ids.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); - } - } - (*edge_partition_edgelist_edge_ids).push_back(std::move(tmp_edge_ids)); + edge_partition_edgelist_srcs.reserve(minor_comm_size); + edge_partition_edgelist_dsts.reserve(minor_comm_size); + for (int i = 0; i < minor_comm_size; ++i) { + edge_partition_edgelist_srcs.push_back( + rmm::device_uvector(edge_partition_edge_counts[i], handle.get_stream())); + edge_partition_edgelist_dsts.push_back( + rmm::device_uvector(edge_partition_edge_counts[i], handle.get_stream())); } + for (int i = 0; i < minor_comm_size; ++i) { + rmm::device_uvector tmp_bytes(edge_partition_edge_counts[i] * compressed_v_size, + handle.get_stream()); - if (edge_partition_edgelist_edge_types) { - rmm::device_uvector tmp_edge_types(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { - auto& input_buffer = (*edgelist_partitioned_edge_types)[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_edge_types.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); - } - } - (*edge_partition_edgelist_edge_types).push_back(std::move(tmp_edge_types)); + raft::update_device(tmp_bytes.data(), + h_edge_partition_edgelist_compressed_srcs[i].data(), + h_edge_partition_edgelist_compressed_srcs[i].size(), + handle.get_stream()); + decompress_vertices(handle, + raft::device_span(tmp_bytes.data(), tmp_bytes.size()), + raft::device_span(edge_partition_edgelist_srcs[i].data(), + edge_partition_edgelist_srcs[i].size()), + compressed_v_size); + + raft::update_device(tmp_bytes.data(), + h_edge_partition_edgelist_compressed_dsts[i].data(), + h_edge_partition_edgelist_compressed_dsts[i].size(), + handle.get_stream()); + decompress_vertices(handle, + raft::device_span(tmp_bytes.data(), tmp_bytes.size()), + raft::device_span(edge_partition_edgelist_dsts[i].data(), + edge_partition_edgelist_dsts[i].size()), + compressed_v_size); } - edgelist_intra_partition_segment_offsets[i] = std::move(intra_partition_segment_offsets); + handle.sync_stream(); } return create_graph_from_partitioned_edgelist(edgelist_srcs.data(), edgelist_srcs.size()), raft::device_span(edgelist_dsts.data(), edgelist_dsts.size()))), - "Invalid input arguments: graph_properties.is_symmetric is true but the input edge list is " + "Invalid input arguments: graph_properties.is_symmetric is true but the input edge " + "list is " "not symmetric."); } @@ -1377,7 +1591,8 @@ create_graph_from_edgelist_impl( handle, raft::device_span(edgelist_srcs.data(), edgelist_srcs.size()), raft::device_span(edgelist_dsts.data(), edgelist_dsts.size())), - "Invalid input arguments: graph_properties.is_multigraph is false but the input edge list " + "Invalid input arguments: graph_properties.is_multigraph is false but the input edge " + "list " "has parallel edges."); } } @@ -1605,7 +1820,8 @@ create_graph_from_edgelist_impl( cugraph::graph_meta_t{ num_vertices, graph_properties, - renumber ? std::optional>{meta.segment_offsets} : std::nullopt}), + renumber ? std::optional>{meta.segment_offsets} : std::nullopt, + meta.hypersparse_degree_offsets}), std::move(edge_weights), std::move(edge_ids), std::move(edge_types), @@ -1759,15 +1975,15 @@ create_graph_from_edgelist_impl( renumber); if (graph_properties.is_symmetric) { - CUGRAPH_EXPECTS( - (check_symmetric( - handle, - raft::device_span(aggregate_edgelist_srcs.data(), - aggregate_edgelist_srcs.size()), - raft::device_span(aggregate_edgelist_dsts.data(), - aggregate_edgelist_dsts.size()))), - "Invalid input arguments: graph_properties.is_symmetric is true but the input edge list is " - "not symmetric."); + CUGRAPH_EXPECTS((check_symmetric( + handle, + raft::device_span(aggregate_edgelist_srcs.data(), + aggregate_edgelist_srcs.size()), + raft::device_span(aggregate_edgelist_dsts.data(), + aggregate_edgelist_dsts.size()))), + "Invalid input arguments: graph_properties.is_symmetric is true but the " + "input edge list is " + "not symmetric."); } if (!graph_properties.is_multigraph) { @@ -1777,7 +1993,8 @@ create_graph_from_edgelist_impl( aggregate_edgelist_srcs.size()), raft::device_span(aggregate_edgelist_dsts.data(), aggregate_edgelist_dsts.size())), - "Invalid input arguments: graph_properties.is_multigraph is false but the input edge list " + "Invalid input arguments: graph_properties.is_multigraph is false but " + "the input edge list " "has parallel edges."); } } diff --git a/cpp/src/structure/detail/structure_utils.cuh b/cpp/src/structure/detail/structure_utils.cuh index 1ef975c1dec..86e3c45ca2f 100644 --- a/cpp/src/structure/detail/structure_utils.cuh +++ b/cpp/src/structure/detail/structure_utils.cuh @@ -60,7 +60,8 @@ rmm::device_uvector compute_sparse_offsets( bool edgelist_major_sorted, rmm::cuda_stream_view stream_view) { - rmm::device_uvector offsets((major_range_last - major_range_first) + 1, stream_view); + rmm::device_uvector offsets(static_cast(major_range_last - major_range_first) + 1, + stream_view); if (edgelist_major_sorted) { offsets.set_element_to_zero_async(0, stream_view); thrust::upper_bound(rmm::exec_policy(stream_view), @@ -77,7 +78,9 @@ rmm::device_uvector compute_sparse_offsets( edgelist_major_first, edgelist_major_last, [offset_view, major_range_first] __device__(auto v) { - atomicAdd(&offset_view[v - major_range_first], edge_t{1}); + cuda::atomic_ref atomic_counter( + offset_view[v - major_range_first]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); }); thrust::exclusive_scan( @@ -246,30 +249,112 @@ sort_and_compress_edgelist(rmm::device_uvector&& edgelist_srcs, rmm::device_uvector offsets(0, stream_view); rmm::device_uvector indices(0, stream_view); - auto edge_first = thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin()); if (edgelist_minors.size() > mem_frugal_threshold) { - offsets = compute_sparse_offsets(edgelist_majors.begin(), - edgelist_majors.end(), - major_range_first, - major_range_last, - false, - stream_view); + static_assert((sizeof(vertex_t) == 4) || (sizeof(vertex_t) == 8)); + if ((sizeof(vertex_t) == 8) && (static_cast(major_range_last - major_range_first) <= + static_cast(std::numeric_limits::max()))) { + rmm::device_uvector edgelist_major_offsets(edgelist_majors.size(), stream_view); + thrust::transform( + rmm::exec_policy_nosync(stream_view), + edgelist_majors.begin(), + edgelist_majors.end(), + edgelist_major_offsets.begin(), + cuda::proclaim_return_type([major_range_first] __device__(vertex_t major) { + return static_cast(major - major_range_first); + })); + edgelist_majors.resize(0, stream_view); + edgelist_majors.shrink_to_fit(stream_view); + + offsets = + compute_sparse_offsets(edgelist_major_offsets.begin(), + edgelist_major_offsets.end(), + uint32_t{0}, + static_cast(major_range_last - major_range_first), + false, + stream_view); + std::array pivots{}; + for (size_t i = 0; i < 3; ++i) { + pivots[i] = static_cast(thrust::distance( + offsets.begin(), + thrust::lower_bound(rmm::exec_policy(stream_view), + offsets.begin(), + offsets.end(), + static_cast((edgelist_major_offsets.size() * (i + 1)) / 4)))); + } - auto pivot = major_range_first + static_cast(thrust::distance( - offsets.begin(), - thrust::lower_bound(rmm::exec_policy(stream_view), - offsets.begin(), - offsets.end(), - edgelist_minors.size() / 2))); - auto second_first = - detail::mem_frugal_partition(edge_first, - edge_first + edgelist_minors.size(), - thrust_tuple_get, 0>{}, - pivot, - stream_view); - thrust::sort(rmm::exec_policy(stream_view), edge_first, second_first); - thrust::sort(rmm::exec_policy(stream_view), second_first, edge_first + edgelist_minors.size()); + auto pair_first = + thrust::make_zip_iterator(edgelist_major_offsets.begin(), edgelist_minors.begin()); + auto second_half_first = + detail::mem_frugal_partition(pair_first, + pair_first + edgelist_major_offsets.size(), + thrust_tuple_get, 0>{}, + pivots[1], + stream_view); + auto second_quarter_first = + detail::mem_frugal_partition(pair_first, + second_half_first, + thrust_tuple_get, 0>{}, + pivots[0], + stream_view); + auto last_quarter_first = + detail::mem_frugal_partition(second_half_first, + pair_first + edgelist_major_offsets.size(), + thrust_tuple_get, 0>{}, + pivots[2], + stream_view); + thrust::sort(rmm::exec_policy(stream_view), pair_first, second_quarter_first); + thrust::sort(rmm::exec_policy(stream_view), second_quarter_first, second_half_first); + thrust::sort(rmm::exec_policy(stream_view), second_half_first, last_quarter_first); + thrust::sort(rmm::exec_policy(stream_view), + last_quarter_first, + pair_first + edgelist_major_offsets.size()); + } else { + offsets = compute_sparse_offsets(edgelist_majors.begin(), + edgelist_majors.end(), + major_range_first, + major_range_last, + false, + stream_view); + std::array pivots{}; + for (size_t i = 0; i < 3; ++i) { + pivots[i] = + major_range_first + + static_cast(thrust::distance( + offsets.begin(), + thrust::lower_bound(rmm::exec_policy(stream_view), + offsets.begin(), + offsets.end(), + static_cast((edgelist_minors.size() * (i + 1)) / 4)))); + } + auto edge_first = thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin()); + auto second_half_first = + detail::mem_frugal_partition(edge_first, + edge_first + edgelist_majors.size(), + thrust_tuple_get, 0>{}, + pivots[1], + stream_view); + auto second_quarter_first = + detail::mem_frugal_partition(edge_first, + second_half_first, + thrust_tuple_get, 0>{}, + pivots[0], + stream_view); + auto last_quarter_first = + detail::mem_frugal_partition(second_half_first, + edge_first + edgelist_majors.size(), + thrust_tuple_get, 0>{}, + pivots[2], + stream_view); + thrust::sort(rmm::exec_policy(stream_view), edge_first, second_quarter_first); + thrust::sort(rmm::exec_policy(stream_view), second_quarter_first, second_half_first); + thrust::sort(rmm::exec_policy(stream_view), second_half_first, last_quarter_first); + thrust::sort( + rmm::exec_policy(stream_view), last_quarter_first, edge_first + edgelist_majors.size()); + edgelist_majors.resize(0, stream_view); + edgelist_majors.shrink_to_fit(stream_view); + } } else { + auto edge_first = thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin()); thrust::sort(rmm::exec_policy(stream_view), edge_first, edge_first + edgelist_minors.size()); offsets = compute_sparse_offsets(edgelist_majors.begin(), edgelist_majors.end(), @@ -277,12 +362,11 @@ sort_and_compress_edgelist(rmm::device_uvector&& edgelist_srcs, major_range_last, true, stream_view); + edgelist_majors.resize(0, stream_view); + edgelist_majors.shrink_to_fit(stream_view); } indices = std::move(edgelist_minors); - edgelist_majors.resize(0, stream_view); - edgelist_majors.shrink_to_fit(stream_view); - std::optional> dcs_nzd_vertices{std::nullopt}; if (major_hypersparse_first) { std::tie(offsets, dcs_nzd_vertices) = compress_hypersparse_offsets(std::move(offsets), diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index ef43b7b13ec..c0e9b7f0a54 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -146,8 +146,7 @@ update_local_sorted_unique_edge_majors_minors( auto num_segments_per_vertex_partition = static_cast(meta.edge_partition_segment_offsets.size() / minor_comm_size); - auto use_dcs = - num_segments_per_vertex_partition > (detail::num_sparse_segments_per_vertex_partition + 2); + auto use_dcs = edge_partition_dcs_nzd_vertices.has_value(); std::optional>> local_sorted_unique_edge_majors{ std::nullopt}; @@ -166,14 +165,15 @@ update_local_sorted_unique_edge_majors_minors( // 1. Update local_sorted_unique_edge_minors & local_sorted_unique_edge_minor_offsets - { + if (detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold > 0.0) { auto [minor_range_first, minor_range_last] = meta.partition.local_edge_partition_minor_range(); auto minor_range_size = meta.partition.local_edge_partition_minor_range_size(); - rmm::device_uvector minor_bitmaps( - (minor_range_size + (sizeof(uint32_t) * 8 - 1)) / (sizeof(uint32_t) * 8), - handle.get_stream()); - thrust::fill( - handle.get_thrust_policy(), minor_bitmaps.begin(), minor_bitmaps.end(), uint32_t{0}); + rmm::device_uvector minor_bitmaps(packed_bool_size(minor_range_size), + handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + minor_bitmaps.begin(), + minor_bitmaps.end(), + packed_bool_empty_mask()); for (size_t i = 0; i < edge_partition_indices.size(); ++i) { thrust::for_each(handle.get_thrust_policy(), edge_partition_indices[i].begin(), @@ -191,6 +191,10 @@ update_local_sorted_unique_edge_majors_minors( raft::comms::op_t::MAX, handle.get_stream()); + std::cout << "max_minor_properties_fill_ratio=" << max_minor_properties_fill_ratio + << " detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold=" + << detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold + << std::endl; if (max_minor_properties_fill_ratio < detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) { auto const chunk_size = @@ -281,92 +285,100 @@ update_local_sorted_unique_edge_majors_minors( // 2. Update local_sorted_unique_edge_majors & local_sorted_unique_edge_major_offsets - std::vector num_local_unique_edge_major_counts(edge_partition_offsets.size()); - for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { - num_local_unique_edge_major_counts[i] += thrust::count_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(edge_partition_offsets[i].size() - 1)), - has_nzd_t{edge_partition_offsets[i].data(), vertex_t{0}}); - } - auto num_local_unique_edge_majors = std::reduce(num_local_unique_edge_major_counts.begin(), - num_local_unique_edge_major_counts.end()); - - vertex_t aggregate_major_range_size{0}; - for (size_t i = 0; i < meta.partition.number_of_local_edge_partitions(); ++i) { - aggregate_major_range_size += meta.partition.local_edge_partition_major_range_size(i); - } - - auto max_major_properties_fill_ratio = - host_scalar_allreduce(comm, - static_cast(num_local_unique_edge_majors) / - static_cast(aggregate_major_range_size), - raft::comms::op_t::MAX, - handle.get_stream()); - - if (max_major_properties_fill_ratio < - detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) { - auto const chunk_size = - static_cast(std::min(1.0 / max_major_properties_fill_ratio, 1024.0)); + if (detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold > 0.0) { + std::vector num_local_unique_edge_major_counts(edge_partition_offsets.size()); + for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { + num_local_unique_edge_major_counts[i] = thrust::count_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(static_cast(edge_partition_offsets[i].size() - 1)), + has_nzd_t{edge_partition_offsets[i].data(), vertex_t{0}}); + } + auto num_local_unique_edge_majors = std::reduce(num_local_unique_edge_major_counts.begin(), + num_local_unique_edge_major_counts.end()); - local_sorted_unique_edge_majors = std::vector>{}; - local_sorted_unique_edge_major_chunk_start_offsets = - std::vector>{}; + vertex_t aggregate_major_range_size{0}; + for (size_t i = 0; i < meta.partition.number_of_local_edge_partitions(); ++i) { + aggregate_major_range_size += meta.partition.local_edge_partition_major_range_size(i); + } - (*local_sorted_unique_edge_majors).reserve(edge_partition_offsets.size()); - (*local_sorted_unique_edge_major_chunk_start_offsets).reserve(edge_partition_offsets.size()); - for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { - auto [major_range_first, major_range_last] = - meta.partition.local_edge_partition_major_range(i); - auto sparse_range_last = - use_dcs - ? (major_range_first + - meta.edge_partition_segment_offsets[num_segments_per_vertex_partition * i + + auto max_major_properties_fill_ratio = + host_scalar_allreduce(comm, + static_cast(num_local_unique_edge_majors) / + static_cast(aggregate_major_range_size), + raft::comms::op_t::MAX, + handle.get_stream()); + + std::cout << "max_major_properties_fill_ratio=" << max_major_properties_fill_ratio + << " detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold=" + << detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold + << std::endl; + if (max_major_properties_fill_ratio < + detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) { + auto const chunk_size = + static_cast(std::min(1.0 / max_major_properties_fill_ratio, 1024.0)); + + local_sorted_unique_edge_majors = std::vector>{}; + local_sorted_unique_edge_major_chunk_start_offsets = + std::vector>{}; + + (*local_sorted_unique_edge_majors).reserve(edge_partition_offsets.size()); + (*local_sorted_unique_edge_major_chunk_start_offsets).reserve(edge_partition_offsets.size()); + for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { + auto [major_range_first, major_range_last] = + meta.partition.local_edge_partition_major_range(i); + auto sparse_range_last = + use_dcs + ? (major_range_first + + meta + .edge_partition_segment_offsets[num_segments_per_vertex_partition * i + detail::num_sparse_segments_per_vertex_partition]) - : major_range_last; - - rmm::device_uvector unique_edge_majors(num_local_unique_edge_major_counts[i], - handle.get_stream()); - CUGRAPH_EXPECTS( - sparse_range_last - major_range_first < std::numeric_limits::max(), - "copy_if will fail (https://github.com/NVIDIA/thrust/issues/1302), work-around required."); - auto cur_size = thrust::distance( - unique_edge_majors.begin(), - thrust::copy_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(major_range_first), - thrust::make_counting_iterator(sparse_range_last), + : major_range_last; + + rmm::device_uvector unique_edge_majors(num_local_unique_edge_major_counts[i], + handle.get_stream()); + CUGRAPH_EXPECTS(sparse_range_last - major_range_first < std::numeric_limits::max(), + "copy_if will fail (https://github.com/NVIDIA/thrust/issues/1302), " + "work-around required."); + auto cur_size = thrust::distance( unique_edge_majors.begin(), - has_nzd_t{edge_partition_offsets[i].data(), major_range_first})); - if (use_dcs) { - thrust::copy(handle.get_thrust_policy(), - (*edge_partition_dcs_nzd_vertices)[i].begin(), - (*edge_partition_dcs_nzd_vertices)[i].end(), - unique_edge_majors.begin() + cur_size); + thrust::copy_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(major_range_first), + thrust::make_counting_iterator(sparse_range_last), + unique_edge_majors.begin(), + has_nzd_t{edge_partition_offsets[i].data(), major_range_first})); + if (use_dcs) { + thrust::copy(handle.get_thrust_policy(), + (*edge_partition_dcs_nzd_vertices)[i].begin(), + (*edge_partition_dcs_nzd_vertices)[i].end(), + unique_edge_majors.begin() + cur_size); + } + + auto num_chunks = static_cast( + ((major_range_last - major_range_first) + (chunk_size - size_t{1})) / chunk_size); + rmm::device_uvector unique_edge_major_chunk_start_offsets(num_chunks + size_t{1}, + handle.get_stream()); + + auto chunk_start_vertex_first = + thrust::make_transform_iterator(thrust::make_counting_iterator(vertex_t{0}), + detail::multiply_and_add_t{ + static_cast(chunk_size), major_range_first}); + thrust::lower_bound(handle.get_thrust_policy(), + unique_edge_majors.begin(), + unique_edge_majors.end(), + chunk_start_vertex_first, + chunk_start_vertex_first + num_chunks, + unique_edge_major_chunk_start_offsets.begin()); + unique_edge_major_chunk_start_offsets.set_element( + num_chunks, static_cast(unique_edge_majors.size()), handle.get_stream()); + + (*local_sorted_unique_edge_majors).push_back(std::move(unique_edge_majors)); + (*local_sorted_unique_edge_major_chunk_start_offsets) + .push_back(std::move(unique_edge_major_chunk_start_offsets)); } - - auto num_chunks = static_cast( - ((major_range_last - major_range_first) + (chunk_size - size_t{1})) / chunk_size); - rmm::device_uvector unique_edge_major_chunk_start_offsets(num_chunks + size_t{1}, - handle.get_stream()); - - auto chunk_start_vertex_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(vertex_t{0}), - detail::multiply_and_add_t{static_cast(chunk_size), major_range_first}); - thrust::lower_bound(handle.get_thrust_policy(), - unique_edge_majors.begin(), - unique_edge_majors.end(), - chunk_start_vertex_first, - chunk_start_vertex_first + num_chunks, - unique_edge_major_chunk_start_offsets.begin()); - unique_edge_major_chunk_start_offsets.set_element( - num_chunks, static_cast(unique_edge_majors.size()), handle.get_stream()); - - (*local_sorted_unique_edge_majors).push_back(std::move(unique_edge_majors)); - (*local_sorted_unique_edge_major_chunk_start_offsets) - .push_back(std::move(unique_edge_major_chunk_start_offsets)); + local_sorted_unique_edge_major_chunk_size = chunk_size; } - local_sorted_unique_edge_major_chunk_size = chunk_size; } return std::make_tuple(std::move(local_sorted_unique_edge_majors), @@ -378,6 +390,50 @@ update_local_sorted_unique_edge_majors_minors( std::move(local_sorted_unique_edge_minor_vertex_partition_offsets)); } +template +std::enable_if_t>> +compute_edge_partition_dcs_nzd_range_bitmaps( + raft::handle_t const& handle, + graph_meta_t const& meta, + std::vector> const& edge_partition_dcs_nzd_vertices) +{ + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + auto num_segments_per_vertex_partition = + static_cast(meta.edge_partition_segment_offsets.size() / minor_comm_size); + + std::vector> edge_partition_dcs_nzd_range_bitmaps{}; + edge_partition_dcs_nzd_range_bitmaps.reserve(edge_partition_dcs_nzd_vertices.size()); + for (size_t i = 0; i < edge_partition_dcs_nzd_vertices.size(); ++i) { + raft::host_span segment_offsets( + meta.edge_partition_segment_offsets.data() + num_segments_per_vertex_partition * i, + num_segments_per_vertex_partition); + rmm::device_uvector bitmap( + packed_bool_size(segment_offsets[detail::num_sparse_segments_per_vertex_partition + 1] - + segment_offsets[detail::num_sparse_segments_per_vertex_partition]), + handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + auto major_range_first = meta.partition.local_edge_partition_major_range_first(i); + auto major_hypersparse_first = + major_range_first + segment_offsets[detail::num_sparse_segments_per_vertex_partition]; + thrust::for_each(handle.get_thrust_policy(), + edge_partition_dcs_nzd_vertices[i].begin(), + edge_partition_dcs_nzd_vertices[i].end(), + [bitmap = raft::device_span(bitmap.data(), bitmap.size()), + major_hypersparse_first] __device__(auto major) { + auto offset = major - major_hypersparse_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(offset)]); + word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed); + }); + edge_partition_dcs_nzd_range_bitmaps.push_back(std::move(bitmap)); + } + + return edge_partition_dcs_nzd_range_bitmaps; +} + } // namespace template @@ -400,7 +456,8 @@ graph_t @@ -452,7 +514,8 @@ graph_t(indices.size()), meta.properties), offsets_(std::move(offsets)), indices_(std::move(indices)), - segment_offsets_(meta.segment_offsets) + segment_offsets_(meta.segment_offsets), + hypersparse_degree_offsets_(meta.hypersparse_degree_offsets) { } diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh index f925a142737..31de9b1e5d3 100644 --- a/cpp/src/structure/graph_view_impl.cuh +++ b/cpp/src/structure/graph_view_impl.cuh @@ -488,14 +488,18 @@ graph_view_t> const& edge_partition_indices, std::optional>> const& edge_partition_dcs_nzd_vertices, + std::optional>> const& + edge_partition_dcs_nzd_range_bitmaps, graph_view_meta_t meta) : detail::graph_base_t( meta.number_of_vertices, meta.number_of_edges, meta.properties), edge_partition_offsets_(edge_partition_offsets), edge_partition_indices_(edge_partition_indices), edge_partition_dcs_nzd_vertices_(edge_partition_dcs_nzd_vertices), + edge_partition_dcs_nzd_range_bitmaps_(edge_partition_dcs_nzd_range_bitmaps), partition_(meta.partition), edge_partition_segment_offsets_(meta.edge_partition_segment_offsets), + edge_partition_hypersparse_degree_offsets_(meta.edge_partition_hypersparse_degree_offsets), local_sorted_unique_edge_srcs_(meta.local_sorted_unique_edge_srcs), local_sorted_unique_edge_src_chunk_start_offsets_( meta.local_sorted_unique_edge_src_chunk_start_offsets), @@ -538,7 +542,8 @@ graph_view_t #include +#include + #include #include #include @@ -233,127 +235,306 @@ std::optional find_locally_unused_ext_vertex_id( : std::nullopt /* if the entire range of vertex_t is used */; } -// returns renumber map and segment_offsets +// returns renumber map, segment_offsets, and hypersparse_degree_offsets template -std::tuple, std::vector, vertex_t> compute_renumber_map( - raft::handle_t const& handle, - std::optional>&& local_vertices, - std::vector const& edgelist_majors, - std::vector const& edgelist_minors, - std::vector const& edgelist_edge_counts) +std::tuple, + std::vector, + std::optional>, + vertex_t> +compute_renumber_map(raft::handle_t const& handle, + std::optional>&& local_vertices, + std::vector const& edgelist_majors, + std::vector const& edgelist_minors, + std::vector const& edgelist_edge_counts) { - rmm::device_uvector sorted_local_vertices(0, handle.get_stream()); - - edge_t num_local_edges = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end()); - - // 1. if local_vertices.has_value() is false, find unique vertices from edge majors (to construct - // local_vertices) + // 1. if local_vertices.has_value() is false, find unique vertices from edge majors & minors (to + // construct local_vertices) - rmm::device_uvector sorted_unique_majors(0, handle.get_stream()); +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "compute_renumber_map 0" << std::endl; +#endif + rmm::device_uvector sorted_local_vertices(0, handle.get_stream()); if (!local_vertices) { - sorted_unique_majors.resize(num_local_edges, handle.get_stream()); - size_t major_offset{0}; - for (size_t i = 0; i < edgelist_majors.size(); ++i) { - thrust::copy(handle.get_thrust_policy(), - edgelist_majors[i], - edgelist_majors[i] + edgelist_edge_counts[i], - sorted_unique_majors.begin() + major_offset); - thrust::sort(handle.get_thrust_policy(), - sorted_unique_majors.begin() + major_offset, - sorted_unique_majors.begin() + major_offset + edgelist_edge_counts[i]); - major_offset += static_cast(thrust::distance( - sorted_unique_majors.begin() + major_offset, - thrust::unique(handle.get_thrust_policy(), - sorted_unique_majors.begin() + major_offset, - sorted_unique_majors.begin() + major_offset + edgelist_edge_counts[i]))); + constexpr size_t num_bins{ + 8}; // increase the number of bins to cut peak memory usage (at the expense of additional + // computing), limit the maximum temporary memory usage to "size of local edge list + // majors|minors * 2 / # bins" + constexpr uint32_t hash_seed = + 1; // shouldn't be 0 (in that case this hash function will coincide with the hash function + // used to map vertices to GPUs, and we may not see the expected randomization) + + auto edge_major_count_vectors = num_bins > 1 + ? std::make_optional>>( + edgelist_majors.size(), std::vector(num_bins)) + : std::nullopt; + if (edge_major_count_vectors) { + for (size_t i = 0; i < edgelist_majors.size(); ++i) { + rmm::device_uvector d_edge_major_counts(num_bins, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + d_edge_major_counts.begin(), + d_edge_major_counts.end(), + edge_t{0}); + thrust::for_each( + handle.get_thrust_policy(), + edgelist_majors[i], + edgelist_majors[i] + edgelist_edge_counts[i], + [counts = raft::device_span(d_edge_major_counts.data(), + d_edge_major_counts.size())] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + cuda::atomic_ref atomic_counter( + counts[hash_func(v) % num_bins]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); + raft::update_host((*edge_major_count_vectors)[i].data(), + d_edge_major_counts.data(), + d_edge_major_counts.size(), + handle.get_stream()); + } } - sorted_unique_majors.resize(major_offset, handle.get_stream()); - if (edgelist_majors.size() > 1) { - thrust::sort( - handle.get_thrust_policy(), sorted_unique_majors.begin(), sorted_unique_majors.end()); + auto edge_minor_count_vectors = num_bins > 1 + ? std::make_optional>>( + edgelist_minors.size(), std::vector(num_bins)) + : std::nullopt; + if (edge_minor_count_vectors) { + for (size_t i = 0; i < edgelist_minors.size(); ++i) { + rmm::device_uvector d_edge_minor_counts(num_bins, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + d_edge_minor_counts.begin(), + d_edge_minor_counts.end(), + edge_t{0}); + thrust::for_each( + handle.get_thrust_policy(), + edgelist_minors[i], + edgelist_minors[i] + edgelist_edge_counts[i], + [counts = raft::device_span(d_edge_minor_counts.data(), + d_edge_minor_counts.size())] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + cuda::atomic_ref atomic_counter( + counts[hash_func(v) % num_bins]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); + raft::update_host((*edge_minor_count_vectors)[i].data(), + d_edge_minor_counts.data(), + d_edge_minor_counts.size(), + handle.get_stream()); + } } - sorted_unique_majors.shrink_to_fit(handle.get_stream()); - } - - // 2. if local_vertices.has_value() is false, find unique vertices from edge minors (to construct - // local_vertices) - rmm::device_uvector sorted_unique_minors(0, handle.get_stream()); - if (!local_vertices) { - sorted_unique_minors.resize(num_local_edges, handle.get_stream()); - size_t minor_offset{0}; - for (size_t i = 0; i < edgelist_minors.size(); ++i) { - thrust::copy(handle.get_thrust_policy(), - edgelist_minors[i], - edgelist_minors[i] + edgelist_edge_counts[i], - sorted_unique_minors.begin() + minor_offset); - thrust::sort(handle.get_thrust_policy(), - sorted_unique_minors.begin() + minor_offset, - sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]); - minor_offset += static_cast(thrust::distance( - sorted_unique_minors.begin() + minor_offset, - thrust::unique(handle.get_thrust_policy(), - sorted_unique_minors.begin() + minor_offset, - sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]))); - } - sorted_unique_minors.resize(minor_offset, handle.get_stream()); - if (edgelist_minors.size() > 1) { - thrust::sort( - handle.get_thrust_policy(), sorted_unique_minors.begin(), sorted_unique_minors.end()); - sorted_unique_minors.resize(thrust::distance(sorted_unique_minors.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_unique_minors.begin(), - sorted_unique_minors.end())), - handle.get_stream()); - } - sorted_unique_minors.shrink_to_fit(handle.get_stream()); - } + handle.sync_stream(); - // 3. update sorted_local_vertices. - // if local_vertices.has_value() is false, reconstruct local_vertices first + for (size_t i = 0; i < num_bins; ++i) { + rmm::device_uvector this_bin_sorted_unique_majors(0, handle.get_stream()); + { + std::vector> edge_partition_tmp_majors{}; // for bin "i" + edge_partition_tmp_majors.reserve(edgelist_majors.size()); + for (size_t j = 0; j < edgelist_majors.size(); ++j) { + rmm::device_uvector tmp_majors(0, handle.get_stream()); + if (num_bins > 1) { + tmp_majors.resize((*edge_major_count_vectors)[j][i], handle.get_stream()); + thrust::copy_if(handle.get_thrust_policy(), + edgelist_majors[j], + edgelist_majors[j] + edgelist_edge_counts[j], + tmp_majors.begin(), + [i] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + return (static_cast(hash_func(v) % num_bins) == i); + }); + } else { + tmp_majors.resize(edgelist_edge_counts[j], handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + edgelist_majors[j], + edgelist_majors[j] + edgelist_edge_counts[j], + tmp_majors.begin()); + } + thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end()); + tmp_majors.resize( + thrust::distance( + tmp_majors.begin(), + thrust::unique(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end())), + handle.get_stream()); + tmp_majors.shrink_to_fit(handle.get_stream()); + + edge_partition_tmp_majors.push_back(std::move(tmp_majors)); + } + if constexpr (multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + std::vector tx_counts(minor_comm_size); + for (int j = 0; j < minor_comm_size; ++j) { + tx_counts[j] = edge_partition_tmp_majors[j].size(); + } + this_bin_sorted_unique_majors.resize(std::reduce(tx_counts.begin(), tx_counts.end()), + handle.get_stream()); + size_t output_offset{0}; + for (size_t j = 0; j < edge_partition_tmp_majors.size(); ++j) { + thrust::copy(handle.get_thrust_policy(), + edge_partition_tmp_majors[j].begin(), + edge_partition_tmp_majors[j].end(), + this_bin_sorted_unique_majors.begin() + output_offset); + output_offset += edge_partition_tmp_majors[j].size(); + } + this_bin_sorted_unique_majors = shuffle_and_unique_segment_sorted_values( + minor_comm, this_bin_sorted_unique_majors.begin(), tx_counts, handle.get_stream()); + } else { + this_bin_sorted_unique_majors = std::move(edge_partition_tmp_majors[0]); + } + } else { + this_bin_sorted_unique_majors = std::move(edge_partition_tmp_majors[0]); + } + } - if (local_vertices) { + rmm::device_uvector this_bin_sorted_unique_minors(0, handle.get_stream()); + { + std::vector> edge_partition_tmp_minors{}; // for bin "i" + edge_partition_tmp_minors.reserve(edgelist_minors.size()); + for (size_t j = 0; j < edgelist_minors.size(); ++j) { + rmm::device_uvector tmp_minors(0, handle.get_stream()); + if (num_bins > 1) { + tmp_minors.resize((*edge_minor_count_vectors)[j][i], handle.get_stream()); + thrust::copy_if(handle.get_thrust_policy(), + edgelist_minors[j], + edgelist_minors[j] + edgelist_edge_counts[j], + tmp_minors.begin(), + [i] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + return (static_cast(hash_func(v) % num_bins) == i); + }); + } else { + tmp_minors.resize(edgelist_edge_counts[j], handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + edgelist_minors[j], + edgelist_minors[j] + edgelist_edge_counts[j], + tmp_minors.begin()); + } + thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end()); + tmp_minors.resize( + thrust::distance( + tmp_minors.begin(), + thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())), + handle.get_stream()); + tmp_minors.shrink_to_fit(handle.get_stream()); + + edge_partition_tmp_minors.push_back(std::move(tmp_minors)); + } + if (edge_partition_tmp_minors.size() == 1) { + this_bin_sorted_unique_minors = std::move(edge_partition_tmp_minors[0]); + } else { + edge_t aggregate_size{0}; + for (size_t j = 0; j < edge_partition_tmp_minors.size(); ++j) { + aggregate_size += edge_partition_tmp_minors[j].size(); + } + this_bin_sorted_unique_minors.resize(aggregate_size, handle.get_stream()); + size_t output_offset{0}; + for (size_t j = 0; j < edge_partition_tmp_minors.size(); ++j) { + thrust::copy(handle.get_thrust_policy(), + edge_partition_tmp_minors[j].begin(), + edge_partition_tmp_minors[j].end(), + this_bin_sorted_unique_minors.begin() + output_offset); + output_offset += edge_partition_tmp_minors[j].size(); + } + edge_partition_tmp_minors.clear(); + thrust::sort(handle.get_thrust_policy(), + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end()); + this_bin_sorted_unique_minors.resize( + thrust::distance(this_bin_sorted_unique_minors.begin(), + thrust::unique(handle.get_thrust_policy(), + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end())), + handle.get_stream()); + this_bin_sorted_unique_minors.shrink_to_fit(handle.get_stream()); + } + if constexpr (multi_gpu) { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + if (major_comm_size > 1) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + compute_gpu_id_from_ext_vertex_t gpu_id_func{ + comm_size, major_comm_size, minor_comm_size}; + auto d_tx_counts = groupby_and_count( + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end(), + [major_comm_size, minor_comm_size, gpu_id_func] __device__(auto v) { + return partition_manager::compute_major_comm_rank_from_global_comm_rank( + major_comm_size, minor_comm_size, gpu_id_func(v)); + }, + major_comm_size, + std::numeric_limits::max(), + handle.get_stream()); + std::vector h_tx_counts(d_tx_counts.size()); + raft::update_host( + h_tx_counts.data(), d_tx_counts.data(), d_tx_counts.size(), handle.get_stream()); + handle.sync_stream(); + std::vector tx_displacements(h_tx_counts.size()); + std::exclusive_scan( + h_tx_counts.begin(), h_tx_counts.end(), tx_displacements.begin(), size_t{0}); + for (int j = 0; j < major_comm_size; ++j) { + thrust::sort( + handle.get_thrust_policy(), + this_bin_sorted_unique_minors.begin() + tx_displacements[j], + this_bin_sorted_unique_minors.begin() + (tx_displacements[j] + h_tx_counts[j])); + } + this_bin_sorted_unique_minors = shuffle_and_unique_segment_sorted_values( + major_comm, this_bin_sorted_unique_minors.begin(), h_tx_counts, handle.get_stream()); + } + } + } + rmm::device_uvector this_bin_sorted_unique_vertices(0, handle.get_stream()); + { + rmm::device_uvector merged_vertices( + this_bin_sorted_unique_majors.size() + this_bin_sorted_unique_minors.size(), + handle.get_stream()); + thrust::merge(handle.get_thrust_policy(), + this_bin_sorted_unique_majors.begin(), + this_bin_sorted_unique_majors.end(), + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end(), + merged_vertices.begin()); + this_bin_sorted_unique_majors.resize(0, handle.get_stream()); + this_bin_sorted_unique_majors.shrink_to_fit(handle.get_stream()); + this_bin_sorted_unique_minors.resize(0, handle.get_stream()); + this_bin_sorted_unique_minors.shrink_to_fit(handle.get_stream()); + merged_vertices.resize(thrust::distance(merged_vertices.begin(), + thrust::unique(handle.get_thrust_policy(), + merged_vertices.begin(), + merged_vertices.end())), + handle.get_stream()); + merged_vertices.shrink_to_fit(handle.get_stream()); + this_bin_sorted_unique_vertices = std::move(merged_vertices); + } + if (sorted_local_vertices.size() == 0) { + sorted_local_vertices = std::move(this_bin_sorted_unique_vertices); + } else { + rmm::device_uvector merged_vertices( + sorted_local_vertices.size() + this_bin_sorted_unique_vertices.size(), + handle.get_stream()); + thrust::merge(handle.get_thrust_policy(), + sorted_local_vertices.begin(), + sorted_local_vertices.end(), + this_bin_sorted_unique_vertices.begin(), + this_bin_sorted_unique_vertices.end(), + merged_vertices.begin()); // merging two unique sets from different hash + // bins, so the merged set can't have duplicates + sorted_local_vertices = std::move(merged_vertices); + } + } + } else { sorted_local_vertices = std::move(*local_vertices); thrust::sort( handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end()); - } else { - sorted_local_vertices.resize(sorted_unique_majors.size() + sorted_unique_minors.size(), - handle.get_stream()); - - thrust::merge(handle.get_thrust_policy(), - sorted_unique_majors.begin(), - sorted_unique_majors.end(), - sorted_unique_minors.begin(), - sorted_unique_minors.end(), - sorted_local_vertices.begin()); - - sorted_unique_majors.resize(0, handle.get_stream()); - sorted_unique_majors.shrink_to_fit(handle.get_stream()); - sorted_unique_minors.resize(0, handle.get_stream()); - sorted_unique_minors.shrink_to_fit(handle.get_stream()); - - sorted_local_vertices.resize(thrust::distance(sorted_local_vertices.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_local_vertices.begin(), - sorted_local_vertices.end())), - handle.get_stream()); - sorted_local_vertices.shrink_to_fit(handle.get_stream()); - - if constexpr (multi_gpu) { - sorted_local_vertices = - cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning( - handle, std::move(sorted_local_vertices)); - thrust::sort( - handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end()); - sorted_local_vertices.resize(thrust::distance(sorted_local_vertices.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_local_vertices.begin(), - sorted_local_vertices.end())), - handle.get_stream()); - sorted_local_vertices.shrink_to_fit(handle.get_stream()); - } } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "compute_renumber_map 1" << std::endl; +#endif + + // 2. find an unused vertex ID auto locally_unused_vertex_id = find_locally_unused_ext_vertex_id( handle, @@ -363,17 +544,9 @@ std::tuple, std::vector, vertex_t> compu "Invalid input arguments: there is no unused value in the entire range of " "vertex_t, increase vertex_t to 64 bit."); - // 4. compute global degrees for the sorted local vertices + // 3. compute global degrees for the sorted local vertices rmm::device_uvector sorted_local_vertex_degrees(0, handle.get_stream()); - std::optional> stream_pool_indices{ - std::nullopt}; // FIXME: move this inside the if statement - - auto constexpr num_chunks = size_t{ - 2}; // tuning parameter, this trade-offs # binary searches (up to num_chunks times more binary - // searches can be necessary if num_unique_majors << edgelist_edge_counts[i]) and temporary - // buffer requirement (cut by num_chunks times), currently set to 2 to avoid peak memory - // usage happening in this part (especially when minor_comm_size is small) if constexpr (multi_gpu) { auto& comm = handle.get_comms(); @@ -386,94 +559,37 @@ std::tuple, std::vector, vertex_t> compu auto edge_partition_major_range_sizes = host_scalar_allgather(minor_comm, sorted_local_vertices.size(), handle.get_stream()); - if ((minor_comm_size >= 2) && (handle.get_stream_pool_size() >= 2)) { - auto vertex_edge_counts = host_scalar_allreduce( - comm, - thrust::make_tuple(static_cast(sorted_local_vertices.size()), num_local_edges), - raft::comms::op_t::SUM, - handle.get_stream()); - // memory footprint vs parallelism trade-off - // peak memory requirement per loop is approximately - // (V/P) * (sizeof(vertex_t) + sizeof(edge_t)) + - // (E / (comm_size * minor_comm_size)) / num_chunks * sizeof(vertex_t) * 2 + - // std::min(V/P, (E / (comm_size * minor_comm_size)) / num_chunks) * (sizeof(vertex_t) + - // sizeof(edge_t)) - // and limit temporary memory requirement to (E / comm_size) * sizeof(vertex_t) - auto avg_vertex_degree = thrust::get<0>(vertex_edge_counts) > 0 - ? static_cast(thrust::get<1>(vertex_edge_counts)) / - static_cast(thrust::get<0>(vertex_edge_counts)) - : double{0.0}; - auto num_streams = static_cast( - (avg_vertex_degree * sizeof(vertex_t)) / - (static_cast(sizeof(vertex_t) + sizeof(edge_t)) + - (((avg_vertex_degree / minor_comm_size) / num_chunks) * sizeof(vertex_t) * 2) + - (std::min(1.0, ((avg_vertex_degree / minor_comm_size) / num_chunks)) * - (sizeof(vertex_t) + sizeof(edge_t))))); - if (num_streams >= 2) { - stream_pool_indices = std::vector(num_streams); - std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); - handle.sync_stream(); - } - } - for (int i = 0; i < minor_comm_size; ++i) { - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool(i % (*stream_pool_indices).size()) - : handle.get_stream(); - - rmm::device_uvector sorted_majors(edge_partition_major_range_sizes[i], loop_stream); + rmm::device_uvector sorted_majors(edge_partition_major_range_sizes[i], + handle.get_stream()); device_bcast(minor_comm, sorted_local_vertices.data(), sorted_majors.data(), edge_partition_major_range_sizes[i], i, - loop_stream); + handle.get_stream()); - rmm::device_uvector sorted_major_degrees(sorted_majors.size(), loop_stream); - thrust::fill(rmm::exec_policy(loop_stream), + rmm::device_uvector sorted_major_degrees(sorted_majors.size(), handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), sorted_major_degrees.begin(), sorted_major_degrees.end(), edge_t{0}); - rmm::device_uvector tmp_majors(0, loop_stream); - tmp_majors.reserve( - (static_cast(edgelist_edge_counts[i]) + (num_chunks - 1)) / num_chunks, - loop_stream); - size_t offset{0}; - for (size_t j = 0; j < num_chunks; ++j) { - size_t this_chunk_size = - std::min(tmp_majors.capacity(), static_cast(edgelist_edge_counts[i]) - offset); - tmp_majors.resize(this_chunk_size, loop_stream); - thrust::copy(rmm::exec_policy(loop_stream), - edgelist_majors[i] + offset, - edgelist_majors[i] + offset + tmp_majors.size(), - tmp_majors.begin()); - thrust::sort(rmm::exec_policy(loop_stream), tmp_majors.begin(), tmp_majors.end()); - auto num_unique_majors = - thrust::count_if(rmm::exec_policy(loop_stream), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(tmp_majors.size()), - is_first_in_run_t{tmp_majors.data()}); - rmm::device_uvector tmp_keys(num_unique_majors, loop_stream); - rmm::device_uvector tmp_values(num_unique_majors, loop_stream); - thrust::reduce_by_key(rmm::exec_policy(loop_stream), - tmp_majors.begin(), - tmp_majors.end(), - thrust::make_constant_iterator(edge_t{1}), - tmp_keys.begin(), - tmp_values.begin()); - - auto kv_pair_first = - thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin())); - thrust::for_each(rmm::exec_policy(loop_stream), - kv_pair_first, - kv_pair_first + tmp_keys.size(), - search_and_increment_degree_t{ - sorted_majors.data(), - static_cast(sorted_majors.size()), - sorted_major_degrees.data()}); - offset += this_chunk_size; - } + thrust::for_each( + handle.get_thrust_policy(), + edgelist_majors[i], + edgelist_majors[i] + edgelist_edge_counts[i], + [sorted_majors = + raft::device_span(sorted_majors.data(), sorted_majors.size()), + sorted_major_degrees = raft::device_span( + sorted_major_degrees.data(), sorted_major_degrees.size())] __device__(auto major) { + auto it = + thrust::lower_bound(thrust::seq, sorted_majors.begin(), sorted_majors.end(), major); + assert((it != sorted_majors.end()) && (*it == major)); + cuda::atomic_ref atomic_counter( + sorted_major_degrees[thrust::distance(sorted_majors.begin(), it)]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); device_reduce(minor_comm, sorted_major_degrees.begin(), @@ -481,11 +597,9 @@ std::tuple, std::vector, vertex_t> compu edge_partition_major_range_sizes[i], raft::comms::op_t::SUM, i, - loop_stream); + handle.get_stream()); if (i == minor_comm_rank) { sorted_local_vertex_degrees = std::move(sorted_major_degrees); } } - - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } } else { assert(edgelist_majors.size() == 1); @@ -495,47 +609,28 @@ std::tuple, std::vector, vertex_t> compu sorted_local_vertex_degrees.end(), edge_t{0}); - rmm::device_uvector tmp_majors(0, handle.get_stream()); - tmp_majors.reserve(static_cast(edgelist_edge_counts[0] + (num_chunks - 1)) / num_chunks, - handle.get_stream()); - size_t offset{0}; - for (size_t i = 0; i < num_chunks; ++i) { - size_t this_chunk_size = - std::min(tmp_majors.capacity(), static_cast(edgelist_edge_counts[0]) - offset); - tmp_majors.resize(this_chunk_size, handle.get_stream()); - thrust::copy(handle.get_thrust_policy(), - edgelist_majors[0] + offset, - edgelist_majors[0] + offset + tmp_majors.size(), - tmp_majors.begin()); - thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end()); - auto num_unique_majors = - thrust::count_if(handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(tmp_majors.size()), - is_first_in_run_t{tmp_majors.data()}); - rmm::device_uvector tmp_keys(num_unique_majors, handle.get_stream()); - rmm::device_uvector tmp_values(num_unique_majors, handle.get_stream()); - thrust::reduce_by_key(handle.get_thrust_policy(), - tmp_majors.begin(), - tmp_majors.end(), - thrust::make_constant_iterator(edge_t{1}), - tmp_keys.begin(), - tmp_values.begin()); - - auto kv_pair_first = - thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin())); - thrust::for_each(handle.get_thrust_policy(), - kv_pair_first, - kv_pair_first + tmp_keys.size(), - search_and_increment_degree_t{ - sorted_local_vertices.data(), - static_cast(sorted_local_vertices.size()), - sorted_local_vertex_degrees.data()}); - offset += this_chunk_size; - } + thrust::for_each(handle.get_thrust_policy(), + edgelist_majors[0], + edgelist_majors[0] + edgelist_edge_counts[0], + [sorted_majors = raft::device_span( + sorted_local_vertices.data(), sorted_local_vertices.size()), + sorted_major_degrees = raft::device_span( + sorted_local_vertex_degrees.data(), + sorted_local_vertex_degrees.size())] __device__(auto major) { + auto it = thrust::lower_bound( + thrust::seq, sorted_majors.begin(), sorted_majors.end(), major); + assert((it != sorted_majors.end()) && (*it == major)); + cuda::atomic_ref atomic_counter( + sorted_major_degrees[thrust::distance(sorted_majors.begin(), it)]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "compute_renumber_map 2" << std::endl; +#endif - // 4. sort local vertices by degree (descending) + // 5. sort local vertices by degree (descending) thrust::sort_by_key(handle.get_thrust_policy(), sorted_local_vertex_degrees.begin(), @@ -543,7 +638,7 @@ std::tuple, std::vector, vertex_t> compu sorted_local_vertices.begin(), thrust::greater()); - // 5. compute segment_offsets + // 6. compute segment_offsets static_assert(detail::num_sparse_segments_per_vertex_partition == 3); static_assert((detail::low_degree_threshold <= detail::mid_degree_threshold) && @@ -553,57 +648,90 @@ std::tuple, std::vector, vertex_t> compu (detail::hypersparse_threshold_ratio <= 1.0)); size_t mid_degree_threshold{detail::mid_degree_threshold}; size_t low_degree_threshold{detail::low_degree_threshold}; - size_t hypersparse_degree_threshold{0}; + size_t hypersparse_degree_threshold{1}; if (multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); mid_degree_threshold *= minor_comm_size; low_degree_threshold *= minor_comm_size; - hypersparse_degree_threshold = - static_cast(minor_comm_size * detail::hypersparse_threshold_ratio); + hypersparse_degree_threshold = std::max( + static_cast(minor_comm_size * detail::hypersparse_threshold_ratio), size_t{1}); } - auto num_segments_per_vertex_partition = - detail::num_sparse_segments_per_vertex_partition + - (hypersparse_degree_threshold > 0 ? size_t{2} : size_t{1}); // last is 0-degree segment - rmm::device_uvector d_thresholds(num_segments_per_vertex_partition - 1, - handle.get_stream()); - auto h_thresholds = - hypersparse_degree_threshold > 0 - ? std::vector{static_cast(mid_degree_threshold), - static_cast(low_degree_threshold), - static_cast(hypersparse_degree_threshold), - std::min(static_cast(hypersparse_degree_threshold), edge_t{1})} - : std::vector{static_cast(mid_degree_threshold), - static_cast(low_degree_threshold), - edge_t{1}}; - raft::update_device( - d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), handle.get_stream()); - - rmm::device_uvector d_segment_offsets(num_segments_per_vertex_partition + 1, - handle.get_stream()); - auto vertex_count = static_cast(sorted_local_vertices.size()); - d_segment_offsets.set_element_to_zero_async(0, handle.get_stream()); - d_segment_offsets.set_element( - num_segments_per_vertex_partition, vertex_count, handle.get_stream()); + std::vector h_segment_offsets{}; + std::optional> h_hypersparse_degree_offsets{}; + { + auto num_partitions = detail::num_sparse_segments_per_vertex_partition /* high, mid, low */ + + (hypersparse_degree_threshold > 1 + ? hypersparse_degree_threshold - size_t{1} + /* one partition per each global degree in the hypersparse region */ + : size_t{0}) + + size_t{1} /* zero */; + rmm::device_uvector d_thresholds(num_partitions - 1, handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + d_thresholds.begin(), + d_thresholds.end(), + [mid_degree_threshold, + low_degree_threshold, + hypersparse_degree_threshold] __device__(size_t i) { + if (i == 0) { + return mid_degree_threshold; // high,mid boundary + } else if (i == 1) { + return low_degree_threshold; // mid, low boundary + } else { + assert(hypersparse_degree_threshold > (i - 2)); + return hypersparse_degree_threshold - (i - 2); + } + }); + rmm::device_uvector d_offsets(num_partitions + 1, handle.get_stream()); + d_offsets.set_element_to_zero_async(0, handle.get_stream()); + auto vertex_count = static_cast(sorted_local_vertices.size()); + d_offsets.set_element(num_partitions, vertex_count, handle.get_stream()); + thrust::upper_bound(handle.get_thrust_policy(), + sorted_local_vertex_degrees.begin(), + sorted_local_vertex_degrees.end(), + d_thresholds.begin(), + d_thresholds.end(), + d_offsets.begin() + 1, + thrust::greater{}); + std::vector h_offsets(d_offsets.size()); + raft::update_host(h_offsets.data(), d_offsets.data(), d_offsets.size(), handle.get_stream()); + handle.sync_stream(); - thrust::upper_bound(handle.get_thrust_policy(), - sorted_local_vertex_degrees.begin(), - sorted_local_vertex_degrees.end(), - d_thresholds.begin(), - d_thresholds.end(), - d_segment_offsets.begin() + 1, - thrust::greater{}); - - std::vector h_segment_offsets(d_segment_offsets.size()); - raft::update_host(h_segment_offsets.data(), - d_segment_offsets.data(), - d_segment_offsets.size(), - handle.get_stream()); - handle.sync_stream(); + auto num_segments_per_vertex_partition = + detail::num_sparse_segments_per_vertex_partition + + (hypersparse_degree_threshold > 1 ? size_t{2} : size_t{1}); // last is 0-degree segment + h_segment_offsets.resize(num_segments_per_vertex_partition + 1); + std::copy(h_offsets.begin(), + h_offsets.begin() + num_sparse_segments_per_vertex_partition + 1, + h_segment_offsets.begin()); + *(h_segment_offsets.rbegin()) = *(h_offsets.rbegin()); + if (hypersparse_degree_threshold > 1) { + *(h_segment_offsets.rbegin() + 1) = *(h_offsets.rbegin() + 1); + + h_hypersparse_degree_offsets = std::vector(hypersparse_degree_threshold); + std::copy(h_offsets.begin() + num_sparse_segments_per_vertex_partition, + h_offsets.begin() + num_sparse_segments_per_vertex_partition + + (hypersparse_degree_threshold - 1), + (*h_hypersparse_degree_offsets).begin()); + auto shift = (*h_hypersparse_degree_offsets)[0]; + std::transform((*h_hypersparse_degree_offsets).begin(), + (*h_hypersparse_degree_offsets).end(), + (*h_hypersparse_degree_offsets).begin(), + [shift](auto offset) { return offset - shift; }); + *((*h_hypersparse_degree_offsets).rbegin()) = *(h_offsets.rbegin() + 1); + } + } - return std::make_tuple( - std::move(sorted_local_vertices), h_segment_offsets, *locally_unused_vertex_id); +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "compute_renumber_map 3" << std::endl; +#endif + + return std::make_tuple(std::move(sorted_local_vertices), + h_segment_offsets, + h_hypersparse_degree_offsets, + *locally_unused_vertex_id); } template @@ -789,32 +917,28 @@ void expensive_check_edgelist( } template -std::vector aggregate_segment_offsets(raft::handle_t const& handle, - std::vector const& segment_offsets) +std::vector aggregate_offset_vectors(raft::handle_t const& handle, + std::vector const& offsets) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); - rmm::device_uvector d_segment_offsets(segment_offsets.size(), handle.get_stream()); - raft::update_device( - d_segment_offsets.data(), segment_offsets.data(), segment_offsets.size(), handle.get_stream()); - rmm::device_uvector d_aggregate_segment_offsets( - minor_comm_size * d_segment_offsets.size(), handle.get_stream()); - minor_comm.allgather(d_segment_offsets.data(), - d_aggregate_segment_offsets.data(), - d_segment_offsets.size(), - handle.get_stream()); - - std::vector h_aggregate_segment_offsets(d_aggregate_segment_offsets.size(), - vertex_t{0}); - raft::update_host(h_aggregate_segment_offsets.data(), - d_aggregate_segment_offsets.data(), - d_aggregate_segment_offsets.size(), + rmm::device_uvector d_offsets(offsets.size(), handle.get_stream()); + raft::update_device(d_offsets.data(), offsets.data(), offsets.size(), handle.get_stream()); + rmm::device_uvector d_aggregate_offset_vectors(minor_comm_size * d_offsets.size(), + handle.get_stream()); + minor_comm.allgather( + d_offsets.data(), d_aggregate_offset_vectors.data(), d_offsets.size(), handle.get_stream()); + + std::vector h_aggregate_offset_vectors(d_aggregate_offset_vectors.size(), vertex_t{0}); + raft::update_host(h_aggregate_offset_vectors.data(), + d_aggregate_offset_vectors.data(), + d_aggregate_offset_vectors.size(), handle.get_stream()); handle.sync_stream(); // this is necessary as h_aggregate_offsets can be used right after return. - return h_aggregate_segment_offsets; + return h_aggregate_offset_vectors; } } // namespace detail @@ -857,10 +981,10 @@ renumber_edgelist( (*edgelist_intra_partition_segment_offsets).size() == static_cast(minor_comm_size), "Invalid input arguments: erroneous (*edgelist_intra_partition_segment_offsets).size()."); for (size_t i = 0; i < edgelist_majors.size(); ++i) { - CUGRAPH_EXPECTS( - (*edgelist_intra_partition_segment_offsets)[i].size() == - static_cast(major_comm_size + 1), - "Invalid input arguments: erroneous (*edgelist_intra_partition_segment_offsets)[].size()."); + CUGRAPH_EXPECTS((*edgelist_intra_partition_segment_offsets)[i].size() == + static_cast(major_comm_size + 1), + "Invalid input arguments: erroneous " + "(*edgelist_intra_partition_segment_offsets)[].size()."); CUGRAPH_EXPECTS( std::is_sorted((*edgelist_intra_partition_segment_offsets)[i].begin(), (*edgelist_intra_partition_segment_offsets)[i].end()), @@ -868,8 +992,8 @@ renumber_edgelist( CUGRAPH_EXPECTS( ((*edgelist_intra_partition_segment_offsets)[i][0] == 0) && ((*edgelist_intra_partition_segment_offsets)[i].back() == edgelist_edge_counts[i]), - "Invalid input arguments: (*edgelist_intra_partition_segment_offsets)[][0] should be 0 and " - "(*edgelist_intra_partition_segment_offsets)[].back() should coincide with " + "Invalid input arguments: (*edgelist_intra_partition_segment_offsets)[][0] should be 0 " + "and (*edgelist_intra_partition_segment_offsets)[].back() should coincide with " "edgelist_edge_counts[]."); } } @@ -893,7 +1017,10 @@ renumber_edgelist( // 1. compute renumber map - auto [renumber_map_labels, vertex_partition_segment_offsets, locally_unused_vertex_id] = + auto [renumber_map_labels, + vertex_partition_segment_offsets, + vertex_partition_hypersparse_degree_offsets, + locally_unused_vertex_id] = detail::compute_renumber_map(handle, std::move(local_vertices), edgelist_const_majors, @@ -934,6 +1061,10 @@ renumber_edgelist( // 3. renumber edges +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "renumber_edgelist 0" << std::endl; +#endif { vertex_t max_edge_partition_major_range_size{0}; for (size_t i = 0; i < edgelist_majors.size(); ++i) { @@ -966,11 +1097,24 @@ renumber_edgelist( } } - if ((static_cast(partition.local_edge_partition_minor_range_size() * - 2.5 /* tuning parameter */) >= - static_cast(number_of_edges / comm_size)) && - edgelist_intra_partition_segment_offsets) { // memory footprint dominated by the O(V/sqrt(P)) - // part than the O(E/P) part + double approx_mem_requirements = + static_cast(partition.local_edge_partition_minor_range_size()) * + (static_cast( + sizeof(vertex_t)) /* rmm::device_uvector renumber_map_minor_labels */ + + + static_cast(sizeof(vertex_t) * 2) * + 2.5 /* kv_store_t renumber_map, * 2.5 to consider load factor */); +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "renumber_edgelist 1 partition.local_edge_partition_minor_range_size()=" + << partition.local_edge_partition_minor_range_size() + << " approx_mem_requirements=" << approx_mem_requirements << " threshold=" + << (static_cast(handle.get_device_properties().totalGlobalMem) * 0.05) + << std::endl; +#endif + if ((approx_mem_requirements > + static_cast(handle.get_device_properties().totalGlobalMem) * 0.05) && + edgelist_intra_partition_segment_offsets) { vertex_t max_segment_size{0}; for (int i = 0; i < major_comm_size; ++i) { auto minor_range_vertex_partition_id = @@ -1020,10 +1164,10 @@ renumber_edgelist( recvcounts[i] = partition.vertex_partition_range_size(minor_range_vertex_partition_id); } std::vector displacements(recvcounts.size(), 0); - std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + std::exclusive_scan(recvcounts.begin(), recvcounts.end(), displacements.begin(), size_t{0}); device_allgatherv(major_comm, - renumber_map_labels.begin(), - renumber_map_minor_labels.begin(), + renumber_map_labels.data(), + renumber_map_minor_labels.data(), recvcounts, displacements, handle.get_stream()); @@ -1044,13 +1188,25 @@ renumber_edgelist( } } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "renumber_edgelist 2" << std::endl; +#endif auto edge_partition_segment_offsets = - detail::aggregate_segment_offsets(handle, vertex_partition_segment_offsets); + detail::aggregate_offset_vectors(handle, vertex_partition_segment_offsets); + auto edge_partition_hypersparse_degree_offsets = + vertex_partition_hypersparse_degree_offsets + ? std::make_optional( + detail::aggregate_offset_vectors(handle, *vertex_partition_hypersparse_degree_offsets)) + : std::nullopt; return std::make_tuple( std::move(renumber_map_labels), - renumber_meta_t{ - number_of_vertices, number_of_edges, partition, edge_partition_segment_offsets}); + renumber_meta_t{number_of_vertices, + number_of_edges, + partition, + edge_partition_segment_offsets, + edge_partition_hypersparse_degree_offsets}); } template @@ -1078,7 +1234,10 @@ renumber_edgelist(raft::handle_t const& handle, std::nullopt); } - auto [renumber_map_labels, segment_offsets, locally_unused_vertex_id] = + auto [renumber_map_labels, + segment_offsets, + hypersparse_degree_offsets, + locally_unused_vertex_id] = detail::compute_renumber_map( handle, std::move(vertices), @@ -1099,8 +1258,9 @@ renumber_edgelist(raft::handle_t const& handle, renumber_map_view.find( edgelist_minors, edgelist_minors + num_edgelist_edges, edgelist_minors, handle.get_stream()); - return std::make_tuple(std::move(renumber_map_labels), - renumber_meta_t{segment_offsets}); + return std::make_tuple( + std::move(renumber_map_labels), + renumber_meta_t{segment_offsets, hypersparse_degree_offsets}); } } // namespace cugraph diff --git a/cpp/src/structure/renumber_utils_impl.cuh b/cpp/src/structure/renumber_utils_impl.cuh index 3efa58d9632..8f69a3c152d 100644 --- a/cpp/src/structure/renumber_utils_impl.cuh +++ b/cpp/src/structure/renumber_utils_impl.cuh @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -363,7 +364,7 @@ void renumber_ext_vertices(raft::handle_t const& handle, } std::unique_ptr> renumber_map_ptr{nullptr}; - if (multi_gpu) { + if constexpr (multi_gpu) { auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); @@ -402,11 +403,12 @@ void renumber_ext_vertices(raft::handle_t const& handle, rmm::device_uvector int_vertices_for_sorted_unique_ext_vertices(0, handle.get_stream()); auto [unique_ext_vertices, int_vertices_for_unique_ext_vertices] = - collect_values_for_unique_keys(handle, + collect_values_for_unique_keys(comm, local_renumber_map.view(), std::move(sorted_unique_ext_vertices), detail::compute_gpu_id_from_ext_vertex_t{ - comm_size, major_comm_size, minor_comm_size}); + comm_size, major_comm_size, minor_comm_size}, + handle.get_stream()); renumber_map_ptr = std::make_unique>( unique_ext_vertices.begin(), @@ -573,7 +575,6 @@ void unrenumber_int_vertices(raft::handle_t const& handle, auto local_int_vertex_first = vertex_partition_id == 0 ? vertex_t{0} : vertex_partition_range_lasts[vertex_partition_id - 1]; - auto local_int_vertex_last = vertex_partition_range_lasts[vertex_partition_id]; rmm::device_uvector sorted_unique_int_vertices(num_vertices, handle.get_stream()); sorted_unique_int_vertices.resize( @@ -595,16 +596,20 @@ void unrenumber_int_vertices(raft::handle_t const& handle, sorted_unique_int_vertices.end())), handle.get_stream()); - auto [unique_int_vertices, ext_vertices_for_unique_int_vertices] = - collect_values_for_unique_int_vertices(handle, - std::move(sorted_unique_int_vertices), - renumber_map_labels, - vertex_partition_range_lasts); + auto ext_vertices_for_sorted_unique_int_vertices = + collect_values_for_sorted_unique_int_vertices( + comm, + raft::device_span(sorted_unique_int_vertices.data(), + sorted_unique_int_vertices.size()), + renumber_map_labels, + vertex_partition_range_lasts, + local_int_vertex_first, + handle.get_stream()); kv_store_t renumber_map( - unique_int_vertices.begin(), - unique_int_vertices.begin() + unique_int_vertices.size(), - ext_vertices_for_unique_int_vertices.begin(), + sorted_unique_int_vertices.begin(), + sorted_unique_int_vertices.end(), + ext_vertices_for_sorted_unique_int_vertices.begin(), invalid_vertex_id::value, invalid_vertex_id::value, handle.get_stream()); @@ -667,4 +672,102 @@ std::enable_if_t unrenumber_local_int_edges(raft::handle_t con do_expensive_check); } +template +void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check) +{ + if (do_expensive_check) { + CUGRAPH_EXPECTS( + thrust::count_if(handle.get_thrust_policy(), + sorted_unique_edge_dsts.begin(), + sorted_unique_edge_dsts.end(), + [int_vertex_last = vertex_partition_range_lasts.back()] __device__(auto v) { + return v != invalid_vertex_id_v && + !is_valid_vertex(int_vertex_last, v); + }) == 0, + "Invalid input arguments: there are out-of-range vertices in sorted_unique_edge_dsts."); + CUGRAPH_EXPECTS( + thrust::is_sorted( + handle.get_thrust_policy(), sorted_unique_edge_dsts.begin(), sorted_unique_edge_dsts.end()), + "Invalid input arguments: the input internal edge destinations are not sorted."); + CUGRAPH_EXPECTS( + static_cast(thrust::count_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(sorted_unique_edge_dsts.size()), + detail::is_first_in_run_t{sorted_unique_edge_dsts.data()})) == + sorted_unique_edge_dsts.size(), + "Invalid input arguments: the input internal edge destinations have duplicates."); + } + + if constexpr (multi_gpu) { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + auto const major_comm_rank = major_comm.get_rank(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + auto const minor_comm_rank = minor_comm.get_rank(); + + auto vertex_partition_id = + partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks( + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank); + auto local_int_vertex_first = vertex_partition_id == 0 + ? vertex_t{0} + : vertex_partition_range_lasts[vertex_partition_id - 1]; + + rmm::device_uvector ext_vertices_for_sorted_unique_edge_dsts(0, handle.get_stream()); + if constexpr (store_transposed) { + std::vector minor_comm_vertex_partition_range_lasts(minor_comm_size); + for (int i = 0; i < minor_comm_size; ++i) { + auto vertex_partition_id = + partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks( + major_comm_size, minor_comm_size, major_comm_rank, i); + minor_comm_vertex_partition_range_lasts[i] = + vertex_partition_range_lasts[vertex_partition_id]; + } + ext_vertices_for_sorted_unique_edge_dsts = collect_values_for_sorted_unique_int_vertices( + minor_comm, + raft::device_span(sorted_unique_edge_dsts.data(), + sorted_unique_edge_dsts.size()), + renumber_map.begin(), + minor_comm_vertex_partition_range_lasts, + local_int_vertex_first, + handle.get_stream()); + } else { + std::vector major_comm_vertex_partition_range_lasts(major_comm_size); + for (int i = 0; i < major_comm_size; ++i) { + auto vertex_partition_id = + partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks( + major_comm_size, minor_comm_size, i, minor_comm_rank); + major_comm_vertex_partition_range_lasts[i] = + vertex_partition_range_lasts[vertex_partition_id]; + } + ext_vertices_for_sorted_unique_edge_dsts = collect_values_for_sorted_unique_int_vertices( + major_comm, + raft::device_span(sorted_unique_edge_dsts.data(), + sorted_unique_edge_dsts.size()), + renumber_map.begin(), + major_comm_vertex_partition_range_lasts, + local_int_vertex_first, + handle.get_stream()); + } + thrust::copy(handle.get_thrust_policy(), + ext_vertices_for_sorted_unique_edge_dsts.begin(), + ext_vertices_for_sorted_unique_edge_dsts.end(), + sorted_unique_edge_dsts.begin()); + } else { + unrenumber_local_int_vertices(handle, + sorted_unique_edge_dsts.data(), + sorted_unique_edge_dsts.size(), + renumber_map.data(), + vertex_t{0}, + vertex_partition_range_lasts[0], + do_expensive_check); + } +} + } // namespace cugraph diff --git a/cpp/src/structure/renumber_utils_mg_v32_e32.cu b/cpp/src/structure/renumber_utils_mg_v32_e32.cu index 93b18aeab86..987ad8a64e6 100644 --- a/cpp/src/structure/renumber_utils_mg_v32_e32.cu +++ b/cpp/src/structure/renumber_utils_mg_v32_e32.cu @@ -64,4 +64,18 @@ template void unrenumber_local_int_edges( std::optional>> const& edgelist_intra_partition_segment_offsets, bool do_expensive_check); +template void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check); + +template void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check); + } // namespace cugraph diff --git a/cpp/src/structure/renumber_utils_mg_v64_e64.cu b/cpp/src/structure/renumber_utils_mg_v64_e64.cu index d528ade2a4c..b5911351cca 100644 --- a/cpp/src/structure/renumber_utils_mg_v64_e64.cu +++ b/cpp/src/structure/renumber_utils_mg_v64_e64.cu @@ -64,4 +64,18 @@ template void unrenumber_local_int_edges( std::optional>> const& edgelist_intra_partition_segment_offsets, bool do_expensive_check); +template void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check); + +template void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check); + } // namespace cugraph diff --git a/cpp/src/structure/renumber_utils_sg_v32_e32.cu b/cpp/src/structure/renumber_utils_sg_v32_e32.cu index c1f4807d4a5..d106ac7ff67 100644 --- a/cpp/src/structure/renumber_utils_sg_v32_e32.cu +++ b/cpp/src/structure/renumber_utils_sg_v32_e32.cu @@ -69,4 +69,18 @@ template void unrenumber_local_int_edges(raft::handle_t co int32_t num_vertices, bool do_expensive_check); +template void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check); + +template void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check); + } // namespace cugraph diff --git a/cpp/src/structure/renumber_utils_sg_v64_e64.cu b/cpp/src/structure/renumber_utils_sg_v64_e64.cu index 7a6e5d368a9..e8caca1c941 100644 --- a/cpp/src/structure/renumber_utils_sg_v64_e64.cu +++ b/cpp/src/structure/renumber_utils_sg_v64_e64.cu @@ -69,4 +69,18 @@ template void unrenumber_local_int_edges(raft::handle_t co int64_t num_vertices, bool do_expensive_check); +template void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check); + +template void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check); + } // namespace cugraph diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 8a18dedd2ab..7f8b3e075df 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -16,8 +16,9 @@ #pragma once #include "prims/fill_edge_src_dst_property.cuh" +#include "prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh" #include "prims/reduce_op.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -51,6 +52,24 @@ namespace cugraph { namespace { +template +struct direction_optimizing_info_t { + rmm::device_uvector + approx_out_degrees; // if graph_view.local_vertex_partition_segment_offsets().has_value() is + // true, holds approximate degrees only for the high and mid degree + // segments; otherwise, exact + rmm::device_uvector visited_bitmap; + std::optional> nzd_unvisited_vertices{ + std::nullopt}; // valid only during bottom-up iterations + std::optional num_nzd_unvisited_low_degree_vertices{ + std::nullopt}; // to decide between topdown vs bottomup, relevant only when + // graph_view.local_vertex_partition_segment_offsets().has_value() is true + std::optional num_nzd_unvisited_hypersparse_vertices{ + std::nullopt}; // to decide between topdown vs bottomup, relevant only when + // graph_view.local_vertex_partition_segment_offsets().has_value() && + // graph_view.use_dcs() are both true +}; + template struct topdown_e_op_t { detail::edge_partition_endpoint_property_device_view_t @@ -69,18 +88,25 @@ struct topdown_e_op_t { } }; -template +template struct bottomup_e_op_t { - detail::edge_partition_endpoint_property_device_view_t + __device__ vertex_t operator()( + vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const + { + return dst; + } +}; + +template +struct bottomup_pred_op_t { + detail::edge_partition_endpoint_property_device_view_t prev_visited_flags{}; // visited in the previous iterations vertex_t dst_first{}; - __device__ thrust::optional operator()( + __device__ bool operator()( vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const { - auto dst_offset = dst - dst_first; - auto old = prev_visited_flags.get(dst_offset); - return old ? thrust::optional{dst} : thrust::nullopt; + return prev_visited_flags.get(dst - dst_first); } }; @@ -88,6 +114,10 @@ struct bottomup_e_op_t { namespace detail { +#if 1 // FIXME: delete +#define BFS_PERFORMANCE_MEASUREMENT 0 +#endif + template void bfs(raft::handle_t const& handle, GraphViewType const& graph_view, @@ -107,6 +137,10 @@ void bfs(raft::handle_t const& handle, static_assert(!GraphViewType::is_storage_transposed, "GraphViewType should support the push model."); +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto prep0 = std::chrono::steady_clock::now(); +#endif // direction optimizing BFS implementation is based on "S. Beamer, K. Asanovic, D. Patterson, // Direction-Optimizing Breadth-First Search, 2012" @@ -144,14 +178,27 @@ void bfs(raft::handle_t const& handle, if constexpr (GraphViewType::is_multi_gpu) { is_sorted = static_cast(host_scalar_allreduce(handle.get_comms(), static_cast(is_sorted), - raft::comms::op_t::SUM, + raft::comms::op_t::MIN, handle.get_stream())); } - CUGRAPH_EXPECTS( is_sorted, "Invalid input arguments: input sources should be sorted in the non-descending order."); + bool no_duplicates = (static_cast(thrust::count_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(n_sources), + is_first_in_run_t{sources})) == n_sources); + if constexpr (GraphViewType::is_multi_gpu) { + no_duplicates = static_cast(host_scalar_allreduce(handle.get_comms(), + static_cast(no_duplicates), + raft::comms::op_t::MIN, + handle.get_stream())); + } + CUGRAPH_EXPECTS(no_duplicates, + "Invalid input arguments: input sources should not have duplicates."); + auto num_invalid_vertices = thrust::count_if(handle.get_thrust_policy(), sources, @@ -188,38 +235,131 @@ void bfs(raft::handle_t const& handle, thrust::fill(handle.get_thrust_policy(), output_first, output_first + n_sources, vertex_t{0}); // 3. update meta data for direction optimizing BFS +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto prep1 = std::chrono::steady_clock::now(); +#endif + + auto segment_offsets = graph_view.local_vertex_partition_segment_offsets(); + + double direction_optimizing_alpha = + (graph_view.number_of_vertices() > 0) + ? ((static_cast(graph_view.compute_number_of_edges(handle)) / + static_cast(graph_view.number_of_vertices())) * + (1.0 / 3.75) /* tuning parametger */) + : double{1.0}; + constexpr vertex_t direction_optimizing_beta = 24; // tuning parameter + + std::optional> aux_info{std::nullopt}; + if (direction_optimizing) { + rmm::device_uvector approx_out_degrees(0, handle.get_stream()); + if (segment_offsets) { // exploit internal knowedge for exhaustive performance optimization for + // large-scale benchmarking (the else path is sufficient for small + // clusters with few tens of GPUs) + size_t partition_idx{0}; + size_t partition_size{1}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + partition_idx = static_cast(minor_comm_rank); + partition_size = static_cast(minor_comm_size); + } - constexpr edge_t direction_optimizing_alpha = 14; - constexpr vertex_t direction_optimizing_beta = 24; + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto edge_mask_view = graph_view.edge_mask_view(); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, partition_idx) + : thrust::nullopt; + auto high_and_mid_degree_segment_size = + (*segment_offsets)[2]; // compute local degrees for high & mid degree segments only, for + // low & hypersparse segments, use low_degree_threshold * + // partition_size * 0.5 & partition_size * + // hypersparse_threshold_ratio * 0.5 as approximate out degrees + if (edge_partition_e_mask) { + approx_out_degrees = edge_partition.compute_local_degrees_with_mask( + (*edge_partition_e_mask).value_first(), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) + + high_and_mid_degree_segment_size, + handle.get_stream()); + } else { + approx_out_degrees = edge_partition.compute_local_degrees( + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) + + high_and_mid_degree_segment_size, + handle.get_stream()); + } + thrust::transform(handle.get_thrust_policy(), + approx_out_degrees.begin(), + approx_out_degrees.end(), + approx_out_degrees.begin(), + multiplier_t{static_cast( + partition_size)}); // local_degrees => approximate global degrees + } else { + approx_out_degrees = graph_view.compute_out_degrees(handle); // exact + } - std::optional> out_degrees{std::nullopt}; - std::optional> nzd_unvisited_vertices{std::nullopt}; - if (direction_optimizing) { - out_degrees = graph_view.compute_out_degrees(handle); - nzd_unvisited_vertices = rmm::device_uvector( - graph_view.local_vertex_partition_range_size(), handle.get_stream()); - (*nzd_unvisited_vertices) - .resize(thrust::distance( - (*nzd_unvisited_vertices).begin(), - thrust::copy_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last()), - (*nzd_unvisited_vertices).begin(), - [vertex_partition, - sources = raft::device_span(sources, n_sources), - out_degrees = raft::device_span( - (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) { - auto v_offset = - vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); - return (out_degrees[v_offset] > edge_t{0}) && - !thrust::binary_search(thrust::seq, sources.begin(), sources.end(), v); - })), - handle.get_stream()); - (*nzd_unvisited_vertices).shrink_to_fit(handle.get_stream()); + rmm::device_uvector visited_bitmap( + packed_bool_size(graph_view.local_vertex_partition_range_size()), handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + visited_bitmap.begin(), + visited_bitmap.end(), + packed_bool_empty_mask()); + thrust::for_each( + handle.get_thrust_policy(), + sources, + sources + n_sources, + [bitmap = raft::device_span(visited_bitmap.data(), visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(v_offset)]); + word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + }); + + std::optional num_nzd_unvisited_low_degree_vertices{std::nullopt}; + std::optional num_nzd_unvisited_hypersparse_vertices{std::nullopt}; + if (segment_offsets) { + num_nzd_unvisited_low_degree_vertices = (*segment_offsets)[3] - (*segment_offsets)[2]; + if (graph_view.use_dcs()) { + num_nzd_unvisited_hypersparse_vertices = (*segment_offsets)[4] - (*segment_offsets)[3]; + } + if (n_sources > 0) { + std::vector h_sources(n_sources); + raft::update_host(h_sources.data(), sources, n_sources, handle.get_stream()); + handle.sync_stream(); + for (size_t i = 0; i < h_sources.size(); ++i) { + auto v_offset = h_sources[i] - graph_view.local_vertex_partition_range_first(); + if ((v_offset >= (*segment_offsets)[2]) && (v_offset < (*segment_offsets)[3])) { + --(*num_nzd_unvisited_low_degree_vertices); + } else if (graph_view.use_dcs()) { + if ((v_offset >= (*segment_offsets)[3]) && (v_offset < (*segment_offsets)[4])) { + --(*num_nzd_unvisited_hypersparse_vertices); + } + } + } + } + } + + aux_info = + direction_optimizing_info_t{std::move(approx_out_degrees), + std::move(visited_bitmap), + std::nullopt, + num_nzd_unvisited_low_degree_vertices, + num_nzd_unvisited_hypersparse_vertices}; } // 4. initialize BFS frontier +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto prep2 = std::chrono::steady_clock::now(); +#endif constexpr size_t bucket_idx_cur = 0; constexpr size_t bucket_idx_next = 1; @@ -237,6 +377,10 @@ void bfs(raft::handle_t const& handle, handle, graph_view); // this may mark some vertices visited in previous iterations as unvisited // (but this is OK as we check prev_dst_visited_flags first) fill_edge_dst_property(handle, graph_view, dst_visited_flags.mutable_view(), false); +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto prep3 = std::chrono::steady_clock::now(); +#endif fill_edge_dst_property(handle, graph_view, @@ -244,15 +388,30 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_cur).end(), prev_dst_visited_flags.mutable_view(), true); +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto prep4 = std::chrono::steady_clock::now(); + std::chrono::duration dur0 = prep1 - prep0; + std::chrono::duration dur1 = prep2 - prep1; + std::chrono::duration dur2 = prep3 - prep2; + std::chrono::duration dur3 = prep4 - prep3; + std::chrono::duration dur = prep4 - prep0; + std::cerr << "prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count() << "," + << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s." << std::endl; +#endif // 4. BFS iteration vertex_t depth{0}; - bool top_down = true; - auto cur_aggregate_vertex_frontier_size = + bool topdown = true; + auto cur_aggregate_frontier_size = static_cast(vertex_frontier.bucket(bucket_idx_cur).aggregate_size()); while (true) { - vertex_t next_aggregate_vertex_frontier_size{}; - if (top_down) { + vertex_t next_aggregate_frontier_size{}; + if (topdown) { +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto topdown0 = std::chrono::steady_clock::now(); +#endif topdown_e_op_t e_op{}; e_op.prev_visited_flags = detail::edge_partition_endpoint_property_device_view_t( @@ -263,14 +422,19 @@ void bfs(raft::handle_t const& handle, e_op.dst_first = graph_view.local_edge_partition_dst_range_first(); auto [new_frontier_vertex_buffer, predecessor_buffer] = - transform_reduce_v_frontier_outgoing_e_by_dst(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op, - reduce_op::any()); + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( + handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op, + reduce_op::any()); +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto topdown1 = std::chrono::steady_clock::now(); +#endif auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1), predecessor_buffer.begin()); @@ -285,10 +449,29 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_next) = key_bucket_t( handle, std::move(new_frontier_vertex_buffer)); +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto topdown2 = std::chrono::steady_clock::now(); +#endif - next_aggregate_vertex_frontier_size = + next_aggregate_frontier_size = static_cast(vertex_frontier.bucket(bucket_idx_next).aggregate_size()); - if (next_aggregate_vertex_frontier_size == 0) { break; } +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto topdown3 = std::chrono::steady_clock::now(); +#endif + if (next_aggregate_frontier_size == 0) { +#if BFS_PERFORMANCE_MEASUREMENT + std::chrono::duration dur0 = topdown1 - topdown0; + std::chrono::duration dur1 = topdown2 - topdown1; + std::chrono::duration dur2 = topdown3 - topdown2; + std::chrono::duration dur = topdown3 - topdown0; + std::cerr << "depth=" << depth << " topdown (prim,vf,host) took " << dur.count() << " (" + << dur0.count() << "," << dur1.count() << "," << dur2.count() << ") s." + << std::endl; +#endif + break; + } fill_edge_dst_property(handle, graph_view, @@ -296,67 +479,170 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_next).end(), prev_dst_visited_flags.mutable_view(), true); +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto topdown4 = std::chrono::steady_clock::now(); + auto topdown5 = std::chrono::steady_clock::now(); +#endif if (direction_optimizing) { - auto m_f = thrust::transform_reduce( - handle.get_thrust_policy(), - vertex_frontier.bucket(bucket_idx_next).begin(), - vertex_frontier.bucket(bucket_idx_next).end(), - cuda::proclaim_return_type( - [vertex_partition, - out_degrees = raft::device_span( - (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) { - auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); - return out_degrees[v_offset]; - }), - edge_t{0}, - thrust::plus{}); + if (vertex_frontier.bucket(bucket_idx_next).size() > 0) { + thrust::for_each( + handle.get_thrust_policy(), + vertex_frontier.bucket(bucket_idx_next).begin(), + vertex_frontier.bucket(bucket_idx_next).end(), + [bitmap = raft::device_span((*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(v_offset)]); + word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + }); + } +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + topdown5 = std::chrono::steady_clock::now(); +#endif + double m_f{0.0}; + double m_u{0.0}; { - rmm::device_uvector tmp_vertices((*nzd_unvisited_vertices).size(), - handle.get_stream()); - tmp_vertices.resize( - thrust::distance(tmp_vertices.begin(), - thrust::set_difference(handle.get_thrust_policy(), - (*nzd_unvisited_vertices).begin(), - (*nzd_unvisited_vertices).end(), - vertex_frontier.bucket(bucket_idx_next).begin(), - vertex_frontier.bucket(bucket_idx_next).end(), - tmp_vertices.begin())), - handle.get_stream()); - nzd_unvisited_vertices = std::move(tmp_vertices); + size_t partition_size{1}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + partition_size = static_cast(minor_comm_size); + } + + auto f_vertex_first = vertex_frontier.bucket(bucket_idx_next).begin(); + auto f_vertex_last = vertex_frontier.bucket(bucket_idx_next).end(); + + if (segment_offsets) { + // FIXME: this actually over-estimates for graphs with power-law degree distribution + auto approx_low_segment_degree = + static_cast(low_degree_threshold * partition_size) * 0.5; + auto approx_hypersparse_segment_degree = + static_cast(partition_size) * hypersparse_threshold_ratio * 0.5; + auto f_segment_offsets = compute_key_segment_offsets( + vertex_frontier.bucket(bucket_idx_next).begin(), + vertex_frontier.bucket(bucket_idx_next).end(), + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + *((*aux_info).num_nzd_unvisited_low_degree_vertices) -= + (f_segment_offsets[3] - f_segment_offsets[2]); + if (graph_view.use_dcs()) { + *((*aux_info).num_nzd_unvisited_hypersparse_vertices) -= + (f_segment_offsets[4] - f_segment_offsets[3]); + } + f_vertex_last = f_vertex_first + f_segment_offsets[2]; + m_f = static_cast((f_segment_offsets[3] - f_segment_offsets[2])) * + approx_low_segment_degree; + if (graph_view.use_dcs()) { + m_f += static_cast(f_segment_offsets[4] - f_segment_offsets[3]) * + approx_hypersparse_segment_degree; + } + + m_u = static_cast(*((*aux_info).num_nzd_unvisited_low_degree_vertices)) * + approx_low_segment_degree; + if (graph_view.use_dcs()) { + m_u += static_cast(*((*aux_info).num_nzd_unvisited_hypersparse_vertices)) * + approx_hypersparse_segment_degree; + } + } + + m_f += static_cast(thrust::transform_reduce( + handle.get_thrust_policy(), + f_vertex_first, + f_vertex_last, + cuda::proclaim_return_type( + [out_degrees = raft::device_span((*aux_info).approx_out_degrees.data(), + (*aux_info).approx_out_degrees.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) { + auto v_offset = v - v_first; + return out_degrees[v_offset]; + }), + edge_t{0}, + thrust::plus{})); + + m_u += static_cast(thrust::transform_reduce( + handle.get_thrust_policy(), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(segment_offsets + ? (*segment_offsets)[2] + : graph_view.local_vertex_partition_range_size()), + cuda::proclaim_return_type( + [out_degrees = raft::device_span((*aux_info).approx_out_degrees.data(), + (*aux_info).approx_out_degrees.size()), + bitmap = raft::device_span( + (*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size())] __device__(vertex_t v_offset) { + auto word = bitmap[packed_bool_offset(v_offset)]; + if ((word & packed_bool_mask(v_offset)) != packed_bool_empty_mask()) { // visited + return edge_t{0}; + } else { + return out_degrees[v_offset]; + } + }), + edge_t{0}, + thrust::plus{})); } - auto m_u = thrust::transform_reduce( - handle.get_thrust_policy(), - (*nzd_unvisited_vertices).begin(), - (*nzd_unvisited_vertices).end(), - cuda::proclaim_return_type( - [vertex_partition, - out_degrees = raft::device_span( - (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) { - auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); - return out_degrees[v_offset]; - }), - edge_t{0}, - thrust::plus{}); - auto aggregate_m_f = - GraphViewType::is_multi_gpu - ? host_scalar_allreduce( - handle.get_comms(), m_f, raft::comms::op_t::SUM, handle.get_stream()) - : m_f; - auto aggregate_m_u = - GraphViewType::is_multi_gpu - ? host_scalar_allreduce( - handle.get_comms(), m_u, raft::comms::op_t::SUM, handle.get_stream()) - : m_u; + auto aggregate_m_f = m_f; + auto aggregate_m_u = m_u; + if constexpr (GraphViewType::is_multi_gpu) { + auto tmp = host_scalar_allreduce(handle.get_comms(), + thrust::make_tuple(m_f, m_u), + raft::comms::op_t::SUM, + handle.get_stream()); + aggregate_m_f = thrust::get<0>(tmp); + aggregate_m_u = thrust::get<1>(tmp); + } +#if BFS_PERFORMANCE_MEASUREMENT + std::cerr << "m_f=" << m_f << " m_u=" << m_u + << " direction_optimizing_alpha=" << direction_optimizing_alpha + << " aggregate_m_f * direction_optimzing_alpha=" + << aggregate_m_f * direction_optimizing_alpha + << " aggregate_m_u=" << aggregate_m_u + << " cur_aggregate_frontier_size=" << cur_aggregate_frontier_size + << " next_aggregate_frontier_size=" << next_aggregate_frontier_size << std::endl; +#endif if ((aggregate_m_f * direction_optimizing_alpha > aggregate_m_u) && - (next_aggregate_vertex_frontier_size >= cur_aggregate_vertex_frontier_size)) { - top_down = false; + (next_aggregate_frontier_size >= cur_aggregate_frontier_size)) { + topdown = false; + (*aux_info).nzd_unvisited_vertices = rmm::device_uvector( + segment_offsets ? *((*segment_offsets).rbegin() + 1) + : graph_view.local_vertex_partition_range_size(), + handle.get_stream()); + (*((*aux_info).nzd_unvisited_vertices)) + .resize( + thrust::distance( + (*((*aux_info).nzd_unvisited_vertices)).begin(), + thrust::copy_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), + thrust::make_counting_iterator( + segment_offsets ? graph_view.local_vertex_partition_range_first() + + *((*segment_offsets).rbegin() + 1) + : graph_view.local_vertex_partition_range_last()), + (*((*aux_info).nzd_unvisited_vertices)).begin(), + [bitmap = raft::device_span((*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + auto word = bitmap[packed_bool_offset(v_offset)]; + return ((word & packed_bool_mask(v_offset)) == packed_bool_empty_mask()); + })), + handle.get_stream()); } } +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto topdown6 = std::chrono::steady_clock::now(); +#endif - if (top_down) { // staying in top-down + if (topdown) { // staying in top-down vertex_frontier.bucket(bucket_idx_cur) = key_bucket_t(handle); vertex_frontier.swap_buckets(bucket_idx_cur, bucket_idx_next); @@ -364,63 +650,161 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_cur) = key_bucket_t( handle, - raft::device_span((*nzd_unvisited_vertices).data(), - (*nzd_unvisited_vertices).size())); + raft::device_span((*((*aux_info).nzd_unvisited_vertices)).data(), + (*((*aux_info).nzd_unvisited_vertices)).size())); vertex_frontier.bucket(bucket_idx_next) = key_bucket_t(handle); } +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto topdown7 = std::chrono::steady_clock::now(); + std::chrono::duration dur0 = topdown1 - topdown0; + std::chrono::duration dur1 = topdown2 - topdown1; + std::chrono::duration dur2 = topdown3 - topdown2; + std::chrono::duration dur3 = topdown4 - topdown3; + std::chrono::duration dur4 = topdown5 - topdown4; + std::chrono::duration dur5 = topdown6 - topdown5; + std::chrono::duration dur6 = topdown7 - topdown6; + std::chrono::duration dur = topdown7 - topdown0; + std::cerr << "depth=" << depth + << " topdown next_aggregate_frontier_size=" << next_aggregate_frontier_size + << " next topdown=" << topdown << " (prim,vf,host,fill,unvisited,dir,vf) took " + << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() + << "," << dur3.count() << "," << dur4.count() << "," << dur5.count() << "," + << dur6.count() << ") s." << std::endl; +#endif } else { // bottom up - bottomup_e_op_t e_op{}; - e_op.prev_visited_flags = - detail::edge_partition_endpoint_property_device_view_t( - prev_dst_visited_flags.mutable_view()); - e_op.dst_first = graph_view.local_edge_partition_dst_range_first(); - auto [new_frontier_vertex_buffer, predecessor_buffer] = - transform_reduce_v_frontier_outgoing_e_by_src(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op, - reduce_op::any()); +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto bottomup0 = std::chrono::steady_clock::now(); +#endif + rmm::device_uvector new_frontier_vertex_buffer(0, handle.get_stream()); + { + bottomup_e_op_t e_op{}; + bottomup_pred_op_t pred_op{}; + pred_op.prev_visited_flags = + detail::edge_partition_endpoint_property_device_view_t( + prev_dst_visited_flags.view()); + pred_op.dst_first = graph_view.local_edge_partition_dst_range_first(); + + rmm::device_uvector predecessor_buffer( + vertex_frontier.bucket(bucket_idx_cur).size(), handle.get_stream()); + per_v_transform_reduce_if_outgoing_e(handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op, + invalid_vertex, + reduce_op::any(), + pred_op, + predecessor_buffer.begin(), + true); + auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1), + predecessor_buffer.begin()); + + // FIXME: this scatter_if and the resize below can be concurrently executed. + thrust::scatter_if( + handle.get_thrust_policy(), + input_pair_first, + input_pair_first + predecessor_buffer.size(), + thrust::make_transform_iterator( + vertex_frontier.bucket(bucket_idx_cur).cbegin(), + detail::shift_left_t{graph_view.local_vertex_partition_range_first()}), + predecessor_buffer.begin(), + thrust::make_zip_iterator(distances, predecessor_first), + detail::is_not_equal_t{invalid_vertex}); + + new_frontier_vertex_buffer.resize(predecessor_buffer.size(), handle.get_stream()); + new_frontier_vertex_buffer.resize( + thrust::distance(new_frontier_vertex_buffer.begin(), + thrust::copy_if(handle.get_thrust_policy(), + vertex_frontier.bucket(bucket_idx_cur).cbegin(), + vertex_frontier.bucket(bucket_idx_cur).cend(), + predecessor_buffer.begin(), + new_frontier_vertex_buffer.begin(), + detail::is_not_equal_t{invalid_vertex})), + handle.get_stream()); - auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1), - predecessor_buffer.begin()); - thrust::scatter( - handle.get_thrust_policy(), - input_pair_first, - input_pair_first + new_frontier_vertex_buffer.size(), - thrust::make_transform_iterator( - new_frontier_vertex_buffer.begin(), - detail::shift_left_t{graph_view.local_vertex_partition_range_first()}), - thrust::make_zip_iterator(distances, predecessor_first)); + assert(direction_optimizing); - assert(direction_optimizing); + thrust::for_each( + handle.get_thrust_policy(), + new_frontier_vertex_buffer.begin(), + new_frontier_vertex_buffer.end(), + [bitmap = raft::device_span((*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(v_offset)]); + word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + }); + (*((*aux_info).nzd_unvisited_vertices)) + .resize( + thrust::distance( + (*((*aux_info).nzd_unvisited_vertices)).begin(), + thrust::remove_if( + handle.get_thrust_policy(), + (*((*aux_info).nzd_unvisited_vertices)).begin(), + (*((*aux_info).nzd_unvisited_vertices)).end(), + [bitmap = raft::device_span((*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + auto word = bitmap[packed_bool_offset(v_offset)]; + return ((word & packed_bool_mask(v_offset)) != packed_bool_empty_mask()); + })), + handle.get_stream()); - { - rmm::device_uvector tmp_vertices((*nzd_unvisited_vertices).size(), - handle.get_stream()); - tmp_vertices.resize( - thrust::distance(tmp_vertices.begin(), - thrust::set_difference(handle.get_thrust_policy(), - (*nzd_unvisited_vertices).begin(), - (*nzd_unvisited_vertices).end(), - new_frontier_vertex_buffer.begin(), - new_frontier_vertex_buffer.end(), - tmp_vertices.begin())), + if (segment_offsets) { + auto key_segment_offsets = compute_key_segment_offsets( + new_frontier_vertex_buffer.begin(), + new_frontier_vertex_buffer.end(), + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + *((*aux_info).num_nzd_unvisited_low_degree_vertices) -= + key_segment_offsets[3] - key_segment_offsets[2]; + if (graph_view.use_dcs()) { + *((*aux_info).num_nzd_unvisited_hypersparse_vertices) -= + key_segment_offsets[4] - key_segment_offsets[3]; + } + } + } +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto bottomup1 = std::chrono::steady_clock::now(); +#endif + + next_aggregate_frontier_size = static_cast(new_frontier_vertex_buffer.size()); + auto aggregate_nzd_unvisited_vertices = + static_cast((*((*aux_info).nzd_unvisited_vertices)).size()); + if constexpr (GraphViewType::is_multi_gpu) { + auto tmp = host_scalar_allreduce( + handle.get_comms(), + thrust::make_tuple(next_aggregate_frontier_size, aggregate_nzd_unvisited_vertices), + raft::comms::op_t::SUM, handle.get_stream()); - nzd_unvisited_vertices = std::move(tmp_vertices); + next_aggregate_frontier_size = thrust::get<0>(tmp); + aggregate_nzd_unvisited_vertices = thrust::get<1>(tmp); } - next_aggregate_vertex_frontier_size = - GraphViewType::is_multi_gpu - ? host_scalar_allreduce(handle.get_comms(), - static_cast(new_frontier_vertex_buffer.size()), - raft::comms::op_t::SUM, - handle.get_stream()) - : static_cast(new_frontier_vertex_buffer.size()); - if (next_aggregate_vertex_frontier_size == 0) { break; } +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto bottomup2 = std::chrono::steady_clock::now(); +#endif + if (next_aggregate_frontier_size == 0) { +#if BFS_PERFORMANCE_MEASUREMENT + std::chrono::duration dur0 = bottomup1 - bottomup0; + std::chrono::duration dur1 = bottomup2 - bottomup1; + std::chrono::duration dur = bottomup2 - bottomup0; + std::cerr << "depth=" << depth << " bottomup (prim+,host) took " << dur.count() << " (" + << dur0.count() << "," << dur1.count() << ") s." << std::endl; +#endif + break; + } fill_edge_dst_property(handle, graph_view, @@ -428,22 +812,18 @@ void bfs(raft::handle_t const& handle, new_frontier_vertex_buffer.end(), prev_dst_visited_flags.mutable_view(), true); - - auto aggregate_nzd_unvisted_vertices = - GraphViewType::is_multi_gpu - ? host_scalar_allreduce(handle.get_comms(), - static_cast((*nzd_unvisited_vertices).size()), - raft::comms::op_t::SUM, - handle.get_stream()) - : static_cast((*nzd_unvisited_vertices).size()); - - if ((next_aggregate_vertex_frontier_size * direction_optimizing_beta < - aggregate_nzd_unvisted_vertices) && - (next_aggregate_vertex_frontier_size < cur_aggregate_vertex_frontier_size)) { - top_down = true; +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto bottomup3 = std::chrono::steady_clock::now(); +#endif + + if ((next_aggregate_frontier_size * direction_optimizing_beta < + aggregate_nzd_unvisited_vertices) && + (next_aggregate_frontier_size < cur_aggregate_frontier_size)) { + topdown = true; } - if (top_down) { // swithcing to top-down + if (topdown) { // swithcing to top-down vertex_frontier.bucket(bucket_idx_cur) = key_bucket_t( handle, std::move(new_frontier_vertex_buffer)); @@ -451,11 +831,26 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_cur) = key_bucket_t( handle, - raft::device_span((*nzd_unvisited_vertices).data(), - (*nzd_unvisited_vertices).size())); + raft::device_span((*((*aux_info).nzd_unvisited_vertices)).data(), + ((*(*aux_info).nzd_unvisited_vertices)).size())); } +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto bottomup4 = std::chrono::steady_clock::now(); + std::chrono::duration dur0 = bottomup1 - bottomup0; + std::chrono::duration dur1 = bottomup2 - bottomup1; + std::chrono::duration dur2 = bottomup3 - bottomup2; + std::chrono::duration dur3 = bottomup4 - bottomup3; + std::chrono::duration dur = bottomup4 - bottomup0; + std::cerr << "depth=" << depth + << " bottomup next_aggregate_frontier_size=" << next_aggregate_frontier_size + << " aggregatee_nzd_unvisited_vertices=" << aggregate_nzd_unvisited_vertices + << " (prim+,host,fill,vf) took " << dur.count() << " (" << dur0.count() << "," + << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s." + << std::endl; +#endif } - cur_aggregate_vertex_frontier_size = next_aggregate_vertex_frontier_size; + cur_aggregate_frontier_size = next_aggregate_frontier_size; depth++; if (depth >= depth_limit) { break; } diff --git a/cpp/src/traversal/extract_bfs_paths_impl.cuh b/cpp/src/traversal/extract_bfs_paths_impl.cuh index 40030e2e39c..d228460bec3 100644 --- a/cpp/src/traversal/extract_bfs_paths_impl.cuh +++ b/cpp/src/traversal/extract_bfs_paths_impl.cuh @@ -220,11 +220,15 @@ std::tuple, vertex_t> extract_bfs_paths( detail::decrement_position{}); if constexpr (multi_gpu) { - current_frontier = collect_values_for_int_vertices(handle, - current_frontier.begin(), - current_frontier.end(), - predecessors, - h_vertex_partition_range_lasts); + auto& comm = handle.get_comms(); + current_frontier = + collect_values_for_int_vertices(comm, + current_frontier.begin(), + current_frontier.end(), + predecessors, + h_vertex_partition_range_lasts, + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); } else { thrust::transform(handle.get_thrust_policy(), current_frontier.begin(), diff --git a/cpp/src/traversal/k_hop_nbrs_impl.cuh b/cpp/src/traversal/k_hop_nbrs_impl.cuh index acf3cfe8fc5..44fa21a5252 100644 --- a/cpp/src/traversal/k_hop_nbrs_impl.cuh +++ b/cpp/src/traversal/k_hop_nbrs_impl.cuh @@ -16,7 +16,7 @@ #pragma once #include "prims/reduce_op.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/vertex_frontier.cuh" #include @@ -147,15 +147,15 @@ k_hop_nbrs(raft::handle_t const& handle, rmm::device_uvector nbrs(0, handle.get_stream()); for (size_t iter = 0; iter < k; ++iter) { auto new_frontier_key_buffer = - transform_reduce_v_frontier_outgoing_e_by_dst(handle, - push_graph_view, - frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op_t{}, - reduce_op::null{}, - do_expensive_check); + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(handle, + push_graph_view, + frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op_t{}, + reduce_op::null{}, + do_expensive_check); if (iter < (k - 1)) { frontier.bucket(bucket_idx_cur).clear(); frontier.bucket(bucket_idx_cur) diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh index e1b7444b92f..594f3b933e5 100644 --- a/cpp/src/traversal/od_shortest_distances_impl.cuh +++ b/cpp/src/traversal/od_shortest_distances_impl.cuh @@ -22,7 +22,7 @@ #include "prims/kv_store.cuh" #include "prims/reduce_op.cuh" #include "prims/transform_reduce_e.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -641,7 +641,6 @@ rmm::device_uvector od_shortest_distances( cutoff, invalid_distance}; detail::transform_reduce_v_frontier_call_e_op_t< - false, thrust::tuple, weight_t, vertex_t, @@ -653,8 +652,8 @@ rmm::device_uvector od_shortest_distances( auto new_frontier_tagged_vertex_buffer = allocate_dataframe_buffer>(0, handle.get_stream()); - std::tie(new_frontier_tagged_vertex_buffer, distance_buffer) = detail:: - extract_transform_v_frontier_e, weight_t>( + std::tie(new_frontier_tagged_vertex_buffer, distance_buffer) = + detail::extract_transform_v_frontier_e, weight_t>( handle, graph_view, vertex_frontier.bucket(bucket_idx_near), diff --git a/cpp/src/traversal/sssp_impl.cuh b/cpp/src/traversal/sssp_impl.cuh index 47908524feb..3429672b151 100644 --- a/cpp/src/traversal/sssp_impl.cuh +++ b/cpp/src/traversal/sssp_impl.cuh @@ -19,7 +19,7 @@ #include "prims/fill_edge_src_dst_property.cuh" #include "prims/reduce_op.cuh" #include "prims/transform_reduce_e.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -197,7 +197,7 @@ void sssp(raft::handle_t const& handle, push_graph_view.local_vertex_partition_view()); auto [new_frontier_vertex_buffer, distance_predecessor_buffer] = - transform_reduce_v_frontier_outgoing_e_by_dst( + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( handle, push_graph_view, vertex_frontier.bucket(bucket_idx_cur_near), diff --git a/cpp/src/utilities/collect_comm.cuh b/cpp/src/utilities/collect_comm.cuh index 2197409fe26..dc4267aac57 100644 --- a/cpp/src/utilities/collect_comm.cuh +++ b/cpp/src/utilities/collect_comm.cuh @@ -50,79 +50,73 @@ namespace cugraph { -// for the keys in kv_store_view, key_to_gpu_id_op(key) should coincide with comm.get_rank() -template -decltype(allocate_dataframe_buffer(0, - rmm::cuda_stream_view{})) -collect_values_for_keys(raft::handle_t const& handle, - KVStoreViewType kv_store_view, - KeyIterator collect_key_first, - KeyIterator collect_key_last, - KeyToGPUIdOp key_to_gpu_id_op) +// for the keys in kv_store_view, key_to_comm_rank_op(key) should coincide with comm.get_rank() +template +dataframe_buffer_type_t collect_values_for_keys( + raft::comms::comms_t const& comm, + KVStoreViewType kv_store_view, + KeyIterator collect_key_first, + KeyIterator collect_key_last, + KeyToCommRankOp key_to_comm_rank_op, + rmm::cuda_stream_view stream_view) { using key_t = typename KVStoreViewType::key_type; static_assert(std::is_same_v::value_type, key_t>); using value_t = typename KVStoreViewType::value_type; - auto& comm = handle.get_comms(); - // 1. collect values for the unique keys in [collect_key_first, collect_key_last) rmm::device_uvector unique_keys(thrust::distance(collect_key_first, collect_key_last), - handle.get_stream()); + stream_view); thrust::copy( - handle.get_thrust_policy(), collect_key_first, collect_key_last, unique_keys.begin()); - thrust::sort(handle.get_thrust_policy(), unique_keys.begin(), unique_keys.end()); + rmm::exec_policy_nosync(stream_view), collect_key_first, collect_key_last, unique_keys.begin()); + thrust::sort(rmm::exec_policy_nosync(stream_view), unique_keys.begin(), unique_keys.end()); unique_keys.resize( thrust::distance( unique_keys.begin(), - thrust::unique(handle.get_thrust_policy(), unique_keys.begin(), unique_keys.end())), - handle.get_stream()); + thrust::unique(rmm::exec_policy(stream_view), unique_keys.begin(), unique_keys.end())), + stream_view); - auto values_for_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); + auto values_for_unique_keys = allocate_dataframe_buffer(0, stream_view); { - rmm::device_uvector rx_unique_keys(0, handle.get_stream()); + rmm::device_uvector rx_unique_keys(0, stream_view); std::vector rx_value_counts{}; std::tie(rx_unique_keys, rx_value_counts) = groupby_gpu_id_and_shuffle_values( comm, unique_keys.begin(), unique_keys.end(), - [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); }, - handle.get_stream()); + [key_to_comm_rank_op] __device__(auto val) { return key_to_comm_rank_op(val); }, + stream_view); auto values_for_rx_unique_keys = - allocate_dataframe_buffer(rx_unique_keys.size(), handle.get_stream()); + allocate_dataframe_buffer(rx_unique_keys.size(), stream_view); kv_store_view.find(rx_unique_keys.begin(), rx_unique_keys.end(), get_dataframe_buffer_begin(values_for_rx_unique_keys), - handle.get_stream()); + stream_view); - auto rx_values_for_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); - std::tie(rx_values_for_unique_keys, std::ignore) = - shuffle_values(comm, - get_dataframe_buffer_begin(values_for_rx_unique_keys), - rx_value_counts, - handle.get_stream()); + auto rx_values_for_unique_keys = allocate_dataframe_buffer(0, stream_view); + std::tie(rx_values_for_unique_keys, std::ignore) = shuffle_values( + comm, get_dataframe_buffer_begin(values_for_rx_unique_keys), rx_value_counts, stream_view); values_for_unique_keys = std::move(rx_values_for_unique_keys); } // 2. build a kv_store_t object for the k, v pairs in unique_keys, values_for_unique_keys. - kv_store_t unique_key_value_store( - handle.get_stream()); + kv_store_t unique_key_value_store(stream_view); if constexpr (KVStoreViewType::binary_search) { unique_key_value_store = kv_store_t(std::move(unique_keys), std::move(values_for_unique_keys), kv_store_view.invalid_value(), false, - handle.get_stream()); + stream_view); } else { auto kv_pair_first = thrust::make_zip_iterator( thrust::make_tuple(unique_keys.begin(), get_dataframe_buffer_begin(values_for_unique_keys))); auto valid_kv_pair_last = - thrust::remove_if(handle.get_thrust_policy(), + thrust::remove_if(rmm::exec_policy(stream_view), kv_pair_first, kv_pair_first + unique_keys.size(), [invalid_value = kv_store_view.invalid_value()] __device__(auto pair) { @@ -136,176 +130,173 @@ collect_values_for_keys(raft::handle_t const& handle, get_dataframe_buffer_begin(values_for_unique_keys), kv_store_view.invalid_key(), kv_store_view.invalid_value(), - handle.get_stream()); + stream_view); - unique_keys.resize(0, handle.get_stream()); - resize_dataframe_buffer(values_for_unique_keys, 0, handle.get_stream()); - unique_keys.shrink_to_fit(handle.get_stream()); - shrink_to_fit_dataframe_buffer(values_for_unique_keys, handle.get_stream()); + unique_keys.resize(0, stream_view); + resize_dataframe_buffer(values_for_unique_keys, 0, stream_view); + unique_keys.shrink_to_fit(stream_view); + shrink_to_fit_dataframe_buffer(values_for_unique_keys, stream_view); } auto unique_key_value_store_view = unique_key_value_store.view(); // 3. find values for [collect_key_first, collect_key_last) auto value_buffer = allocate_dataframe_buffer( - thrust::distance(collect_key_first, collect_key_last), handle.get_stream()); - unique_key_value_store_view.find(collect_key_first, - collect_key_last, - get_dataframe_buffer_begin(value_buffer), - handle.get_stream()); + thrust::distance(collect_key_first, collect_key_last), stream_view); + unique_key_value_store_view.find( + collect_key_first, collect_key_last, get_dataframe_buffer_begin(value_buffer), stream_view); return value_buffer; } -// for the keys in kv_store_view, key_to_gpu_id_op(key) should coincide with comm.get_rank() -template +// for the keys in kv_store_view, key_to_comm_rank_op(key) should coincide with comm.get_rank() +template std::tuple, - decltype(allocate_dataframe_buffer( - 0, cudaStream_t{nullptr}))> + dataframe_buffer_type_t> collect_values_for_unique_keys( - raft::handle_t const& handle, + raft::comms::comms_t const& comm, KVStoreViewType kv_store_view, rmm::device_uvector&& collect_unique_keys, - KeyToGPUIdOp key_to_gpu_id_op) + KeyToCommRankOp key_to_comm_rank_op, + rmm::cuda_stream_view stream_view) { using key_t = typename KVStoreViewType::key_type; using value_t = typename KVStoreViewType::value_type; - auto& comm = handle.get_comms(); - - auto values_for_collect_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); + auto values_for_collect_unique_keys = allocate_dataframe_buffer(0, stream_view); { auto [rx_unique_keys, rx_value_counts] = groupby_gpu_id_and_shuffle_values( comm, collect_unique_keys.begin(), collect_unique_keys.end(), - [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); }, - handle.get_stream()); + [key_to_comm_rank_op] __device__(auto val) { return key_to_comm_rank_op(val); }, + stream_view); auto values_for_rx_unique_keys = - allocate_dataframe_buffer(rx_unique_keys.size(), handle.get_stream()); + allocate_dataframe_buffer(rx_unique_keys.size(), stream_view); kv_store_view.find(rx_unique_keys.begin(), rx_unique_keys.end(), get_dataframe_buffer_begin(values_for_rx_unique_keys), - handle.get_stream()); + stream_view); - std::tie(values_for_collect_unique_keys, std::ignore) = - shuffle_values(comm, - get_dataframe_buffer_begin(values_for_rx_unique_keys), - rx_value_counts, - handle.get_stream()); + std::tie(values_for_collect_unique_keys, std::ignore) = shuffle_values( + comm, get_dataframe_buffer_begin(values_for_rx_unique_keys), rx_value_counts, stream_view); } return std::make_tuple(std::move(collect_unique_keys), std::move(values_for_collect_unique_keys)); } template -std::tuple< - rmm::device_uvector, - decltype(allocate_dataframe_buffer::value_type>( - 0, cudaStream_t{nullptr}))> -collect_values_for_unique_int_vertices(raft::handle_t const& handle, - rmm::device_uvector&& collect_unique_int_vertices, - ValueIterator local_value_first, - std::vector const& vertex_partition_range_lasts) +dataframe_buffer_type_t::value_type> +collect_values_for_sorted_unique_int_vertices( + raft::comms::comms_t const& comm, + raft::device_span collect_sorted_unique_int_vertices, + ValueIterator local_value_first, + std::vector const& comm_rank_vertex_partition_range_lasts, + vertex_t local_vertex_partition_range_first, + rmm::cuda_stream_view stream_view) { using value_t = typename thrust::iterator_traits::value_type; - auto& comm = handle.get_comms(); - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_size = major_comm.get_size(); - auto const major_comm_rank = major_comm.get_rank(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - auto const minor_comm_rank = minor_comm.get_rank(); + // 1.find tx_counts - // 1. groupby and shuffle internal vertices + rmm::device_uvector d_range_lasts(comm_rank_vertex_partition_range_lasts.size(), + stream_view); + raft::update_device(d_range_lasts.data(), + comm_rank_vertex_partition_range_lasts.data(), + comm_rank_vertex_partition_range_lasts.size(), + stream_view); - rmm::device_uvector d_vertex_partition_range_lasts(vertex_partition_range_lasts.size(), - handle.get_stream()); - raft::update_device(d_vertex_partition_range_lasts.data(), - vertex_partition_range_lasts.data(), - vertex_partition_range_lasts.size(), - handle.get_stream()); + rmm::device_uvector d_offsets(d_range_lasts.size() - 1, stream_view); + thrust::lower_bound(rmm::exec_policy_nosync(stream_view), + collect_sorted_unique_int_vertices.begin(), + collect_sorted_unique_int_vertices.end(), + d_range_lasts.begin(), + d_range_lasts.begin() + (d_range_lasts.size() - 1), + d_offsets.begin()); - auto [rx_int_vertices, rx_int_vertex_counts] = groupby_gpu_id_and_shuffle_values( - comm, - collect_unique_int_vertices.begin(), - collect_unique_int_vertices.end(), - detail::compute_gpu_id_from_int_vertex_t{ - raft::device_span(d_vertex_partition_range_lasts.data(), - d_vertex_partition_range_lasts.size()), - major_comm_size, - minor_comm_size}, - handle.get_stream()); - - // 2: Lookup return values - - auto vertex_partition_id = - partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks( - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank); - auto local_int_vertex_first = - vertex_partition_id == 0 ? vertex_t{0} : vertex_partition_range_lasts[vertex_partition_id - 1]; - - auto value_buffer = - allocate_dataframe_buffer(rx_int_vertices.size(), handle.get_stream()); - thrust::transform(handle.get_thrust_policy(), + std::vector h_offsets(d_offsets.size() + 2); + raft::update_host(h_offsets.data() + 1, d_offsets.data(), d_offsets.size(), stream_view); + h_offsets[0] = 0; + h_offsets.back() = collect_sorted_unique_int_vertices.size(); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view)); + + std::vector tx_counts(comm_rank_vertex_partition_range_lasts.size()); + std::adjacent_difference(h_offsets.begin() + 1, h_offsets.end(), tx_counts.begin()); + + // 2. shuffle sorted unique internal vertices to the owning ranks + + auto [rx_int_vertices, rx_counts] = + shuffle_values(comm, collect_sorted_unique_int_vertices.begin(), tx_counts, stream_view); + + // 3.Lookup return values + + auto value_buffer = allocate_dataframe_buffer(rx_int_vertices.size(), stream_view); + thrust::transform(rmm::exec_policy_nosync(stream_view), rx_int_vertices.begin(), rx_int_vertices.end(), get_dataframe_buffer_begin(value_buffer), - [local_value_first, local_int_vertex_first] __device__(auto v) { - return local_value_first[v - local_int_vertex_first]; + [local_value_first, local_vertex_partition_range_first] __device__(auto v) { + return local_value_first[v - local_vertex_partition_range_first]; }); + rx_int_vertices.resize(0, stream_view); + rx_int_vertices.shrink_to_fit(stream_view); - // 3: Shuffle results back to original GPU + // 4. Shuffle results back to the original ranks - std::tie(value_buffer, std::ignore) = shuffle_values( - comm, get_dataframe_buffer_begin(value_buffer), rx_int_vertex_counts, handle.get_stream()); + std::tie(value_buffer, std::ignore) = + shuffle_values(comm, get_dataframe_buffer_begin(value_buffer), rx_counts, stream_view); - return std::make_tuple(std::move(collect_unique_int_vertices), std::move(value_buffer)); + return value_buffer; } template -decltype(allocate_dataframe_buffer::value_type>( - 0, cudaStream_t{nullptr})) +dataframe_buffer_type_t::value_type> collect_values_for_int_vertices( - raft::handle_t const& handle, + raft::comms::comms_t const& comm, VertexIterator collect_vertex_first, VertexIterator collect_vertex_last, ValueIterator local_value_first, std::vector::value_type> const& - vertex_partition_range_lasts) + comm_rank_vertex_partition_range_lasts, + typename thrust::iterator_traits::value_type local_vertex_partition_range_first, + rmm::cuda_stream_view stream_view) { using vertex_t = typename thrust::iterator_traits::value_type; using value_t = typename thrust::iterator_traits::value_type; size_t input_size = thrust::distance(collect_vertex_first, collect_vertex_last); - rmm::device_uvector sorted_unique_int_vertices(input_size, handle.get_stream()); + rmm::device_uvector sorted_unique_int_vertices(input_size, stream_view); - raft::copy( - sorted_unique_int_vertices.data(), collect_vertex_first, input_size, handle.get_stream()); + raft::copy(sorted_unique_int_vertices.data(), collect_vertex_first, input_size, stream_view); - thrust::sort(handle.get_thrust_policy(), + thrust::sort(rmm::exec_policy_nosync(stream_view), sorted_unique_int_vertices.begin(), sorted_unique_int_vertices.end()); - auto last = thrust::unique(handle.get_thrust_policy(), + auto last = thrust::unique(rmm::exec_policy(stream_view), sorted_unique_int_vertices.begin(), sorted_unique_int_vertices.end()); sorted_unique_int_vertices.resize(thrust::distance(sorted_unique_int_vertices.begin(), last), - handle.get_stream()); - - auto [unique_int_vertices, tmp_value_buffer] = collect_values_for_unique_int_vertices( - handle, std::move(sorted_unique_int_vertices), local_value_first, vertex_partition_range_lasts); + stream_view); - kv_store_t kv_map(std::move(unique_int_vertices), + auto tmp_value_buffer = collect_values_for_sorted_unique_int_vertices( + comm, + raft::device_span(sorted_unique_int_vertices.data(), + sorted_unique_int_vertices.size()), + local_value_first, + comm_rank_vertex_partition_range_lasts, + local_vertex_partition_range_first, + stream_view); + + kv_store_t kv_map(std::move(sorted_unique_int_vertices), std::move(tmp_value_buffer), invalid_vertex_id::value, false, - handle.get_stream()); + stream_view); auto device_view = detail::kv_binary_search_store_device_view_t(kv_map.view()); - auto value_buffer = allocate_dataframe_buffer(input_size, handle.get_stream()); - thrust::transform(handle.get_thrust_policy(), + auto value_buffer = allocate_dataframe_buffer(input_size, stream_view); + thrust::transform(rmm::exec_policy_nosync(stream_view), collect_vertex_first, collect_vertex_last, get_dataframe_buffer_begin(value_buffer), diff --git a/cpp/src/utilities/shuffle_vertex_pairs.cuh b/cpp/src/utilities/shuffle_vertex_pairs.cuh index 70327db5ffb..e13cc6dd9f7 100644 --- a/cpp/src/utilities/shuffle_vertex_pairs.cuh +++ b/cpp/src/utilities/shuffle_vertex_pairs.cuh @@ -61,7 +61,7 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl( (edge_ids ? sizeof(edge_t) : size_t{0}) + (edge_types ? sizeof(edge_type_t) : size_t{0}); auto constexpr mem_frugal_ratio = - 0.1; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the + 0.05; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the // total_global_mem, switch to the memory frugal approach (thrust::sort is used to // group-by by default, and thrust::sort requires temporary buffer comparable to the input // data size) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 09b1431e33b..2768f168ba1 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -40,14 +40,14 @@ add_library(cugraphtestutil STATIC utilities/conversion_utilities_sg.cu utilities/debug_utilities_sg.cpp utilities/validation_utilities.cu - link_prediction/similarity_compare.cpp - centrality/betweenness_centrality_validate.cu - community/egonet_validate.cu - cores/k_core_validate.cu - structure/induced_subgraph_validate.cu - sampling/random_walks_check_sg.cu - sampling/detail/nbr_sampling_validate.cu - sampling/detail/sampling_post_processing_validate.cu + #link_prediction/similarity_compare.cpp + #centrality/betweenness_centrality_validate.cu + #community/egonet_validate.cu + #cores/k_core_validate.cu + #structure/induced_subgraph_validate.cu + #sampling/random_walks_check_sg.cu + #sampling/detail/nbr_sampling_validate.cu + #sampling/detail/sampling_post_processing_validate.cu ../../thirdparty/mmio/mmio.c) target_compile_options(cugraphtestutil @@ -594,6 +594,10 @@ if(BUILD_CUGRAPH_MG_TESTS) # - MG BFS tests ------------------------------------------------------------------------------ ConfigureTestMG(MG_BFS_TEST traversal/mg_bfs_test.cpp) + ############################################################################################### + # - MG GRAPH500 BFS tests -------------------------------------------------------------------- + ConfigureTestMG(MG_GRAPH500_BFS_TEST traversal/mg_graph500_bfs_test.cu) + ############################################################################################### # - Extract BFS Paths tests ------------------------------------------------------------------- ConfigureTestMG(MG_EXTRACT_BFS_PATHS_TEST @@ -679,7 +683,7 @@ if(BUILD_CUGRAPH_MG_TESTS) ############################################################################################### # - MG PRIMS TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_SRC_DST tests -------------------------- ConfigureTestMG(MG_TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_SRC_DST_TEST - prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu) + prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu) ############################################################################################### # - MG PRIMS REDUCE_V tests ------------------------------------------------------------------- diff --git a/cpp/tests/c_api/mg_test_utils.cpp b/cpp/tests/c_api/mg_test_utils.cpp index 58c5e59c16f..18807b00a6b 100644 --- a/cpp/tests/c_api/mg_test_utils.cpp +++ b/cpp/tests/c_api/mg_test_utils.cpp @@ -95,9 +95,16 @@ extern "C" void* create_mg_raft_handle(int argc, char** argv) C_MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &comm_size)); C_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); C_CUDA_TRY(cudaSetDevice(comm_rank % num_gpus_per_node)); + ncclUniqueId id{}; + if (comm_rank == 0) { + C_NCCL_TRY(ncclGetUniqueId(&id)); + } + C_MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD)); + ncclComm_t nccl_comm{}; + C_NCCL_TRY(ncclCommInitRank(&nccl_comm, comm_size, id, comm_rank)); raft::handle_t* handle = new raft::handle_t{}; - raft::comms::initialize_mpi_comms(handle, MPI_COMM_WORLD); + raft::comms::initialize_mpi_comms(handle, MPI_COMM_WORLD, nccl_comm); #if 1 int gpu_row_comm_size = 1; diff --git a/cpp/tests/c_api/mg_test_utils.h b/cpp/tests/c_api/mg_test_utils.h index 7461d402b5b..a79c74675d2 100644 --- a/cpp/tests/c_api/mg_test_utils.h +++ b/cpp/tests/c_api/mg_test_utils.h @@ -36,6 +36,18 @@ } \ } while (0) +#define C_NCCL_TRY(call) \ + do { \ + ncclResult_t status = call; \ + if (ncclSuccess != status) { \ + printf("NCCL call='%s' at file=%s line=%d failed.", \ + #call, \ + __FILE__, \ + __LINE__); \ + exit(1); \ + } \ + } while (0) + #define C_CUDA_TRY(call) \ do { \ cudaError_t const status = call; \ diff --git a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu similarity index 74% rename from cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu rename to cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu index 5947dd9a560..51c536bb97f 100644 --- a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu +++ b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/vertex_frontier.cuh" #include "utilities/base_fixture.hpp" #include "utilities/conversion_utilities.hpp" @@ -203,48 +203,7 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst if (cugraph::test::g_perf) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement handle_->get_comms().barrier(); - hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_src"); - } - - auto mg_reduce_by_src_new_frontier_key_buffer = - cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); - [[maybe_unused]] auto mg_reduce_by_src_payload_buffer = - cugraph::detail::allocate_optional_dataframe_buffer(0, handle_->get_stream()); - - if constexpr (std::is_same_v) { - mg_reduce_by_src_new_frontier_key_buffer = - cugraph::transform_reduce_v_frontier_outgoing_e_by_src( - *handle_, - mg_graph_view, - mg_vertex_frontier.bucket(bucket_idx_cur), - mg_src_prop.view(), - mg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - cugraph::reduce_op::null{}); - } else { - std::tie(mg_reduce_by_src_new_frontier_key_buffer, mg_reduce_by_src_payload_buffer) = - cugraph::transform_reduce_v_frontier_outgoing_e_by_src( - *handle_, - mg_graph_view, - mg_vertex_frontier.bucket(bucket_idx_cur), - mg_src_prop.view(), - mg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - cugraph::reduce_op::plus{}); - } - - if (cugraph::test::g_perf) { - RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - handle_->get_comms().barrier(); - hr_timer.stop(); - hr_timer.display_and_clear(std::cout); - } - if (cugraph::test::g_perf) { - RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - handle_->get_comms().barrier(); - hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_src"); + hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_dst"); } auto mg_reduce_by_dst_new_frontier_key_buffer = @@ -286,56 +245,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst // 3. compare SG & MG results if (prims_usecase.check_correctness) { - if constexpr (std::is_same_v) { - cugraph::unrenumber_int_vertices( - *handle_, - mg_reduce_by_src_new_frontier_key_buffer.begin(), - mg_reduce_by_src_new_frontier_key_buffer.size(), - (*mg_renumber_map).data(), - mg_graph_view.vertex_partition_range_lasts()); - - cugraph::unrenumber_int_vertices( - *handle_, - mg_reduce_by_dst_new_frontier_key_buffer.begin(), - mg_reduce_by_dst_new_frontier_key_buffer.size(), - (*mg_renumber_map).data(), - mg_graph_view.vertex_partition_range_lasts()); - } else { - cugraph::unrenumber_int_vertices( - *handle_, - std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).begin(), - std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).size(), - (*mg_renumber_map).data(), - mg_graph_view.vertex_partition_range_lasts()); - - cugraph::unrenumber_int_vertices( - *handle_, - std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).begin(), - std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).size(), - (*mg_renumber_map).data(), - mg_graph_view.vertex_partition_range_lasts()); - } - - auto mg_reduce_by_src_aggregate_new_frontier_key_buffer = - cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); - if constexpr (std::is_same_v) { - mg_reduce_by_src_aggregate_new_frontier_key_buffer = - cugraph::test::device_gatherv(*handle_, - mg_reduce_by_src_new_frontier_key_buffer.data(), - mg_reduce_by_src_new_frontier_key_buffer.size()); - } else { - std::get<0>(mg_reduce_by_src_aggregate_new_frontier_key_buffer) = - cugraph::test::device_gatherv( - *handle_, - std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).data(), - std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).size()); - std::get<1>(mg_reduce_by_src_aggregate_new_frontier_key_buffer) = - cugraph::test::device_gatherv( - *handle_, - std::get<1>(mg_reduce_by_src_new_frontier_key_buffer).data(), - std::get<1>(mg_reduce_by_src_new_frontier_key_buffer).size()); - } - auto mg_reduce_by_dst_aggregate_new_frontier_key_buffer = cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); if constexpr (std::is_same_v) { @@ -356,26 +265,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst std::get<1>(mg_reduce_by_dst_new_frontier_key_buffer).size()); } - [[maybe_unused]] auto mg_reduce_by_src_aggregate_payload_buffer = - cugraph::detail::allocate_optional_dataframe_buffer(0, handle_->get_stream()); - if constexpr (!std::is_same_v) { - if constexpr (std::is_arithmetic_v) { - mg_reduce_by_src_aggregate_payload_buffer = - cugraph::test::device_gatherv(*handle_, - mg_reduce_by_src_payload_buffer.data(), - mg_reduce_by_src_payload_buffer.size()); - } else { - std::get<0>(mg_reduce_by_src_aggregate_payload_buffer) = - cugraph::test::device_gatherv(*handle_, - std::get<0>(mg_reduce_by_src_payload_buffer).data(), - std::get<0>(mg_reduce_by_src_payload_buffer).size()); - std::get<1>(mg_reduce_by_src_aggregate_payload_buffer) = - cugraph::test::device_gatherv(*handle_, - std::get<1>(mg_reduce_by_src_payload_buffer).data(), - std::get<1>(mg_reduce_by_src_payload_buffer).size()); - } - } - [[maybe_unused]] auto mg_reduce_by_dst_aggregate_payload_buffer = cugraph::detail::allocate_optional_dataframe_buffer(0, handle_->get_stream()); if constexpr (!std::is_same_v) { @@ -409,22 +298,11 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst if (handle_->get_comms().get_rank() == int{0}) { if constexpr (std::is_same_v) { - thrust::sort( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_new_frontier_key_buffer)); - thrust::sort( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(mg_reduce_by_dst_aggregate_new_frontier_key_buffer), cugraph::get_dataframe_buffer_end(mg_reduce_by_dst_aggregate_new_frontier_key_buffer)); } else { - thrust::sort_by_key( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_payload_buffer)); - thrust::sort_by_key( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(mg_reduce_by_dst_aggregate_new_frontier_key_buffer), @@ -471,34 +349,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst .insert(cugraph::get_dataframe_buffer_begin(sg_key_buffer), cugraph::get_dataframe_buffer_end(sg_key_buffer)); - auto sg_reduce_by_src_new_frontier_key_buffer = - cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); - [[maybe_unused]] auto sg_reduce_by_src_payload_buffer = - cugraph::detail::allocate_optional_dataframe_buffer(0, handle_->get_stream()); - if constexpr (std::is_same_v) { - sg_reduce_by_src_new_frontier_key_buffer = - cugraph::transform_reduce_v_frontier_outgoing_e_by_src( - *handle_, - sg_graph_view, - sg_vertex_frontier.bucket(bucket_idx_cur), - sg_src_prop.view(), - sg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - cugraph::reduce_op::null{}); - } else { - std::tie(sg_reduce_by_src_new_frontier_key_buffer, sg_reduce_by_src_payload_buffer) = - cugraph::transform_reduce_v_frontier_outgoing_e_by_src( - *handle_, - sg_graph_view, - sg_vertex_frontier.bucket(bucket_idx_cur), - sg_src_prop.view(), - sg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - cugraph::reduce_op::plus{}); - } - auto sg_reduce_by_dst_new_frontier_key_buffer = cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); [[maybe_unused]] auto sg_reduce_by_dst_payload_buffer = @@ -528,22 +378,11 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst } if constexpr (std::is_same_v) { - thrust::sort( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer)); - thrust::sort( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer), cugraph::get_dataframe_buffer_end(sg_reduce_by_dst_new_frontier_key_buffer)); } else { - thrust::sort_by_key( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer)); - thrust::sort_by_key( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer), @@ -551,14 +390,7 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer)); } - bool key_passed = thrust::equal( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer)); - ASSERT_TRUE(key_passed); - - key_passed = thrust::equal( + auto key_passed = thrust::equal( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer), cugraph::get_dataframe_buffer_end(sg_reduce_by_dst_new_frontier_key_buffer), @@ -567,13 +399,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst if constexpr (!std::is_same_v) { bool payload_passed = thrust::equal( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer), - cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_payload_buffer)); - ASSERT_TRUE(payload_passed); - - payload_passed = thrust::equal( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer), diff --git a/cpp/tests/traversal/mg_bfs_test.cpp b/cpp/tests/traversal/mg_bfs_test.cpp index c294c6d0091..810c62d5321 100644 --- a/cpp/tests/traversal/mg_bfs_test.cpp +++ b/cpp/tests/traversal/mg_bfs_test.cpp @@ -332,9 +332,9 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( // enable correctness checks std::make_tuple(BFS_Usecase{0, false}, - cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false)), + cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true /* undirected */, false)), std::make_tuple(BFS_Usecase{0, true}, - cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false)))); + cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true /* undirected */, false)))); INSTANTIATE_TEST_SUITE_P( rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with diff --git a/cpp/tests/traversal/mg_graph500_bfs_test.cu b/cpp/tests/traversal/mg_graph500_bfs_test.cu new file mode 100644 index 00000000000..21205c5ad64 --- /dev/null +++ b/cpp/tests/traversal/mg_graph500_bfs_test.cu @@ -0,0 +1,916 @@ +/* + * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "detail/graph_partition_utils.cuh" +#include "prims/count_if_e.cuh" +#include "prims/extract_transform_e.cuh" +#include "prims/fill_edge_src_dst_property.cuh" +#include "prims/kv_store.cuh" +#include "prims/update_edge_src_dst_property.cuh" +#include "utilities/base_fixture.hpp" +#include "utilities/collect_comm.cuh" +#include "utilities/conversion_utilities.hpp" +#include "utilities/device_comm_wrapper.hpp" +#include "utilities/mg_utilities.hpp" +#include "utilities/property_generator_utilities.hpp" +#include "utilities/test_graphs.hpp" +#include "utilities/thrust_wrapper.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +// FIXME: replace std::cerr with std::cout + +struct Graph500_BFS_Usecase { + bool unrenumber_predecessors{true}; + bool validate{true}; +}; + +template +class Tests_GRAPH500_MGBFS + : public ::testing::TestWithParam> { + public: + Tests_GRAPH500_MGBFS() {} + + static void SetUpTestCase() + { +#if 1 + auto ret = setenv("NCCL_DEBUG", "WARN", 1); + if (ret != 0) std::cerr << "setenv(\"NCCL_DEBUG\", \"TRACE\", 1) returned " << ret << std::endl; +#endif +#if 0 // workstation + // nothing +#else +#if 0 // for CW + ret = setenv("NCCL_NET", "IB", 1); + if (ret != 0) std::cerr << "setenv(\"NCCL_NET\", \"IB\", 1) returned " << ret << std::endl; + ret = setenv("NCCL_SOCKET_IFNAME", "enp90s0f0np0", 1); + if (ret != 0) + std::cerr << "setenv(\"NCCL_SOCKET_IFNAME\", \"enp90s0f0np0\", 1) returned " << ret + << std::endl; +#else // for EOS + ret = setenv("NCCL_COLLNET_ENABLE", "0", 1); + if (ret != 0) + std::cerr << "setenv(\"NCCL_COLLNET_ENABLE\", \"0\", 1) returned " << ret << std::endl; + ret = setenv("NCCL_SHARP_DISABLE", "1", 1); + if (ret != 0) + std::cerr << "setenv(\"NCCL_SHARP_DISABLE\", \"1\", 1) returned " << ret << std::endl; + ret = setenv("NCCL_SHARP_GROUP_SIZE_THRESH", "8", 1); + if (ret != 0) + std::cerr << "setenv(\"NCCL_SHARP_GROUP_SIZE_THRESH\", \"8\", 1) returned " << ret + << std::endl; +#endif +#endif + size_t pool_size = + 16; // note that CUDA_DEVICE_MAX_CONNECTIONS (default: 8) should be set to a value larger + // than pool_size to avoid false dependency among different streams + handle_ = cugraph::test::initialize_mg_handle(pool_size); + } + + static void TearDownTestCase() { handle_.reset(); } + + virtual void SetUp() {} + virtual void TearDown() {} + + template + void run_current_test(Graph500_BFS_Usecase const& bfs_usecase, + input_usecase_t const& input_usecase) + { + using weight_t = float; + using edge_type_t = int32_t; // dummy + + bool constexpr store_transposed = false; + bool constexpr multi_gpu = true; + bool constexpr renumber = true; + bool constexpr test_weighted = false; + bool constexpr shuffle = false; // Graph 500 requirement (edges can't be pre-shuffled, edges + // should be shuffled in Kernel 1) + size_t constexpr num_warmup_starting_vertices = + 1; // to enforce all CUDA & NCCL initializations + size_t constexpr num_timed_starting_vertices = 64; // Graph 500 requirement (64) + + HighResTimer hr_timer{}; + + auto& comm = handle_->get_comms(); + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + auto& major_comm = handle_->get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle_->get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + std::cerr << "comm_size=" << comm_size << " major_comm_size=" << major_comm_size + << " minor_comm_size=" << minor_comm_size << std::endl; + + constexpr auto invalid_distance = std::numeric_limits::max(); + constexpr auto invalid_vertex = cugraph::invalid_vertex_id::value; + + // 1. force NCCL P2P initialization + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("NCCL P2P buffer initialization"); + } + + cugraph::test::enforce_p2p_initialization(comm, handle_->get_stream()); + cugraph::test::enforce_p2p_initialization(major_comm, handle_->get_stream()); + cugraph::test::enforce_p2p_initialization(minor_comm, handle_->get_stream()); + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.stop(); + hr_timer.display_and_clear(std::cerr); + } + + // 2. create an edge list + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("MG Construct edge list"); + } + + std::vector> src_chunks{}; + std::vector> dst_chunks{}; + std::tie(src_chunks, dst_chunks, std::ignore, std::ignore, std::ignore) = + input_usecase.template construct_edgelist( + *handle_, test_weighted, store_transposed, multi_gpu, shuffle); + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.stop(); + hr_timer.display_and_clear(std::cerr); + } + + // 3. create an MG graph + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("MG Construct graph (Kernel 1)"); + } + + for (size_t i = 0; i < src_chunks.size(); ++i) { // shuffle edges +#if 1 // FIXME: delete + std::cerr << "i=" << i << " start shuffling external edges sizes=(" << src_chunks[i].size() + << "," << dst_chunks[i].size() << ")" << std::endl; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto start = std::chrono::steady_clock::now(); +#endif + std::tie(src_chunks[i], dst_chunks[i], std::ignore, std::ignore, std::ignore, std::ignore) = + cugraph::shuffle_external_edges( + *handle_, + std::move(src_chunks[i]), + std::move(dst_chunks[i]), + std::nullopt, + std::nullopt, + std::nullopt); +#if 1 // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto end = std::chrono::steady_clock::now(); + std::chrono::duration dur = end - start; + std::cerr << "i=" << i << " shuffle_external_edges took " << dur.count() << " s." + << std::endl; +#endif + } + + cugraph::graph_t mg_graph(*handle_); + std::optional> mg_renumber_map{std::nullopt}; + std::tie(mg_graph, std::ignore, std::ignore, std::ignore, mg_renumber_map) = + cugraph::create_graph_from_edgelist( + *handle_, + std::nullopt, + std::move(src_chunks), + std::move(dst_chunks), + std::nullopt, + std::nullopt, + std::nullopt, + cugraph::graph_properties_t{input_usecase.undirected() /* symmetric */, + true /* multi-graph */}, + renumber); + + auto mg_graph_view = mg_graph.view(); + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.stop(); + hr_timer.display_and_clear(std::cerr); + } + + auto num_vertices = mg_graph_view.number_of_vertices(); + { + auto num_self_loops = mg_graph_view.count_self_loops(*handle_); + auto number_of_edges = mg_graph_view.compute_number_of_edges(*handle_); + if (mg_graph_view.is_symmetric()) { + std::cerr << "V=" << mg_graph_view.number_of_vertices() << " E=" << number_of_edges + << " num_self_loops=" << num_self_loops + << " undirected E=" << ((number_of_edges - num_self_loops) / 2 + num_self_loops) + << std::endl; + } + } + + // 4. randomly select starting vertices + + rmm::device_uvector d_starting_vertices(0, handle_->get_stream()); + { + raft::random::RngState rng_state(comm_size + comm_rank /* seed */); + auto tot_vertices = num_warmup_starting_vertices + num_timed_starting_vertices; + auto out_degrees = mg_graph_view.compute_out_degrees(*handle_); + + size_t num_generated{0}; + while (num_generated < tot_vertices) { + auto candidates = + cugraph::select_random_vertices( + *handle_, + mg_graph_view, + std::nullopt, + rng_state, + tot_vertices - num_generated, + true /* with_replacement */, + false /* sort_vertices */); + candidates.resize( + thrust::distance( + candidates.begin(), + thrust::remove_if(handle_->get_thrust_policy(), + candidates.begin(), + candidates.end(), + [v_first = mg_graph_view.local_vertex_partition_range_first(), + out_degrees = raft::device_span( + out_degrees.data(), out_degrees.size())] __device__(auto v) { + auto out_degree = out_degrees[v - v_first]; + return out_degree == 0; // remove isolated vertices + })), + handle_->get_stream()); + auto num_valids = cugraph::host_scalar_allreduce( + comm, candidates.size(), raft::comms::op_t::SUM, handle_->get_stream()); + num_generated += num_valids; + auto old_size = d_starting_vertices.size(); + d_starting_vertices.resize(old_size + candidates.size(), handle_->get_stream()); + thrust::copy(handle_->get_thrust_policy(), + candidates.begin(), + candidates.end(), + d_starting_vertices.begin() + old_size); + } +#if 1 // FIXME: delete + raft::print_device_vector( + "d_starting_vertices", d_starting_vertices.data(), d_starting_vertices.size(), std::cerr); + rmm::device_uvector d_starting_vertex_out_degrees(d_starting_vertices.size(), + handle_->get_stream()); + auto map_first = thrust::make_transform_iterator( + d_starting_vertices.begin(), + cugraph::detail::shift_left_t{mg_graph_view.local_vertex_partition_range_first()}); + thrust::gather(handle_->get_thrust_policy(), + map_first, + map_first + d_starting_vertex_out_degrees.size(), + out_degrees.begin(), + d_starting_vertex_out_degrees.begin()); + raft::print_device_vector( + "d_starting_vertex_out_degrees", d_starting_vertex_out_degrees.data(), d_starting_vertex_out_degrees.size(), std::cerr); +#endif + } + auto starting_vertex_counts = + cugraph::host_scalar_allgather(comm, d_starting_vertices.size(), handle_->get_stream()); + auto starting_vertex_offsets = std::vector(starting_vertex_counts.size() + 1); + starting_vertex_offsets[0] = 0; + std::inclusive_scan(starting_vertex_counts.begin(), + starting_vertex_counts.end(), + starting_vertex_offsets.begin() + 1); + + // 5. run MG BFS + + // FIXME: Graph500 doesn't require computing distances. + rmm::device_uvector d_mg_distances(mg_graph_view.local_vertex_partition_range_size(), + handle_->get_stream()); + rmm::device_uvector d_mg_predecessors( + mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream()); + + double total_elapsed{0.0}; + for (size_t i = 0; i < (num_warmup_starting_vertices + num_timed_starting_vertices); ++i) { + auto starting_vertex_comm_rank = static_cast(std::distance( + starting_vertex_offsets.begin() + 1, + std::upper_bound(starting_vertex_offsets.begin() + 1, starting_vertex_offsets.end(), i))); + raft::device_span d_starting_vertex(static_cast(nullptr), + size_t{0}); + if (comm_rank == starting_vertex_comm_rank) { + d_starting_vertex = raft::device_span( + d_starting_vertices.data() + (i - starting_vertex_offsets[comm_rank]), 1); + } + std::cerr << "start running BFS i=" << i << std::endl; + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("MG BFS (Kernel 2)"); + } + + cugraph::bfs(*handle_, + mg_graph_view, + d_mg_distances.data(), + d_mg_predecessors.data(), + d_starting_vertex.data(), + d_starting_vertex.size(), + mg_graph_view.is_symmetric() ? true : false, + std::numeric_limits::max()); + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + auto elapsed = hr_timer.stop(); + if (i >= num_warmup_starting_vertices) { total_elapsed += elapsed; } + hr_timer.display_and_clear(std::cerr); + } +#if 1 + { + size_t free{}; + size_t total{}; + RAFT_CUDA_TRY(cudaMemGetInfo(&free, &total)); + std::cerr << "After BFS CUDA memory free=" << (free / (1024.0 * 1024.0 * 1024.0)) + << " total=" << (total / (1024.0 * 1024.0 * 1024.0)) << std::endl; + } +#endif + + /* compute the number of visisted edges */ + + { + rmm::device_uvector flags(mg_graph_view.local_vertex_partition_range_size(), + handle_->get_stream()); + thrust::transform(handle_->get_thrust_policy(), + d_mg_distances.begin(), + d_mg_distances.end(), + flags.begin(), + cuda::proclaim_return_type([invalid_distance] __device__(auto d) { + return d != invalid_distance; + })); + cugraph::edge_src_property_t edge_src_flags(*handle_, + mg_graph_view); + cugraph::update_edge_src_property( + *handle_, mg_graph_view, flags.begin(), edge_src_flags.mutable_view()); + auto m = cugraph::count_if_e( + *handle_, + mg_graph_view, + edge_src_flags.view(), + cugraph::edge_dst_dummy_property_t{}.view(), + cugraph::edge_dummy_property_t{}.view(), + [] __device__(auto, auto, auto src_flag, auto, auto) { return src_flag; }) / + edge_t{2}; + std::cerr << "# visited undirected edges=" << m << std::endl; + } + + if (bfs_usecase.validate) { + /* check starting vertex's predecessor */ + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("validate (starting vertex's predecessor)"); + } + + { + size_t num_invalids{0}; + if (d_starting_vertex.size() > 0) { + assert(d_starting_vertex.size() == 1); + num_invalids = thrust::count_if( + handle_->get_thrust_policy(), + d_starting_vertex.begin(), + d_starting_vertex.end(), + [v_first = mg_graph_view.local_vertex_partition_range_first(), + predecessors = raft::device_span( + d_mg_predecessors.data(), d_mg_predecessors.size())] __device__(auto v) { + return predecessors[v - v_first] != invalid_vertex; + }); + } + num_invalids = cugraph::host_scalar_allreduce( + comm, num_invalids, raft::comms::op_t::SUM, handle_->get_stream()); + ASSERT_EQ(num_invalids, 0) + << "predecessor of a starting vertex should be invalid_vertex"; // Graph 500 requires + // the predecessor of a + // starting vertex to + // be itself (cuGraph + // API specifies that + // the predecessor of a + // starting vertex is + // an invalid vertex, + // but this really + // doesn't impact + // perforamnce) + } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.stop(); + hr_timer.display_and_clear(std::cerr); + } + + /* check for cycles (update predecessor to predecessor's predecessor till reaching the + * starting vertex, if there exists a cycle, this won't finish) */ + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("validate (cycle)"); + } + + { + vertex_t h_starting_vertex{}; + if (comm_rank == starting_vertex_comm_rank) { + raft::update_host( + &h_starting_vertex, d_starting_vertex.data(), 1, handle_->get_stream()); + handle_->sync_stream(); + } + h_starting_vertex = cugraph::host_scalar_bcast( + comm, h_starting_vertex, starting_vertex_comm_rank, handle_->get_stream()); + + rmm::device_uvector ancestors(d_mg_predecessors.size(), handle_->get_stream()); + ancestors.resize( + thrust::distance( + ancestors.begin(), + thrust::copy_if(handle_->get_thrust_policy(), + d_mg_predecessors.begin(), + d_mg_predecessors.end(), + ancestors.begin(), + cugraph::detail::is_not_equal_t{invalid_vertex})), + handle_->get_stream()); + + cugraph::kv_store_t kv_store( + thrust::make_counting_iterator(mg_graph_view.local_vertex_partition_range_first()), + thrust::make_counting_iterator(mg_graph_view.local_vertex_partition_range_last()), + d_mg_predecessors.begin(), + invalid_vertex, + true /* key_sorted */, + handle_->get_stream()); + auto kv_store_view = kv_store.view(); + auto h_vertex_partition_range_lasts = mg_graph_view.vertex_partition_range_lasts(); + auto d_vertex_partition_range_lasts = + cugraph::test::to_device(*handle_, h_vertex_partition_range_lasts); + size_t level{0}; + auto aggregate_size = cugraph::host_scalar_allreduce( + comm, ancestors.size(), raft::comms::op_t::SUM, handle_->get_stream()); + while (aggregate_size > 0) { + ASSERT_TRUE(level < mg_graph_view.number_of_vertices() - 1) + << "BFS predecessor tree has a cycle."; + ancestors.resize( + thrust::distance( + ancestors.begin(), + thrust::remove_if(handle_->get_thrust_policy(), + ancestors.begin(), + ancestors.end(), + cugraph::detail::is_equal_t{h_starting_vertex})), + handle_->get_stream()); + ancestors = cugraph::collect_values_for_keys( + comm, + kv_store_view, + ancestors.begin(), + ancestors.end(), + cugraph::detail::compute_gpu_id_from_int_vertex_t{ + raft::device_span(d_vertex_partition_range_lasts.data(), + d_vertex_partition_range_lasts.size()), + major_comm_size, + minor_comm_size}, + handle_->get_stream()); + aggregate_size = cugraph::host_scalar_allreduce( + comm, ancestors.size(), raft::comms::op_t::SUM, handle_->get_stream()); + ++level; + } + } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.stop(); + hr_timer.display_and_clear(std::cerr); + } + + /* check that distance(src) = distance(predecssor(v)) + 1 */ + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("validate (predecessor tree distances)"); + } + + { + rmm::device_uvector tree_srcs(mg_graph_view.local_vertex_partition_range_size(), + handle_->get_stream()); + tree_srcs.resize( + thrust::distance( + tree_srcs.begin(), + thrust::copy_if(handle_->get_thrust_policy(), + d_mg_predecessors.begin(), + d_mg_predecessors.end(), + tree_srcs.begin(), + cugraph::detail::is_not_equal_t{invalid_vertex})), + handle_->get_stream()); + + auto tree_src_dists = cugraph::collect_values_for_int_vertices( + comm, + tree_srcs.begin(), + tree_srcs.end(), + d_mg_distances.begin(), + mg_graph_view.vertex_partition_range_lasts(), + mg_graph_view.local_vertex_partition_range_first(), + handle_->get_stream()); + + rmm::device_uvector tree_dst_dists(tree_src_dists.size(), + handle_->get_stream()); + thrust::copy_if(handle_->get_thrust_policy(), + d_mg_distances.begin(), + d_mg_distances.end(), + d_mg_predecessors.begin(), + tree_dst_dists.begin(), + cugraph::detail::is_not_equal_t{invalid_vertex}); + + auto input_pair_first = + thrust::make_zip_iterator(tree_src_dists.begin(), tree_dst_dists.begin()); + auto num_invalids = thrust::count_if(handle_->get_thrust_policy(), + input_pair_first, + input_pair_first + tree_src_dists.size(), + [] __device__(auto pair) { + auto src_dist = thrust::get<0>(pair); + auto dst_dist = thrust::get<1>(pair); + return (src_dist + 1) != dst_dist; + }); + num_invalids = cugraph::host_scalar_allreduce( + comm, num_invalids, raft::comms::op_t::SUM, handle_->get_stream()); + + ASSERT_EQ(num_invalids, 0) + << " source and destination vertices in the BFS predecessor tree are not one hop away."; + } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.stop(); + hr_timer.display_and_clear(std::cerr); + } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("validate (graph distances & connected components)"); + } + + /* check distances and connect component coverage in the input graph */ + + { + constexpr size_t num_rounds = 24; // to cut peak memory usage + + rmm::device_uvector d_mg_typecasted_distances(d_mg_distances.size(), + handle_->get_stream()); + auto max_distance = thrust::transform_reduce( + handle_->get_thrust_policy(), + d_mg_distances.begin(), + d_mg_distances.end(), + cuda::proclaim_return_type([invalid_distance] __device__(auto d) { + return d == invalid_distance ? vertex_t{0} : d; + }), + vertex_t{0}, + thrust::maximum{}); + max_distance = cugraph::host_scalar_allreduce( + comm, max_distance, raft::comms::op_t::MAX, handle_->get_stream()); + ASSERT_TRUE(max_distance <= std::numeric_limits::max()) + << "the input graph diameter exceeds std::numeric_limits::max(), so we " + "can't use uint8_t to store distances in validation."; + thrust::transform(handle_->get_thrust_policy(), + d_mg_distances.begin(), + d_mg_distances.end(), + d_mg_typecasted_distances.begin(), + cugraph::detail::typecast_t{}); + cugraph::edge_src_property_t edge_src_dist( + *handle_, mg_graph_view); + cugraph::update_edge_src_property(*handle_, + mg_graph_view, + d_mg_typecasted_distances.begin(), + edge_src_dist.mutable_view()); + + size_t num_invalids{0}; + for (size_t r = 0; r < num_rounds; ++r) { + auto dst_first = mg_graph_view.local_edge_partition_dst_range_first(); + auto dst_range_size = mg_graph_view.local_edge_partition_dst_range_size(); + auto num_this_round_dsts = + dst_range_size / num_rounds + + (r < (dst_range_size % num_rounds) ? vertex_t{1} : vertex_t{0}); + rmm::device_uvector this_round_dsts(num_this_round_dsts, + handle_->get_stream()); + thrust::tabulate(handle_->get_thrust_policy(), + this_round_dsts.begin(), + this_round_dsts.end(), + [dst_first, r, num_rounds] __device__(size_t i) { + return dst_first + static_cast(r + i * num_rounds); + }); + + auto this_round_dst_dists = cugraph::collect_values_for_sorted_unique_int_vertices( + comm, + raft::device_span(this_round_dsts.data(), this_round_dsts.size()), + d_mg_typecasted_distances.begin(), + mg_graph_view.vertex_partition_range_lasts(), + mg_graph_view.local_vertex_partition_range_first(), + handle_->get_stream()); + + num_invalids += cugraph::count_if_e( + *handle_, + mg_graph_view, + edge_src_dist.view(), + cugraph::edge_dst_dummy_property_t{}.view(), + cugraph::edge_dummy_property_t{}.view(), + [invalid_distance, + num_rounds, + r, + dst_first, + this_round_dst_dists = raft::device_span( + this_round_dst_dists.data(), + this_round_dst_dists + .size())] __device__(auto src, auto dst, auto src_dist, auto, auto) { + auto dst_offset = dst - dst_first; + if ((dst_offset % num_rounds) == r) { + auto dst_dist = this_round_dst_dists[dst_offset / num_rounds]; + if (src_dist != invalid_distance) { + return (dst_dist == invalid_distance) || + (((src_dist >= dst_dist) ? (src_dist - dst_dist) + : (dst_dist - src_dist)) > 1); + } else { + return (dst_dist != invalid_distance); + } + } else { + return false; + } + }); + } + + num_invalids = cugraph::host_scalar_allreduce( + comm, num_invalids, raft::comms::op_t::SUM, handle_->get_stream()); + + ASSERT_EQ(num_invalids, 0) + << "only one of the two connected vertices is reachable from the starting vertex or " + "the distances from the starting vertex differ by more than one."; + } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.stop(); + hr_timer.display_and_clear(std::cerr); + } + + /* check that predecessor->v edges exist in the input graph */ + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("validate (predecessor->v edge existence)"); + } + + { + rmm::device_uvector query_srcs(d_mg_predecessors.size(), handle_->get_stream()); + rmm::device_uvector query_dsts(query_srcs.size(), handle_->get_stream()); + auto input_edge_first = thrust::make_zip_iterator( + d_mg_predecessors.begin(), + thrust::make_counting_iterator(mg_graph_view.local_vertex_partition_range_first())); + auto output_edge_first = + thrust::make_zip_iterator(query_srcs.begin(), query_dsts.begin()); + query_srcs.resize( + thrust::distance( + output_edge_first, + thrust::copy_if(handle_->get_thrust_policy(), + input_edge_first, + input_edge_first + d_mg_predecessors.size(), + d_mg_predecessors.begin(), + output_edge_first, + cugraph::detail::is_not_equal_t{invalid_vertex})), + handle_->get_stream()); + query_dsts.resize(query_srcs.size(), handle_->get_stream()); + +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "start shuffling edges" << std::endl; +#endif + std::tie(query_srcs, query_dsts, std::ignore, std::ignore, std::ignore, std::ignore) = + cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning< + vertex_t, + edge_t, + weight_t, + edge_type_t>(*handle_, + std::move(query_srcs), + std::move(query_dsts), + std::nullopt, + std::nullopt, + std::nullopt, + mg_graph_view.vertex_partition_range_lasts()); +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "edges shuffled, calling has_edge()" << std::endl; +#endif + + auto flags = mg_graph_view.has_edge( + *handle_, + raft::device_span(query_srcs.data(), query_srcs.size()), + raft::device_span(query_dsts.data(), query_dsts.size()), + true /* FIXME: remove */); + auto num_invalids = + thrust::count(handle_->get_thrust_policy(), flags.begin(), flags.end(), false); +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "local # invalids=" << num_invalids << std::endl; +#endif + num_invalids = cugraph::host_scalar_allreduce( + comm, num_invalids, raft::comms::op_t::SUM, handle_->get_stream()); +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "global # invalids=" << num_invalids << std::endl; + if (num_invalids > 0) { + rmm::device_uvector d_pair(2, handle_->get_stream()); + thrust::fill( + handle_->get_thrust_policy(), d_pair.begin(), d_pair.end(), invalid_vertex); + auto triplet_first = + thrust::make_zip_iterator(query_srcs.begin(), query_dsts.begin(), flags.begin()); + thrust::for_each(handle_->get_thrust_policy(), + triplet_first, + triplet_first + query_srcs.size(), + [pair = raft::device_span( + d_pair.data(), d_pair.size())] __device__(auto triplet) { + if (thrust::get<2>(triplet) == false) { + auto src = thrust::get<0>(triplet); + auto dst = thrust::get<1>(triplet); + printf("missing edge from src=%lld to dst=%lld\n", + (long long)src, + (long long)dst); + pair[0] = src; + pair[1] = dst; + } + }); + std::vector h_pair(2); + raft::update_host(h_pair.data(), d_pair.data(), d_pair.size(), handle_->get_stream()); + handle_->sync_stream(); + auto min_comm_rank = + cugraph::host_scalar_allreduce(comm, + h_pair[0] == invalid_vertex ? comm_size : comm_rank, + raft::comms::op_t::MIN, + handle_->get_stream()); + if (min_comm_rank != comm_size) { + if (comm_rank == min_comm_rank) { + std::cerr << "comm_rank=" << comm_rank << " has an invalid pair (" << h_pair[0] + << "," << h_pair[1] << ")" << std::endl; + } + auto tup = cugraph::host_scalar_bcast(comm, + thrust::make_tuple(h_pair[0], h_pair[1]), + min_comm_rank, + handle_->get_stream()); + std::cerr << "tup=(" << thrust::get<0>(tup) << "," << thrust::get<1>(tup) << ")" + << std::endl; + auto num_appears = cugraph::count_if_e( + *handle_, + mg_graph_view, + cugraph::edge_src_dummy_property_t{}.view(), + cugraph::edge_dst_dummy_property_t{}.view(), + cugraph::edge_dummy_property_t{}.view(), + [missing_src = thrust::get<0>(tup), missing_dst = thrust::get<1>(tup)] __device__( + auto src, auto dst, auto src_dist, auto, auto) { + if (src == missing_src && dst == missing_dst) { + printf("edge %lld, %lld actually exists.\n", (long long)src, (long long)dst); + return true; + } + return false; + }); + std::cerr << "num_appears=" << num_appears << std::endl; + if (thrust::get<0>(tup) >= mg_graph_view.local_vertex_partition_range_first() && + thrust::get<0>(tup) < mg_graph_view.local_vertex_partition_range_last()) { + auto v_offset = + thrust::get<0>(tup) - mg_graph_view.local_vertex_partition_range_first(); + std::cerr << "thrust::get<0>(tup) v_offset=" << v_offset << std::endl; + raft::print_device_vector( + "thrust::get<0>(tup) dist", d_mg_distances.data() + v_offset, 1, std::cerr); + raft::print_device_vector( + "thrust::get<0>(tup) pred", d_mg_predecessors.data() + v_offset, 1, std::cerr); + } + if (thrust::get<1>(tup) >= mg_graph_view.local_vertex_partition_range_first() && + thrust::get<1>(tup) < mg_graph_view.local_vertex_partition_range_last()) { + auto v_offset = + thrust::get<1>(tup) - mg_graph_view.local_vertex_partition_range_first(); + std::cerr << "thrust::get<1>(tup) v_offset=" << v_offset << std::endl; + raft::print_device_vector( + "thrust::get<1>(tup) dist", d_mg_distances.data() + v_offset, 1, std::cerr); + raft::print_device_vector( + "thrust::get<1>(tup) pred", d_mg_predecessors.data() + v_offset, 1, std::cerr); + } + } + comm.barrier(); + } +#else + ASSERT_EQ(num_invalids, 0) << "predecessor->v missing in the input graph."; +#endif + } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.stop(); + hr_timer.display_and_clear(std::cerr); + } + } + } + + std::cerr << "average MG BFS (Kernel 2) time: " << (total_elapsed / num_timed_starting_vertices) + << std::endl; + } + + private: + static std::unique_ptr handle_; +}; + +template +std::unique_ptr Tests_GRAPH500_MGBFS::handle_ = nullptr; + +using Tests_GRAPH500_MGBFS_Rmat = Tests_GRAPH500_MGBFS; + +TEST_P(Tests_GRAPH500_MGBFS_Rmat, CheckInt64Int64) +{ + auto param = GetParam(); + run_current_test( + std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param))); +} + +INSTANTIATE_TEST_SUITE_P( + rmat_small_test, + Tests_GRAPH500_MGBFS_Rmat, + ::testing::Values( + // enable correctness checks + std::make_tuple(Graph500_BFS_Usecase{false, true}, + cugraph::test::Rmat_Usecase(10, + 16, + 0.57, + 0.19, + 0.19, + 0 /* base RNG seed */, + true /* undirected */, + true /* scramble vertex ID */)))); + +INSTANTIATE_TEST_SUITE_P( + rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with + --gtest_filter to select only the rmat_benchmark_test with a specific + vertex & edge type combination) by command line arguments and do not + include more than one Rmat_Usecase that differ only in scale or edge + factor (to avoid running same benchmarks more than once) */ + Tests_GRAPH500_MGBFS_Rmat, + ::testing::Values( + // disable correctness checks for large graphs + std::make_tuple(Graph500_BFS_Usecase{false, false}, + cugraph::test::Rmat_Usecase(20, + 16, + 0.57, + 0.19, + 0.19, + 0 /* base RNG seed */, + true /* undirected */, + true /* scramble vertex IDs */)))); + +CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp index 25011c0c97a..7a76739f32f 100644 --- a/cpp/tests/utilities/base_fixture.hpp +++ b/cpp/tests/utilities/base_fixture.hpp @@ -70,17 +70,35 @@ inline auto make_managed() { return std::make_shared(total * init_alloc_ratio)), rmm::CUDA_ALLOCATION_ALIGNMENT) : rmm::align_down(std::min(free, total / 10), rmm::CUDA_ALLOCATION_ALIGNMENT); - return rmm::mr::make_owning_wrapper(make_cuda(), min_alloc); + std::optional max_alloc{}; + if (use_max) { + max_alloc = init_alloc; + } + std::cout << "init_alloc ratio=" << static_cast(init_alloc) / static_cast(total) << std::endl; + return rmm::mr::make_owning_wrapper(make_cuda(), init_alloc, max_alloc); } inline auto make_binning() @@ -108,12 +126,12 @@ inline auto make_binning() * @return Memory resource instance */ inline std::shared_ptr create_memory_resource( - std::string const& allocation_mode) + std::string const& allocation_mode, int comm_size) { if (allocation_mode == "binning") return make_binning(); if (allocation_mode == "cuda") return make_cuda(); if (allocation_mode == "pool") return make_pool(); - if (allocation_mode == "maxpool") return make_pool(true); + if (allocation_mode == "maxpool") return make_pool(true, comm_size); if (allocation_mode == "managed") return make_managed(); CUGRAPH_FAIL("Invalid RMM allocation mode"); } @@ -210,7 +228,7 @@ inline auto parse_test_options(int argc, char** argv) ::testing::InitGoogleTest(&argc, argv); \ auto const cmd_opts = parse_test_options(argc, argv); \ auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ - auto resource = cugraph::test::create_memory_resource(rmm_mode); \ + auto resource = cugraph::test::create_memory_resource(rmm_mode, 1); \ rmm::mr::set_current_device_resource(resource.get()); \ cugraph::test::g_perf = cmd_opts["perf"].as(); \ cugraph::test::g_rmat_scale = \ @@ -232,6 +250,9 @@ inline auto parse_test_options(int argc, char** argv) #define CUGRAPH_MG_TEST_PROGRAM_MAIN() \ int main(int argc, char** argv) \ { \ + if (setenv("CUDA_DEVICE_MAX_CONNECTIONS", "18", 1) != 0) { \ + std::cerr << "setenv() returned ret" << std::endl; \ + } \ cugraph::test::initialize_mpi(argc, argv); \ auto comm_rank = cugraph::test::query_mpi_comm_world_rank(); \ auto comm_size = cugraph::test::query_mpi_comm_world_size(); \ @@ -241,7 +262,7 @@ inline auto parse_test_options(int argc, char** argv) ::testing::InitGoogleTest(&argc, argv); \ auto const cmd_opts = parse_test_options(argc, argv); \ auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ - auto resource = cugraph::test::create_memory_resource(rmm_mode); \ + auto resource = cugraph::test::create_memory_resource(rmm_mode, comm_size); \ rmm::mr::set_current_device_resource(resource.get()); \ cugraph::test::g_perf = cmd_opts["perf"].as(); \ cugraph::test::g_rmat_scale = \ diff --git a/cpp/tests/utilities/mg_utilities.cpp b/cpp/tests/utilities/mg_utilities.cpp index 6f8fb8c6acd..ee2a4740b97 100644 --- a/cpp/tests/utilities/mg_utilities.cpp +++ b/cpp/tests/utilities/mg_utilities.cpp @@ -51,16 +51,26 @@ std::unique_ptr initialize_mg_handle(size_t pool_size) handle = std::make_unique(rmm::cuda_stream_per_thread, std::make_shared(pool_size)); - raft::comms::initialize_mpi_comms(handle.get(), MPI_COMM_WORLD); - auto& comm = handle->get_comms(); - auto const comm_size = comm.get_size(); + auto comm_rank = query_mpi_comm_world_rank(); + auto comm_size = query_mpi_comm_world_size(); + ncclUniqueId id{}; + if (comm_rank == 0) { + RAFT_NCCL_TRY(ncclGetUniqueId(&id)); + } + RAFT_MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD)); + ncclComm_t nccl_comm{}; + ncclConfig_t nccl_config = NCCL_CONFIG_INITIALIZER; + nccl_config.splitShare = 1; + RAFT_NCCL_TRY(ncclCommInitRankConfig(&nccl_comm, comm_size, id, comm_rank, &nccl_config)); + + raft::comms::initialize_mpi_comms(handle.get(), MPI_COMM_WORLD, nccl_comm); auto gpu_row_comm_size = static_cast(sqrt(static_cast(comm_size))); while (comm_size % gpu_row_comm_size != 0) { --gpu_row_comm_size; } - cugraph::partition_manager::init_subcomm(*handle, gpu_row_comm_size); + cugraph::partition_manager::init_subcomm(*handle, std::max(comm_size / 16, 1)); return std::move(handle); } diff --git a/cpp/tests/utilities/mg_utilities.hpp b/cpp/tests/utilities/mg_utilities.hpp index 9f98245387d..a21ee2bc525 100644 --- a/cpp/tests/utilities/mg_utilities.hpp +++ b/cpp/tests/utilities/mg_utilities.hpp @@ -29,7 +29,7 @@ void finalize_mpi(); int query_mpi_comm_world_rank(); int query_mpi_comm_world_size(); -std::unique_ptr initialize_mg_handle(size_t pool_size = 64); +std::unique_ptr initialize_mg_handle(size_t pool_size = 8 /* default value of CUDA_DEVICE_MAX_CONNECTIONS */); // NCCL lazily initializes for P2P, and this enforces P2P initialization for better performance // measurements diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp index 0a706d1cf80..5edc722a8c6 100644 --- a/cpp/tests/utilities/test_graphs.hpp +++ b/cpp/tests/utilities/test_graphs.hpp @@ -24,7 +24,6 @@ #include #include #include -#include // legacy coo_to_csr #include @@ -234,7 +233,8 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { construct_edgelist(raft::handle_t const& handle, bool test_weighted, bool store_transposed, - bool multi_gpu) const + bool multi_gpu, + bool shuffle = true) const { CUGRAPH_EXPECTS( (size_t{1} << scale_) <= static_cast(std::numeric_limits::max()), @@ -246,7 +246,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { // cuMemAddressReserve // (https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management), we // can reduce the temporary memory requirement to (1 / num_partitions) * (original data size) - size_t constexpr num_partitions_per_gpu = 4; + size_t constexpr num_partitions_per_gpu = 8; size_t num_partitions = num_partitions_per_gpu * static_cast(multi_gpu ? handle.get_comms().get_size() : 1); @@ -330,7 +330,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { handle, std::move(tmp_src_v), std::move(tmp_dst_v), std::move(tmp_weights_v)); } - if (multi_gpu) { + if (multi_gpu && shuffle) { std::tie(store_transposed ? tmp_dst_v : tmp_src_v, store_transposed ? tmp_src_v : tmp_dst_v, tmp_weights_v, @@ -375,7 +375,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { translate(handle, vertex_v); - if (multi_gpu) { + if (multi_gpu && shuffle) { vertex_v = cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning( handle, std::move(vertex_v)); } @@ -391,6 +391,10 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { void set_edge_factor(size_t edge_factor) { edge_factor_ = edge_factor; } + bool undirected() const { + return undirected_; + } + private: size_t scale_{}; size_t edge_factor_{}; @@ -762,39 +766,5 @@ construct_graph(raft::handle_t const& handle, return std::make_tuple(std::move(graph), std::move(edge_weights), std::move(renumber_map)); } -namespace legacy { - -template -std::unique_ptr> construct_graph_csr( - raft::handle_t const& handle, input_usecase_t const& input_usecase, bool test_weighted) -{ - auto [d_src_v, d_dst_v, d_weight_v, d_vertices_v, is_symmetric] = - input_usecase.template construct_edgelist( - handle, test_weighted, false, false); - vertex_t num_vertices{}; // assuming that vertex IDs are non-negative consecutive integers - if (d_vertices_v) { - num_vertices = - max_element( - handle, raft::device_span((*d_vertices_v).data(), (*d_vertices_v).size())) + - 1; - } else { - num_vertices = - std::max( - max_element(handle, raft::device_span(d_src_v.data(), d_src_v.size())), - max_element(handle, raft::device_span(d_dst_v.data(), d_dst_v.size()))) + - 1; - } - - cugraph::legacy::GraphCOOView cooview( - d_src_v.data(), - d_dst_v.data(), - d_weight_v ? d_weight_v->data() : nullptr, - num_vertices, - static_cast(d_src_v.size())); - - return cugraph::coo_to_csr(cooview); -} - -} // namespace legacy } // namespace test } // namespace cugraph