diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index 3a55e50c8b..9174464473 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -60,9 +60,11 @@ block_spgemm gauss_seidel ------------ -.. doxygenfunction:: create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) .. doxygenfunction:: create_gs_handle(const HandleExecSpace&, int, KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) +.. doxygenfunction:: create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) +.. doxygenfunction:: create_gs_handle(const HandleExecSpace&, int, KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) .. doxygenfunction:: create_gs_handle(KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) +.. doxygenfunction:: destroy_gs_handle() .. doxygenfunction:: gauss_seidel_symbolic(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) diff --git a/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index 501e71e3e7..5291b7b14b 100644 --- a/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -89,7 +89,7 @@ class ClusterGaussSeidel { typedef typename HandleType::scalar_persistent_work_view_t scalar_persistent_work_view_t; - typedef Kokkos::RangePolicy my_exec_space; + typedef Kokkos::RangePolicy range_policy_t; typedef nnz_lno_t color_t; typedef Kokkos::View color_view_t; typedef Kokkos::Bitset bitset_t; @@ -519,6 +519,7 @@ class ClusterGaussSeidel { using raw_colinds_t = Kokkos::View>; auto gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE Kokkos::Timer timer; #endif @@ -528,6 +529,8 @@ class ClusterGaussSeidel { // symmetric and non-symmetric input cases. rowmap_t sym_xadj; colinds_t sym_adj; + // TODO: pass my_exec_space into KokkosGraph kernels. Requires + // https://github.com/kokkos/kokkos-kernels/issues/1879. if (!this->is_symmetric) { KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< in_rowmap_t, in_colinds_t, rowmap_t, colinds_t, MyExecSpace>( @@ -649,10 +652,14 @@ class ClusterGaussSeidel { #endif nnz_lno_persistent_work_view_t color_xadj; nnz_lno_persistent_work_view_t color_adj; + + // Wait for coloring to finish on its stream + using ColoringExecSpace = typename HandleType::HandleExecSpace; + ColoringExecSpace().fence(); KokkosKernels::Impl::create_reverse_map< typename HandleType::GraphColoringHandleType::color_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - numClusters, numColors, colors, color_xadj, color_adj); + my_exec_space, numClusters, numColors, colors, color_xadj, color_adj); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE MyExecSpace().fence(); std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl; @@ -661,6 +668,8 @@ class ClusterGaussSeidel { nnz_lno_persistent_work_host_view_t color_xadj_host( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Color xadj"), color_xadj.extent(0)); + // NOTE: the below deep copy ensures that we don't start running numeric + // on a non-default stream until our symbolic data has landed. Kokkos::deep_copy(color_xadj_host, color_xadj); gsHandle->set_color_xadj(color_xadj_host); gsHandle->set_color_adj(color_adj); @@ -735,7 +744,8 @@ class ClusterGaussSeidel { }; void initialize_numeric() { - auto gsHandle = get_gs_handle(); + auto gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); if (!gsHandle->is_symbolic_called()) { this->initialize_symbolic(); } @@ -751,12 +761,15 @@ class ClusterGaussSeidel { this->handle->get_suggested_team_size(suggested_vector_size); scalar_persistent_work_view_t inverse_diagonal( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Aii^-1"), num_rows); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "Aii^-1"), + num_rows); nnz_lno_t rows_per_team = this->handle->get_team_work_size( - suggested_team_size, MyExecSpace().concurrency(), num_rows); + suggested_team_size, my_exec_space.concurrency(), num_rows); if (have_diagonal_given) { - Kokkos::deep_copy(inverse_diagonal, this->given_inverse_diagonal); + Kokkos::deep_copy(my_exec_space, inverse_diagonal, + this->given_inverse_diagonal); } else { // extract inverse diagonal from matrix Get_Matrix_Diagonals gmd(this->row_map, this->entries, this->values, @@ -764,12 +777,13 @@ class ClusterGaussSeidel { if (gsHandle->use_teams()) { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::team_get_matrix_diagonals", - team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, + team_policy_t(my_exec_space, + (num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), gmd); } else { Kokkos::parallel_for("KokkosSparse::GaussSeidel::get_matrix_diagonals", - my_exec_space(0, num_rows), gmd); + range_policy_t(my_exec_space, 0, num_rows), gmd); } } gsHandle->set_inverse_diagonal(inverse_diagonal); @@ -787,7 +801,8 @@ class ClusterGaussSeidel { nnz_scalar_t omega = Kokkos::ArithTraits::one(), bool apply_forward = true, bool apply_backward = true, bool /*update_y_vector*/ = true) { - auto gsHandle = get_gs_handle(); + auto gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); size_type nnz = entries.extent(0); nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj(); @@ -797,8 +812,8 @@ class ClusterGaussSeidel { color_t numColors = gsHandle->get_num_colors(); if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector( - num_cols, x_lhs_output_vec); + KokkosKernels::Impl::zero_vector(my_exec_space, num_cols, + x_lhs_output_vec); } scalar_persistent_work_view_t inverse_diagonal = @@ -811,7 +826,7 @@ class ClusterGaussSeidel { this->handle->get_suggested_team_size(suggested_vector_size); nnz_lno_t rows_per_team = this->handle->get_team_work_size( - suggested_team_size, MyExecSpace().concurrency(), num_rows); + suggested_team_size, my_exec_space.concurrency(), num_rows); // Get clusters per team. Round down to favor finer granularity, since // this is sensitive to load imbalance nnz_lno_t clusters_per_team = @@ -825,33 +840,34 @@ class ClusterGaussSeidel { color_adj, gsHandle->get_cluster_xadj(), gsHandle->get_cluster_adj(), inverse_diagonal, clusters_per_team, omega); - this->IterativeTeamPSGS(gs, numColors, h_color_xadj, suggested_team_size, - suggested_vector_size, numIter, apply_forward, - apply_backward); + this->IterativeTeamPSGS(my_exec_space, gs, numColors, h_color_xadj, + suggested_team_size, suggested_vector_size, + numIter, apply_forward, apply_backward); } else { PSGS gs( this->row_map, this->entries, this->values, x_lhs_output_vec, y_rhs_input_vec, color_adj, gsHandle->get_cluster_xadj(), gsHandle->get_cluster_adj(), omega, inverse_diagonal); - this->IterativePSGS(gs, numColors, h_color_xadj, numIter, apply_forward, - apply_backward); + this->IterativePSGS(my_exec_space, gs, numColors, h_color_xadj, numIter, + apply_forward, apply_backward); } } template - void IterativeTeamPSGS(TPSGS& gs, color_t numColors, + void IterativeTeamPSGS(MyExecSpace& my_exec_space, TPSGS& gs, + color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj, nnz_lno_t team_size, nnz_lno_t vec_size, int num_iteration, bool apply_forward, bool apply_backward) { for (int i = 0; i < num_iteration; ++i) - this->DoTeamPSGS(gs, numColors, h_color_xadj, team_size, vec_size, - apply_forward, apply_backward); + this->DoTeamPSGS(my_exec_space, gs, numColors, h_color_xadj, team_size, + vec_size, apply_forward, apply_backward); } template - void DoTeamPSGS(TPSGS& gs, color_t numColors, + void DoTeamPSGS(MyExecSpace& my_exec_space, TPSGS& gs, color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj, nnz_lno_t team_size, nnz_lno_t vec_size, bool apply_forward, bool apply_backward) { @@ -865,9 +881,12 @@ class ClusterGaussSeidel { gs._color_set_end = color_index_end; Kokkos::parallel_for( "KokkosSparse::GaussSeidel::Team_PSGS::forward", - team_policy_t((overall_work + gs._clusters_per_team - 1) / - gs._clusters_per_team, - team_size, vec_size), + Kokkos::Experimental::require( + team_policy_t(my_exec_space, + (overall_work + gs._clusters_per_team - 1) / + gs._clusters_per_team, + team_size, vec_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } } @@ -882,10 +901,13 @@ class ClusterGaussSeidel { gs._color_set_begin = color_index_begin; gs._color_set_end = color_index_end; Kokkos::parallel_for( - "KokkosSparse::GaussSeidel::Team_PSGS::forward", - team_policy_t((overall_work + gs._clusters_per_team - 1) / - gs._clusters_per_team, - team_size, vec_size), + "KokkosSparse::GaussSeidel::Team_PSGS::backward", + Kokkos::Experimental::require( + team_policy_t(my_exec_space, + (overall_work + gs._clusters_per_team - 1) / + gs._clusters_per_team, + team_size, vec_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); if (i == 0) { break; @@ -895,17 +917,18 @@ class ClusterGaussSeidel { } template - void IterativePSGS(PSGS& gs, color_t numColors, + void IterativePSGS(MyExecSpace& my_exec_space, PSGS& gs, color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj, int num_iteration, bool apply_forward, bool apply_backward) { for (int i = 0; i < num_iteration; ++i) { - this->DoPSGS(gs, numColors, h_color_xadj, apply_forward, apply_backward); + this->DoPSGS(my_exec_space, gs, numColors, h_color_xadj, apply_forward, + apply_backward); } } template - void DoPSGS(PSGS& gs, color_t numColors, + void DoPSGS(MyExecSpace& my_exec_space, PSGS& gs, color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj, bool apply_forward, bool apply_backward) { if (apply_forward) { @@ -914,10 +937,13 @@ class ClusterGaussSeidel { nnz_lno_t color_index_end = h_color_xadj(i + 1); gs._color_set_begin = color_index_begin; gs._color_set_end = color_index_end; - Kokkos::parallel_for("KokkosSparse::GaussSeidel::PSGS::forward", - Kokkos::RangePolicy( - 0, color_index_end - color_index_begin), - gs); + Kokkos::parallel_for( + "KokkosSparse::GaussSeidel::PSGS::forward", + Kokkos::Experimental::require( + Kokkos::RangePolicy( + my_exec_space, 0, color_index_end - color_index_begin), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + gs); } } if (apply_backward && numColors) { @@ -926,10 +952,13 @@ class ClusterGaussSeidel { nnz_lno_t color_index_end = h_color_xadj(i + 1); gs._color_set_begin = color_index_begin; gs._color_set_end = color_index_end; - Kokkos::parallel_for("KokkosSparse::GaussSeidel::PSGS::backward", - Kokkos::RangePolicy( - 0, color_index_end - color_index_begin), - gs); + Kokkos::parallel_for( + "KokkosSparse::GaussSeidel::PSGS::backward", + Kokkos::Experimental::require( + Kokkos::RangePolicy( + my_exec_space, 0, color_index_end - color_index_begin), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + gs); if (i == 0) { break; } diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 36ea2d9df8..6c2782b676 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -274,7 +274,7 @@ void lower_tri_symbolic(ExecSpaceIn& space, TriSolveHandle& thandle, Kokkos::parallel_reduce( "check_count host", Kokkos::RangePolicy( - 0, nodes_per_level.extent(0)), + space, 0, nodes_per_level.extent(0)), KOKKOS_LAMBDA(const long i, long& update) { update += nodes_per_level(i); }, @@ -285,8 +285,7 @@ void lower_tri_symbolic(ExecSpaceIn& space, TriSolveHandle& thandle, check_count = 0; // reset Kokkos::parallel_reduce( "check_count device", - Kokkos::RangePolicy( - 0, dnodes_per_level.extent(0)), + Kokkos::RangePolicy(0, dnodes_per_level.extent(0)), KOKKOS_LAMBDA(const long i, long& update) { update += dnodes_per_level(i); }, @@ -740,8 +739,8 @@ void upper_tri_symbolic(ExecutionSpace& space, TriSolveHandle& thandle, check_count = 0; // reset Kokkos::parallel_reduce( "check_count device", - Kokkos::RangePolicy( - 0, dnodes_per_level.extent(0)), + Kokkos::RangePolicy(space, 0, + dnodes_per_level.extent(0)), KOKKOS_LAMBDA(const long i, long& update) { update += dnodes_per_level(i); }, diff --git a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index f1f7a0e6cd..7ff620e047 100644 --- a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -576,10 +576,11 @@ class TwostageGaussSeidel { tic = timer.seconds(); #endif auto *gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); bool two_stage = gsHandle->isTwoStage(); bool compact_form = gsHandle->isCompactForm(); GSDirection direction = gsHandle->getSweepDirection(); - using GS_Functor_t = + using TSGS_Functor_t = TwostageGaussSeidel_functor; // count nnz in local L & U matrices (rowmap_viewL/rowmap_viewU stores @@ -587,28 +588,29 @@ class TwostageGaussSeidel { ordinal_t nnzA = column_view.extent(0); ordinal_t nnzL = 0; // lower-part of diagonal block ordinal_t nnzU = 0; // upper-part of diagonal block - row_map_view_t rowmap_viewL("row_mapL", + row_map_view_t rowmap_viewL(Kokkos::view_alloc(my_exec_space, "row_mapL"), num_rows + 1); // lower-part of diagonal block - row_map_view_t rowmap_viewU("row_mapU", + row_map_view_t rowmap_viewU(Kokkos::view_alloc(my_exec_space, "row_mapU"), num_rows + 1); // upper-part of diagonal block - row_map_view_t rowmap_viewLa("row_mapLa", + row_map_view_t rowmap_viewLa(Kokkos::view_alloc(my_exec_space, "row_mapLa"), num_rows + 1); // complement of U+D - row_map_view_t rowmap_viewUa("row_mapUa", + row_map_view_t rowmap_viewUa(Kokkos::view_alloc(my_exec_space, "row_mapUa"), num_rows + 1); // complement of L+D if (direction == GS_FORWARD || direction == GS_SYMMETRIC) { using range_policy = Kokkos::RangePolicy; Kokkos::parallel_reduce( - "nnzL", range_policy(0, num_rows), - GS_Functor_t(two_stage, compact_form, num_rows, rowmap_view, - column_view, rowmap_viewL, rowmap_viewUa), + "nnzL", range_policy(my_exec_space, 0, num_rows), + TSGS_Functor_t(two_stage, compact_form, num_rows, rowmap_view, + column_view, rowmap_viewL, rowmap_viewUa), nnzL); } + // TODO: continue if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) { using range_policy = Kokkos::RangePolicy; Kokkos::parallel_reduce( "nnzU", range_policy(0, num_rows), - GS_Functor_t(two_stage, compact_form, num_rows, rowmap_view, - column_view, rowmap_viewU, rowmap_viewLa), + TSGS_Functor_t(two_stage, compact_form, num_rows, rowmap_view, + column_view, rowmap_viewU, rowmap_viewLa), nnzU); } ordinal_t nnzLa = 0; // complement of U+D @@ -633,19 +635,19 @@ class TwostageGaussSeidel { // shift ptr so that it now contains offsets (combine it with the previous // functor calls?) if (direction == GS_FORWARD || direction == GS_SYMMETRIC) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( - 1 + num_rows, rowmap_viewL); + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + my_exec_space, 1 + num_rows, rowmap_viewL); if (compact_form) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( - 1 + num_rows, rowmap_viewLa); + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + my_exec_space, 1 + num_rows, rowmap_viewLa); } } if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( - 1 + num_rows, rowmap_viewU); + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + my_exec_space, 1 + num_rows, rowmap_viewU); if (compact_form) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( - 1 + num_rows, rowmap_viewUa); + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + my_exec_space, 1 + num_rows, rowmap_viewUa); } } #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS @@ -657,31 +659,48 @@ class TwostageGaussSeidel { #endif // allocate memory to store local D values_view_t viewD( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "diags"), num_rows); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "diags"), + num_rows); // allocate memory to store local L entries_view_t column_viewL( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesL"), nnzL); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "entriesL"), + nnzL); values_view_t values_viewL( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesL"), nnzL); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "valuesL"), + nnzL); // allocate memory to store local U entries_view_t column_viewU( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesU"), nnzU); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "entriesU"), + nnzU); values_view_t values_viewU( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesU"), nnzU); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "valuesU"), + nnzU); // allocate memory to store complement of U+D entries_view_t column_viewLa( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesLa"), nnzLa); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "entriesLa"), + nnzLa); values_view_t values_viewLa( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesLa"), nnzLa); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "valuesLa"), + nnzLa); // allocate memory to store complement of L+D entries_view_t column_viewUa( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesUa"), nnzUa); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "entriesUa"), + nnzUa); values_view_t values_viewUa( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesUa"), nnzUa); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "valuesUa"), + nnzUa); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); tic = timer.seconds(); @@ -694,8 +713,11 @@ class TwostageGaussSeidel { // extract local L & U structures (for computing (L+D)^{-1} or (D+U)^{-1}) using range_policy = Kokkos::RangePolicy; Kokkos::parallel_for( - "entriesLU", range_policy(0, num_rows), - GS_Functor_t( + "entriesLU", + Kokkos::Experimental::require( + range_policy(my_exec_space, 0, num_rows), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + TSGS_Functor_t( two_stage, compact_form, num_rows, rowmap_view, column_view, rowmap_viewL, column_viewL, rowmap_viewU, column_viewU, // @@ -709,6 +731,7 @@ class TwostageGaussSeidel { timer.reset(); #endif + my_exec_space.fence(); // Wait for non-default stream to finish work // construct CrsMat with them graph_t graphL(column_viewL, rowmap_viewL); graph_t graphU(column_viewU, rowmap_viewU); @@ -732,7 +755,9 @@ class TwostageGaussSeidel { gsHandle->setUa(crsmatUa); values_view_t viewDa( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "diags"), num_rows); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "diags"), + num_rows); gsHandle->setDa(viewDa); } @@ -744,10 +769,11 @@ class TwostageGaussSeidel { if (sptrsv_algo != SPTRSVAlgorithm::SPTRSV_CUSPARSE) { // symbolic with CuSparse needs // values - sptrsv_symbolic(handle->get_gs_sptrsvL_handle(), rowmap_viewL, - crsmatL.graph.entries); - sptrsv_symbolic(handle->get_gs_sptrsvU_handle(), rowmap_viewU, - crsmatU.graph.entries); + sptrsv_symbolic(my_exec_space, handle->get_gs_sptrsvL_handle(), + rowmap_viewL, crsmatL.graph.entries); + sptrsv_symbolic(my_exec_space, handle->get_gs_sptrsvU_handle(), + rowmap_viewU, crsmatU.graph.entries); + my_exec_space.fence(); // LBV: there use to be a Kokkos::fence() here, I think a space fence should be good enough? } } } @@ -762,13 +788,14 @@ class TwostageGaussSeidel { Kokkos::fence(); timer.reset(); #endif - using GS_Functor_t = + using TSGS_Functor_t = TwostageGaussSeidel_functor; - auto *gsHandle = get_gs_handle(); - bool two_stage = gsHandle->isTwoStage(); - bool compact_form = gsHandle->isCompactForm(); + auto *gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); + bool two_stage = gsHandle->isTwoStage(); + bool compact_form = gsHandle->isCompactForm(); // load local D from handle auto viewD = gsHandle->getD(); @@ -799,12 +826,15 @@ class TwostageGaussSeidel { // extract local L, D & U matrices using range_policy = Kokkos::RangePolicy; Kokkos::parallel_for( - "valueLU", range_policy(0, num_rows), - GS_Functor_t(two_stage, compact_form, diagos_given, num_rows, - rowmap_view, column_view, values_view, d_invert_view, - rowmap_viewL, values_viewL, viewD, rowmap_viewU, - values_viewU, rowmap_viewLa, values_viewLa, viewDa, - rowmap_viewUa, values_viewUa)); + "valueLU", + Kokkos::Experimental::require( + range_policy(my_exec_space, 0, num_rows), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + TSGS_Functor_t(two_stage, compact_form, diagos_given, num_rows, + rowmap_view, column_view, values_view, d_invert_view, + rowmap_viewL, values_viewL, viewD, rowmap_viewU, + values_viewU, rowmap_viewLa, values_viewLa, viewDa, + rowmap_viewUa, values_viewUa)); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); tic = timer.seconds(); @@ -813,6 +843,8 @@ class TwostageGaussSeidel { timer.reset(); #endif + my_exec_space.fence(); // Wait for non-default stream work to finish before + // sptrsv, TODO: remove if (!(gsHandle->isTwoStage())) { using namespace KokkosSparse::Experimental; auto sptrsv_algo = @@ -830,10 +862,10 @@ class TwostageGaussSeidel { rowmap_viewU, column_viewU, values_viewU); // now do symbolic - sptrsv_symbolic(handle->get_gs_sptrsvL_handle(), rowmap_viewL, - crsmatL.graph.entries, values_viewL); - sptrsv_symbolic(handle->get_gs_sptrsvU_handle(), rowmap_viewU, - crsmatU.graph.entries, values_viewU); + sptrsv_symbolic(my_exec_space, handle->get_gs_sptrsvL_handle(), + rowmap_viewL, crsmatL.graph.entries, values_viewL); + sptrsv_symbolic(my_exec_space, handle->get_gs_sptrsvU_handle(), + rowmap_viewU, crsmatU.graph.entries, values_viewU); } } } @@ -857,10 +889,11 @@ class TwostageGaussSeidel { #endif // - auto *gsHandle = get_gs_handle(); - bool two_stage = gsHandle->isTwoStage(); - bool compact_form = gsHandle->isCompactForm(); - scalar_t gamma = gsHandle->getInnerDampFactor(); + auto *gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); + bool two_stage = gsHandle->isTwoStage(); + bool compact_form = gsHandle->isCompactForm(); + scalar_t gamma = gsHandle->getInnerDampFactor(); GSDirection direction = gsHandle->getSweepDirection(); if (apply_forward && apply_backward) { @@ -883,7 +916,7 @@ class TwostageGaussSeidel { auto crsmatUa = gsHandle->getUa(); // complement of U+D (used only for compact form) - // wratp A into crsmat + // wrap A into crsmat input_crsmat_t crsmatA("A", num_rows, num_cols, values_view.extent(0), values_view, rowmap_view, column_view); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS @@ -916,36 +949,36 @@ class TwostageGaussSeidel { NumSweeps *= 2; } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector( - nrhs, localX); + KokkosKernels::Impl::zero_vector(my_exec_space, nrhs, localX); } for (int sweep = 0; sweep < NumSweeps; ++sweep) { bool forward_sweep = (direction == GS_FORWARD || (direction == GS_SYMMETRIC && sweep % 2 == 0)); // compute residual vector - KokkosBlas::scal(localR, one, localB); + KokkosBlas::scal(my_exec_space, localR, one, localB); if (sweep > 0 || !init_zero_x_vector) { if (compact_form) { if (forward_sweep) { // R = B - U*x - KokkosSparse::spmv("N", scalar_t(-one), crsmatUa, localX, one, - localR); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatUa, + localX, one, localR); } else { // R = B - L*x - KokkosSparse::spmv("N", scalar_t(-one), crsmatLa, localX, one, - localR); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatLa, + localX, one, localR); } if (omega != one) { // R = B - (U + (1-1/omega)D)*x scalar_t omega2 = (one / omega - one); auto localY = Kokkos::subview(localX, range_type(0, num_rows), Kokkos::ALL()); - KokkosBlas::mult(zero, localZ, one, localDa, localY); - KokkosBlas::axpy(omega2, localZ, localR); + KokkosBlas::mult(my_exec_space, zero, localZ, one, localDa, localY); + KokkosBlas::axpy(my_exec_space, omega2, localZ, localR); } } else { // not compact_form // R = B - A*x - KokkosSparse::spmv("N", scalar_t(-one), crsmatA, localX, one, localR); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatA, + localX, one, localR); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS { auto localRj = @@ -981,8 +1014,9 @@ class TwostageGaussSeidel { Kokkos::subview(localZ, Kokkos::ALL(), range_type(j, j + 1)); single_vector_view_t Rj(localRj.data(), num_rows); single_vector_view_t Zj(localZj.data(), num_rows); - sptrsv_solve(handle->get_gs_sptrsvL_handle(), crsmatL.graph.row_map, - crsmatL.graph.entries, crsmatL.values, Rj, Zj); + sptrsv_solve(my_exec_space, handle->get_gs_sptrsvL_handle(), + crsmatL.graph.row_map, crsmatL.graph.entries, + crsmatL.values, Rj, Zj); } } else { using namespace KokkosSparse::Experimental; @@ -995,8 +1029,9 @@ class TwostageGaussSeidel { Kokkos::subview(localZ, Kokkos::ALL(), range_type(j, j + 1)); single_vector_view_t Rj(localRj.data(), num_rows); single_vector_view_t Zj(localZj.data(), num_rows); - sptrsv_solve(handle->get_gs_sptrsvU_handle(), crsmatU.graph.row_map, - crsmatU.graph.entries, crsmatU.values, Rj, Zj); + sptrsv_solve(my_exec_space, handle->get_gs_sptrsvU_handle(), + crsmatU.graph.row_map, crsmatU.graph.entries, + crsmatU.values, Rj, Zj); } } @@ -1005,21 +1040,25 @@ class TwostageGaussSeidel { Kokkos::subview(localX, range_type(0, num_rows), Kokkos::ALL()); if (compact_form) { // Y = omega * Z - KokkosBlas::scal(localY, one, localZ); + KokkosBlas::scal(my_exec_space, localY, one, localZ); } else { // Y = Y + omega * Z - KokkosBlas::axpy(one, localZ, localY); + KokkosBlas::axpy(my_exec_space, one, localZ, localY); } } else { // ====== inner Jacobi-Richardson ===== #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS // compute initial residual norm // > compute RHS for the inner loop, R = B - A*x - internal_vector_view_t tempR("tempR", num_rows, 1); - KokkosBlas::scal(tempR, one, localB); - KokkosSparse::spmv("N", scalar_t(-one), crsmatA, localX, one, tempR); + internal_vector_view_t tempR( + Kokkos::view_alloc(my_exec_space, std::string("tempR")), num_rows, + 1); + KokkosBlas::scal(my_exec_space, tempR, one, localB); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatA, localX, + one, tempR); // > initial vector for the inner loop is zero - Kokkos::deep_copy(localZ, zero); + Kokkos::deep_copy(my_exec_space, localZ, zero); + my_exec_space.fence(); using Norm_Functor_t = TwostageGaussSeidel_functor; @@ -1027,7 +1066,7 @@ class TwostageGaussSeidel { { mag_t normR = zero; Kokkos::parallel_reduce( - "normR", range_policy(0, num_rows), + "normR", range_policy(my_exec_space, 0, num_rows), Norm_Functor_t(forward_sweep, num_rows, rowmap_view, column_view, values_view, localD, localZ, tempR), normR); @@ -1042,23 +1081,23 @@ class TwostageGaussSeidel { // row-scale: (D^{-1}*L)*Y = D^{-1}*B // compute Z := D^{-1}*R - KokkosBlas::mult(zero, localZ, one, localD, localR); + KokkosBlas::mult(my_exec_space, zero, localZ, one, localD, localR); // apply inner damping factor, if not one if (gamma != one) { // Z = gamma * Z - KokkosBlas::scal(localZ, gamma, localZ); + KokkosBlas::scal(my_exec_space, localZ, gamma, localZ); } } else { // copy to localT (workspace used to save D^{-1}*R for JR iteration) - KokkosBlas::mult(zero, localT, one, localD, localR); + KokkosBlas::mult(my_exec_space, zero, localT, one, localD, localR); // initialize Jacobi-Richardson (using R as workspace for JR // iteration) - KokkosBlas::scal(localR, one, localT); + KokkosBlas::scal(my_exec_space, localR, one, localT); // apply inner damping factor, if not one if (gamma != one) { // R = gamma * R - KokkosBlas::scal(localR, gamma, localR); + KokkosBlas::scal(my_exec_space, localR, gamma, localR); } } #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS @@ -1066,7 +1105,7 @@ class TwostageGaussSeidel { // compute residual norm of the starting vector (D^{-1}R) mag_t normR = zero; Kokkos::parallel_reduce( - "normR", range_policy(0, num_rows), + "normR", range_policy(my_exec_space, 0, num_rows), Norm_Functor_t(forward_sweep, num_rows, rowmap_view, column_view, values_view, localD, localT, tempR), normR); @@ -1077,34 +1116,34 @@ class TwostageGaussSeidel { for (int ii = 0; ii < NumInnerSweeps; ii++) { // T = D^{-1}*R, and L = D^{-1}*L and U = D^{-1}*U // copy T into Z - KokkosBlas::scal(localZ, one, localT); + KokkosBlas::scal(my_exec_space, localZ, one, localT); if (forward_sweep) { // Z = Z - L*R - KokkosSparse::spmv("N", scalar_t(-omega), crsmatL, localR, one, - localZ); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-omega), crsmatL, + localR, one, localZ); } else { // Z = R - U*T - KokkosSparse::spmv("N", scalar_t(-omega), crsmatU, localR, one, - localZ); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-omega), crsmatU, + localR, one, localZ); } // apply inner damping factor, if not one if (gamma != one) { // Z = gamma * Z - KokkosBlas::scal(localZ, gamma, localZ); + KokkosBlas::scal(my_exec_space, localZ, gamma, localZ); // Z = Z + (one - one/gamma) * R scalar_t gamma2 = one - gamma; - KokkosBlas::axpy(gamma2, localR, localZ); + KokkosBlas::axpy(my_exec_space, gamma2, localR, localZ); } if (ii + 1 < NumInnerSweeps) { // reinitialize (R to be Z) - KokkosBlas::scal(localR, one, localZ); + KokkosBlas::scal(my_exec_space, localR, one, localZ); } #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS { // compute residual norm(r - (L+D)*y) mag_t normR = zero; Kokkos::parallel_reduce( - "normR", range_policy(0, num_rows), + "normR", range_policy(my_exec_space, 0, num_rows), Norm_Functor_t(forward_sweep, num_rows, rowmap_view, column_view, values_view, localD, localZ, tempR), normR); @@ -1119,22 +1158,23 @@ class TwostageGaussSeidel { Kokkos::subview(localX, range_type(0, num_rows), Kokkos::ALL()); if (compact_form) { // Y := omega * z - KokkosBlas::scal(localY, omega, localZ); + KokkosBlas::scal(my_exec_space, localY, omega, localZ); } else { // Y := X + omega * Z - KokkosBlas::axpy(omega, localZ, localY); + KokkosBlas::axpy(my_exec_space, omega, localZ, localY); } } // end of inner GS sweep } // end of outer GS sweep #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS { // R = B - A*x - KokkosBlas::scal(localR, one, localB); - KokkosSparse::spmv("N", scalar_t(-one), crsmatA, localX, one, localR); + KokkosBlas::scal(my_exec_space, localR, one, localB); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatA, localX, + one, localR); auto localRj = Kokkos::subview(localR, Kokkos::ALL(), range_type(0, 1)); single_vector_view_t Rj(localRj.data(), num_rows); - std::cout << "norm(GS)-" << NumSweeps << " " << KokkosBlas::nrm2(Rj) - << std::endl; + std::cout << "norm(GS)-" << NumSweeps << " " + << KokkosBlas::nrm2(my_exec_space, Rj) << std::endl; } #endif } diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 680045823e..317079f09a 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -605,19 +605,25 @@ class KokkosKernelsHandle { // clang-format off /** * @brief Create a gauss seidel handle object - * + * @note: Caller is responsible for freeing this via destroy_gs_handle() + * * @param handle_exec_space The execution space instance to execute kernels on. * @param num_streams The number of streams to allocate memory for. * @param gs_algorithm Specifies which algorithm to use: - * - * KokkosSpace::GS_DEFAULT PointGaussSeidel - * KokkosSpace::GS_PERMUTED ?? - * KokkosSpace::GS_TEAM ?? - * KokkosSpace::GS_CLUSTER ?? - * KokkosSpace::GS_TWOSTAGE ?? + * + * KokkosSpace::GS_DEFAULT PointGaussSeidel or BlockGaussSeidel, depending on matrix type. + * KokkosSpace::GS_PERMUTED Reorders rows/cols into colors to improve locality. Uses RangePolicy over rows. + * KokkosSpace::GS_TEAM Uses TeamPolicy over batches of rows with ThreadVector within rows. + * KokkosSpace::GS_CLUSTER Uses independent clusters of nodes in the graph. Within a cluster, x is updated sequentially. + * For more information, see: https://arxiv.org/pdf/2204.02934.pdf. + * KokkosSpace::GS_TWOSTAGE Uses spmv to parallelize inner sweeps of x. + * For more information, see: https://arxiv.org/pdf/2104.01196.pdf. * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: - * - * KokkosGraph::COLORING_DEFAULT ?? + * + * KokkosGraph::COLORING_DEFAULT Depends on execution space: + * COLORING_SERIAL on Kokkos::Serial; + * COLORING_EB on GPUs; + * COLORING_VBBIT on Kokkos::Sycl or elsewhere. * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring * KokkosGraph::COLORING_VB Vertex Based Coloring * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array @@ -649,7 +655,8 @@ class KokkosKernelsHandle { // clang-format off /** * @brief Create a gauss seidel handle object - * + * @note: Caller is responsible for freeing this via destroy_gs_handle() + * * @param gs_algorithm Specifies which algorithm to use: * * KokkosSpace::GS_DEFAULT PointGaussSeidel or BlockGaussSeidel, depending on matrix type. @@ -744,7 +751,10 @@ class KokkosKernelsHandle { // clang-format off /** * @brief Create a gs handle object - * + * @note: Caller is responsible for freeing this via destroy_gs_handle() + * + * @param handle_exec_space The execution space instance to execute kernels on. + * @param num_streams The number of streams to allocate memory for. * @param clusterAlgo Specifies which clustering algorithm to use: * * KokkosSparse::CLUSTER_DEFAULT ?? @@ -753,8 +763,11 @@ class KokkosKernelsHandle { * KokkosSparse::NUM_CLUSTERING_ALGORITHMS ?? * @param hint_verts_per_cluster Hint how many verticies to use per cluster * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: - * - * KokkosGraph::COLORING_DEFAULT ?? + * + * KokkosGraph::COLORING_DEFAULT Depends on execution space: + * COLORING_SERIAL on Kokkos::Serial; + * COLORING_EB on GPUs; + * COLORING_VBBIT on Kokkos::Sycl or elsewhere. * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring * KokkosGraph::COLORING_VB Vertex Based Coloring * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array @@ -766,15 +779,61 @@ class KokkosKernelsHandle { * backwards compatibility for SPGEMM and other use cases) */ // clang-format on - void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, + void create_gs_handle(const HandleExecSpace &handle_exec_space, + int num_streams, + KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t hint_verts_per_cluster, KokkosGraph::ColoringAlgorithm coloring_algorithm = KokkosGraph::COLORING_DEFAULT) { this->destroy_gs_handle(); this->is_owner_of_the_gs_handle = true; this->gsHandle = new ClusterGaussSeidelHandleType( - clusterAlgo, hint_verts_per_cluster, coloring_algorithm); + handle_exec_space, num_streams, clusterAlgo, hint_verts_per_cluster, + coloring_algorithm); } + + /** + * @brief Create a gs handle object + * @note: Caller is responsible for freeing this via destroy_gs_handle() + * + * @param clusterAlgo Specifies which clustering algorithm to use: + * + * KokkosSparse::ClusteringAlgorithm::CLUSTER_DEFAULT ?? + * KokkosSparse::ClusteringAlgorithm::CLUSTER_MIS2 ?? + * KokkosSparse::ClusteringAlgorithm::CLUSTER_BALLOON ?? + * KokkosSparse::ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS + * ?? + * @param hint_verts_per_cluster Hint how many verticies to use per cluster + * @param coloring_algorithm Specifies which coloring algorithm to color the + * graph with: + * + * KokkosGraph::COLORING_DEFAULT Depends on + * execution space: COLORING_SERIAL on Kokkos::Serial; COLORING_EB on GPUs; + * COLORING_VBBIT + * on Kokkos::Sycl or elsewhere. KokkosGraph::COLORING_SERIAL Serial Greedy + * Coloring KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based + * Coloring with bit array KokkosGraph::COLORING_VBCS Vertex Based Color + * Set KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based + * Deterministic Coloring with bit array KokkosGraph::COLORING_EB Edge + * Based Coloring KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph + * Coloring (kept here for backwards compatibility for SPGEMM and other use + * cases) + */ + // clang-format on + void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, + nnz_lno_t hint_verts_per_cluster, + KokkosGraph::ColoringAlgorithm coloring_algorithm = + KokkosGraph::COLORING_DEFAULT) { + HandleExecSpace handle_exec_space; + create_gs_handle(handle_exec_space, 1, clusterAlgo, hint_verts_per_cluster, + coloring_algorithm); + } + + /** + * Free the dynamically allocated gs handle allocation. + */ void destroy_gs_handle() { if (is_owner_of_the_gs_handle && this->gsHandle != NULL) { delete this->gsHandle; diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 036fe1b119..4e362f2781 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -29,13 +29,13 @@ namespace Experimental { /// @brief Gauss-Seidel preconditioner setup (first phase, based on sparsity /// pattern only) /// -/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type -/// @param space The execution space instance this kernel will be run on. -/// @param handle KernelHandle instance +/// @param space The execution space instance this kernel will be run on +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -112,7 +112,7 @@ void gauss_seidel_symbolic(const ExecutionSpace &space, KernelHandle *handle, /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type -/// @param handle KernelHandle instance +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -141,7 +141,7 @@ void gauss_seidel_symbolic(KernelHandle *handle, /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type -/// @param handle KernelHandle instance +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param block_size The number of degrees of freedom per block @@ -173,15 +173,15 @@ void block_gauss_seidel_symbolic( /// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's /// numeric values) /// -/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type -/// @param space The execution space instance this kernel will be run on. -/// @param handle KernelHandle instance +/// @param space The execution space instance this kernel will be run on +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -279,8 +279,8 @@ void gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided -/// inverse diagonal must share this type. -/// @param handle KernelHandle instance +/// inverse diagonal must share this type +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -313,16 +313,16 @@ void gauss_seidel_numeric(KernelHandle *handle, /// numeric values). This version accepts the matrix's inverse diagonal from the /// user. /// -/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided -/// inverse diagonal must share this type. -/// @param space The execution space instance this kernel will be run on. -/// @param handle KernelHandle instance +/// inverse diagonal must share this type +/// @param space The execution space instance this kernel will be run on +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -427,8 +427,8 @@ void gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided -/// inverse diagonal must share this type. -/// @param handle KernelHandle instance +/// inverse diagonal must share this type +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -468,7 +468,7 @@ void gauss_seidel_numeric(KernelHandle *handle, /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type -/// @param handle handle A KokkosKernelsHandle instance +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param block_size The number of degrees of freedom per block @@ -504,7 +504,7 @@ void block_gauss_seidel_numeric( /// @brief Apply symmetric (forward + backward) Gauss-Seidel preconditioner to /// system AX=Y /// -/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -512,12 +512,12 @@ void block_gauss_seidel_numeric( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. +/// rank-1 or rank-2 View /// @param space The execution space instance this kernel will be run -/// on. NOTE: Currently only used for GS_DEFAULT. -/// @param handle handle A KokkosKernelsHandle instance +/// on. NOTE: Currently only used for GS_DEFAULT and GS_TWOSTAGE +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -680,8 +680,8 @@ void symmetric_gauss_seidel_apply( /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle handle A KokkosKernelsHandle instance +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -727,10 +727,10 @@ void symmetric_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle handle A KokkosKernelsHandle instance. +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param block_size The number of degrees of freedom per block @@ -793,12 +793,12 @@ void symmetric_block_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. +/// rank-1 or rank-2 View /// @param space The execution space instance this kernel will be run -/// on. NOTE: Currently only used for GS_DEFAULT. -/// @param handle KernelHandle instance +/// on. NOTE: Currently only used for GS_DEFAULT and GS_TWOSTAGE +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -960,10 +960,10 @@ void forward_sweep_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle KernelHandle instance +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -1008,10 +1008,10 @@ void forward_sweep_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle KernelHandle instance +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param block_size The number of degrees of freedom per block @@ -1067,7 +1067,7 @@ void forward_sweep_block_gauss_seidel_apply( /// /// @brief Apply backward Gauss-Seidel preconditioner to system AX=Y /// -/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -1075,12 +1075,12 @@ void forward_sweep_block_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. +/// rank-1 or rank-2 View /// @param space The execution space instance this kernel will be run -/// on. NOTE: Currently only used for GS_DEFAULT. -/// @param handle KernelHandle instance +/// on. NOTE: Currently only used for GS_DEFAULT and GS_TWOSTAGE +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -1242,10 +1242,10 @@ void backward_sweep_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle KernelHandle instance +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -1290,10 +1290,10 @@ void backward_sweep_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle KernelHandle instance +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param block_size The number of degrees of freedom per block diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 624382ec5b..b947227ac0 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -104,6 +104,7 @@ class GaussSeidelHandle { bool called_symbolic; bool called_numeric; + bool execution_space_is_set; int suggested_vector_size; int suggested_team_size; @@ -121,6 +122,7 @@ class GaussSeidelHandle { numColors(0), called_symbolic(false), called_numeric(false), + execution_space_is_set(false), suggested_vector_size(0), suggested_team_size(0) {} @@ -134,6 +136,7 @@ class GaussSeidelHandle { numColors(0), called_symbolic(false), called_numeric(false), + execution_space_is_set(false), suggested_vector_size(0), suggested_team_size(0) {} @@ -159,8 +162,7 @@ class GaussSeidelHandle { template void set_execution_space(const ExecSpaceIn exec_space_in) { - static bool is_set = false; - if (!is_set) { + if (!execution_space_is_set) { static_assert(std::is_same::value, "The type of exec_space_in should be the same as " "GaussSeidelHandle::HandleExecSpace"); @@ -172,7 +174,6 @@ class GaussSeidelHandle { "without multiple handles. Please create a new handle via " "create_gs_handle.\n"); } - is_set = true; } void set_algorithm_type(const GSAlgorithm sgs_algo) { @@ -496,7 +497,7 @@ class ClusterGaussSeidelHandle typedef GaussSeidelHandle GSHandle; - typedef ExecutionSpace HandleExecSpace; + using HandleExecSpace = typename GSHandle::HandleExecSpace; typedef TemporaryMemorySpace HandleTempMemorySpace; typedef PersistentMemorySpace HandlePersistentMemorySpace; @@ -553,15 +554,12 @@ class ClusterGaussSeidelHandle nnz_lno_persistent_work_view_t vert_clusters; public: - /** - * \brief Default constructor. - */ - - // Constructor for cluster-coloring based GS and SGS - ClusterGaussSeidelHandle(ClusteringAlgorithm cluster_algo_, + // Constructors for cluster-coloring based GS and SGS + ClusterGaussSeidelHandle(GSHandle gs_handle, + ClusteringAlgorithm cluster_algo_, nnz_lno_t cluster_size_, KokkosGraph::ColoringAlgorithm coloring_algo_) - : GSHandle(GS_CLUSTER), + : GSHandle(gs_handle), cluster_algo(cluster_algo_), cluster_size(cluster_size_), coloring_algo(coloring_algo_), @@ -570,6 +568,23 @@ class ClusterGaussSeidelHandle cluster_adj(), vert_clusters() {} + /** + * \brief Default constructor. + */ + ClusterGaussSeidelHandle(ClusteringAlgorithm cluster_algo_, + nnz_lno_t cluster_size_, + KokkosGraph::ColoringAlgorithm coloring_algo_) + : ClusterGaussSeidelHandle(GSHandle(GS_CLUSTER), cluster_algo_, + cluster_size_, coloring_algo_) {} + + ClusterGaussSeidelHandle(HandleExecSpace handle_exec_space, int n_streams, + ClusteringAlgorithm cluster_algo_, + nnz_lno_t cluster_size_, + KokkosGraph::ColoringAlgorithm coloring_algo_) + : ClusterGaussSeidelHandle( + GSHandle(handle_exec_space, n_streams, GS_CLUSTER), cluster_algo_, + cluster_size_, coloring_algo_) {} + void set_cluster_size(nnz_lno_t cs) { this->cluster_size = cs; } nnz_lno_t get_cluster_size() const { return this->cluster_size; } @@ -767,9 +782,18 @@ class TwoStageGaussSeidelHandle void initVectors(int nrows_, int nrhs_) { if (this->nrows != nrows_ || this->nrhs != nrhs_) { - this->localR = vector_view_t("temp", nrows_, nrhs_); - this->localT = vector_view_t("temp", nrows_, nrhs_); - this->localZ = vector_view_t("temp", nrows_, nrhs_); + vector_view_t r( + Kokkos::view_alloc(this->execution_space, std::string("temp")), + nrows_, nrhs_); + this->localR = r; + vector_view_t t( + Kokkos::view_alloc(this->execution_space, std::string("temp")), + nrows_, nrhs_); + this->localT = t; + vector_view_t z( + Kokkos::view_alloc(this->execution_space, std::string("temp")), + nrows_, nrhs_); + this->localZ = z; this->nrows = nrows_; this->nrhs = nrhs_; } diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 48c7d41a91..64285328c1 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -756,6 +756,10 @@ void test_gauss_seidel_streams_rank1( execution_space(), std::vector(nstreams, 1)); std::vector kh_v(nstreams); + std::vector kh_psgs_v(nstreams); + std::vector kh_tsgs_v(nstreams); + std::vector kh_tsgs_classic_v(nstreams); + std::vector kh_cluster_v(nstreams); std::vector input_mat_v(nstreams); std::vector solution_x_v(nstreams); std::vector x_vector_v(nstreams); @@ -764,6 +768,8 @@ void test_gauss_seidel_streams_rank1( const scalar_t one = Kokkos::ArithTraits::one(); const scalar_t zero = Kokkos::ArithTraits::zero(); + // two-stage with SpTRSV supports only omega = one + auto omega_tsgs_classic = one; for (int i = 0; i < nstreams; i++) { input_mat_v[i] = @@ -792,17 +798,27 @@ void test_gauss_seidel_streams_rank1( Kokkos::view_alloc(Kokkos::WithoutInitializing, "x vector"), nv); x_vector_v[i] = x_vector_tmp; - kh_v[i] = KernelHandle(); // Initialize KokkosKernelsHandle defaults. - kh_v[i].create_gs_handle(instances[i], nstreams, GS_DEFAULT, coloringAlgo); + kh_psgs_v[i] = KernelHandle(); // Initialize KokkosKernelsHandle defaults. + kh_tsgs_v[i] = KernelHandle(); // Initialize KokkosKernelsHandle defaults. + kh_tsgs_classic_v[i] = + KernelHandle(); // Initialize KokkosKernelsHandle defaults. + + kh_psgs_v[i].create_gs_handle(instances[i], nstreams, GS_DEFAULT, + coloringAlgo); + kh_tsgs_v[i].create_gs_handle(instances[i], nstreams, GS_TWOSTAGE, + coloringAlgo); + kh_tsgs_v[i].set_gs_twostage(true, input_mat_v[i].numRows()); + kh_tsgs_classic_v[i].create_gs_handle(instances[i], nstreams, GS_TWOSTAGE, + coloringAlgo); + kh_tsgs_classic_v[i].set_gs_twostage(false, input_mat_v[i].numRows()); } - int apply_count = 3; // test symmetric, forward, backward //*** Point-coloring version **** for (int apply_type = 0; apply_type < apply_count; ++apply_type) { for (int i = 0; i < nstreams; i++) Kokkos::deep_copy(instances[i], x_vector_v[i], zero); - run_gauss_seidel_streams(instances, kh_v, input_mat_v, x_vector_v, + run_gauss_seidel_streams(instances, kh_psgs_v, input_mat_v, x_vector_v, y_vector_v, symmetric, m_omega, apply_type, nstreams); for (int i = 0; i < nstreams; i++) { @@ -814,7 +830,78 @@ void test_gauss_seidel_streams_rank1( } } - for (int i = 0; i < nstreams; i++) kh_v[i].destroy_gs_handle(); + //*** Cluster-coloring version **** + int clusterSizes[3] = {2, 5, 34}; + std::vector clusteringAlgos = {CLUSTER_MIS2, + CLUSTER_BALLOON}; + for (int csize = 0; csize < 3; csize++) { + for (auto clusterAlgo : clusteringAlgos) { + for (int i = 0; i < nstreams; i++) { + kh_cluster_v[i] = + KernelHandle(); // Initialize KokkosKernelsHandle defaults. + kh_cluster_v[i].create_gs_handle(instances[i], nstreams, clusterAlgo, + clusterSizes[csize], coloringAlgo); + } + + for (int apply_type = 0; apply_type < apply_count; ++apply_type) { + for (int i = 0; i < nstreams; i++) { + Kokkos::deep_copy(instances[i], x_vector_v[i], zero); + instances[i].fence(); + } + + run_gauss_seidel_streams(instances, kh_cluster_v, input_mat_v, + x_vector_v, y_vector_v, symmetric, m_omega, + apply_type, nstreams); + for (int i = 0; i < nstreams; i++) { + KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, + x_vector_v[i]); + mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); + EXPECT_LT(result_norm_res, initial_norm_res_v[i]) + << "with (clusterSize:" << clusterSizes[csize] + << ", clusterAlgo:" << clusterAlgo + << ",coloringAlgo:" << coloringAlgo << ") on stream_idx: " << i; + } + } + for (int i = 0; i < nstreams; i++) kh_cluster_v[i].destroy_gs_handle(); + } + } + + //*** Two-stage version **** + for (int apply_type = 0; apply_type < apply_count; ++apply_type) { + for (int i = 0; i < nstreams; i++) + Kokkos::deep_copy(instances[i], x_vector_v[i], zero); + run_gauss_seidel_streams(instances, kh_tsgs_v, input_mat_v, x_vector_v, + y_vector_v, symmetric, m_omega, apply_type, + nstreams); + for (int i = 0; i < nstreams; i++) { + KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, + x_vector_v[i]); + mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); + EXPECT_LT(result_norm_res, initial_norm_res_v[i]) + << "on stream_idx: " << i; + } + } + //*** Two-stage version (classic) **** + for (int apply_type = 0; apply_type < apply_count; ++apply_type) { + for (int i = 0; i < nstreams; i++) + Kokkos::deep_copy(instances[i], x_vector_v[i], zero); + run_gauss_seidel_streams(instances, kh_tsgs_classic_v, input_mat_v, + x_vector_v, y_vector_v, symmetric, + omega_tsgs_classic, apply_type, nstreams); + for (int i = 0; i < nstreams; i++) { + KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, + x_vector_v[i]); + mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); + EXPECT_LT(result_norm_res, initial_norm_res_v[i]) + << "on stream_idx: " << i; + } + } + + for (int i = 0; i < nstreams; i++) { + kh_psgs_v[i].destroy_gs_handle(); + kh_tsgs_v[i].destroy_gs_handle(); + kh_tsgs_classic_v[i].destroy_gs_handle(); + } } #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ @@ -830,9 +917,6 @@ void test_gauss_seidel_streams_rank1( test_gauss_seidel_streams_rank1( \ 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ 1); \ - test_gauss_seidel_streams_rank1( \ - 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ - 2); \ test_gauss_seidel_streams_rank1( \ 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ 3); \