diff --git a/cpp/include/cugraph_c/sampling_algorithms.h b/cpp/include/cugraph_c/sampling_algorithms.h index 6ef813d59f3..a9a310db7a5 100644 --- a/cpp/include/cugraph_c/sampling_algorithms.h +++ b/cpp/include/cugraph_c/sampling_algorithms.h @@ -493,12 +493,12 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_hop( const cugraph_sample_result_t* result); /** - * @brief Get the hop offsets from the sampling algorithm result + * @brief Get the label-hop offsets from the sampling algorithm result * * @param [in] result The result from a sampling algorithm - * @return type erased array pointing to the hop offsets + * @return type erased array pointing to the label-hop offsets */ -cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_hop_offsets( +cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_label_hop_offsets( const cugraph_sample_result_t* result); /** @@ -511,7 +511,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_index( const cugraph_sample_result_t* result); /** - * @deprecated This call should be replaced with cugraph_sample_get_get_label_offsets + * @deprecated This call should be replaced with cugraph_sample_get_get_label_hop_offsets * @brief Get the result offsets from the sampling algorithm result * * @param [in] result The result from a sampling algorithm @@ -520,15 +520,6 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_index( cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_offsets( const cugraph_sample_result_t* result); -/** - * @brief Get the result label offsets from the sampling algorithm result - * - * @param [in] result The result from a sampling algorithm - * @return type erased array pointing to the result label offsets - */ -cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_label_offsets( - const cugraph_sample_result_t* result); - /** * @brief Get the renumber map * diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp index 17b66ec8aaa..6ae1cf6d259 100644 --- a/cpp/src/c_api/uniform_neighbor_sampling.cpp +++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp @@ -50,9 +50,8 @@ struct cugraph_sample_result_t { cugraph_type_erased_device_array_t* edge_type_{nullptr}; cugraph_type_erased_device_array_t* wgt_{nullptr}; cugraph_type_erased_device_array_t* hop_{nullptr}; - cugraph_type_erased_device_array_t* hop_offsets_{nullptr}; + cugraph_type_erased_device_array_t* label_hop_offsets_{nullptr}; cugraph_type_erased_device_array_t* label_{nullptr}; - cugraph_type_erased_device_array_t* label_offsets_{nullptr}; cugraph_type_erased_device_array_t* renumber_map_{nullptr}; cugraph_type_erased_device_array_t* renumber_map_offsets_{nullptr}; }; @@ -237,7 +236,7 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct rmm::device_uvector minors(0, handle_.get_stream()); std::optional> major_offsets{std::nullopt}; - std::optional> hop_offsets{std::nullopt}; + std::optional> label_hop_offsets{std::nullopt}; std::optional> renumber_map{std::nullopt}; std::optional> renumber_map_offsets{std::nullopt}; @@ -245,21 +244,20 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct if (options_.renumber_results_) { bool src_is_major = (options_.compression_type_ == cugraph::compression_type_t::CSR) || (options_.compression_type_ == cugraph::compression_type_t::DCSR); - if (options_.compression_type_ != cugraph::compression_type_t::COO) { - bool doubly_compress = - (options_.compression_type_ == cugraph::compression_type_t::DCSR) || - (options_.compression_type_ == cugraph::compression_type_t::DCSC); - - std::tie(majors, - *major_offsets, + if (options_.compression_type_ == cugraph::compression_type_t::COO) { + // COO + + rmm::device_uvector output_majors(0, handle_.get_stream()); + rmm::device_uvector output_renumber_map(0, handle_.get_stream()); + std::tie(output_majors, minors, wgt, edge_id, edge_type, - hop_offsets, - *renumber_map, + label_hop_offsets, + output_renumber_map, renumber_map_offsets) = - cugraph::renumber_and_compress_sampled_edgelist( + cugraph::renumber_and_sort_sampled_edgelist( handle_, std::move(src), std::move(dst), @@ -273,20 +271,29 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct edge_label->size())) : std::nullopt, src_is_major, - options_.compress_per_hop_, - doubly_compress, do_expensive_check_); + + majors.emplace(std::move(output_majors)); + renumber_map.emplace(std::move(output_renumber_map)); } else { - // COO - std::tie(*majors, + // (D)CSC, (D)CSR + + bool doubly_compress = + (options_.compression_type_ == cugraph::compression_type_t::DCSR) || + (options_.compression_type_ == cugraph::compression_type_t::DCSC); + + rmm::device_uvector output_major_offsets(0, handle_.get_stream()); + rmm::device_uvector output_renumber_map(0, handle_.get_stream()); + std::tie(majors, + output_major_offsets, minors, wgt, edge_id, edge_type, - hop_offsets, - *renumber_map, + label_hop_offsets, + renumber_map, renumber_map_offsets) = - cugraph::renumber_and_sort_sampled_edgelist( + cugraph::renumber_and_compress_sampled_edgelist( handle_, std::move(src), std::move(dst), @@ -300,31 +307,24 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct edge_label->size())) : std::nullopt, src_is_major, + options_.compress_per_hop_, + doubly_compress, do_expensive_check_); + + renumber_map.emplace(std::move(output_renumber_map)); + major_offsets.emplace(std::move(output_major_offsets)); } + // These are now represented by label_hop_offsets hop.reset(); offsets.reset(); } else { - *majors = std::move(src); - minors = std::move(dst); + majors.emplace(std::move(src)); + minors = std::move(dst); + + label_hop_offsets = std::move(offsets); } - /* - cugraph_type_erased_device_array_t* major_offsets_{nullptr}; - cugraph_type_erased_device_array_t* majors_{nullptr}; - cugraph_type_erased_device_array_t* minors_{nullptr}; - cugraph_type_erased_device_array_t* edge_id_{nullptr}; - cugraph_type_erased_device_array_t* edge_type_{nullptr}; - cugraph_type_erased_device_array_t* wgt_{nullptr}; - cugraph_type_erased_device_array_t* hop_{nullptr}; - cugraph_type_erased_device_array_t* hop_offsets_{nullptr}; - cugraph_type_erased_device_array_t* label_{nullptr}; - cugraph_type_erased_device_array_t* label_offsets_{nullptr}; - cugraph_type_erased_device_array_t* renumber_map_{nullptr}; - cugraph_type_erased_device_array_t* renumber_map_offsets_{nullptr}; - */ - result_ = new cugraph::c_api::cugraph_sample_result_t{ (major_offsets) ? new cugraph::c_api::cugraph_type_erased_device_array_t(*major_offsets, SIZE_T) @@ -341,14 +341,12 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct : nullptr, (wgt) ? new cugraph::c_api::cugraph_type_erased_device_array_t(*wgt, graph_->weight_type_) : nullptr, - (hop) ? new cugraph::c_api::cugraph_type_erased_device_array_t(*hop, INT32) : nullptr, - (hop_offsets) ? new cugraph::c_api::cugraph_type_erased_device_array_t(*hop_offsets, SIZE_T) + (hop) ? new cugraph::c_api::cugraph_type_erased_device_array_t(*hop, INT32) : nullptr, // FIXME get rid of this once Seunghwa updates the API + (label_hop_offsets) ? new cugraph::c_api::cugraph_type_erased_device_array_t(*label_hop_offsets, SIZE_T) : nullptr, (edge_label) ? new cugraph::c_api::cugraph_type_erased_device_array_t(edge_label.value(), INT32) : nullptr, - (offsets) ? new cugraph::c_api::cugraph_type_erased_device_array_t(offsets.value(), SIZE_T) - : nullptr, (renumber_map) ? new cugraph::c_api::cugraph_type_erased_device_array_t( renumber_map.value(), graph_->vertex_type_) : nullptr, @@ -402,7 +400,25 @@ extern "C" void cugraph_sampling_set_return_hops(cugraph_sampling_options_t* opt extern "C" void cugraph_sampling_set_compression_type(cugraph_sampling_options_t* options, cugraph_compression_type_t value) { auto internal_pointer = reinterpret_cast(options); - internal_pointer->compression_type_ = value; + switch(value) { + case COO: + internal_pointer->compression_type_ = cugraph::compression_type_t::COO; + break; + case CSR: + internal_pointer->compression_type_ = cugraph::compression_type_t::CSR; + break; + case CSC: + internal_pointer->compression_type_ = cugraph::compression_type_t::CSC; + break; + case DCSR: + internal_pointer->compression_type_ = cugraph::compression_type_t::DCSR; + break; + case DCSC: + internal_pointer->compression_type_ = cugraph::compression_type_t::DCSC; + break; + default: + CUGRAPH_FAIL("Invalid compression type"); + } } extern "C" void cugraph_sampling_set_prior_sources_behavior(cugraph_sampling_options_t* options, @@ -529,13 +545,13 @@ extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_ho : NULL; } -extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_hop_offsets( +extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_label_hop_offsets( const cugraph_sample_result_t* result) { auto internal_pointer = reinterpret_cast(result); - return internal_pointer->hop_offsets_ != nullptr + return internal_pointer->label_hop_offsets_ != nullptr ? reinterpret_cast( - internal_pointer->hop_offsets_->view()) + internal_pointer->label_hop_offsets_->view()) : NULL; } @@ -551,17 +567,7 @@ extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_of const cugraph_sample_result_t* result) { // Deprecated. - return cugraph_sample_result_get_label_offsets(result); -} - -extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_label_offsets( - const cugraph_sample_result_t* result) -{ - auto internal_pointer = reinterpret_cast(result); - return internal_pointer->label_offsets_ != nullptr - ? reinterpret_cast( - internal_pointer->label_offsets_->view()) - : NULL; + return cugraph_sample_result_get_label_hop_offsets(result); } extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_renumber_map( @@ -828,9 +834,8 @@ extern "C" void cugraph_sample_result_free(cugraph_sample_result_t* result) delete internal_pointer->edge_type_; delete internal_pointer->wgt_; delete internal_pointer->hop_; - delete internal_pointer->hop_offsets_; + delete internal_pointer->label_hop_offsets_; delete internal_pointer->label_; - delete internal_pointer->label_offsets_; delete internal_pointer->renumber_map_; delete internal_pointer->renumber_map_offsets_; delete internal_pointer; diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py index 219854bb002..f03aadd032e 100644 --- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py +++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py @@ -67,6 +67,7 @@ def uniform_neighbor_sample( prior_sources_behavior: str = None, deduplicate_sources: bool = False, renumber: bool = False, + use_legacy_names=True, # deprecated ) -> Union[cudf.DataFrame, Tuple[cudf.DataFrame, cudf.DataFrame]]: """ Does neighborhood sampling, which samples nodes from a graph based on the @@ -128,6 +129,11 @@ def uniform_neighbor_sample( Whether to renumber on a per-batch basis. If True, will return the renumber map and renumber map offsets as an additional dataframe. + + use_legacy_names: bool, optional (default=True) + Whether to use the legacy column names (sources, destinations). + If True, will use "sources" and "destinations" as the column names. + If False, will use "majors" and "minors" as the column names. Returns ------- @@ -193,6 +199,18 @@ def uniform_neighbor_sample( Contains the batch offsets for the renumber maps """ + if use_legacy_names: + major_col_name = "sources" + minor_col_name = "destinations" + warning_msg = ( + "The legacy column names (sources, destinations)" + " will no longer be supported for uniform_neighbor_sample" + " in release 23.12. The use_legacy_names=False option will" + " become the only option, and (majors, minors) will be the" + " only supported column names." + ) + warnings.warn(warning_msg, FutureWarning) + if with_edge_properties: warning_msg = ( "The with_edge_properties flag is deprecated" @@ -279,35 +297,37 @@ def uniform_neighbor_sample( # TODO use a dictionary at PLC w/o breaking users if renumber: ( - sources, - destinations, + majors, + minors, weights, edge_ids, edge_types, batch_ids, - offsets, + label_hop_offsets, hop_ids, renumber_map, renumber_map_offsets, ) = sampling_result else: ( - sources, - destinations, + majors, + minors, weights, edge_ids, edge_types, batch_ids, - offsets, + label_hop_offsets, hop_ids, ) = sampling_result - df["sources"] = sources - df["destinations"] = destinations + df[major_col_name] = majors + df[minor_col_name] = minors df["weight"] = weights df["edge_id"] = edge_ids df["edge_type"] = edge_types - df["hop_id"] = hop_ids + if hop_ids is not None: + df["hop_id"] = hop_ids + if renumber: renumber_df = cudf.DataFrame( @@ -318,34 +338,57 @@ def uniform_neighbor_sample( if not return_offsets: batch_ids_r = cudf.Series(batch_ids).repeat( - cp.diff(renumber_map_offsets) + cp.diff(renumber_map_offsets[:-1]) ) batch_ids_r.reset_index(drop=True, inplace=True) renumber_df["batch_id"] = batch_ids_r if return_offsets: - offsets_df = cudf.DataFrame( - { - "batch_id": batch_ids, - "offsets": offsets[:-1], - } + batches_series = cudf.Series( + batch_ids, + name="batch_id", ) + offsets_df = cudf.Series( + label_hop_offsets, + name="offsets", + ).to_frame() + + if len(batches_series) > len(offsets_df): + # this is extremely rare so the inefficiency is ok + offsets_df = offsets_df.join(batches_series, how='outer').sort_index() + else: + offsets_df['batch_id'] = batches_series if renumber: - offsets_df["renumber_map_offsets"] = renumber_map_offsets[:-1] + renumber_offset_series = cudf.Series( + renumber_map_offsets[:-1], + name="renumber_map_offsets" + ) + + if len(renumber_offset_series) > len(renumber_df): + # this is extremely rare so the inefficiency is ok + renumber_df = renumber_df.join(renumber_offset_series, how='outer').sort_index() + else: + renumber_df['renumber_map_offsets'] = renumber_offset_series + else: if len(batch_ids) > 0: - batch_ids = cudf.Series(batch_ids).repeat(cp.diff(offsets)) + if renumber: # FIXME change this once Seunghwa updates the sampling API + batch_ids = cudf.Series(cp.repeat(batch_ids, len(fanout_vals))) + + batch_ids = cudf.Series(batch_ids).repeat(cp.diff(label_hop_offsets)) batch_ids.reset_index(drop=True, inplace=True) + print('output batch ids:', batch_ids) df["batch_id"] = batch_ids else: + # TODO this is deprecated, remove it in 23.12 sources, destinations, indices = sampling_result - df["sources"] = sources - df["destinations"] = destinations + df[major_col_name] = sources + df[minor_col_name] = destinations if indices is None: df["indices"] = None @@ -359,8 +402,8 @@ def uniform_neighbor_sample( df["indices"] = indices if G.renumbered and not renumber: - df = G.unrenumber(df, "sources", preserve_order=True) - df = G.unrenumber(df, "destinations", preserve_order=True) + df = G.unrenumber(df, major_col_name, preserve_order=True) + df = G.unrenumber(df, minor_col_name, preserve_order=True) if return_offsets: if renumber: diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py index 62599291d04..c770326ab6c 100644 --- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py +++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py @@ -362,8 +362,8 @@ def test_uniform_neighbor_sample_edge_properties(return_offsets): assert sampling_results["hop_id"].values_host.tolist() == ([0, 0, 1, 1, 1, 1] * 2) if return_offsets: - assert sampling_offsets["batch_id"].values_host.tolist() == [0, 1] - assert sampling_offsets["offsets"].values_host.tolist() == [0, 6] + assert sampling_offsets["batch_id"].dropna().values_host.tolist() == [0, 1] + assert sampling_offsets["offsets"].dropna().values_host.tolist() == [0, 6, 12] else: assert sampling_results["batch_id"].values_host.tolist() == ([0] * 6 + [1] * 6) @@ -778,6 +778,63 @@ def test_uniform_neighbor_sample_renumber(hops): assert (renumber_map.batch_id == 0).all() +@pytest.mark.sg +@pytest.mark.parametrize("hops", [[5], [5, 5], [5, 5, 5]]) +def test_uniform_neighbor_sample_offset_renumber(hops): + el = email_Eu_core.get_edgelist() + + G = cugraph.Graph(directed=True) + G.from_cudf_edgelist(el, source="src", destination="dst") + + seeds = G.select_random_vertices(62, int(0.0001 * len(el))) + + sampling_results_unrenumbered, offsets_unrenumbered = cugraph.uniform_neighbor_sample( + G, + seeds, + hops, + with_replacement=False, + with_edge_properties=True, + with_batch_ids=False, + deduplicate_sources=True, + renumber=False, + return_offsets=True, + random_state=62, + ) + + sampling_results_renumbered, offsets_renumbered, renumber_map = cugraph.uniform_neighbor_sample( + G, + seeds, + hops, + with_replacement=False, + with_edge_properties=True, + with_batch_ids=False, + deduplicate_sources=True, + renumber=True, + return_offsets=True, + random_state=62, + ) + + sources_hop_0 = sampling_results_unrenumbered[ + sampling_results_unrenumbered.hop_id == 0 + ].sources + for hop in range(len(hops)): + destinations_hop = sampling_results_unrenumbered[ + sampling_results_unrenumbered.hop_id <= hop + ].destinations + expected_renumber_map = cudf.concat([sources_hop_0, destinations_hop]).unique() + + assert sorted(expected_renumber_map.values_host.tolist()) == sorted( + renumber_map.map[0 : len(expected_renumber_map)].values_host.tolist() + ) + + renumber_map_offsets = renumber_map.renumber_map_offsets.dropna() + assert len(renumber_map_offsets) == 2 + assert renumber_map_offsets.iloc[0] == 0 + assert renumber_map_offsets.iloc[-1] == len(renumber_map) + + assert len(offsets_renumbered) == len(hops) + 1 + + @pytest.mark.sg @pytest.mark.skip(reason="needs to be written!") def test_multi_client_sampling(): diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd index ffb458b409c..62a91b7d792 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd @@ -176,15 +176,32 @@ cdef extern from "cugraph_c/algorithms.h": const cugraph_sample_result_t* result ) + # Deprecated, use cugraph_sample_result_get_majors cdef cugraph_type_erased_device_array_view_t* \ cugraph_sample_result_get_sources( const cugraph_sample_result_t* result ) + # Deprecated, use cugraph_sample_result_get_minors cdef cugraph_type_erased_device_array_view_t* \ cugraph_sample_result_get_destinations( const cugraph_sample_result_t* result ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_sample_result_get_majors( + const cugraph_sample_result_t* result + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_sample_result_get_minors( + const cugraph_sample_result_t* result + ) + + cdef cugraph_type_erased_host_array_view_t* \ + cugraph_sample_result_get_major_offsets( + const cugraph_sample_result_t* result + ) cdef cugraph_type_erased_device_array_view_t* \ cugraph_sample_result_get_index( @@ -211,11 +228,17 @@ cdef extern from "cugraph_c/algorithms.h": const cugraph_sample_result_t* result ) + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_sample_result_get_label_hop_offsets( + const cugraph_sample_result_t* result + ) + cdef cugraph_type_erased_device_array_view_t* \ cugraph_sample_result_get_start_labels( const cugraph_sample_result_t* result ) + # Deprecated cdef cugraph_type_erased_device_array_view_t* \ cugraph_sample_result_get_offsets( const cugraph_sample_result_t* result @@ -246,10 +269,17 @@ cdef extern from "cugraph_c/algorithms.h": pass ctypedef enum cugraph_prior_sources_behavior_t: - DEFAULT + DEFAULT=0 CARRY_OVER EXCLUDE + ctypedef enum cugraph_compression_type_t: + COO=0 + CSR + CSC + DCSR + DCSC + cdef cugraph_error_code_t \ cugraph_sampling_options_create( cugraph_sampling_options_t** options, @@ -277,7 +307,7 @@ cdef extern from "cugraph_c/algorithms.h": cdef void \ cugraph_sampling_set_prior_sources_behavior( cugraph_sampling_options_t* options, - cugraph_prior_sources_behavior_t value + cugraph_prior_sources_behavior_t value, ) cdef void \ @@ -286,10 +316,22 @@ cdef extern from "cugraph_c/algorithms.h": bool_t value, ) + cdef void \ + cugraph_sampling_set_compress_per_hop( + cugraph_sampling_options_t* options, + bool_t value, + ) + + cdef void \ + cugraph_sampling_set_compression_type( + cugraph_sampling_options_t* options, + cugraph_compression_type_t value, + ) + cdef void \ cugraph_sampling_options_free( cugraph_sampling_options_t* options, - ) + ) # uniform random walks cdef cugraph_error_code_t \ diff --git a/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx b/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx index d11f6994298..a233bdde69a 100644 --- a/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx +++ b/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx @@ -20,14 +20,17 @@ from pylibcugraph._cugraph_c.array cimport ( ) from pylibcugraph._cugraph_c.algorithms cimport ( cugraph_sample_result_t, - cugraph_sample_result_get_sources, - cugraph_sample_result_get_destinations, + cugraph_sample_result_get_majors, + cugraph_sample_result_get_minors, + cugraph_sample_result_get_label_hop_offsets, + cugraph_sample_result_get_sources, # deprecated + cugraph_sample_result_get_destinations, # deprecated cugraph_sample_result_get_edge_weight, cugraph_sample_result_get_edge_id, cugraph_sample_result_get_edge_type, - cugraph_sample_result_get_hop, + cugraph_sample_result_get_hop, # deprecated cugraph_sample_result_get_start_labels, - cugraph_sample_result_get_offsets, + cugraph_sample_result_get_offsets, # deprecated cugraph_sample_result_get_renumber_map, cugraph_sample_result_get_renumber_map_offsets, cugraph_sample_result_free, @@ -60,7 +63,28 @@ cdef class SamplingResult: cdef set_ptr(self, cugraph_sample_result_t* sample_result_ptr): self.c_sample_result_ptr = sample_result_ptr + def get_majors(self): + if self.c_sample_result_ptr is NULL: + raise ValueError("pointer not set, must call set_ptr() with a " + "non-NULL value first.") + cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = ( + cugraph_sample_result_get_majors(self.c_sample_result_ptr) + ) + return create_cupy_array_view_for_device_ptr(device_array_view_ptr, + self) + + def get_minors(self): + if self.c_sample_result_ptr is NULL: + raise ValueError("pointer not set, must call set_ptr() with a " + "non-NULL value first.") + cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = ( + cugraph_sample_result_get_minors(self.c_sample_result_ptr) + ) + return create_cupy_array_view_for_device_ptr(device_array_view_ptr, + self) + def get_sources(self): + # Deprecated if self.c_sample_result_ptr is NULL: raise ValueError("pointer not set, must call set_ptr() with a " "non-NULL value first.") @@ -71,6 +95,7 @@ cdef class SamplingResult: self) def get_destinations(self): + # Deprecated if self.c_sample_result_ptr is NULL: raise ValueError("pointer not set, must call set_ptr() with a " "non-NULL value first.") @@ -95,6 +120,7 @@ cdef class SamplingResult: self) def get_indices(self): + # Deprecated return self.get_edge_weights() def get_edge_ids(self): @@ -135,6 +161,17 @@ cdef class SamplingResult: return create_cupy_array_view_for_device_ptr(device_array_view_ptr, self) + def get_label_hop_offsets(self): + if self.c_sample_result_ptr is NULL: + raise ValueError("pointer not set, must call set_ptr() with a " + "non-NULL value first.") + cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = ( + cugraph_sample_result_get_label_hop_offsets(self.c_sample_result_ptr) + ) + return create_cupy_array_view_for_device_ptr(device_array_view_ptr, + self) + + # Deprecated def get_offsets(self): if self.c_sample_result_ptr is NULL: raise ValueError("pointer not set, must call set_ptr() with a " @@ -145,6 +182,7 @@ cdef class SamplingResult: return create_cupy_array_view_for_device_ptr(device_array_view_ptr, self) + # Deprecated def get_hop_ids(self): if self.c_sample_result_ptr is NULL: raise ValueError("pointer not set, must call set_ptr() with a " diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx index bc2aa9205f1..b0a647cf8f5 100644 --- a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx +++ b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx @@ -38,6 +38,7 @@ from pylibcugraph._cugraph_c.graph cimport ( from pylibcugraph._cugraph_c.algorithms cimport ( cugraph_sample_result_t, cugraph_prior_sources_behavior_t, + cugraph_compression_type_t, cugraph_sampling_options_t, cugraph_sampling_options_create, cugraph_sampling_options_free, @@ -46,7 +47,8 @@ from pylibcugraph._cugraph_c.algorithms cimport ( cugraph_sampling_set_prior_sources_behavior, cugraph_sampling_set_dedupe_sources, cugraph_sampling_set_renumber_results, - + cugraph_sampling_set_compress_per_hop, + cugraph_sampling_set_compression_type, ) from pylibcugraph._cugraph_c.sampling_algorithms cimport ( cugraph_uniform_neighbor_sample, @@ -90,6 +92,8 @@ def uniform_neighbor_sample(ResourceHandle resource_handle, deduplicate_sources=False, return_hops=False, renumber=False, + compression='COO', + compress_per_hop=False, random_state=None): """ Does neighborhood sampling, which samples nodes from a graph based on the @@ -153,6 +157,16 @@ def uniform_neighbor_sample(ResourceHandle resource_handle, If True, will renumber the sources and destinations on a per-batch basis and return the renumber map and batch offsets in additional to the standard returns. + + compression: str (Optional) + Options: COO (default), CSR, CSC, DCSR, DCSR + Sets the compression format for the returned samples. + + compress_per_hop: bool (Optional) + If False (default), will create a compressed edgelist for the + entire batch. + If True, will create a separate compressed edgelist per hop within + a batch. random_state: int (Optional) Random state to use when generating samples. Optional argument, @@ -173,13 +187,16 @@ def uniform_neighbor_sample(ResourceHandle resource_handle, the renumber map for each batch starts). """ - cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + cdef cugraph_resource_handle_t* c_resource_handle_ptr = ( resource_handle.c_resource_handle_ptr + ) + cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr cdef bool_t c_deduplicate_sources = deduplicate_sources cdef bool_t c_return_hops = return_hops cdef bool_t c_renumber = renumber + cdef bool_t c_compress_per_hop = compress_per_hop assert_CAI_type(start_list, "start_list") assert_CAI_type(batch_id_list, "batch_id_list", True) @@ -269,6 +286,23 @@ def uniform_neighbor_sample(ResourceHandle resource_handle, f'Invalid option {prior_sources_behavior}' ' for prior sources behavior' ) + + cdef cugraph_compression_type_t compression_behavior_e + if compression is None or compression == 'COO': + compression_behavior_e = cugraph_compression_type_t.COO + elif compression == 'CSR': + compression_behavior_e = cugraph_compression_type_t.CSR + elif compression == 'CSC': + compression_behavior_e = cugraph_compression_type_t.CSC + elif compression == 'DCSR': + compression_behavior_e = cugraph_compression_type_t.DCSR + elif compression == 'DCSC': + compression_behavior_e = cugraph_compression_type_t.DCSC + else: + raise ValueError( + f'Invalid option {compression}' + ' for compression type' + ) cdef cugraph_sampling_options_t* sampling_options error_code = cugraph_sampling_options_create(&sampling_options, &error_ptr) @@ -279,6 +313,8 @@ def uniform_neighbor_sample(ResourceHandle resource_handle, cugraph_sampling_set_dedupe_sources(sampling_options, c_deduplicate_sources) cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior_e) cugraph_sampling_set_renumber_results(sampling_options, c_renumber) + cugraph_sampling_set_compression_type(sampling_options, compression_behavior_e) + cugraph_sampling_set_compress_per_hop(sampling_options, c_compress_per_hop) error_code = cugraph_uniform_neighbor_sample( c_resource_handle_ptr, @@ -311,24 +347,28 @@ def uniform_neighbor_sample(ResourceHandle resource_handle, # Get cupy "views" of the individual arrays to return. These each increment # the refcount on the SamplingResult instance which will keep the data alive # until all references are removed and the GC runs. + # TODO Return everything that isn't null in release 23.12 if with_edge_properties: - cupy_sources = result.get_sources() - cupy_destinations = result.get_destinations() + cupy_majors = result.get_majors() + cupy_minors = result.get_minors() cupy_edge_weights = result.get_edge_weights() cupy_edge_ids = result.get_edge_ids() cupy_edge_types = result.get_edge_types() cupy_batch_ids = result.get_batch_ids() - cupy_offsets = result.get_offsets() - cupy_hop_ids = result.get_hop_ids() + cupy_label_hop_offsets = result.get_label_hop_offsets() + if renumber: cupy_renumber_map = result.get_renumber_map() cupy_renumber_map_offsets = result.get_renumber_map_offsets() - return (cupy_sources, cupy_destinations, cupy_edge_weights, cupy_edge_ids, cupy_edge_types, cupy_batch_ids, cupy_offsets, cupy_hop_ids, cupy_renumber_map, cupy_renumber_map_offsets) + # TODO drop the placeholder for hop ids in release 23.12 + return (cupy_majors, cupy_minors, cupy_edge_weights, cupy_edge_ids, cupy_edge_types, cupy_batch_ids, cupy_label_hop_offsets, None, cupy_renumber_map, cupy_renumber_map_offsets) else: - return (cupy_sources, cupy_destinations, cupy_edge_weights, cupy_edge_ids, cupy_edge_types, cupy_batch_ids, cupy_offsets, cupy_hop_ids) + cupy_hop_ids = result.get_hop_ids() # FIXME change this once Seunghwa updates the API + return (cupy_majors, cupy_minors, cupy_edge_weights, cupy_edge_ids, cupy_edge_types, cupy_batch_ids, cupy_label_hop_offsets, cupy_hop_ids) else: + # TODO this is deprecated, remove it in release 23.12 cupy_sources = result.get_sources() cupy_destinations = result.get_destinations() cupy_indices = result.get_indices()