diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp index e2b8c33519a..65abb5e96ea 100644 --- a/cpp/src/c_api/uniform_neighbor_sampling.cpp +++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp @@ -28,7 +28,6 @@ #include #include -#include namespace cugraph { namespace c_api { @@ -219,19 +218,6 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct options_.dedupe_sources_, do_expensive_check_); - std::cout << "has labels? " << has_labels << std::endl; - std::cout << "has offsets? " << (offsets.has_value()) << std::endl; - - bool print=false; - if (offsets->size() < 10) { - print=true; - for(size_t k = 0; k < offsets->size(); ++k) std::cout << offsets->element(k, handle_.get_stream()) << " "; - std::cout << std::endl; - - for(size_t k = 0; k < hop->size(); ++k) std::cout << hop->element(k, handle_.get_stream()) << " "; - std::cout << std::endl; - } - std::vector vertex_partition_lasts = graph_view.vertex_partition_range_lasts(); cugraph::unrenumber_int_vertices(handle_, @@ -340,8 +326,6 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct CUGRAPH_FAIL("Can only use COO format if not renumbering"); } - std::cout << "offsets? " << offsets.has_value() << std::endl; - std::tie(src, dst, wgt, edge_id, edge_type, label_hop_offsets) = cugraph::sort_sampled_edgelist( handle_, @@ -365,12 +349,6 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct hop.reset(); offsets.reset(); - - if(print && label_hop_offsets) { - std::cout << "printing label_hop_offsets: "; - for(size_t k = 0; k < label_hop_offsets->size(); ++k) std::cout << label_hop_offsets->element(k, handle_.get_stream()); - std::cout << std::endl; - } } result_ = new cugraph::c_api::cugraph_sample_result_t{ diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh index 0c397d91b20..e8fecf47414 100644 --- a/cpp/src/sampling/sampling_post_processing_impl.cuh +++ b/cpp/src/sampling/sampling_post_processing_impl.cuh @@ -166,9 +166,12 @@ void check_input_edges( std::numeric_limits::max()), "Invalid input arguments: current implementation assumes that the number of " "unique labels is no larger than std::numeric_limits::max()."); + /* CUGRAPH_EXPECTS(!edgelist_label_offsets || std::get<1>(*edgelist_label_offsets) > 0, "Invlaid input arguments: there should be 1 or more labels if " "edgelist_label_offsets.has_value() is true."); + */ + CUGRAPH_EXPECTS( !edgelist_label_offsets.has_value() || (std::get<0>(*edgelist_label_offsets).size() == std::get<1>(*edgelist_label_offsets) + 1), diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py index ec708925428..079c55a4a6a 100644 --- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py +++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py @@ -294,6 +294,7 @@ def uniform_neighbor_sample( start_list = G.lookup_internal_vertex_id(start_list, columns) start_list = start_list.rename(columns={columns[0]: start_col_name}) + sampling_result = pylibcugraph_uniform_neighbor_sample( resource_handle=ResourceHandle(), input_graph=G._plc_graph, @@ -343,21 +344,33 @@ def uniform_neighbor_sample( }) if not return_offsets: - batch_ids_r = cudf.Series(batch_ids).repeat( - cp.diff(sampling_result['renumber_map_offsets'][:-1]) - ) - batch_ids_r.reset_index(drop=True, inplace=True) - renumber_df["batch_id"] = batch_ids_r + if len(batch_ids) > 0: + print(batch_ids) + print(sampling_result['renumber_map_offsets']) + batch_ids_r = cudf.Series(batch_ids).repeat( + cp.diff(sampling_result['renumber_map_offsets']) + ) + batch_ids_r.reset_index(drop=True, inplace=True) + renumber_df["batch_id"] = batch_ids_r + else: + renumber_df['batch_id'] = None if return_offsets: batches_series = cudf.Series( batch_ids, name="batch_id", ) - offsets_df = cudf.Series( - label_hop_offsets, - name="offsets", - ).to_frame() + if include_hop_column: + # TODO remove this logic in release 23.12 + offsets_df = cudf.Series( + label_hop_offsets[cp.arange(len(batch_ids)+1) * len(fanout_vals)], + name='offsets', + ).to_frame() + else: + offsets_df = cudf.Series( + label_hop_offsets, + name="offsets", + ).to_frame() if len(batches_series) > len(offsets_df): # this is extremely rare so the inefficiency is ok @@ -376,23 +389,34 @@ def uniform_neighbor_sample( renumber_df = renumber_df.join(renumber_offset_series, how='outer').sort_index() else: renumber_df['renumber_map_offsets'] = renumber_offset_series - - if include_hop_column: - print(batch_ids) - print(label_hop_offsets) - raise ValueError("asdf") else: if len(batch_ids) > 0: - if renumber: # FIXME change this once Seunghwa updates the sampling API - batch_ids = cudf.Series(cp.repeat(batch_ids, len(fanout_vals))) - - batch_ids = cudf.Series(batch_ids).repeat(cp.diff(label_hop_offsets)) - batch_ids.reset_index(drop=True, inplace=True) - print('output batch ids:', batch_ids) - - results_df["batch_id"] = batch_ids + batch_ids_r = cudf.Series(cp.repeat(batch_ids, len(fanout_vals))) + batch_ids_r = cudf.Series(batch_ids_r).repeat(cp.diff(label_hop_offsets)) + batch_ids_r.reset_index(drop=True, inplace=True) + + results_df["batch_id"] = batch_ids_r + else: + results_df['batch_id'] = None + # TODO remove this logic in release 23.12, hops will always returned as offsets + if include_hop_column: + if len(batch_ids) > 0: + hop_ids_r = cudf.Series(cp.arange(len(fanout_vals))) + hop_ids_r = cudf.concat([hop_ids_r] * len(batch_ids),ignore_index=True) + print(len(hop_ids_r)) + print(len(label_hop_offsets)) + + # generate the hop column + hop_ids_r = cudf.Series(hop_ids_r, name='hop_id').repeat( + cp.diff(label_hop_offsets) + ).reset_index(drop=True) + else: + hop_ids_r = cudf.Series(name='hop_id', dtype='int32') + + results_df = results_df.join(hop_ids_r, how='outer').sort_index() + if major_col_name not in results_df: if use_legacy_names: raise ValueError("Can't use legacy names with major offsets") diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py index c0cb18dcf29..1fb6ad419fa 100644 --- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py +++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py @@ -308,7 +308,8 @@ def test_uniform_neighbor_sample_unweighted(simple_unweighted_input_expected_out @pytest.mark.sg @pytest.mark.cugraph_ops @pytest.mark.parametrize("return_offsets", [True, False]) -def test_uniform_neighbor_sample_edge_properties(return_offsets): +@pytest.mark.parametrize("include_hop_column", [True, False]) +def test_uniform_neighbor_sample_edge_properties(return_offsets, include_hop_column): edgelist_df = cudf.DataFrame( { "src": cudf.Series([0, 1, 2, 3, 4, 3, 4, 2, 0, 1, 0, 2], dtype="int32"), @@ -342,6 +343,7 @@ def test_uniform_neighbor_sample_edge_properties(return_offsets): with_edge_properties=True, with_batch_ids=True, return_offsets=return_offsets, + include_hop_column=include_hop_column ) if return_offsets: sampling_results, sampling_offsets = sampling_results @@ -364,11 +366,17 @@ def test_uniform_neighbor_sample_edge_properties(return_offsets): == sampling_results["destinations"].values_host.tolist() ) - assert sampling_results["hop_id"].values_host.tolist() == ([0, 0, 1, 1, 1, 1] * 2) + if include_hop_column: + assert sampling_results["hop_id"].values_host.tolist() == ([0, 0, 1, 1, 1, 1] * 2) + else: + assert 'hop_id' not in sampling_results if return_offsets: assert sampling_offsets["batch_id"].dropna().values_host.tolist() == [0, 1] - assert sampling_offsets["offsets"].dropna().values_host.tolist() == [0, 6, 12] + if include_hop_column: + assert sampling_offsets["offsets"].dropna().values_host.tolist() == [0, 6, 12] + else: + assert sampling_offsets["offsets"].dropna().values_host.tolist() == [0, 2, 6, 8, 12] else: assert sampling_results["batch_id"].values_host.tolist() == ([0] * 6 + [1] * 6)