Skip to content

Commit

Permalink
wrap up sg API
Browse files Browse the repository at this point in the history
  • Loading branch information
alexbarghi-nv committed Sep 20, 2023
1 parent 2bd93d9 commit 3195298
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 47 deletions.
22 changes: 0 additions & 22 deletions cpp/src/c_api/uniform_neighbor_sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
#include <cugraph/sampling_functions.hpp>

#include <raft/core/handle.hpp>
#include <iostream>

namespace cugraph {
namespace c_api {
Expand Down Expand Up @@ -219,19 +218,6 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
options_.dedupe_sources_,
do_expensive_check_);

std::cout << "has labels? " << has_labels << std::endl;
std::cout << "has offsets? " << (offsets.has_value()) << std::endl;

bool print=false;
if (offsets->size() < 10) {
print=true;
for(size_t k = 0; k < offsets->size(); ++k) std::cout << offsets->element(k, handle_.get_stream()) << " ";
std::cout << std::endl;

for(size_t k = 0; k < hop->size(); ++k) std::cout << hop->element(k, handle_.get_stream()) << " ";
std::cout << std::endl;
}

std::vector<vertex_t> vertex_partition_lasts = graph_view.vertex_partition_range_lasts();

cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(handle_,
Expand Down Expand Up @@ -340,8 +326,6 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
CUGRAPH_FAIL("Can only use COO format if not renumbering");
}

std::cout << "offsets? " << offsets.has_value() << std::endl;

std::tie(src, dst, wgt, edge_id, edge_type, label_hop_offsets) =
cugraph::sort_sampled_edgelist(
handle_,
Expand All @@ -365,12 +349,6 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct

hop.reset();
offsets.reset();

if(print && label_hop_offsets) {
std::cout << "printing label_hop_offsets: ";
for(size_t k = 0; k < label_hop_offsets->size(); ++k) std::cout << label_hop_offsets->element(k, handle_.get_stream());
std::cout << std::endl;
}
}

result_ = new cugraph::c_api::cugraph_sample_result_t{
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/sampling/sampling_post_processing_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,12 @@ void check_input_edges(
std::numeric_limits<label_index_t>::max()),
"Invalid input arguments: current implementation assumes that the number of "
"unique labels is no larger than std::numeric_limits<uint32_t>::max().");
/*
CUGRAPH_EXPECTS(!edgelist_label_offsets || std::get<1>(*edgelist_label_offsets) > 0,
"Invlaid input arguments: there should be 1 or more labels if "
"edgelist_label_offsets.has_value() is true.");
*/

CUGRAPH_EXPECTS(
!edgelist_label_offsets.has_value() ||
(std::get<0>(*edgelist_label_offsets).size() == std::get<1>(*edgelist_label_offsets) + 1),
Expand Down
68 changes: 46 additions & 22 deletions python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ def uniform_neighbor_sample(
start_list = G.lookup_internal_vertex_id(start_list, columns)
start_list = start_list.rename(columns={columns[0]: start_col_name})


sampling_result = pylibcugraph_uniform_neighbor_sample(
resource_handle=ResourceHandle(),
input_graph=G._plc_graph,
Expand Down Expand Up @@ -343,21 +344,33 @@ def uniform_neighbor_sample(
})

if not return_offsets:
batch_ids_r = cudf.Series(batch_ids).repeat(
cp.diff(sampling_result['renumber_map_offsets'][:-1])
)
batch_ids_r.reset_index(drop=True, inplace=True)
renumber_df["batch_id"] = batch_ids_r
if len(batch_ids) > 0:
print(batch_ids)
print(sampling_result['renumber_map_offsets'])
batch_ids_r = cudf.Series(batch_ids).repeat(
cp.diff(sampling_result['renumber_map_offsets'])
)
batch_ids_r.reset_index(drop=True, inplace=True)
renumber_df["batch_id"] = batch_ids_r
else:
renumber_df['batch_id'] = None

if return_offsets:
batches_series = cudf.Series(
batch_ids,
name="batch_id",
)
offsets_df = cudf.Series(
label_hop_offsets,
name="offsets",
).to_frame()
if include_hop_column:
# TODO remove this logic in release 23.12
offsets_df = cudf.Series(
label_hop_offsets[cp.arange(len(batch_ids)+1) * len(fanout_vals)],
name='offsets',
).to_frame()
else:
offsets_df = cudf.Series(
label_hop_offsets,
name="offsets",
).to_frame()

if len(batches_series) > len(offsets_df):
# this is extremely rare so the inefficiency is ok
Expand All @@ -376,23 +389,34 @@ def uniform_neighbor_sample(
renumber_df = renumber_df.join(renumber_offset_series, how='outer').sort_index()
else:
renumber_df['renumber_map_offsets'] = renumber_offset_series

if include_hop_column:
print(batch_ids)
print(label_hop_offsets)
raise ValueError("asdf")

else:
if len(batch_ids) > 0:
if renumber: # FIXME change this once Seunghwa updates the sampling API
batch_ids = cudf.Series(cp.repeat(batch_ids, len(fanout_vals)))

batch_ids = cudf.Series(batch_ids).repeat(cp.diff(label_hop_offsets))
batch_ids.reset_index(drop=True, inplace=True)
print('output batch ids:', batch_ids)

results_df["batch_id"] = batch_ids
batch_ids_r = cudf.Series(cp.repeat(batch_ids, len(fanout_vals)))
batch_ids_r = cudf.Series(batch_ids_r).repeat(cp.diff(label_hop_offsets))
batch_ids_r.reset_index(drop=True, inplace=True)

results_df["batch_id"] = batch_ids_r
else:
results_df['batch_id'] = None

# TODO remove this logic in release 23.12, hops will always returned as offsets
if include_hop_column:
if len(batch_ids) > 0:
hop_ids_r = cudf.Series(cp.arange(len(fanout_vals)))
hop_ids_r = cudf.concat([hop_ids_r] * len(batch_ids),ignore_index=True)
print(len(hop_ids_r))
print(len(label_hop_offsets))

# generate the hop column
hop_ids_r = cudf.Series(hop_ids_r, name='hop_id').repeat(
cp.diff(label_hop_offsets)
).reset_index(drop=True)
else:
hop_ids_r = cudf.Series(name='hop_id', dtype='int32')

results_df = results_df.join(hop_ids_r, how='outer').sort_index()

if major_col_name not in results_df:
if use_legacy_names:
raise ValueError("Can't use legacy names with major offsets")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,8 @@ def test_uniform_neighbor_sample_unweighted(simple_unweighted_input_expected_out
@pytest.mark.sg
@pytest.mark.cugraph_ops
@pytest.mark.parametrize("return_offsets", [True, False])
def test_uniform_neighbor_sample_edge_properties(return_offsets):
@pytest.mark.parametrize("include_hop_column", [True, False])
def test_uniform_neighbor_sample_edge_properties(return_offsets, include_hop_column):
edgelist_df = cudf.DataFrame(
{
"src": cudf.Series([0, 1, 2, 3, 4, 3, 4, 2, 0, 1, 0, 2], dtype="int32"),
Expand Down Expand Up @@ -342,6 +343,7 @@ def test_uniform_neighbor_sample_edge_properties(return_offsets):
with_edge_properties=True,
with_batch_ids=True,
return_offsets=return_offsets,
include_hop_column=include_hop_column
)
if return_offsets:
sampling_results, sampling_offsets = sampling_results
Expand All @@ -364,11 +366,17 @@ def test_uniform_neighbor_sample_edge_properties(return_offsets):
== sampling_results["destinations"].values_host.tolist()
)

assert sampling_results["hop_id"].values_host.tolist() == ([0, 0, 1, 1, 1, 1] * 2)
if include_hop_column:
assert sampling_results["hop_id"].values_host.tolist() == ([0, 0, 1, 1, 1, 1] * 2)
else:
assert 'hop_id' not in sampling_results

if return_offsets:
assert sampling_offsets["batch_id"].dropna().values_host.tolist() == [0, 1]
assert sampling_offsets["offsets"].dropna().values_host.tolist() == [0, 6, 12]
if include_hop_column:
assert sampling_offsets["offsets"].dropna().values_host.tolist() == [0, 6, 12]
else:
assert sampling_offsets["offsets"].dropna().values_host.tolist() == [0, 2, 6, 8, 12]
else:
assert sampling_results["batch_id"].values_host.tolist() == ([0] * 6 + [1] * 6)

Expand Down

0 comments on commit 3195298

Please sign in to comment.