Skip to content

Commit

Permalink
various improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
alexbarghi-nv committed Sep 12, 2023
1 parent 0a2b2b7 commit c86ceac
Show file tree
Hide file tree
Showing 4 changed files with 169 additions and 63 deletions.
124 changes: 66 additions & 58 deletions python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ def uniform_neighbor_sample(
deduplicate_sources: bool = False,
renumber: bool = False,
use_legacy_names=True, # deprecated
compress_per_hop=False,
compression='COO',
) -> Union[cudf.DataFrame, Tuple[cudf.DataFrame, cudf.DataFrame]]:
"""
Does neighborhood sampling, which samples nodes from a graph based on the
Expand Down Expand Up @@ -134,6 +136,14 @@ def uniform_neighbor_sample(
Whether to use the legacy column names (sources, destinations).
If True, will use "sources" and "destinations" as the column names.
If False, will use "majors" and "minors" as the column names.
compress_per_hop: bool, optional (default=False)
Whether to compress globally (default), or to produce a separate
compressed edgelist per hop.
compression: str, optional (default=COO)
Sets the compression type for the output minibatches.
Valid options are COO (default), CSR, CSR, DCSR, and DCSR.
Returns
-------
Expand Down Expand Up @@ -210,6 +220,9 @@ def uniform_neighbor_sample(
" only supported column names."
)
warnings.warn(warning_msg, FutureWarning)
else:
major_col_name = "majors"
minor_col_name = "minors"

if with_edge_properties:
warning_msg = (
Expand Down Expand Up @@ -289,56 +302,41 @@ def uniform_neighbor_sample(
deduplicate_sources=deduplicate_sources,
return_hops=return_hops,
renumber=renumber,
compression=compression,
compress_per_hop=compress_per_hop,
return_dict=True,
)

df = cudf.DataFrame()
results_df = cudf.DataFrame()

if with_edge_properties:
# TODO use a dictionary at PLC w/o breaking users
if renumber:
(
majors,
minors,
weights,
edge_ids,
edge_types,
batch_ids,
label_hop_offsets,
hop_ids,
renumber_map,
renumber_map_offsets,
) = sampling_result
else:
(
majors,
minors,
weights,
edge_ids,
edge_types,
batch_ids,
label_hop_offsets,
hop_ids,
) = sampling_result

df[major_col_name] = majors
df[minor_col_name] = minors
df["weight"] = weights
df["edge_id"] = edge_ids
df["edge_type"] = edge_types
if hop_ids is not None:
df["hop_id"] = hop_ids

results_df_cols = [
'majors',
'minors',
'weight',
'edge_id',
'edge_type',
'hop_id'
]
for col in results_df_cols:
array = sampling_result[col]
if array is not None:
# The length of each of these arrays should be the same
results_df[col] = array

results_df.rename(columns={'majors':major_col_name, 'minors':minor_col_name},inplace=True)

label_hop_offsets = sampling_result['label_hop_offsets']
batch_ids = sampling_result['batch_id']

if renumber:
renumber_df = cudf.DataFrame(
{
"map": renumber_map,
}
)
renumber_df = cudf.DataFrame({
'map': sampling_result['renumber_map'],
})

if not return_offsets:
batch_ids_r = cudf.Series(batch_ids).repeat(
cp.diff(renumber_map_offsets[:-1])
cp.diff(sampling_result['renumber_map_offsets'][:-1])
)
batch_ids_r.reset_index(drop=True, inplace=True)
renumber_df["batch_id"] = batch_ids_r
Expand All @@ -361,7 +359,7 @@ def uniform_neighbor_sample(

if renumber:
renumber_offset_series = cudf.Series(
renumber_map_offsets[:-1],
sampling_result['renumber_map_offsets'][:-1],
name="renumber_map_offsets"
)

Expand All @@ -370,7 +368,6 @@ def uniform_neighbor_sample(
renumber_df = renumber_df.join(renumber_offset_series, how='outer').sort_index()
else:
renumber_df['renumber_map_offsets'] = renumber_offset_series


else:
if len(batch_ids) > 0:
Expand All @@ -381,37 +378,48 @@ def uniform_neighbor_sample(
batch_ids.reset_index(drop=True, inplace=True)
print('output batch ids:', batch_ids)

df["batch_id"] = batch_ids
results_df["batch_id"] = batch_ids

if major_col_name not in results_df:
if use_legacy_names:
raise ValueError("Can't use legacy names with major offsets")

major_offsets_series = cudf.Series(sampling_result['major_offsets'], name='major_offsets')
if len(major_offsets_series) > len(results_df):
# this is extremely rare so the inefficiency is ok
results_df = results_df.join(major_offsets_series, how='outer').sort_index()
else:
results_df['major_offsets'] = major_offsets_series

else:
# TODO this is deprecated, remove it in 23.12
sources, destinations, indices = sampling_result

df[major_col_name] = sources
df[minor_col_name] = destinations
results_df[major_col_name] = sampling_result['sources']
results_df[minor_col_name] = sampling_result['destinations']
indices = sampling_result['indices']

if indices is None:
df["indices"] = None
results_df["indices"] = None
else:
df["indices"] = indices
results_df["indices"] = indices
if weight_t == "int32":
df["indices"] = indices.astype("int32")
results_df["indices"] = indices.astype("int32")
elif weight_t == "int64":
df["indices"] = indices.astype("int64")
results_df["indices"] = indices.astype("int64")
else:
df["indices"] = indices
results_df["indices"] = indices

if G.renumbered and not renumber:
df = G.unrenumber(df, major_col_name, preserve_order=True)
df = G.unrenumber(df, minor_col_name, preserve_order=True)
results_df = G.unrenumber(results_df, major_col_name, preserve_order=True)
results_df = G.unrenumber(results_df, minor_col_name, preserve_order=True)

if return_offsets:
if renumber:
return df, offsets_df, renumber_df
return results_df, offsets_df, renumber_df
else:
return df, offsets_df
return results_df, offsets_df

if renumber:
return df, renumber_df
return results_df, renumber_df

return df
return results_df
2 changes: 1 addition & 1 deletion python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ cdef extern from "cugraph_c/algorithms.h":
const cugraph_sample_result_t* result
)

cdef cugraph_type_erased_host_array_view_t* \
cdef cugraph_type_erased_device_array_view_t* \
cugraph_sample_result_get_major_offsets(
const cugraph_sample_result_t* result
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ from pylibcugraph._cugraph_c.array cimport (
)
from pylibcugraph._cugraph_c.algorithms cimport (
cugraph_sample_result_t,
cugraph_sample_result_get_major_offsets,
cugraph_sample_result_get_majors,
cugraph_sample_result_get_minors,
cugraph_sample_result_get_label_hop_offsets,
Expand Down Expand Up @@ -63,13 +64,30 @@ cdef class SamplingResult:
cdef set_ptr(self, cugraph_sample_result_t* sample_result_ptr):
self.c_sample_result_ptr = sample_result_ptr

def get_major_offsets(self):
if self.c_sample_result_ptr is NULL:
raise ValueError("pointer not set, must call set_ptr() with a "
"non-NULL value first.")

cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
cugraph_sample_result_get_major_offsets(self.c_sample_result_ptr)
)
if device_array_view_ptr is NULL:
return None

return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
self)

def get_majors(self):
if self.c_sample_result_ptr is NULL:
raise ValueError("pointer not set, must call set_ptr() with a "
"non-NULL value first.")
cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
cugraph_sample_result_get_majors(self.c_sample_result_ptr)
)
if device_array_view_ptr is NULL:
return None

return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
self)

Expand All @@ -80,6 +98,9 @@ cdef class SamplingResult:
cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
cugraph_sample_result_get_minors(self.c_sample_result_ptr)
)
if device_array_view_ptr is NULL:
return None

return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
self)

Expand All @@ -91,6 +112,9 @@ cdef class SamplingResult:
cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
cugraph_sample_result_get_sources(self.c_sample_result_ptr)
)
if device_array_view_ptr is NULL:
return None

return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
self)

Expand All @@ -102,6 +126,9 @@ cdef class SamplingResult:
cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
cugraph_sample_result_get_destinations(self.c_sample_result_ptr)
)
if device_array_view_ptr is NULL:
return None

return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
self)

Expand Down Expand Up @@ -158,6 +185,9 @@ cdef class SamplingResult:
cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
cugraph_sample_result_get_start_labels(self.c_sample_result_ptr)
)
if device_array_view_ptr is NULL:
return None

return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
self)

Expand All @@ -168,6 +198,9 @@ cdef class SamplingResult:
cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
cugraph_sample_result_get_label_hop_offsets(self.c_sample_result_ptr)
)
if device_array_view_ptr is NULL:
return None

return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
self)

Expand All @@ -179,6 +212,9 @@ cdef class SamplingResult:
cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
cugraph_sample_result_get_offsets(self.c_sample_result_ptr)
)
if device_array_view_ptr is NULL:
return None

return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
self)

Expand All @@ -190,6 +226,9 @@ cdef class SamplingResult:
cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
cugraph_sample_result_get_hop(self.c_sample_result_ptr)
)
if device_array_view_ptr is NULL:
return None

return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
self)

Expand All @@ -200,6 +239,9 @@ cdef class SamplingResult:
cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
cugraph_sample_result_get_renumber_map(self.c_sample_result_ptr)
)
if device_array_view_ptr is NULL:
return None

return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
self)

Expand All @@ -210,5 +252,8 @@ cdef class SamplingResult:
cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
cugraph_sample_result_get_renumber_map_offsets(self.c_sample_result_ptr)
)
if device_array_view_ptr is NULL:
return None

return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
self)
Loading

0 comments on commit c86ceac

Please sign in to comment.