From d5230faad533bb78bea0cf583682ba723eee2960 Mon Sep 17 00:00:00 2001 From: Alexandria Barghi Date: Fri, 29 Dec 2023 16:04:44 -0800 Subject: [PATCH] remove flag --- .../cugraph_pyg/sampler/cugraph_sampler.py | 1 - .../dask/sampling/uniform_neighbor_sample.py | 172 +++++++----------- .../sampling/uniform_neighbor_sample.py | 150 ++++++--------- .../sampling/test_uniform_neighbor_sample.py | 3 - .../test_uniform_neighbor_sample_mg.py | 5 - 5 files changed, 124 insertions(+), 207 deletions(-) diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py index b6ec932abbe..4750e0cbc07 100644 --- a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py +++ b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py @@ -199,7 +199,6 @@ def __neighbor_sample( # conversion required by cugraph api list(num_neighbors), replace, - with_edge_properties=True, ) if self.__graph_store._is_delayed: diff --git a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py index 15d109452eb..fecc34f799e 100644 --- a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py +++ b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py @@ -94,58 +94,44 @@ def create_empty_df_with_edge_props(indices_t, weight_t, return_offsets=False): return df -def convert_to_cudf(cp_arrays, weight_t, with_edge_properties, return_offsets=False): +def convert_to_cudf(cp_arrays, return_offsets=False): """ Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper """ df = cudf.DataFrame() - if with_edge_properties: - ( - sources, - destinations, - weights, - edge_ids, - edge_types, - batch_ids, - offsets, - hop_ids, - ) = cp_arrays - - df[src_n] = sources - df[dst_n] = destinations - df[weight_n] = weights - df[edge_id_n] = edge_ids - df[edge_type_n] = edge_types - df[hop_id_n] = hop_ids - - if return_offsets: - offsets_df = cudf.DataFrame( - { - batch_id_n: batch_ids, - offsets_n: offsets[:-1], - } - ) - return df, offsets_df - else: - if len(batch_ids) > 0: - batch_ids = cudf.Series(batch_ids).repeat(cp.diff(offsets)) - batch_ids.reset_index(drop=True, inplace=True) + ( + sources, + destinations, + weights, + edge_ids, + edge_types, + batch_ids, + offsets, + hop_ids, + ) = cp_arrays + + df[src_n] = sources + df[dst_n] = destinations + df[weight_n] = weights + df[edge_id_n] = edge_ids + df[edge_type_n] = edge_types + df[hop_id_n] = hop_ids - df[batch_id_n] = batch_ids - return df + if return_offsets: + offsets_df = cudf.DataFrame( + { + batch_id_n: batch_ids, + offsets_n: offsets[:-1], + } + ) + return df, offsets_df else: - cupy_sources, cupy_destinations, cupy_indices = cp_arrays - - df[src_n] = cupy_sources - df[dst_n] = cupy_destinations - df[indices_n] = cupy_indices - - if weight_t == "int32": - df.indices = df.indices.astype("int32") - elif weight_t == "int64": - df.indices = df.indices.astype("int64") + if len(batch_ids) > 0: + batch_ids = cudf.Series(batch_ids).repeat(cp.diff(offsets)) + batch_ids.reset_index(drop=True, inplace=True) + df[batch_id_n] = batch_ids return df @@ -157,8 +143,6 @@ def _call_plc_uniform_neighbor_sample( label_to_output_comm_rank, fanout_vals, with_replacement, - weight_t, - with_edge_properties, random_state=None, return_offsets=False, ): @@ -173,12 +157,11 @@ def _call_plc_uniform_neighbor_sample( h_fan_out=fanout_vals, with_replacement=with_replacement, do_expensive_check=False, - with_edge_properties=with_edge_properties, batch_id_list=batch_id_list_x, random_state=random_state, ) return convert_to_cudf( - cp_arrays, weight_t, with_edge_properties, return_offsets=return_offsets + cp_arrays, return_offsets=return_offsets ) @@ -193,7 +176,6 @@ def _mg_call_plc_uniform_neighbor_sample( with_replacement, weight_t, indices_t, - with_edge_properties, random_state, return_offsets=False, ): @@ -207,8 +189,6 @@ def _mg_call_plc_uniform_neighbor_sample( label_to_output_comm_rank, fanout_vals, with_replacement, - weight_t=weight_t, - with_edge_properties=with_edge_properties, # FIXME accept and properly transmute a numpy/cupy random state. random_state=hash((random_state, i)), workers=[w], @@ -223,8 +203,6 @@ def _mg_call_plc_uniform_neighbor_sample( create_empty_df_with_edge_props( indices_t, weight_t, return_offsets=return_offsets ) - if with_edge_properties - else create_empty_df(indices_t, weight_t) ) if return_offsets: @@ -251,7 +229,6 @@ def uniform_neighbor_sample( start_list: Sequence, fanout_vals: List[int], with_replacement: bool = True, - with_edge_properties: bool = False, batch_id_list: Sequence = None, label_list: Sequence = None, label_to_output_comm_rank: bool = None, @@ -279,13 +256,8 @@ def uniform_neighbor_sample( with_replacement: bool, optional (default=True) Flag to specify if the random sampling is done with replacement - with_edge_properties: bool, optional (default=False) - Flag to specify whether to return edge properties (weight, edge id, - edge type, batch id, hop id) with the sampled edges. - batch_id_list: cudf.Series or dask_cudf.Series (int32), optional (default=None) - List of batch ids that will be returned with the sampled edges if - with_edge_properties is set to True. + List of batch ids that will be returned with the sampled edges. label_list: cudf.Series or dask_cudf.Series (int32), optional (default=None) List of unique batch id labels. Used along with @@ -314,51 +286,40 @@ def uniform_neighbor_sample( ------- result : dask_cudf.DataFrame or Tuple[dask_cudf.DataFrame, dask_cudf.DataFrame] GPU distributed data frame containing several dask_cudf.Series - - If with_edge_properties=True: - ddf['sources']: dask_cudf.Series + If return_offsets=False: + df['sources']: dask_cudf.Series Contains the source vertices from the sampling result - ddf['destinations']: dask_cudf.Series + df['destinations']: dask_cudf.Series Contains the destination vertices from the sampling result - ddf['indices']: dask_cudf.Series - Contains the indices from the sampling result for path - reconstruction - - If with_edge_properties=False: - If return_offsets=False: - df['sources']: dask_cudf.Series - Contains the source vertices from the sampling result - df['destinations']: dask_cudf.Series - Contains the destination vertices from the sampling result - df['edge_weight']: dask_cudf.Series - Contains the edge weights from the sampling result - df['edge_id']: dask_cudf.Series - Contains the edge ids from the sampling result - df['edge_type']: dask_cudf.Series - Contains the edge types from the sampling result - df['batch_id']: dask_cudf.Series - Contains the batch ids from the sampling result - df['hop_id']: dask_cudf.Series - Contains the hop ids from the sampling result - - If return_offsets=True: - df['sources']: cudf.Series - Contains the source vertices from the sampling result - df['destinations']: cudf.Series - Contains the destination vertices from the sampling result - df['edge_weight']: cudf.Series - Contains the edge weights from the sampling result - df['edge_id']: cudf.Series - Contains the edge ids from the sampling result - df['edge_type']: cudf.Series - Contains the edge types from the sampling result - df['hop_id']: cudf.Series - Contains the hop ids from the sampling result - - offsets_df['batch_id']: cudf.Series - Contains the batch ids from the sampling result - offsets_df['offsets']: cudf.Series - Contains the offsets of each batch in the sampling result + df['edge_weight']: dask_cudf.Series + Contains the edge weights from the sampling result + df['edge_id']: dask_cudf.Series + Contains the edge ids from the sampling result + df['edge_type']: dask_cudf.Series + Contains the edge types from the sampling result + df['batch_id']: dask_cudf.Series + Contains the batch ids from the sampling result + df['hop_id']: dask_cudf.Series + Contains the hop ids from the sampling result + + If return_offsets=True: + df['sources']: cudf.Series + Contains the source vertices from the sampling result + df['destinations']: cudf.Series + Contains the destination vertices from the sampling result + df['edge_weight']: cudf.Series + Contains the edge weights from the sampling result + df['edge_id']: cudf.Series + Contains the edge ids from the sampling result + df['edge_type']: cudf.Series + Contains the edge types from the sampling result + df['hop_id']: cudf.Series + Contains the hop ids from the sampling result + + offsets_df['batch_id']: cudf.Series + Contains the batch ids from the sampling result + offsets_df['offsets']: cudf.Series + Contains the offsets of each batch in the sampling result """ if isinstance(start_list, int): @@ -371,8 +332,7 @@ def uniform_neighbor_sample( input_graph.renumber_map.renumbered_src_col_name ].dtype, ) - - elif with_edge_properties and batch_id_list is None: + elif batch_id_list is None: batch_id_list = cudf.Series(cp.zeros(len(start_list), dtype="int32")) # fanout_vals must be a host array! @@ -450,7 +410,6 @@ def uniform_neighbor_sample( with_replacement=with_replacement, weight_t=weight_t, indices_t=indices_t, - with_edge_properties=with_edge_properties, random_state=random_state, return_offsets=return_offsets, ) @@ -472,7 +431,6 @@ def uniform_neighbor_sample( with_replacement=with_replacement, weight_t=weight_t, indices_t=indices_t, - with_edge_properties=with_edge_properties, random_state=random_state, return_offsets=return_offsets, ) diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py index a7dad6c01a6..79fed3b099c 100644 --- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py +++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py @@ -55,7 +55,6 @@ def uniform_neighbor_sample( start_list: Sequence, fanout_vals: List[int], with_replacement: bool = True, - with_edge_properties: bool = False, batch_id_list: Sequence = None, random_state: int = None, return_offsets: bool = False, @@ -80,10 +79,6 @@ def uniform_neighbor_sample( with_replacement: bool, optional (default=True) Flag to specify if the random sampling is done with replacement - with_edge_properties: bool, optional (default=False) - Flag to specify whether to return edge properties (weight, edge id, - edge type, batch id, hop id) with the sampled edges. - batch_id_list: list (int32) List of batch ids that will be returned with the sampled edges if with_edge_properties is set to True. @@ -101,51 +96,40 @@ def uniform_neighbor_sample( ------- result : cudf.DataFrame or Tuple[cudf.DataFrame, cudf.DataFrame] GPU data frame containing multiple cudf.Series - - If with_edge_properties=False: + If return_offsets=False: + df['sources']: cudf.Series + Contains the source vertices from the sampling result + df['destinations']: cudf.Series + Contains the destination vertices from the sampling result + df['edge_weight']: cudf.Series + Contains the edge weights from the sampling result + df['edge_id']: cudf.Series + Contains the edge ids from the sampling result + df['edge_type']: cudf.Series + Contains the edge types from the sampling result + df['batch_id']: cudf.Series + Contains the batch ids from the sampling result + df['hop_id']: cudf.Series + Contains the hop ids from the sampling result + + If return_offsets=True: df['sources']: cudf.Series Contains the source vertices from the sampling result df['destinations']: cudf.Series Contains the destination vertices from the sampling result - df['indices']: cudf.Series - Contains the indices (edge weights) from the sampling result - for path reconstruction - - If with_edge_properties=True: - If return_offsets=False: - df['sources']: cudf.Series - Contains the source vertices from the sampling result - df['destinations']: cudf.Series - Contains the destination vertices from the sampling result - df['edge_weight']: cudf.Series - Contains the edge weights from the sampling result - df['edge_id']: cudf.Series - Contains the edge ids from the sampling result - df['edge_type']: cudf.Series - Contains the edge types from the sampling result - df['batch_id']: cudf.Series - Contains the batch ids from the sampling result - df['hop_id']: cudf.Series - Contains the hop ids from the sampling result - - If return_offsets=True: - df['sources']: cudf.Series - Contains the source vertices from the sampling result - df['destinations']: cudf.Series - Contains the destination vertices from the sampling result - df['edge_weight']: cudf.Series - Contains the edge weights from the sampling result - df['edge_id']: cudf.Series - Contains the edge ids from the sampling result - df['edge_type']: cudf.Series - Contains the edge types from the sampling result - df['hop_id']: cudf.Series - Contains the hop ids from the sampling result - - offsets_df['batch_id']: cudf.Series - Contains the batch ids from the sampling result - offsets_df['offsets']: cudf.Series - Contains the offsets of each batch in the sampling result + df['edge_weight']: cudf.Series + Contains the edge weights from the sampling result + df['edge_id']: cudf.Series + Contains the edge ids from the sampling result + df['edge_type']: cudf.Series + Contains the edge types from the sampling result + df['hop_id']: cudf.Series + Contains the hop ids from the sampling result + + offsets_df['batch_id']: cudf.Series + Contains the batch ids from the sampling result + offsets_df['offsets']: cudf.Series + Contains the offsets of each batch in the sampling result """ if isinstance(start_list, int): @@ -156,7 +140,7 @@ def uniform_neighbor_sample( start_list, dtype=G.edgelist.edgelist_df[G.srcCol].dtype ) - if with_edge_properties and batch_id_list is None: + if batch_id_list is None: batch_id_list = cp.zeros(len(start_list), dtype="int32") # fanout_vals must be a host array! @@ -186,60 +170,44 @@ def uniform_neighbor_sample( h_fan_out=fanout_vals, with_replacement=with_replacement, do_expensive_check=False, - with_edge_properties=with_edge_properties, batch_id_list=batch_id_list, random_state=random_state, ) df = cudf.DataFrame() - if with_edge_properties: - ( - sources, - destinations, - weights, - edge_ids, - edge_types, - batch_ids, - offsets, - hop_ids, - ) = sampling_result - - df["sources"] = sources - df["destinations"] = destinations - df["weight"] = weights - df["edge_id"] = edge_ids - df["edge_type"] = edge_types - df["hop_id"] = hop_ids - - if return_offsets: - offsets_df = cudf.DataFrame( - { - "batch_id": batch_ids, - "offsets": offsets[:-1], - } - ) + ( + sources, + destinations, + weights, + edge_ids, + edge_types, + batch_ids, + offsets, + hop_ids, + ) = sampling_result + + df["sources"] = sources + df["destinations"] = destinations + df["weight"] = weights + df["edge_id"] = edge_ids + df["edge_type"] = edge_types + df["hop_id"] = hop_ids - else: - if len(batch_ids) > 0: - batch_ids = cudf.Series(batch_ids).repeat(cp.diff(offsets)) - batch_ids.reset_index(drop=True, inplace=True) - - df["batch_id"] = batch_ids + if return_offsets: + offsets_df = cudf.DataFrame( + { + "batch_id": batch_ids, + "offsets": offsets[:-1], + } + ) else: - sources, destinations, indices = sampling_result - - df["sources"] = sources - df["destinations"] = destinations + if len(batch_ids) > 0: + batch_ids = cudf.Series(batch_ids).repeat(cp.diff(offsets)) + batch_ids.reset_index(drop=True, inplace=True) - df["indices"] = indices - if weight_t == "int32": - df["indices"] = indices.astype("int32") - elif weight_t == "int64": - df["indices"] = indices.astype("int64") - else: - df["indices"] = indices + df["batch_id"] = batch_ids if G.renumbered: df = G.unrenumber(df, "sources", preserve_order=True) diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py index 6fe16d97713..a2ee9934b81 100644 --- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py +++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py @@ -333,7 +333,6 @@ def test_uniform_neighbor_sample_edge_properties(return_offsets): start_list=start_df["seed"], fanout_vals=[2, 2], with_replacement=False, - with_edge_properties=True, batch_id_list=start_df["batch"], return_offsets=return_offsets, ) @@ -393,7 +392,6 @@ def test_uniform_neighbor_sample_edge_properties_self_loops(): batch_id_list=cudf.Series([1, 1, 1], dtype="int32"), fanout_vals=[2, 2], with_replacement=False, - with_edge_properties=True, random_state=80, ) @@ -446,7 +444,6 @@ def test_uniform_neighbor_sample_empty_start_list(): batch_id_list=cudf.Series([], dtype="int32"), fanout_vals=[2, 2], with_replacement=False, - with_edge_properties=True, random_state=32, ) diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py index 76657eb634f..7522f7ee31c 100644 --- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py +++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py @@ -353,7 +353,6 @@ def test_uniform_neighbor_sample_edge_properties(dask_client, return_offsets): start_list=cudf.Series([0, 4], dtype="int64"), fanout_vals=[-1, -1], with_replacement=False, - with_edge_properties=True, batch_id_list=cudf.Series([0, 1], dtype="int32"), label_list=cudf.Series([0, 1], dtype="int32") if return_offsets else None, label_to_output_comm_rank=cudf.Series(dest_rank, dtype="int32") @@ -447,7 +446,6 @@ def test_uniform_neighbor_sample_edge_properties_self_loops(dask_client): ), fanout_vals=[2, 2], with_replacement=False, - with_edge_properties=True, ).compute() assert sorted(sampling_results.sources.values_host.tolist()) == [0, 0, 1, 1, 2, 2] @@ -507,7 +505,6 @@ def test_uniform_neighbor_edge_properties_sample_small_start_list( start_list=cudf.Series([0]), fanout_vals=[10, 25], with_replacement=with_replacement, - with_edge_properties=True, batch_id_list=cudf.Series([10], dtype="int32"), ) @@ -541,7 +538,6 @@ def test_uniform_neighbor_sample_without_dask_inputs(dask_client): batch_id_list=cudf.Series([1, 1, 1], dtype="int32"), fanout_vals=[2, 2], with_replacement=False, - with_edge_properties=True, ).compute() assert sorted(sampling_results.sources.values_host.tolist()) == [0, 0, 1, 1, 2, 2] @@ -608,7 +604,6 @@ def test_uniform_neighbor_sample_batched(dask_client, dataset, input_df, max_bat batch_id_list=input_batch, fanout_vals=[5, 5], with_replacement=False, - with_edge_properties=True, ) for batch_id in range(max_batches):