From 6531e145d8d281347aaf643ba6e63e5da2c4bfc3 Mon Sep 17 00:00:00 2001 From: Tingyu Wang Date: Tue, 26 Sep 2023 20:44:24 -0700 Subject: [PATCH] enable csc loader --- .../cugraph_dgl/dataloading/dataloader.py | 2 ++ .../cugraph_dgl/dataloading/dataset.py | 23 ++++++++++++++----- .../dataloading/utils/sampling_helpers.py | 6 +++-- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py index e7b1d3f41aa..b8241f489e5 100644 --- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py +++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py @@ -230,7 +230,9 @@ def __iter__(self): if self.sparse_format == "csc": kwargs["compression"] = "CSR" kwargs["compress_per_hop"] = True + # The following kwargs will be deprecated in uniform sampler. kwargs["use_legacy_names"] = False + kwargs["include_hop_column"] = False else: kwargs["deduplicate_sources"] = False diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py index 93e673fde82..815fd30d8eb 100644 --- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py +++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py @@ -19,6 +19,7 @@ from cugraph_dgl.dataloading.utils.sampling_helpers import ( create_homogeneous_sampled_graphs_from_dataframe, create_heterogeneous_sampled_graphs_from_dataframe, + create_homogeneous_sampled_graphs_from_dataframe_csc, ) @@ -62,10 +63,20 @@ def __getitem__(self, idx: int): fn, batch_offset = self._batch_to_fn_d[idx] if fn != self._current_batch_fn: - df = _load_sampled_file(dataset_obj=self, fn=fn) - self._current_batches = create_homogeneous_sampled_graphs_from_dataframe( - sampled_df=df, edge_dir=self.edge_dir, return_type=self._return_type - ) + if self.sparse_format == "csc": + df = _load_sampled_file(dataset_obj=self, fn=fn, skip_rename=True) + self._current_batches = ( + create_homogeneous_sampled_graphs_from_dataframe_csc(df) + ) + else: + df = _load_sampled_file(dataset_obj=self, fn=fn) + self._current_batches = ( + create_homogeneous_sampled_graphs_from_dataframe( + sampled_df=df, + edge_dir=self.edge_dir, + return_type=self._return_type, + ) + ) current_offset = idx - batch_offset return self._current_batches[current_offset] @@ -152,9 +163,9 @@ def set_input_files( ) -def _load_sampled_file(dataset_obj, fn): +def _load_sampled_file(dataset_obj, fn, skip_rename=False): df = cudf.read_parquet(os.path.join(fn)) - if dataset_obj.edge_dir == "in": + if dataset_obj.edge_dir == "in" and not skip_rename: df.rename( columns={"sources": "destinations", "destinations": "sources"}, inplace=True, diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py index 8932c866b57..26e33166d4e 100644 --- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py +++ b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py @@ -522,5 +522,7 @@ def _create_homogeneous_sparse_graphs_from_csc( return output -def create_homogeneous_sampled_graphs_from_dataframe_csc(df): - return _create_homogeneous_sparse_graphs_from_csc(*(_process_sampled_df_csc(df))) +def create_homogeneous_sampled_graphs_from_dataframe_csc(sampled_df: cudf.DataFrame): + return _create_homogeneous_sparse_graphs_from_csc( + *(_process_sampled_df_csc(sampled_df)) + )