rapidsai · rapids-bot · Mar 11, 2024 · Sep 1, 2023 · Sep 1, 2023 · Sep 1, 2023
@@ -152,7 +152,7 @@ Next are standard GNN training arguments such as `FANOUT`, `BATCH_SIZE`, etc.  Y
 the number of training epochs here.  These are followed by the `REPLICATION_FACTOR` argument, which
 can be used to create replications of the dataset for scale testing purposes.
 
-The final two arguments are `FRAMEWORK` which can be either "cuGraphPyG" or "PyG", and `GPUS_PER_NODE`
+The final two arguments are `FRAMEWORK` which can be "cuGraphDGL", "cuGraphPyG" or "PyG", and `GPUS_PER_NODE`
 which must be set to the correct value, even if this is provided by a SLURM argument.  If `GPUS_PER_NODE`
 is not set to the correct number of GPUs, the script will hang indefinitely until it times out.  Mismatched
 GPUs per node is currently unsupported by this script but should be possible in practice.

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
@@ -123,6 +123,13 @@ def parse_args():
         required=True,
     )
 
+    parser.add_argument(
+        "--use_wholegraph",
+        action="store_true",
+        help="Whether to use WholeGraph feature storage",
+        required=False,
+    )
+
     parser.add_argument(
         "--model",
         type=str,
@@ -162,6 +169,13 @@ def parse_args():
         required=False,
     )
 
+    parser.add_argument(
+        "--skip_download",
+        action="store_true",
+        help="Whether to skip downloading",
+        required=False,
+    )
+
     return parser.parse_args()
 
 
@@ -186,16 +200,38 @@ def main(args):
 
     world_size = int(os.environ["SLURM_JOB_NUM_NODES"]) * args.gpus_per_node
 
+    if args.use_wholegraph:
+        # TODO support WG without cuGraph
+        if args.framework not in ["cuGraphPyG", "cuGraphDGL"]:
+            raise ValueError("WG feature store only supported with cuGraph backends")
+        from pylibwholegraph.torch.initialize import (
+            get_global_communicator,
+            get_local_node_communicator,
+            init,
+        )
+
+        logger.info("initializing WG comms...")
+        init(global_rank, world_size, local_rank, args.gpus_per_node)
+        wm_comm = get_global_communicator()
+        get_local_node_communicator()
+
+        wm_comm = wm_comm.wmb_comm
+        logger.info(f"rank {global_rank} successfully initialized WG comms")
+        wm_comm.barrier()
+
     dataset = OGBNPapers100MDataset(
         replication_factor=args.replication_factor,
         dataset_dir=args.dataset_dir,
         train_split=args.train_split,
         val_split=args.val_split,
         load_edge_index=(args.framework == "PyG"),
+        backend="wholegraph" if args.use_wholegraph else "torch",
     )
 
-    if global_rank == 0:
+    # Note: this does not generate WG files
+    if global_rank == 0 and not args.skip_download:
         dataset.download()
+
     dist.barrier()
 
     fanout = [int(f) for f in args.fanout.split("_")]
@@ -234,6 +270,28 @@ def main(args):
             replace=False,
             num_neighbors=fanout,
             batch_size=args.batch_size,
+            backend="wholegraph" if args.use_wholegraph else "torch",
+        )
+    elif args.framework == "cuGraphDGL":
+        sample_dir = os.path.join(
+            args.sample_dir,
+            f"ogbn_papers100M[{args.replication_factor}]_b{args.batch_size}_f{fanout}",
+        )
+        from trainers.dgl import DGLCuGraphTrainer
+
+        trainer = DGLCuGraphTrainer(
+            model=args.model,
+            dataset=dataset,
+            sample_dir=sample_dir,
+            device=local_rank,
+            rank=global_rank,
+            world_size=world_size,
+            num_epochs=args.num_epochs,
+            shuffle=True,
+            replace=False,
+            num_neighbors=[int(f) for f in args.fanout.split("_")],
+            batch_size=args.batch_size,
+            backend="wholegraph" if args.use_wholegraph else "torch",
         )
     else:
         raise ValueError("unsupported framework")

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py b/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py
@@ -200,19 +200,20 @@ def sample_graph(
 
     total_time = 0.0
     for epoch in range(num_epochs):
-        steps = [("train", train_df), ("test", test_df)]
+        steps = [("train", train_df)]
         if epoch == num_epochs - 1:
             steps.append(("val", val_df))
+            steps.append(("test", test_df))
 
         for step, batch_df in steps:
             batch_df = batch_df.sample(frac=1.0, random_state=seed)
 
-            if step == "val":
-                output_sample_path = os.path.join(output_path, "val", "samples")
-            else:
+            if step == "train":
                 output_sample_path = os.path.join(
                     output_path, f"epoch={epoch}", f"{step}", "samples"
                 )
+            else:
+                output_sample_path = os.path.join(output_path, step, "samples")
             os.makedirs(output_sample_path)
 
             sampler = BulkSampler(
@@ -372,7 +373,7 @@ def load_disk_dataset(
         can_edge_type = tuple(edge_type.split("__"))
         edge_index_dict[can_edge_type] = dask_cudf.read_parquet(
             Path(parquet_path) / edge_type / "edge_index.parquet"
-        ).repartition(n_workers * 2)
+        ).repartition(npartitions=n_workers * 2)
 
         edge_index_dict[can_edge_type]["src"] += node_offsets_replicated[
             can_edge_type[0]
@@ -431,7 +432,7 @@ def load_disk_dataset(
         if os.path.exists(node_label_path):
             node_labels[node_type] = (
                 dask_cudf.read_parquet(node_label_path)
-                .repartition(n_workers)
+                .repartition(npartitions=n_workers)
                 .drop("label", axis=1)
                 .persist()
             )
@@ -574,8 +575,8 @@ def benchmark_cugraph_bulk_sampling(
             "use_legacy_names": False,
             "include_hop_column": False,
         }
-    else:
-        # FIXME: Update these arguments when CSC mode is fixed in cuGraph-PyG (release 24.02)
+    elif sampling_target_framework == "cugraph_pyg":
+        # FIXME: Update these arguments when CSC mode is fixed in cuGraph-PyG (release 24.04)
         sampling_kwargs = {
             "deduplicate_sources": True,
             "prior_sources_behavior": "exclude",
@@ -585,8 +586,10 @@ def benchmark_cugraph_bulk_sampling(
             "use_legacy_names": False,
             "include_hop_column": True,
         }
+    else:
+        raise ValueError("Only cugraph_dgl_csr or cugraph_pyg are valid frameworks")
 
-    batches_per_partition = 600_000 // batch_size
+    batches_per_partition = 256
     execution_time, allocation_counts = sample_graph(
         G=G,
         label_df=dask_label_df,
@@ -761,9 +764,9 @@ def get_args():
     logger.setLevel(logging.INFO)
 
     args = get_args()
-    if args.sampling_target_framework not in ["cugraph_dgl_csr", None]:
+    if args.sampling_target_framework not in ["cugraph_dgl_csr", "cugraph_pyg"]:
         raise ValueError(
-            "sampling_target_framework must be one of cugraph_dgl_csr or None",
+            "sampling_target_framework must be one of cugraph_dgl_csr or cugraph_pyg",
             "Other frameworks are not supported at this time.",
         )
 

@@ -34,6 +34,7 @@ def __init__(
         train_split=0.8,
         val_split=0.5,
         load_edge_index=True,
+        backend="torch",
     ):
         self.__replication_factor = replication_factor
         self.__disk_x = None
@@ -43,6 +44,7 @@ def __init__(
         self.__train_split = train_split
         self.__val_split = val_split
         self.__load_edge_index = load_edge_index
+        self.__backend = backend
 
     def download(self):
         import logging
@@ -152,6 +154,27 @@ def download(self):
             )
             ldf.to_parquet(node_label_file_path)
 
+        # WholeGraph
+        wg_bin_file_path = os.path.join(dataset_path, "wgb", "paper")
+        if self.__replication_factor == 1:
+            wg_bin_rep_path = os.path.join(wg_bin_file_path, "node_feat.d")
+        else:
+            wg_bin_rep_path = os.path.join(
+                wg_bin_file_path, f"node_feat_{self.__replication_factor}x.d"
+            )
+
+        if not os.path.exists(wg_bin_rep_path):
+            os.makedirs(wg_bin_rep_path)
+            if dataset is None:
+                from ogb.nodeproppred import NodePropPredDataset
+
+                dataset = NodePropPredDataset(
+                    name="ogbn-papers100M", root=self.__dataset_dir
+                )
+            node_feat = dataset[0][0]["node_feat"]
+            for k in range(self.__replication_factor):
+                node_feat.tofile(os.path.join(wg_bin_rep_path, f"{k:04d}.bin"))
+
     @property
     def edge_index_dict(
         self,
@@ -224,21 +247,59 @@ def edge_index_dict(
 
     @property
     def x_dict(self) -> Dict[str, torch.Tensor]:
+        if self.__disk_x is None:
+            if self.__backend == "wholegraph":
+                self.__load_x_wg()
+            else:
+                self.__load_x_torch()
+
+        return self.__disk_x
+
+    def __load_x_torch(self) -> None:
         node_type_path = os.path.join(
             self.__dataset_dir, "ogbn_papers100M", "npy", "paper"
         )
+        if self.__replication_factor == 1:
+            full_path = os.path.join(node_type_path, "node_feat.npy")
+        else:
+            full_path = os.path.join(
+                node_type_path, f"node_feat_{self.__replication_factor}x.npy"
+            )
+
+        self.__disk_x = {"paper": torch.as_tensor(np.load(full_path, mmap_mode="r"))}
+
+    def __load_x_wg(self) -> None:
+        import logging
+
+        logger = logging.getLogger("OGBNPapers100MDataset")
+        logger.info("Loading x into WG embedding...")
+
+        import pylibwholegraph.torch as wgth
+
+        node_type_path = os.path.join(
+            self.__dataset_dir, "ogbn_papers100M", "wgb", "paper"
+        )
+        if self.__replication_factor == 1:
+            full_path = os.path.join(node_type_path, "node_feat.d")
+        else:
+            full_path = os.path.join(
+                node_type_path, f"node_feat_{self.__replication_factor}x.d"
+            )
 
-        if self.__disk_x is None:
-            if self.__replication_factor == 1:
-                full_path = os.path.join(node_type_path, "node_feat.npy")
-            else:
-                full_path = os.path.join(
-                    node_type_path, f"node_feat_{self.__replication_factor}x.npy"
-                )
+        file_list = [os.path.join(full_path, f) for f in os.listdir(full_path)]
+
+        x = wgth.create_embedding_from_filelist(
+            wgth.get_global_communicator(),
+            "distributed",  # TODO support other options
+            "cpu",  # TODO support GPU
+            file_list,
+            torch.float32,
+            128,
+        )
 
-            self.__disk_x = {"paper": np.load(full_path, mmap_mode="r")}
+        logger.info("created x wg embedding")
 
-        return self.__disk_x
+        self.__disk_x = {"paper": x}
 
     @property
     def y_dict(self) -> Dict[str, torch.Tensor]:
@@ -321,7 +382,7 @@ def __get_labels(self):
             torch.as_tensor(node_label.node.values, device="cpu")
         ] = torch.as_tensor(node_label.label.values, device="cpu")
 
-        self.__y = {"paper": node_label_tensor.contiguous()}
+        self.__y = {"paper": node_label_tensor.to(torch.int16).contiguous()}
 
         train_ix, test_val_ix = train_test_split(
             torch.as_tensor(node_label.node.values),

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/models/dgl/__init__.py b/benchmarks/cugraph/standalone/bulk_sampling/models/dgl/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .models_dgl import GraphSAGE
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/models/dgl/models_dgl.py b/benchmarks/cugraph/standalone/bulk_sampling/models/dgl/models_dgl.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F
+
+
+class GraphSAGE(torch.nn.Module):
+    """
+    GraphSAGE model implementation for DGL
+    supporting both native DGL and cuGraph-ops
+    backends.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels,
+        out_channels,
+        num_layers,
+        model_backend="dgl",
+    ):
+        if model_backend == "dgl":
+            from dgl.nn import SAGEConv
+        else:
+            from cugraph_dgl.nn import SAGEConv
+
+        super(GraphSAGE, self).__init__()
+        self.convs = torch.nn.ModuleList()
+        for _ in range(num_layers - 1):
+            self.convs.append(
+                SAGEConv(in_channels, hidden_channels, aggregator_type="mean")
+            )
+            in_channels = hidden_channels
+        self.convs.append(
+            SAGEConv(hidden_channels, out_channels, aggregator_type="mean")
+        )
+
+    def forward(self, blocks, x):
+        """
+        Runs the model forward pass given a list of blocks
+        and feature tensor.
+        """
+
+        for i, conv in enumerate(self.convs):
+            x = conv(blocks[i], x)
+            if i != len(self.convs) - 1:
+                x = F.relu(x)
+                x = F.dropout(x, p=0.5)
+        return x
+
+
+def create_model(feat_size, num_classes, num_layers, model_backend="dgl"):
+    model = GraphSAGE(
+        feat_size, 64, num_classes, num_layers, model_backend=model_backend
+    )
+    model = model.to("cuda")
+    model.train()
+    return model