Merge branch 'branch-24.04' into branch-24.04-nx_cugraph_benchmarks

rapidsai · Mar 12, 2024 · a9560d8 · a9560d8
2 parents 0efab64 + 6171bd9
commit a9560d8
Show file tree

Hide file tree

Showing 68 changed files with 2,171 additions and 1,103 deletions.
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -15,6 +15,7 @@ jobs:
       - checks
       - conda-cpp-build
       - conda-cpp-tests
+      - conda-cpp-checks
       - conda-notebook-tests
       - conda-python-build
       - conda-python-tests
@@ -52,6 +53,14 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:
       build_type: pull-request
+  conda-cpp-checks:
+    needs: conda-cpp-build
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: pull-request
+      enable_check_symbols: true
+      symbol_exclusions: (cugraph::ops|hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel)
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -14,6 +14,16 @@ on:
         type: string
 
 jobs:
+  conda-cpp-checks:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      enable_check_symbols: true
+      symbol_exclusions: (cugraph::ops|hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel)
   conda-cpp-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -46,16 +46,20 @@ repos:
           )
         types_or: [c, c++, cuda]
         args: ["-fallback-style=none", "-style=file", "-i"]
-  - repo: local
-    hooks:
-      - id: copyright-check
-        name: copyright-check
-        entry: python ./ci/checks/copyright.py --git-modified-only --update-current-year
-        language: python
-        pass_filenames: false
-        additional_dependencies: [gitpython]
   - repo: https://github.com/rapidsai/dependency-file-generator
     rev: v1.8.0
     hooks:
         - id: rapids-dependency-file-generator
           args: ["--clean"]
+  - repo: https://github.com/rapidsai/pre-commit-hooks
+    rev: v0.0.1
+    hooks:
+      - id: verify-copyright
+        files: |
+          (?x)
+              [.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$|
+              CMakeLists[.]txt$|
+              CMakeLists_standalone[.]txt$|
+              [.]flake8[.]cython$|
+              meta[.]yaml$|
+              setup[.]cfg$
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/README.md b/benchmarks/cugraph/standalone/bulk_sampling/README.md
@@ -152,7 +152,7 @@ Next are standard GNN training arguments such as `FANOUT`, `BATCH_SIZE`, etc.  Y
 the number of training epochs here.  These are followed by the `REPLICATION_FACTOR` argument, which
 can be used to create replications of the dataset for scale testing purposes.
 
-The final two arguments are `FRAMEWORK` which can be either "cuGraphPyG" or "PyG", and `GPUS_PER_NODE`
+The final two arguments are `FRAMEWORK` which can be "cugraph_dgl_csr", "cugraph_pyg" or "pyg", and `GPUS_PER_NODE`
 which must be set to the correct value, even if this is provided by a SLURM argument.  If `GPUS_PER_NODE`
 is not set to the correct number of GPUs, the script will hang indefinitely until it times out.  Mismatched
 GPUs per node is currently unsupported by this script but should be possible in practice.

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
@@ -43,8 +43,9 @@ def init_pytorch_worker(rank: int, use_rmm_torch_allocator: bool = False) -> Non
 
     rmm.reinitialize(
         devices=[rank],
-        pool_allocator=True,
-        initial_pool_size=pool_size,
+        pool_allocator=False,
+        # pool_allocator=True,
+        # initial_pool_size=pool_size,
     )
 
     if use_rmm_torch_allocator:
@@ -119,10 +120,17 @@ def parse_args():
     parser.add_argument(
         "--framework",
         type=str,
-        help="The framework to test (PyG, cuGraphPyG)",
+        help="The framework to test (PyG, cugraph_pyg, cugraph_dgl_csr)",
         required=True,
     )
 
+    parser.add_argument(
+        "--use_wholegraph",
+        action="store_true",
+        help="Whether to use WholeGraph feature storage",
+        required=False,
+    )
+
     parser.add_argument(
         "--model",
         type=str,
@@ -162,6 +170,13 @@ def parse_args():
         required=False,
     )
 
+    parser.add_argument(
+        "--skip_download",
+        action="store_true",
+        help="Whether to skip downloading",
+        required=False,
+    )
+
     return parser.parse_args()
 
 
@@ -186,21 +201,43 @@ def main(args):
 
     world_size = int(os.environ["SLURM_JOB_NUM_NODES"]) * args.gpus_per_node
 
+    if args.use_wholegraph:
+        # TODO support WG without cuGraph
+        if args.framework.lower() not in ["cugraph_pyg", "cugraph_dgl_csr"]:
+            raise ValueError("WG feature store only supported with cuGraph backends")
+        from pylibwholegraph.torch.initialize import (
+            get_global_communicator,
+            get_local_node_communicator,
+            init,
+        )
+
+        logger.info("initializing WG comms...")
+        init(global_rank, world_size, local_rank, args.gpus_per_node)
+        wm_comm = get_global_communicator()
+        get_local_node_communicator()
+
+        wm_comm = wm_comm.wmb_comm
+        logger.info(f"rank {global_rank} successfully initialized WG comms")
+        wm_comm.barrier()
+
     dataset = OGBNPapers100MDataset(
         replication_factor=args.replication_factor,
         dataset_dir=args.dataset_dir,
         train_split=args.train_split,
         val_split=args.val_split,
-        load_edge_index=(args.framework == "PyG"),
+        load_edge_index=(args.framework.lower() == "pyg"),
+        backend="wholegraph" if args.use_wholegraph else "torch",
     )
 
-    if global_rank == 0:
+    # Note: this does not generate WG files
+    if global_rank == 0 and not args.skip_download:
         dataset.download()
+
     dist.barrier()
 
     fanout = [int(f) for f in args.fanout.split("_")]
 
-    if args.framework == "PyG":
+    if args.framework.lower() == "pyg":
         from trainers.pyg import PyGNativeTrainer
 
         trainer = PyGNativeTrainer(
@@ -215,7 +252,7 @@ def main(args):
             num_neighbors=fanout,
             batch_size=args.batch_size,
         )
-    elif args.framework == "cuGraphPyG":
+    elif args.framework.lower() == "cugraph_pyg":
         sample_dir = os.path.join(
             args.sample_dir,
             f"ogbn_papers100M[{args.replication_factor}]_b{args.batch_size}_f{fanout}",
@@ -229,11 +266,35 @@ def main(args):
             device=local_rank,
             rank=global_rank,
             world_size=world_size,
+            gpus_per_node=args.gpus_per_node,
             num_epochs=args.num_epochs,
             shuffle=True,
             replace=False,
             num_neighbors=fanout,
             batch_size=args.batch_size,
+            backend="wholegraph" if args.use_wholegraph else "torch",
+        )
+    elif args.framework.lower() == "cugraph_dgl_csr":
+        sample_dir = os.path.join(
+            args.sample_dir,
+            f"ogbn_papers100M[{args.replication_factor}]_b{args.batch_size}_f{fanout}",
+        )
+        from trainers.dgl import DGLCuGraphTrainer
+
+        trainer = DGLCuGraphTrainer(
+            model=args.model,
+            dataset=dataset,
+            sample_dir=sample_dir,
+            device=local_rank,
+            rank=global_rank,
+            world_size=world_size,
+            gpus_per_node=args.gpus_per_node,
+            num_epochs=args.num_epochs,
+            shuffle=True,
+            replace=False,
+            num_neighbors=[int(f) for f in args.fanout.split("_")],
+            batch_size=args.batch_size,
+            backend="wholegraph" if args.use_wholegraph else "torch",
         )
     else:
         raise ValueError("unsupported framework")