Merge branch 'branch-23.12' into adding_build_all

rapidsai · Oct 9, 2023 · 79be311 · 79be311
2 parents 0745336 + 2c1626f
commit 79be311
Show file tree

Hide file tree

Showing 46 changed files with 1,955 additions and 904 deletions.
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:23.12-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"

diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,13 +5,13 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:23.12-cpp-llvm16-cuda11.8-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:23.10": {"version": "1.14.1"},
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/ucx:23.12": {"version": "1.14.1"},
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"

diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "12.0",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:23.10-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:23.12-cpp-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"

diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json
@@ -5,13 +5,13 @@
     "args": {
       "CUDA": "12.0",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda12.0-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:23.12-cpp-llvm16-cuda12.0-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:23.10": {"version": "1.14.1"},
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/ucx:23.12": {"version": "1.14.1"},
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -133,5 +133,5 @@ jobs:
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
       build_command: |
         sccache -z;
-        build-all --verbose;
+        build-all --verbose -j$(nproc --ignore=1);
         sccache -s;
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -16,7 +16,7 @@ repos:
       - id: black
         language_version: python3
         args: [--target-version=py38]
-        files: ^python/
+        files: ^(python/.*|benchmarks/.*)$
   - repo: https://github.com/PyCQA/flake8
     rev: 6.0.0
     hooks:

diff --git a/benchmarks/cugraph-dgl/pytest-based/bench_cugraph_dgl_uniform_neighbor_sample.py b/benchmarks/cugraph-dgl/pytest-based/bench_cugraph_dgl_uniform_neighbor_sample.py
@@ -21,6 +21,7 @@
 import pytest
 import numpy as np
 import cupy as cp
+
 # Facing issues with rapids-pytest-benchmark plugin
 # pytest-benchmark.
 import pytest_benchmark
@@ -33,6 +34,7 @@
 import dgl
 import torch
 import rmm
+
 _seed = 42
 
 
@@ -71,29 +73,31 @@ def create_graph(graph_data):
     else:
         raise TypeError(f"graph_data can only be str or dict, got {type(graph_data)}")
 
-    num_nodes = max(edgelist_df['src'].max(),
-                edgelist_df['dst'].max())+1
+    num_nodes = max(edgelist_df["src"].max(), edgelist_df["dst"].max()) + 1
 
-    num_nodes_dict = {'_N':num_nodes}
+    num_nodes_dict = {"_N": num_nodes}
 
     gs = CuGraphStorage(num_nodes_dict=num_nodes_dict, single_gpu=True)
-    gs.add_edge_data(edgelist_df,
-                    # reverse to make same graph as cugraph
-                    node_col_names=['dst', 'src'],
-                    canonical_etype=['_N', 'connects', '_N'])
+    gs.add_edge_data(
+        edgelist_df,
+        # reverse to make same graph as cugraph
+        node_col_names=["dst", "src"],
+        canonical_etype=["_N", "connects", "_N"],
+    )
 
     return gs
 
 
-
 def create_mg_graph(graph_data):
     """
     Create a graph instance based on the data to be loaded/generated.
     """
     # range starts at 1 to let let 0 be used by benchmark/client process
     visible_devices = os.getenv("DASK_WORKER_DEVICES", "1,2,3,4")
 
-    cluster = LocalCUDACluster(protocol='ucx', rmm_pool_size='25GB', CUDA_VISIBLE_DEVICES=visible_devices)
+    cluster = LocalCUDACluster(
+        protocol="ucx", rmm_pool_size="25GB", CUDA_VISIBLE_DEVICES=visible_devices
+    )
     client = Client(cluster)
     Comms.initialize(p2p=True)
     rmm.reinitialize(pool_allocator=True)
@@ -126,25 +130,23 @@ def create_mg_graph(graph_data):
     else:
         raise TypeError(f"graph_data can only be str or dict, got {type(graph_data)}")
 
-    num_nodes = max(edgelist_df['src'].max().compute(),
-                    edgelist_df['dst'].max().compute())
+    num_nodes = max(
+        edgelist_df["src"].max().compute(), edgelist_df["dst"].max().compute()
+    )
 
     # running into issues with smaller partitions
-    edgelist_df = edgelist_df.repartition(npartitions=edgelist_df.npartitions*2)
+    edgelist_df = edgelist_df.repartition(npartitions=edgelist_df.npartitions * 2)
 
-    num_nodes_dict = {'_N':num_nodes}
+    num_nodes_dict = {"_N": num_nodes}
 
-    gs = CuGraphStorage(num_nodes_dict=num_nodes_dict,  single_gpu=False)
-    gs.add_edge_data(edgelist_df,
-                    node_col_names=['dst', 'src'],
-                    canonical_etype=['_N', 'C', '_N'])
+    gs = CuGraphStorage(num_nodes_dict=num_nodes_dict, single_gpu=False)
+    gs.add_edge_data(
+        edgelist_df, node_col_names=["dst", "src"], canonical_etype=["_N", "C", "_N"]
+    )
     return (gs, client, cluster)
 
 
-
-def get_uniform_neighbor_sample_args(
-    G, seed, batch_size, fanout, with_replacement
-):
+def get_uniform_neighbor_sample_args(G, seed, batch_size, fanout, with_replacement):
     """
     Return a dictionary containing the args for uniform_neighbor_sample based
     on the graph and desired args passed in. For example, if a large start list
@@ -165,7 +167,7 @@ def get_uniform_neighbor_sample_args(
     else:
         num_start_verts = batch_size
 
-    srcs = G.graphstore.gdata.get_edge_data()['_SRC_']
+    srcs = G.graphstore.gdata.get_edge_data()["_SRC_"]
     start_list = srcs.head(num_start_verts)
     assert len(start_list) == num_start_verts
 
@@ -205,7 +207,6 @@ def graph_objs(request):
         dask_cluster.close()
 
 
-
 ################################################################################
 # Benchmarks
 @pytest.mark.parametrize("batch_size", params.batch_sizes.values())
@@ -223,15 +224,15 @@ def bench_cugraph_dgl_uniform_neighbor_sample(
 
     # Reverse to match cugraph
     # DGL does from dst to src
-    fanout_val = uns_args['fanout']
+    fanout_val = uns_args["fanout"]
     fanout_val.reverse()
     sampler = dgl.dataloading.NeighborSampler(uns_args["fanout"])
     sampler_f = sampler.sample_blocks
 
     # Warmup
     _ = sampler_f(g=G, seed_nodes=uns_args["seed_nodes"])
     # print(f"\n{uns_args}")
-    result_seed_nodes, output_nodes, blocks  = benchmark(
+    result_seed_nodes, output_nodes, blocks = benchmark(
         sampler_f,
         g=G,
         seed_nodes=uns_args["seed_nodes"],

diff --git a/benchmarks/cugraph-dgl/python-script/dgl_dataloading_benchmark/dgl_benchmark.py b/benchmarks/cugraph-dgl/python-script/dgl_dataloading_benchmark/dgl_benchmark.py
@@ -52,13 +52,9 @@ def load_edges_from_disk(parquet_path, replication_factor, input_meta):
             src_ls = [ei["src"]]
             dst_ls = [ei["dst"]]
             for r in range(1, replication_factor):
-                new_src = ei["src"] + (
-                    r * input_meta["num_nodes"][can_edge_type[0]]
-                )
+                new_src = ei["src"] + (r * input_meta["num_nodes"][can_edge_type[0]])
                 src_ls.append(new_src)
-                new_dst = ei["dst"] + (
-                    r * input_meta["num_nodes"][can_edge_type[2]]
-                )
+                new_dst = ei["dst"] + (r * input_meta["num_nodes"][can_edge_type[2]])
                 dst_ls.append(new_dst)
 
             ei["src"] = torch.cat(src_ls).contiguous()
@@ -92,16 +88,11 @@ def load_node_labels(dataset_path, replication_factor, input_meta):
                             ]
                         ),
                         "label": pd.concat(
-                            [
-                                node_label.label
-                                for r in range(1, replication_factor)
-                            ]
+                            [node_label.label for r in range(1, replication_factor)]
                         ),
                     }
                 )
-                node_label = pd.concat([node_label, dfr]).reset_index(
-                    drop=True
-                )
+                node_label = pd.concat([node_label, dfr]).reset_index(drop=True)
 
             node_label_tensor = torch.full(
                 (num_nodes_dict[node_type],), -1, dtype=torch.float32
@@ -133,9 +124,7 @@ def create_dgl_graph_from_disk(dataset_path, replication_factor=1):
         input_meta = json.load(f)
 
     parquet_path = os.path.join(dataset_path, "parquet")
-    graph_data = load_edges_from_disk(
-        parquet_path, replication_factor, input_meta
-    )
+    graph_data = load_edges_from_disk(parquet_path, replication_factor, input_meta)
     node_data = load_node_labels(dataset_path, replication_factor, input_meta)
     g = dgl.heterograph(graph_data)
 
@@ -154,7 +143,7 @@ def create_dataloader(g, train_idx, batch_size, fanouts, use_uva):
     Returns:
         DGLGraph: DGLGraph with the loaded dataset.
     """
-    
+
     print("Creating dataloader", flush=True)
     st = time.time()
     if use_uva:
@@ -220,21 +209,21 @@ def dataloading_benchmark(g, train_idx, fanouts, batch_sizes, use_uva):
             print("==============================================")
     return time_ls
 
+
 def set_seed(seed):
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
 
+
 if __name__ == "__main__":
     parser = ArgumentParser()
     parser.add_argument(
         "--dataset_path", type=str, default="/datasets/abarghi/ogbn_papers100M"
     )
     parser.add_argument("--replication_factors", type=str, default="1,2,4,8")
-    parser.add_argument(
-        "--fanouts", type=str, default="25_25,10_10_10,5_10_20"
-    )
+    parser.add_argument("--fanouts", type=str, default="25_25,10_10_10,5_10_20")
     parser.add_argument("--batch_sizes", type=str, default="512,1024")
     parser.add_argument("--do_not_use_uva", action="store_true")
     parser.add_argument("--seed", type=int, default=42)
@@ -267,22 +256,16 @@ def set_seed(seed):
         et = time.time()
         print(f"Replication factor = {replication_factor}")
         print(
-            f"G has {g.num_edges()} edges and took",
-            f" {et - st:.2f} seconds to load"
+            f"G has {g.num_edges()} edges and took", f" {et - st:.2f} seconds to load"
         )
         train_idx = {"paper": node_data["paper"]["train_idx"]}
         r_time_ls = dataloading_benchmark(
             g, train_idx, fanouts, batch_sizes, use_uva=use_uva
         )
-        print(
-            "Benchmark completed for replication factor = ", replication_factor
-        )
+        print("Benchmark completed for replication factor = ", replication_factor)
         print("==============================================")
         # Add replication factor to the time list
-        [
-            x.update({"replication_factor": replication_factor})
-            for x in r_time_ls
-        ]
+        [x.update({"replication_factor": replication_factor}) for x in r_time_ls]
         time_ls.extend(r_time_ls)
 
     df = pd.DataFrame(time_ls)