diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 7f0b95e3573..7c8c9973462 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -15,6 +15,7 @@ jobs:
       - checks
       - conda-cpp-build
       - conda-cpp-tests
+      - conda-cpp-checks
       - conda-notebook-tests
       - conda-python-build
       - conda-python-tests
@@ -52,6 +53,14 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
     with:
       build_type: pull-request
+  conda-cpp-checks:
+    needs: conda-cpp-build
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
+    with:
+      build_type: pull-request
+      enable_check_symbols: true
+      symbol_exclusions: (cugraph::ops|hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel)
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 32fb2d62b29..0bd095bfa94 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -14,6 +14,16 @@ on:
         type: string
 
 jobs:
+  conda-cpp-checks:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      enable_check_symbols: true
+      symbol_exclusions: (cugraph::ops|hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel)
   conda-cpp-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3d893e0e562..ddb84d8a0f0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -46,16 +46,20 @@ repos:
           )
         types_or: [c, c++, cuda]
         args: ["-fallback-style=none", "-style=file", "-i"]
-  - repo: local
-    hooks:
-      - id: copyright-check
-        name: copyright-check
-        entry: python ./ci/checks/copyright.py --git-modified-only --update-current-year
-        language: python
-        pass_filenames: false
-        additional_dependencies: [gitpython]
   - repo: https://github.com/rapidsai/dependency-file-generator
     rev: v1.8.0
     hooks:
         - id: rapids-dependency-file-generator
           args: ["--clean"]
+  - repo: https://github.com/rapidsai/pre-commit-hooks
+    rev: v0.0.1
+    hooks:
+      - id: verify-copyright
+        files: |
+          (?x)
+              [.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$|
+              CMakeLists[.]txt$|
+              CMakeLists_standalone[.]txt$|
+              [.]flake8[.]cython$|
+              meta[.]yaml$|
+              setup[.]cfg$
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/README.md b/benchmarks/cugraph/standalone/bulk_sampling/README.md
index 2d09466fb2f..56e9f4f5f64 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/README.md
+++ b/benchmarks/cugraph/standalone/bulk_sampling/README.md
@@ -152,7 +152,7 @@ Next are standard GNN training arguments such as `FANOUT`, `BATCH_SIZE`, etc.  Y
 the number of training epochs here.  These are followed by the `REPLICATION_FACTOR` argument, which
 can be used to create replications of the dataset for scale testing purposes.
 
-The final two arguments are `FRAMEWORK` which can be either "cuGraphPyG" or "PyG", and `GPUS_PER_NODE`
+The final two arguments are `FRAMEWORK` which can be "cugraph_dgl_csr", "cugraph_pyg" or "pyg", and `GPUS_PER_NODE`
 which must be set to the correct value, even if this is provided by a SLURM argument.  If `GPUS_PER_NODE`
 is not set to the correct number of GPUs, the script will hang indefinitely until it times out.  Mismatched
 GPUs per node is currently unsupported by this script but should be possible in practice.
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
index c9e347b261d..2604642b748 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
@@ -43,8 +43,9 @@ def init_pytorch_worker(rank: int, use_rmm_torch_allocator: bool = False) -> Non
 
     rmm.reinitialize(
         devices=[rank],
-        pool_allocator=True,
-        initial_pool_size=pool_size,
+        pool_allocator=False,
+        # pool_allocator=True,
+        # initial_pool_size=pool_size,
     )
 
     if use_rmm_torch_allocator:
@@ -119,10 +120,17 @@ def parse_args():
     parser.add_argument(
         "--framework",
         type=str,
-        help="The framework to test (PyG, cuGraphPyG)",
+        help="The framework to test (PyG, cugraph_pyg, cugraph_dgl_csr)",
         required=True,
     )
 
+    parser.add_argument(
+        "--use_wholegraph",
+        action="store_true",
+        help="Whether to use WholeGraph feature storage",
+        required=False,
+    )
+
     parser.add_argument(
         "--model",
         type=str,
@@ -162,6 +170,13 @@ def parse_args():
         required=False,
     )
 
+    parser.add_argument(
+        "--skip_download",
+        action="store_true",
+        help="Whether to skip downloading",
+        required=False,
+    )
+
     return parser.parse_args()
 
 
@@ -186,21 +201,43 @@ def main(args):
 
     world_size = int(os.environ["SLURM_JOB_NUM_NODES"]) * args.gpus_per_node
 
+    if args.use_wholegraph:
+        # TODO support WG without cuGraph
+        if args.framework.lower() not in ["cugraph_pyg", "cugraph_dgl_csr"]:
+            raise ValueError("WG feature store only supported with cuGraph backends")
+        from pylibwholegraph.torch.initialize import (
+            get_global_communicator,
+            get_local_node_communicator,
+            init,
+        )
+
+        logger.info("initializing WG comms...")
+        init(global_rank, world_size, local_rank, args.gpus_per_node)
+        wm_comm = get_global_communicator()
+        get_local_node_communicator()
+
+        wm_comm = wm_comm.wmb_comm
+        logger.info(f"rank {global_rank} successfully initialized WG comms")
+        wm_comm.barrier()
+
     dataset = OGBNPapers100MDataset(
         replication_factor=args.replication_factor,
         dataset_dir=args.dataset_dir,
         train_split=args.train_split,
         val_split=args.val_split,
-        load_edge_index=(args.framework == "PyG"),
+        load_edge_index=(args.framework.lower() == "pyg"),
+        backend="wholegraph" if args.use_wholegraph else "torch",
     )
 
-    if global_rank == 0:
+    # Note: this does not generate WG files
+    if global_rank == 0 and not args.skip_download:
         dataset.download()
+
     dist.barrier()
 
     fanout = [int(f) for f in args.fanout.split("_")]
 
-    if args.framework == "PyG":
+    if args.framework.lower() == "pyg":
         from trainers.pyg import PyGNativeTrainer
 
         trainer = PyGNativeTrainer(
@@ -215,7 +252,7 @@ def main(args):
             num_neighbors=fanout,
             batch_size=args.batch_size,
         )
-    elif args.framework == "cuGraphPyG":
+    elif args.framework.lower() == "cugraph_pyg":
         sample_dir = os.path.join(
             args.sample_dir,
             f"ogbn_papers100M[{args.replication_factor}]_b{args.batch_size}_f{fanout}",
@@ -229,11 +266,35 @@ def main(args):
             device=local_rank,
             rank=global_rank,
             world_size=world_size,
+            gpus_per_node=args.gpus_per_node,
             num_epochs=args.num_epochs,
             shuffle=True,
             replace=False,
             num_neighbors=fanout,
             batch_size=args.batch_size,
+            backend="wholegraph" if args.use_wholegraph else "torch",
+        )
+    elif args.framework.lower() == "cugraph_dgl_csr":
+        sample_dir = os.path.join(
+            args.sample_dir,
+            f"ogbn_papers100M[{args.replication_factor}]_b{args.batch_size}_f{fanout}",
+        )
+        from trainers.dgl import DGLCuGraphTrainer
+
+        trainer = DGLCuGraphTrainer(
+            model=args.model,
+            dataset=dataset,
+            sample_dir=sample_dir,
+            device=local_rank,
+            rank=global_rank,
+            world_size=world_size,
+            gpus_per_node=args.gpus_per_node,
+            num_epochs=args.num_epochs,
+            shuffle=True,
+            replace=False,
+            num_neighbors=[int(f) for f in args.fanout.split("_")],
+            batch_size=args.batch_size,
+            backend="wholegraph" if args.use_wholegraph else "torch",
         )
     else:
         raise ValueError("unsupported framework")
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py b/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py
index e3a5bba3162..95e1afcb28b 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py
@@ -190,6 +190,10 @@ def sample_graph(
     val_perc=0.5,
     sampling_kwargs={},
 ):
+    logger = logging.getLogger("__main__")
+    logger.info("Starting sampling phase...")
+
+    logger.info("Calculating random splits...")
     cupy.random.seed(seed)
     train_df, test_df = label_df.random_split(
         [train_perc, 1 - train_perc], random_state=seed, shuffle=True
@@ -197,24 +201,35 @@ def sample_graph(
     val_df, test_df = label_df.random_split(
         [val_perc, 1 - val_perc], random_state=seed, shuffle=True
     )
+    logger.info("Calculated random splits")
 
     total_time = 0.0
     for epoch in range(num_epochs):
-        steps = [("train", train_df), ("test", test_df)]
+        steps = [("train", train_df)]
         if epoch == num_epochs - 1:
             steps.append(("val", val_df))
+            steps.append(("test", test_df))
 
         for step, batch_df in steps:
-            batch_df = batch_df.sample(frac=1.0, random_state=seed)
+            logger.info("Shuffling batch dataframe...")
+            batch_df = batch_df.sample(frac=1.0, random_state=seed).persist()
+            logger.info("Shuffled and persisted batch dataframe...")
 
-            if step == "val":
-                output_sample_path = os.path.join(output_path, "val", "samples")
-            else:
+            if step == "train":
                 output_sample_path = os.path.join(
                     output_path, f"epoch={epoch}", f"{step}", "samples"
                 )
-            os.makedirs(output_sample_path)
+            else:
+                output_sample_path = os.path.join(output_path, step, "samples")
+
+            client = default_client()
+
+            def func():
+                os.makedirs(output_sample_path, exist_ok=True)
+
+            client.run(func)
 
+            logger.info("Creating bulk sampler...")
             sampler = BulkSampler(
                 batch_size=batch_size,
                 output_path=output_sample_path,
@@ -227,6 +242,7 @@ def sample_graph(
                 log_level=logging.INFO,
                 **sampling_kwargs,
             )
+            logger.info("Bulk sampler created and ready for input")
 
             n_workers = len(default_client().scheduler_info()["workers"])
 
@@ -244,13 +260,13 @@ def sample_graph(
             # should always persist the batch dataframe or performance may be suboptimal
             batch_df = batch_df.persist()
 
-            print("created batches")
+            logger.info("created and persisted batches")
 
             start_time = perf_counter()
             sampler.add_batches(batch_df, start_col_name="node", batch_col_name="batch")
             sampler.flush()
             end_time = perf_counter()
-            print("flushed all batches")
+            logger.info("flushed all batches")
             total_time += end_time - start_time
 
     return total_time
@@ -356,23 +372,29 @@ def load_disk_dataset(
     path = Path(dataset_dir) / dataset
     parquet_path = path / "parquet"
 
+    logger = logging.getLogger("__main__")
+
+    logger.info("getting n workers...")
     n_workers = get_n_workers()
+    logger.info(f"there are {n_workers} workers")
 
     with open(os.path.join(path, "meta.json")) as meta_file:
         meta = json.load(meta_file)
 
+    logger.info("assigning offsets...")
     node_offsets, node_offsets_replicated, total_num_nodes = assign_offsets_pyg(
         meta["num_nodes"], replication_factor=replication_factor
     )
+    logger.info("offsets assigned")
 
     edge_index_dict = {}
     for edge_type in meta["num_edges"].keys():
-        print(f"Loading edge index for edge type {edge_type}")
+        logger.info(f"Loading edge index for edge type {edge_type}")
 
         can_edge_type = tuple(edge_type.split("__"))
         edge_index_dict[can_edge_type] = dask_cudf.read_parquet(
             Path(parquet_path) / edge_type / "edge_index.parquet"
-        ).repartition(n_workers * 2)
+        ).repartition(npartitions=n_workers * 2)
 
         edge_index_dict[can_edge_type]["src"] += node_offsets_replicated[
             can_edge_type[0]
@@ -384,6 +406,7 @@ def load_disk_dataset(
         edge_index_dict[can_edge_type] = edge_index_dict[can_edge_type]
 
         if replication_factor > 1:
+            logger.info("processing replications")
             edge_index_dict[can_edge_type] = edge_index_dict[
                 can_edge_type
             ].map_partitions(
@@ -400,6 +423,7 @@ def load_disk_dataset(
                     }
                 ),
             )
+            logger.info("replications processed")
 
         gc.collect()
 
@@ -407,48 +431,63 @@ def load_disk_dataset(
             edge_index_dict[can_edge_type] = edge_index_dict[can_edge_type].rename(
                 columns={"src": "dst", "dst": "src"}
             )
+        logger.info("edge index loaded")
 
     # Assign numeric edge type ids based on lexicographic order
     edge_offsets = {}
     edge_count = 0
-    for num_edge_type, can_edge_type in enumerate(sorted(edge_index_dict.keys())):
-        if add_edge_types:
-            edge_index_dict[can_edge_type]["etp"] = cupy.int32(num_edge_type)
-        edge_offsets[can_edge_type] = edge_count
-        edge_count += len(edge_index_dict[can_edge_type])
+    # for num_edge_type, can_edge_type in enumerate(sorted(edge_index_dict.keys())):
+    #    if add_edge_types:
+    #        edge_index_dict[can_edge_type]["etp"] = cupy.int32(num_edge_type)
+    #    edge_offsets[can_edge_type] = edge_count
+    #    edge_count += len(edge_index_dict[can_edge_type])
+
+    if len(edge_index_dict) != 1:
+        raise ValueError("should only be 1 edge index")
+
+    logger.info("setting edge type")
+
+    all_edges_df = list(edge_index_dict.values())[0]
+    if add_edge_types:
+        all_edges_df["etp"] = cupy.int32(0)
 
-    all_edges_df = dask_cudf.concat(list(edge_index_dict.values()))
+    # all_edges_df = dask_cudf.concat(list(edge_index_dict.values()))
 
     del edge_index_dict
     gc.collect()
 
     node_labels = {}
     for node_type, offset in node_offsets_replicated.items():
-        print(f"Loading node labels for node type {node_type} (offset={offset})")
+        logger.info(f"Loading node labels for node type {node_type} (offset={offset})")
         node_label_path = os.path.join(
             os.path.join(parquet_path, node_type), "node_label.parquet"
         )
         if os.path.exists(node_label_path):
             node_labels[node_type] = (
                 dask_cudf.read_parquet(node_label_path)
-                .repartition(n_workers)
+                .repartition(npartitions=n_workers)
                 .drop("label", axis=1)
                 .persist()
             )
+            logger.info(f"Loaded and persisted initial labels")
             node_labels[node_type]["node"] += offset
             node_labels[node_type] = node_labels[node_type].persist()
+            logger.info(f"Set and persisted node offsets")
 
             if replication_factor > 1:
+                logger.info(f"Replicating labels...")
                 node_labels[node_type] = node_labels[node_type].map_partitions(
                     _replicate_df,
                     replication_factor,
                     {"node": meta["num_nodes"][node_type]},
                     meta=cudf.DataFrame({"node": cudf.Series(dtype="int64")}),
                 )
+                logger.info(f"Replicated labels (will likely evaluate later)")
 
             gc.collect()
 
     node_labels_df = dask_cudf.concat(list(node_labels.values())).reset_index(drop=True)
+    logger.info("Dataset successfully loaded")
 
     del node_labels
     gc.collect()
@@ -459,6 +498,7 @@ def load_disk_dataset(
         node_offsets_replicated,
         edge_offsets,
         total_num_nodes,
+        sum(meta["num_edges"].values()) * replication_factor,
     )
 
 
@@ -540,6 +580,7 @@ def benchmark_cugraph_bulk_sampling(
             node_offsets,
             edge_offsets,
             total_num_nodes,
+            num_input_edges,
         ) = load_disk_dataset(
             dataset,
             dataset_dir=dataset_dir,
@@ -548,7 +589,6 @@ def benchmark_cugraph_bulk_sampling(
             add_edge_types=add_edge_types,
         )
 
-    num_input_edges = len(dask_edgelist_df)
     logger.info(f"Number of input edges = {num_input_edges:,}")
 
     G = construct_graph(dask_edgelist_df)
@@ -562,7 +602,13 @@ def benchmark_cugraph_bulk_sampling(
         output_path,
         f"{dataset}[{replication_factor}]_b{batch_size}_f{fanout}",
     )
-    os.makedirs(output_subdir)
+
+    client = default_client()
+
+    def func():
+        os.makedirs(output_subdir, exist_ok=True)
+
+    client.run(func)
 
     if sampling_target_framework == "cugraph_dgl_csr":
         sampling_kwargs = {
@@ -574,8 +620,8 @@ def benchmark_cugraph_bulk_sampling(
             "use_legacy_names": False,
             "include_hop_column": False,
         }
-    else:
-        # FIXME: Update these arguments when CSC mode is fixed in cuGraph-PyG (release 24.02)
+    elif sampling_target_framework == "cugraph_pyg":
+        # FIXME: Update these arguments when CSC mode is fixed in cuGraph-PyG (release 24.04)
         sampling_kwargs = {
             "deduplicate_sources": True,
             "prior_sources_behavior": "exclude",
@@ -585,8 +631,10 @@ def benchmark_cugraph_bulk_sampling(
             "use_legacy_names": False,
             "include_hop_column": True,
         }
+    else:
+        raise ValueError("Only cugraph_dgl_csr or cugraph_pyg are valid frameworks")
 
-    batches_per_partition = 600_000 // batch_size
+    batches_per_partition = 256
     execution_time, allocation_counts = sample_graph(
         G=G,
         label_df=dask_label_df,
@@ -761,9 +809,9 @@ def get_args():
     logger.setLevel(logging.INFO)
 
     args = get_args()
-    if args.sampling_target_framework not in ["cugraph_dgl_csr", None]:
+    if args.sampling_target_framework not in ["cugraph_dgl_csr", "cugraph_pyg"]:
         raise ValueError(
-            "sampling_target_framework must be one of cugraph_dgl_csr or None",
+            "sampling_target_framework must be one of cugraph_dgl_csr or cugraph_pyg",
             "Other frameworks are not supported at this time.",
         )
 
@@ -775,12 +823,30 @@ def get_args():
     seeds_per_call_opts = [int(s) for s in args.seeds_per_call_opts.split(",")]
     dask_worker_devices = [int(d) for d in args.dask_worker_devices.split(",")]
 
-    logger.info("starting dask client")
-    client, cluster = start_dask_client()
+    import time
+
+    time_dask_start = time.localtime()
+
+    logger.info(f"{time.asctime(time_dask_start)}: starting dask client")
+    from dask_cuda.initialize import initialize
+    from dask.distributed import Client
+    from cugraph.dask.comms import comms as Comms
+    import os, time
+
+    client = Client(scheduler_file=os.environ["SCHEDULER_FILE"], timeout=360)
+    time.sleep(30)
+    cluster = Comms.initialize(p2p=True)
+    # client, cluster = start_dask_client()
+    time_dask_end = time.localtime()
+    logger.info(f"{time.asctime(time_dask_end)}: dask client started")
+
+    logger.info("enabling spilling")
     enable_spilling()
-    stats_ls = []
     client.run(enable_spilling)
-    logger.info("dask client started")
+    logger.info("enabled spilling")
+
+    stats_ls = []
+
     for dataset in datasets:
         m = re.match(r"(\w+)\[([0-9]+)\]", dataset)
         if m:
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/datasets/ogbn_papers100M.py b/benchmarks/cugraph/standalone/bulk_sampling/datasets/ogbn_papers100M.py
index a50e40f6d55..e3151e37a25 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/datasets/ogbn_papers100M.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/datasets/ogbn_papers100M.py
@@ -34,6 +34,7 @@ def __init__(
         train_split=0.8,
         val_split=0.5,
         load_edge_index=True,
+        backend="torch",
     ):
         self.__replication_factor = replication_factor
         self.__disk_x = None
@@ -43,6 +44,7 @@ def __init__(
         self.__train_split = train_split
         self.__val_split = val_split
         self.__load_edge_index = load_edge_index
+        self.__backend = backend
 
     def download(self):
         import logging
@@ -152,6 +154,27 @@ def download(self):
             )
             ldf.to_parquet(node_label_file_path)
 
+        # WholeGraph
+        wg_bin_file_path = os.path.join(dataset_path, "wgb", "paper")
+        if self.__replication_factor == 1:
+            wg_bin_rep_path = os.path.join(wg_bin_file_path, "node_feat.d")
+        else:
+            wg_bin_rep_path = os.path.join(
+                wg_bin_file_path, f"node_feat_{self.__replication_factor}x.d"
+            )
+
+        if not os.path.exists(wg_bin_rep_path):
+            os.makedirs(wg_bin_rep_path)
+            if dataset is None:
+                from ogb.nodeproppred import NodePropPredDataset
+
+                dataset = NodePropPredDataset(
+                    name="ogbn-papers100M", root=self.__dataset_dir
+                )
+            node_feat = dataset[0][0]["node_feat"]
+            for k in range(self.__replication_factor):
+                node_feat.tofile(os.path.join(wg_bin_rep_path, f"{k:04d}.bin"))
+
     @property
     def edge_index_dict(
         self,
@@ -224,45 +247,87 @@ def edge_index_dict(
 
     @property
     def x_dict(self) -> Dict[str, torch.Tensor]:
+        if self.__disk_x is None:
+            if self.__backend == "wholegraph":
+                self.__load_x_wg()
+            else:
+                self.__load_x_torch()
+
+        return self.__disk_x
+
+    def __load_x_torch(self) -> None:
         node_type_path = os.path.join(
             self.__dataset_dir, "ogbn_papers100M", "npy", "paper"
         )
+        if self.__replication_factor == 1:
+            full_path = os.path.join(node_type_path, "node_feat.npy")
+        else:
+            full_path = os.path.join(
+                node_type_path, f"node_feat_{self.__replication_factor}x.npy"
+            )
 
-        if self.__disk_x is None:
-            if self.__replication_factor == 1:
-                full_path = os.path.join(node_type_path, "node_feat.npy")
-            else:
-                full_path = os.path.join(
-                    node_type_path, f"node_feat_{self.__replication_factor}x.npy"
-                )
+        self.__disk_x = {"paper": torch.as_tensor(np.load(full_path, mmap_mode="r"))}
 
-            self.__disk_x = {"paper": np.load(full_path, mmap_mode="r")}
+    def __load_x_wg(self) -> None:
+        import logging
 
-        return self.__disk_x
+        logger = logging.getLogger("OGBNPapers100MDataset")
+        logger.info("Loading x into WG embedding...")
+
+        import pylibwholegraph.torch as wgth
+
+        node_type_path = os.path.join(
+            self.__dataset_dir, "ogbn_papers100M", "wgb", "paper"
+        )
+        if self.__replication_factor == 1:
+            full_path = os.path.join(node_type_path, "node_feat.d")
+        else:
+            full_path = os.path.join(
+                node_type_path, f"node_feat_{self.__replication_factor}x.d"
+            )
+
+        file_list = [os.path.join(full_path, f) for f in os.listdir(full_path)]
+
+        x = wgth.create_embedding_from_filelist(
+            wgth.get_global_communicator(),
+            "distributed",  # TODO support other options
+            "cpu",  # TODO support GPU
+            file_list,
+            torch.float32,
+            128,
+        )
+        from pylibwholegraph.torch.initialize import get_global_communicator
+
+        wm_comm = get_global_communicator()
+        wm_comm.barrier()
+
+        logger.info("created x wg embedding")
+
+        self.__disk_x = {"paper": x}
 
     @property
     def y_dict(self) -> Dict[str, torch.Tensor]:
         if self.__y is None:
-            self.__get_labels()
+            self.__get_y()
 
         return self.__y
 
     @property
     def train_dict(self) -> Dict[str, torch.Tensor]:
         if self.__train is None:
-            self.__get_labels()
+            self.__get_split()
         return self.__train
 
     @property
     def test_dict(self) -> Dict[str, torch.Tensor]:
         if self.__test is None:
-            self.__get_labels()
+            self.__get_split()
         return self.__test
 
     @property
     def val_dict(self) -> Dict[str, torch.Tensor]:
         if self.__val is None:
-            self.__get_labels()
+            self.__get_split()
         return self.__val
 
     @property
@@ -271,7 +336,7 @@ def num_input_features(self) -> int:
 
     @property
     def num_labels(self) -> int:
-        return int(self.y_dict["paper"].max()) + 1
+        return 172
 
     def num_nodes(self, node_type: str) -> int:
         if node_type != "paper":
@@ -285,46 +350,49 @@ def num_edges(self, edge_type: Tuple[str, str, str]) -> int:
 
         return 1_615_685_872 * self.__replication_factor
 
-    def __get_labels(self):
+    def __get_y(self):
         label_path = os.path.join(
             self.__dataset_dir,
             "ogbn_papers100M",
-            "parquet",
+            "wgb",
             "paper",
-            "node_label.parquet",
+            "node_label.d",
+            "0.bin",
         )
 
-        node_label = pandas.read_parquet(label_path)
-
-        if self.__replication_factor > 1:
-            orig_num_nodes = self.num_nodes("paper") // self.__replication_factor
-            dfr = pandas.DataFrame(
-                {
-                    "node": pandas.concat(
-                        [
-                            node_label.node + (r * orig_num_nodes)
-                            for r in range(1, self.__replication_factor)
-                        ]
-                    ),
-                    "label": pandas.concat(
-                        [node_label.label for r in range(1, self.__replication_factor)]
-                    ),
-                }
+        if self.__backend == "wholegraph":
+            import pylibwholegraph.torch as wgth
+
+            node_label = wgth.create_embedding_from_filelist(
+                wgth.get_global_communicator(),
+                "distributed",  # TODO support other options
+                "cpu",  # TODO support GPU
+                [label_path] * self.__replication_factor,
+                torch.int16,
+                1,
+            )
+
+        else:
+            node_label_1x = torch.as_tensor(
+                np.fromfile(label_path, dtype="int16"), device="cpu"
             )
-            node_label = pandas.concat([node_label, dfr]).reset_index(drop=True)
 
+            if self.__replication_factor > 1:
+                node_label = torch.concatenate(
+                    [node_label_1x] * self.__replication_factor
+                )
+            else:
+                node_label = node_label_1x
+
+        self.__y = {"paper": node_label}
+
+    def __get_split(self):
         num_nodes = self.num_nodes("paper")
-        node_label_tensor = torch.full(
-            (num_nodes,), -1, dtype=torch.float32, device="cpu"
-        )
-        node_label_tensor[
-            torch.as_tensor(node_label.node.values, device="cpu")
-        ] = torch.as_tensor(node_label.label.values, device="cpu")
 
-        self.__y = {"paper": node_label_tensor.contiguous()}
+        node = self.y_dict["paper"][self.y_dict["paper"] > 0]
 
         train_ix, test_val_ix = train_test_split(
-            torch.as_tensor(node_label.node.values),
+            node,
             train_size=self.__train_split,
             random_state=num_nodes,
         )
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/basic.py b/benchmarks/cugraph/standalone/bulk_sampling/models/dgl/__init__.py
similarity index 51%
rename from python/nx-cugraph/nx_cugraph/algorithms/bipartite/basic.py
rename to benchmarks/cugraph/standalone/bulk_sampling/models/dgl/__init__.py
index 46c6b54075b..610a7648801 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/basic.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/models/dgl/__init__.py
@@ -10,22 +10,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import cupy as cp
 
-from nx_cugraph.algorithms.cluster import _triangles
-from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import networkx_algorithm
 
-__all__ = [
-    "is_bipartite",
-]
-
-
-@networkx_algorithm(version_added="24.02", _plc="triangle_count")
-def is_bipartite(G):
-    G = _to_graph(G)
-    # Counting triangles may not be the fastest way to do this, but it is simple.
-    node_ids, triangles, is_single_node = _triangles(
-        G, None, symmetrize="union" if G.is_directed() else None
-    )
-    return int(cp.count_nonzero(triangles)) == 0
+from .models_dgl import GraphSAGE
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/models/dgl/models_dgl.py b/benchmarks/cugraph/standalone/bulk_sampling/models/dgl/models_dgl.py
new file mode 100644
index 00000000000..2cfdda2d2e7
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/models/dgl/models_dgl.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F
+
+
+class GraphSAGE(torch.nn.Module):
+    """
+    GraphSAGE model implementation for DGL
+    supporting both native DGL and cuGraph-ops
+    backends.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels,
+        out_channels,
+        num_layers,
+        model_backend="dgl",
+    ):
+        if model_backend == "dgl":
+            from dgl.nn import SAGEConv
+        else:
+            from cugraph_dgl.nn import SAGEConv
+
+        super(GraphSAGE, self).__init__()
+        self.convs = torch.nn.ModuleList()
+        for _ in range(num_layers - 1):
+            self.convs.append(
+                SAGEConv(in_channels, hidden_channels, aggregator_type="mean")
+            )
+            in_channels = hidden_channels
+        self.convs.append(
+            SAGEConv(hidden_channels, out_channels, aggregator_type="mean")
+        )
+
+    def forward(self, blocks, x):
+        """
+        Runs the model forward pass given a list of blocks
+        and feature tensor.
+        """
+
+        for i, conv in enumerate(self.convs):
+            x = conv(blocks[i], x)
+            if i != len(self.convs) - 1:
+                x = F.relu(x)
+                x = F.dropout(x, p=0.5)
+        return x
+
+
+def create_model(feat_size, num_classes, num_layers, model_backend="dgl"):
+    model = GraphSAGE(
+        feat_size, 64, num_classes, num_layers, model_backend=model_backend
+    )
+    model = model.to("cuda")
+    model.train()
+    return model
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/models/pyg/models_cugraph_pyg.py b/benchmarks/cugraph/standalone/bulk_sampling/models/pyg/models_cugraph_pyg.py
index 1de791bf588..7ee400b004f 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/models/pyg/models_cugraph_pyg.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/models/pyg/models_cugraph_pyg.py
@@ -57,7 +57,7 @@ def forward(self, x, edge, num_sampled_nodes, num_sampled_edges):
 
         for i, conv in enumerate(self.convs):
             if i > 0:
-                new_num_edges = edge[1][-2]
+                new_num_edges = int(edge[1][-2])
                 edge[0] = edge[0].narrow(
                     dim=0,
                     start=0,
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/run_train_job.sh b/benchmarks/cugraph/standalone/bulk_sampling/run_train_job.sh
index 27ae0dc7788..8136018c877 100755
--- a/benchmarks/cugraph/standalone/bulk_sampling/run_train_job.sh
+++ b/benchmarks/cugraph/standalone/bulk_sampling/run_train_job.sh
@@ -12,12 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#SBATCH -A datascience_rapids_cugraphgnn
-#SBATCH -p luna
-#SBATCH -J datascience_rapids_cugraphgnn-papers:bulkSamplingPyG
-#SBATCH -N 1
-#SBATCH -t 00:25:00
-
 CONTAINER_IMAGE=${CONTAINER_IMAGE:="please_specify_container"}
 SCRIPTS_DIR=$(pwd)
 LOGS_DIR=${LOGS_DIR:=$(pwd)"/logs"}
@@ -31,10 +25,11 @@ mkdir -p $DATASETS_DIR
 BATCH_SIZE=512
 FANOUT="10_10_10"
 NUM_EPOCHS=1
-REPLICATION_FACTOR=1
+REPLICATION_FACTOR=2
+JOB_ID=$RANDOM
 
-# options: PyG or cuGraphPyG
-FRAMEWORK="cuGraphPyG"
+# options: PyG, cuGraphPyG, or cuGraphDGL
+FRAMEWORK="cuGraphDGL"
 GPUS_PER_NODE=8
 
 nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
@@ -52,6 +47,7 @@ echo Num GPUs Per Node: $gpus_per_node
 
 set -e
 
+
 # First run without cuGraph to get data
 
 if [[ "$FRAMEWORK" == "cuGraphPyG" ]]; then
@@ -59,25 +55,10 @@ if [[ "$FRAMEWORK" == "cuGraphPyG" ]]; then
     srun \
         --container-image $CONTAINER_IMAGE \
         --container-mounts=${LOGS_DIR}":/logs",${SAMPLES_DIR}":/samples",${SCRIPTS_DIR}":/scripts",${DATASETS_DIR}":/datasets" \
-        bash /scripts/run_sampling.sh $BATCH_SIZE $FANOUT $REPLICATION_FACTOR "/scripts" $NUM_EPOCHS
+        bash /scripts/train.sh $BATCH_SIZE $FANOUT $REPLICATION_FACTOR "/scripts" $NUM_EPOCHS "cugraph_pyg" $nnodes $head_node_ip $JOB_ID
+elif [[ "$FRAMEWORK" == "cuGraphDGL" ]]; then
+    srun \
+        --container-image $CONTAINER_IMAGE \
+        --container-mounts=${LOGS_DIR}":/logs",${SAMPLES_DIR}":/samples",${SCRIPTS_DIR}":/scripts",${DATASETS_DIR}":/datasets" \
+        bash /scripts/train.sh $BATCH_SIZE $FANOUT $REPLICATION_FACTOR "/scripts" $NUM_EPOCHS "cugraph_dgl_csr" $nnodes $head_node_ip $JOB_ID
 fi
-
-# Train
-srun \
-    --container-image $CONTAINER_IMAGE \
-    --container-mounts=${LOGS_DIR}":/logs",${SAMPLES_DIR}":/samples",${SCRIPTS_DIR}":/scripts",${DATASETS_DIR}":/datasets" \
-    torchrun \
-        --nnodes $nnodes \
-        --nproc-per-node $gpus_per_node \
-        --rdzv-id $RANDOM \
-        --rdzv-backend c10d \
-        --rdzv-endpoint $head_node_ip:29500 \
-        /scripts/bench_cugraph_training.py \
-            --output_file "/logs/output.txt" \
-            --framework $FRAMEWORK \
-            --dataset_dir "/datasets" \
-            --sample_dir "/samples" \
-            --batch_size $BATCH_SIZE \
-            --fanout $FANOUT \
-            --replication_factor $REPLICATION_FACTOR \
-            --num_epochs $NUM_EPOCHS
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/run_sampling.sh b/benchmarks/cugraph/standalone/bulk_sampling/train.sh
similarity index 66%
rename from benchmarks/cugraph/standalone/bulk_sampling/run_sampling.sh
rename to benchmarks/cugraph/standalone/bulk_sampling/train.sh
index 1b3085dcc9a..a3b85e281f1 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/run_sampling.sh
+++ b/benchmarks/cugraph/standalone/bulk_sampling/train.sh
@@ -21,6 +21,10 @@ FANOUT=$2
 REPLICATION_FACTOR=$3
 SCRIPTS_DIR=$4
 NUM_EPOCHS=$5
+SAMPLING_FRAMEWORK=$6
+N_NODES=$7
+HEAD_NODE_IP=$8
+JOB_ID=$9
 
 SAMPLES_DIR=/samples
 DATASET_DIR=/datasets
@@ -29,12 +33,19 @@ LOGS_DIR=/logs
 MG_UTILS_DIR=${SCRIPTS_DIR}/mg_utils
 SCHEDULER_FILE=${MG_UTILS_DIR}/dask_scheduler.json
 
-export WORKER_RMM_POOL_SIZE=28G
-export UCX_MAX_RNDV_RAILS=1
+echo $SAMPLES_DIR
+ls $SAMPLES_DIR
+
+export WORKER_RMM_POOL_SIZE=75G
+#export UCX_MAX_RNDV_RAILS=1
 export RAPIDS_NO_INITIALIZE=1
 export CUDF_SPILL=1
-export LIBCUDF_CUFILE_POLICY="OFF"
+export LIBCUDF_CUFILE_POLICY="KVIKIO"
+export KVIKIO_NTHREADS=64
 export GPUS_PER_NODE=8
+#export NCCL_CUMEM_ENABLE=0
+#export NCCL_DEBUG="TRACE"
+export NCCL_DEBUG_FILE=/logs/nccl_debug.%h.%p
 
 export SCHEDULER_FILE=$SCHEDULER_FILE
 export LOGS_DIR=$LOGS_DIR
@@ -59,8 +70,9 @@ else
 fi
 
 echo "properly waiting for workers to connect"
-NUM_GPUS=$(python -c "import os; print(int(os.environ['SLURM_JOB_NUM_NODES'])*int(os.environ['GPUS_PER_NODE']))")
-handleTimeout 120 python ${MG_UTILS_DIR}/wait_for_workers.py \
+export NUM_GPUS=$(python -c "import os; print(int(os.environ['SLURM_JOB_NUM_NODES'])*int(os.environ['GPUS_PER_NODE']))")
+SEEDS_PER_CALL=$(python -c "import os; print(int(os.environ['NUM_GPUS'])*65536)")
+handleTimeout 630 python ${MG_UTILS_DIR}/wait_for_workers.py \
                     --num-expected-workers ${NUM_GPUS} \
                     --scheduler-file-path ${SCHEDULER_FILE}
 
@@ -76,14 +88,15 @@ if [[ $SLURM_NODEID == 0 ]]; then
         --datasets "ogbn_papers100M["$REPLICATION_FACTOR"]" \
         --fanouts $FANOUT \
         --batch_sizes $BATCH_SIZE \
-        --seeds_per_call_opts "524288" \
+        --seeds_per_call_opts $SEEDS_PER_CALL \
         --num_epochs $NUM_EPOCHS \
-        --random_seed 42
+        --random_seed 42 \
+        --sampling_target_framework $SAMPLING_FRAMEWORK
 
-    echo "DONE" > ${SAMPLES_DIR}/status.txt
+    echo "DONE" > ${LOGS_DIR}/status.txt
 fi
 
-while [ ! -f "${SAMPLES_DIR}"/status.txt ]
+while [ ! -f "${LOGS_DIR}"/status.txt ]
 do
     sleep 1
 done
@@ -106,6 +119,25 @@ if [[ ${#python_processes[@]} -gt 1 || $dask_processes ]]; then
 fi
 sleep 2
 
+torchrun \
+    --nnodes $N_NODES \
+    --nproc-per-node $GPUS_PER_NODE \
+    --rdzv-id $JOB_ID \
+    --rdzv-backend c10d \
+    --rdzv-endpoint $HEAD_NODE_IP:29500 \
+    /scripts/bench_cugraph_training.py \
+        --output_file "/logs/output.txt" \
+        --framework $SAMPLING_FRAMEWORK \
+        --dataset_dir "/datasets" \
+        --sample_dir "/samples" \
+        --batch_size $BATCH_SIZE \
+        --fanout $FANOUT \
+        --replication_factor $REPLICATION_FACTOR \
+        --num_epochs $NUM_EPOCHS \
+        --use_wholegraph \
+        --skip_download
+
+
 if [[ $SLURM_NODEID == 0 ]]; then
-    rm ${SAMPLES_DIR}/status.txt
+    rm ${LOGS_DIR}/status.txt
 fi
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/__init__.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/__init__.py
new file mode 100644
index 00000000000..03d2a51e538
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .trainers_dgl import DGLTrainer
+from .trainers_cugraph_dgl import DGLCuGraphTrainer
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_cugraph_dgl.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_cugraph_dgl.py
new file mode 100644
index 00000000000..37745e645fd
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_cugraph_dgl.py
@@ -0,0 +1,315 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import time
+import re
+
+from .trainers_dgl import DGLTrainer
+from models.dgl import GraphSAGE
+from datasets import Dataset
+
+import torch
+import numpy as np
+import warnings
+
+from torch.nn.parallel import DistributedDataParallel as ddp
+from cugraph_dgl.dataloading import HomogenousBulkSamplerDataset
+from cugraph.gnn import FeatureStore
+
+from typing import List
+
+
+def get_dataloader(
+    input_file_paths: List[str],
+    total_num_nodes: int,
+    sparse_format: str,
+    return_type: str,
+) -> torch.utils.data.DataLoader:
+    """
+    Returns a dataloader that reads bulk samples from the given input paths.
+
+    Parameters
+    ----------
+    input_file_paths: List[str]
+        List of input parquet files containing samples.
+    total_num_nodes: int
+        Total number of nodes in the graph.
+    sparse_format: str
+        The sparse format to read (i.e. coo)
+    return_type: str
+        The type of object to be returned by the dataloader (i.e. dgl.Block)
+
+    Returns
+    -------
+    torch.utils.data.DataLoader
+    """
+
+    print("Creating dataloader", flush=True)
+    st = time.time()
+    if len(input_file_paths) > 0:
+        dataset = HomogenousBulkSamplerDataset(
+            total_num_nodes,
+            edge_dir="in",
+            sparse_format=sparse_format,
+            return_type=return_type,
+        )
+        dataset.set_input_files(input_file_paths=input_file_paths)
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            collate_fn=lambda x: x,
+            shuffle=False,
+            num_workers=0,
+            batch_size=None,
+        )
+        et = time.time()
+        print(f"Time to create dataloader = {et - st:.2f} seconds", flush=True)
+        return dataloader
+    else:
+        return []
+
+
+class DGLCuGraphTrainer(DGLTrainer):
+    """
+    Trainer implementation for cuGraph-DGL that supports
+    WholeGraph as a feature store.
+    """
+
+    def __init__(
+        self,
+        dataset: Dataset,
+        model: str = "GraphSAGE",
+        device: int = 0,
+        rank: int = 0,
+        world_size: int = 1,
+        gpus_per_node: int = 1,
+        num_epochs: int = 1,
+        sample_dir: str = ".",
+        backend: str = "torch",
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+        dataset: Dataset
+            The dataset to train on.
+        model: str
+            The model to use for training.
+            Currently only "GraphSAGE" is supported.
+        device: int, default=0
+            The CUDA device to use.
+        rank: int, default=0
+            The global rank of the worker this trainer is assigned to.
+        world_size: int, default=1
+            The number of workers in the world.
+        num_epochs: int, default=1
+            The number of training epochs to run.
+        sample_dir: str, default="."
+            The directory where samples generated by the bulk sampler
+            are stored.
+        backend: str, default="torch"
+            The feature store backend to be used by the cuGraph Feature Store.
+            Defaults to "torch".  Options are "torch" and "wholegraph"
+        kwargs
+            Keyword arguments to pass to the loader
+        """
+        self.__data = None
+        self.__device = device
+        self.__rank = rank
+        self.__world_size = world_size
+        self.__gpus_per_node = gpus_per_node
+        self.__num_epochs = num_epochs
+        self.__dataset = dataset
+        self.__sample_dir = sample_dir
+        self.__loader_kwargs = kwargs
+        self.__model = self.get_model(model)
+        self.__optimizer = None
+        self.__backend = backend
+
+    @property
+    def rank(self):
+        return self.__rank
+
+    @property
+    def model(self):
+        return self.__model
+
+    @property
+    def dataset(self):
+        return self.__dataset
+
+    @property
+    def optimizer(self):
+        if self.__optimizer is None:
+            self.__optimizer = torch.optim.Adam(
+                self.model.parameters(), lr=0.01, weight_decay=0.0005
+            )
+        return self.__optimizer
+
+    @property
+    def num_epochs(self) -> int:
+        return self.__num_epochs
+
+    def get_loader(self, epoch: int = 0, stage="train") -> int:
+        # TODO support online sampling
+        if stage == "train":
+            path = os.path.join(self.__sample_dir, f"epoch={epoch}", stage, "samples")
+        elif stage in ["test", "val"]:
+            path = os.path.join(self.__sample_dir, stage, "samples")
+        else:
+            raise ValueError(f"Invalid stage {stage}")
+
+        input_file_paths, num_batches = self.get_input_files(
+            path, epoch=epoch, stage=stage
+        )
+
+        dataloader = get_dataloader(
+            input_file_paths=input_file_paths.tolist(),
+            total_num_nodes=None,
+            sparse_format="csc",
+            return_type="cugraph_dgl.nn.SparseGraph",
+        )
+        return dataloader, num_batches
+
+    @property
+    def data(self):
+        import logging
+
+        logger = logging.getLogger("DGLCuGraphTrainer")
+        logger.info("getting data")
+
+        if self.__data is None:
+            logger.info("using wholegraph backend")
+            if self.__backend == "wholegraph":
+                fs = FeatureStore(
+                    backend="wholegraph",
+                    wg_type="chunked",
+                    wg_location="cpu",
+                )
+            else:
+                fs = FeatureStore(backend=self.__backend)
+            num_nodes_dict = {}
+
+            if self.__backend == "wholegraph":
+                from pylibwholegraph.torch.initialize import get_global_communicator
+
+                wm_comm = get_global_communicator()
+                wm_comm.barrier()
+
+            for node_type, x in self.__dataset.x_dict.items():
+                logger.debug(f"getting x for {node_type}")
+                fs.add_data(x, node_type, "x")
+                num_nodes_dict[node_type] = self.__dataset.num_nodes(node_type)
+                if self.__backend == "wholegraph":
+                    wm_comm.barrier()
+
+            for node_type, y in self.__dataset.y_dict.items():
+                logger.debug(f"getting y for {node_type}")
+                if self.__backend == "wholegraph":
+                    logger.info("using wholegraph backend")
+                    fs.add_data(y, node_type, "y")
+                    wm_comm.barrier()
+                else:
+                    y = y.cuda()
+                    y = y.reshape((y.shape[0], 1))
+                    fs.add_data(y, node_type, "y")
+
+            """
+            for node_type, train in self.__dataset.train_dict.items():
+                logger.debug(f"getting train for {node_type}")
+                train = train.reshape((train.shape[0], 1))
+                if self.__backend != "wholegraph":
+                    train = train.cuda()
+                fs.add_data(train, node_type, "train")
+
+            for node_type, test in self.__dataset.test_dict.items():
+                logger.debug(f"getting test for {node_type}")
+                test = test.reshape((test.shape[0], 1))
+                if self.__backend != "wholegraph":
+                    test = test.cuda()
+                fs.add_data(test, node_type, "test")
+
+            for node_type, val in self.__dataset.val_dict.items():
+                logger.debug(f"getting val for {node_type}")
+                val = val.reshape((val.shape[0], 1))
+                if self.__backend != "wholegraph":
+                    val = val.cuda()
+                fs.add_data(val, node_type, "val")
+            """
+
+            # # TODO support online sampling if the edge index is provided
+            # num_edges_dict = self.__dataset.edge_index_dict
+            # if not isinstance(list(num_edges_dict.values())[0], int):
+            #     num_edges_dict = {k: len(v) for k, v in num_edges_dict}
+
+            if self.__backend == "wholegraph":
+                wm_comm.barrier()
+
+            self.__data = fs
+        return self.__data
+
+    def get_model(self, name="GraphSAGE"):
+        if name != "GraphSAGE":
+            raise ValueError("only GraphSAGE is currently supported")
+
+        num_input_features = self.__dataset.num_input_features
+        num_output_features = self.__dataset.num_labels
+        num_layers = len(self.__loader_kwargs["num_neighbors"])
+
+        with torch.cuda.device(self.__device):
+            model = (
+                GraphSAGE(
+                    in_channels=num_input_features,
+                    hidden_channels=64,
+                    out_channels=num_output_features,
+                    num_layers=num_layers,
+                    model_backend="cugraph_dgl",
+                )
+                .to(torch.float32)
+                .to(self.__device)
+            )
+            # TODO: Fix for distributed models
+            if torch.distributed.is_initialized():
+                model = ddp(model, device_ids=[self.__device])
+            else:
+                warnings.warn("Distributed training is not available")
+            print("done creating model")
+
+        return model
+
+    def get_input_files(self, path, epoch=0, stage="train"):
+        file_list = np.array([f.path for f in os.scandir(path)])
+        file_list.sort()
+        np.random.seed(epoch)
+        np.random.shuffle(file_list)
+
+        splits = np.array_split(file_list, self.__gpus_per_node)
+
+        ex = re.compile(r"batch=([0-9]+)\-([0-9]+).parquet")
+        num_batches = min(
+            [
+                sum(
+                    [
+                        int(ex.match(fname.split("/")[-1])[2])
+                        - int(ex.match(fname.split("/")[-1])[1])
+                        for fname in s
+                    ]
+                )
+                for s in splits
+            ]
+        )
+        if num_batches == 0:
+            raise ValueError(
+                f"Too few batches for training with world size {self.__world_size}"
+            )
+
+        return splits[self.__device], num_batches
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_dgl.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_dgl.py
new file mode 100644
index 00000000000..fad986257b2
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_dgl.py
@@ -0,0 +1,361 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import logging
+import torch
+import torch.distributed as td
+import torch.nn.functional as F
+from torchmetrics import Accuracy
+from trainers import Trainer
+import time
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from cugraph.gnn import FeatureStore
+
+
+def get_features(input_nodes, output_nodes, feature_store, key="paper"):
+    if isinstance(input_nodes, dict):
+        input_nodes = input_nodes[key]
+    if isinstance(output_nodes, dict):
+        output_nodes = output_nodes[key]
+
+    # TODO: Fix below
+    # Adding based on assumption that cpu features
+    # and gpu index is not supported yet
+
+    if feature_store.backend == "torch":
+        input_nodes = input_nodes.to("cpu")
+        output_nodes = output_nodes.to("cpu")
+
+    x = feature_store.get_data(indices=input_nodes, type_name=key, feat_name="x")
+    y = feature_store.get_data(indices=output_nodes, type_name=key, feat_name="y")
+    y = y.reshape((y.shape[0],))
+    return x, y
+
+
+def log_batch(
+    logger: logging.Logger,
+    iter_i: int,
+    num_batches: int,
+    time_forward: int,
+    time_backward: int,
+    time_start: int,
+    loader_time_iter: int,
+    epoch: int,
+    rank: int,
+):
+    """
+    Logs the current performance of the trainer.
+
+    Parameters
+    ----------
+    logger: logging.Logger
+        The logger to use for logging the performance details.
+    iter_i: int
+        The current training iteration.
+    num_batches: int
+        The number of batches processed so far
+    time_forward: int
+        The total amount of time for the model forward pass so far
+    time_backward: int
+        The total amount of the for the model backwards pass so far
+    time_start: int
+        The time at which training was started
+    loader_time_iter: int
+        The time taken by the loader in the current iteraiton
+    epoch: int
+        The current training epoch
+    rank: int
+        The global rank of this worker
+
+    Returns
+    -------
+    None
+    """
+
+    time_forward_iter = time_forward / num_batches
+    time_backward_iter = time_backward / num_batches
+    total_time_iter = (time.perf_counter() - time_start) / num_batches
+    logger.info(f"epoch {epoch}, iteration {iter_i}, rank {rank}")
+    logger.info(f"time forward: {time_forward_iter}")
+    logger.info(f"time backward: {time_backward_iter}")
+    logger.info(f"loader time: {loader_time_iter}")
+    logger.info(f"total time: {total_time_iter}")
+
+
+def train_epoch(
+    model,
+    optimizer,
+    loader,
+    feature_store,
+    epoch,
+    num_classes,
+    time_d,
+    logger,
+    rank,
+    max_num_batches,
+):
+    """
+    Train the model for one epoch.
+        model: The model to train.
+        optimizer: The optimizer to use.
+        loader: The loader to use.
+        data: cuGraph.gnn.FeatueStore
+        epoch: The epoch number.
+        num_classes: The number of classes.
+        time_d: A dictionary of times.
+        logger: The logger to use.
+        rank: Global rank
+        max_num_batches: Number of batches after which to quit (to avoid hang due to asymmetry)
+    """
+    model = model.train()
+    time_feature_indexing = time_d["time_feature_indexing"]
+    time_feature_transfer = time_d["time_feature_transfer"]
+    time_forward = time_d["time_forward"]
+    time_backward = time_d["time_backward"]
+    time_loader = time_d["time_loader"]
+
+    time_start = time.perf_counter()
+    end_time_backward = time.perf_counter()
+
+    num_batches = 0
+
+    for iter_i, (input_nodes, output_nodes, blocks) in enumerate(loader):
+        loader_time_iter = time.perf_counter() - end_time_backward
+        time_loader += loader_time_iter
+        feature_indexing_time_start = time.perf_counter()
+        x, y_true = get_features(input_nodes, output_nodes, feature_store=feature_store)
+        additional_feature_time_end = time.perf_counter()
+        time_feature_indexing += (
+            additional_feature_time_end - feature_indexing_time_start
+        )
+        feature_trasfer_time_start = time.perf_counter()
+        x = x.to("cuda")
+        y_true = y_true.to("cuda")
+        time_feature_transfer += time.perf_counter() - feature_trasfer_time_start
+        num_batches += 1
+
+        start_time_forward = time.perf_counter()
+        y_pred = model(
+            blocks,
+            x,
+        )
+        end_time_forward = time.perf_counter()
+        time_forward += end_time_forward - start_time_forward
+
+        if y_pred.shape[0] > len(y_true):
+            raise ValueError(f"illegal shape: {y_pred.shape}; {y_true.shape}")
+
+        y_true = y_true[: y_pred.shape[0]]
+        y_true = F.one_hot(
+            y_true.to(torch.int64),
+            num_classes=num_classes,
+        ).to(torch.float32)
+
+        if y_true.shape != y_pred.shape:
+            raise ValueError(
+                f"y_true shape was {y_true.shape} "
+                f"but y_pred shape was {y_pred.shape} "
+                f"in iteration {iter_i} "
+                f"on rank {y_pred.device.index}"
+            )
+
+        start_time_backward = time.perf_counter()
+        loss = F.cross_entropy(y_pred, y_true)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        end_time_backward = time.perf_counter()
+        time_backward += end_time_backward - start_time_backward
+
+        if iter_i % 50 == 0:
+            log_batch(
+                logger=logger,
+                iter_i=iter_i,
+                num_batches=num_batches,
+                time_forward=time_forward,
+                time_backward=time_backward,
+                time_start=time_start,
+                loader_time_iter=loader_time_iter,
+                epoch=epoch,
+                rank=rank,
+            )
+
+        if max_num_batches is not None and iter_i >= max_num_batches:
+            break
+
+    time_d["time_loader"] += time_loader
+    time_d["time_feature_indexing"] += time_feature_indexing
+    time_d["time_feature_transfer"] += time_feature_transfer
+    time_d["time_forward"] += time_forward
+    time_d["time_backward"] += time_backward
+
+    return num_batches
+
+
+def get_accuracy(
+    model: torch.nn.Module,
+    loader: torch.utils.DataLoader,
+    feature_store: FeatureStore,
+    num_classes: int,
+    max_num_batches: int,
+) -> float:
+    """
+    Computes the accuracy given a loader that ouputs evaluation data, the model being evaluated,
+    the feature store where node features are stored, and the number of output classes.
+
+    Parameters
+    ----------
+    model: torch.nn.Module
+        The model being evaluated
+    loader: torch.utils.DataLoader
+        The loader over evaluation samples
+    feature_store: cugraph.gnn.FeatureStore
+        The feature store containing node features
+    num_classes: int
+        The number of output classes of the model
+    max_num_batches: int
+        The number of batches to iterate for, will quit after reaching this number.
+        Used to avoid hang due to asymmetric input.
+
+    Returns
+    -------
+    float
+        The calcuated accuracy, as a percentage.
+
+    """
+
+    print("Computing accuracy...", flush=True)
+    acc = Accuracy(task="multiclass", num_classes=num_classes).cuda()
+    acc_sum = 0.0
+    num_batches = 0
+    with torch.no_grad():
+        for iter_i, (input_nodes, output_nodes, blocks) in enumerate(loader):
+            x, y_true = get_features(
+                input_nodes, output_nodes, feature_store=feature_store
+            )
+            x = x.to("cuda")
+            y_true = y_true.to("cuda")
+
+            out = model(blocks, x)
+            batch_size = out.shape[0]
+            acc_sum += acc(out[:batch_size].softmax(dim=-1), y_true[:batch_size])
+            num_batches += 1
+
+            if max_num_batches is not None and iter_i >= max_num_batches:
+                break
+
+    num_batches = num_batches
+
+    acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device="cuda")
+    td.all_reduce(acc_sum, op=td.ReduceOp.SUM)
+    nb = torch.tensor(float(num_batches), dtype=torch.float32, device=acc_sum.device)
+    td.all_reduce(nb, op=td.ReduceOp.SUM)
+
+    acc = acc_sum / nb
+
+    print(
+        f"Accuracy: {acc * 100.0:.4f}%",
+    )
+    return acc * 100.0
+
+
+class DGLTrainer(Trainer):
+    """
+    Trainer implementation for node classification in DGL.
+    """
+
+    def train(self):
+        logger = logging.getLogger("DGLTrainer")
+        time_d = {
+            "time_loader": 0.0,
+            "time_feature_indexing": 0.0,
+            "time_feature_transfer": 0.0,
+            "time_forward": 0.0,
+            "time_backward": 0.0,
+        }
+        total_batches = 0
+        for epoch in range(self.num_epochs):
+            start_time = time.perf_counter()
+            self.model.train()
+            with td.algorithms.join.Join(
+                [self.model], divide_by_initial_world_size=False
+            ):
+                loader, max_num_batches = self.get_loader(epoch=epoch, stage="train")
+                num_batches = train_epoch(
+                    model=self.model,
+                    optimizer=self.optimizer,
+                    loader=loader,
+                    feature_store=self.data,
+                    num_classes=self.dataset.num_labels,
+                    epoch=epoch,
+                    time_d=time_d,
+                    logger=logger,
+                    rank=self.rank,
+                    max_num_batches=max_num_batches,
+                )
+                total_batches = total_batches + num_batches
+            end_time = time.perf_counter()
+            epoch_time_taken = end_time - start_time
+            print(
+                f"RANK: {self.rank} Total time taken for training epoch {epoch} = {epoch_time_taken}",
+                flush=True,
+            )
+            print("---" * 30)
+            td.barrier()
+            self.model.eval()
+            with td.algorithms.join.Join(
+                [self.model], divide_by_initial_world_size=False
+            ):
+                # test
+                loader, max_num_batches = self.get_loader(epoch=epoch, stage="test")
+                test_acc = get_accuracy(
+                    model=self.model.module,
+                    loader=loader,
+                    feature_store=self.data,
+                    num_classes=self.dataset.num_labels,
+                    max_num_batches=max_num_batches,
+                )
+                print(f"Accuracy: {test_acc:.4f}%")
+
+        # val:
+        self.model.eval()
+        with td.algorithms.join.Join([self.model], divide_by_initial_world_size=False):
+            loader, max_num_batches = self.get_loader(epoch=epoch, stage="val")
+            val_acc = get_accuracy(
+                model=self.model.module,
+                loader=loader,
+                feature_store=self.data,
+                num_classes=self.dataset.num_labels,
+                max_num_batches=max_num_batches,
+            )
+            print(f"Validation Accuracy: {val_acc:.4f}%")
+
+        val_acc = float(val_acc)
+        stats = {
+            "Accuracy": val_acc,
+            "# Batches": total_batches,
+            "Loader Time": time_d["time_loader"],
+            "Feature Time": time_d["time_feature_indexing"]
+            + time_d["time_feature_transfer"],
+            "Forward Time": time_d["time_forward"],
+            "Backward Time": time_d["time_backward"],
+        }
+        return stats
+
+
+# For native DGL training, see benchmarks/cugraph-dgl/scale-benchmarks
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_cugraph_pyg.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_cugraph_pyg.py
index 71151e9ba59..833322deffe 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_cugraph_pyg.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_cugraph_pyg.py
@@ -13,41 +13,84 @@
 
 from .trainers_pyg import PyGTrainer
 from models.pyg import CuGraphSAGE
+from datasets import Dataset
 
 import torch
 import numpy as np
 
 from torch.nn.parallel import DistributedDataParallel as ddp
+from torch.distributed.optim import ZeroRedundancyOptimizer
 
 from cugraph.gnn import FeatureStore
 from cugraph_pyg.data import CuGraphStore
 from cugraph_pyg.loader import BulkSampleLoader
 
 import os
+import re
 
 
 class PyGCuGraphTrainer(PyGTrainer):
+    """
+    Trainer implementation for cuGraph-PyG that supports
+    WholeGraph as a feature store.
+    """
+
     def __init__(
         self,
-        dataset,
-        model="GraphSAGE",
-        device=0,
-        rank=0,
-        world_size=1,
-        num_epochs=1,
-        sample_dir=".",
+        dataset: Dataset,
+        model: str = "GraphSAGE",
+        device: int = 0,
+        rank: int = 0,
+        world_size: int = 1,
+        gpus_per_node: int = 1,
+        num_epochs: int = 1,
+        sample_dir: str = ".",
+        backend: str = "torch",
         **kwargs,
     ):
+        """
+        Parameters
+        ----------
+        dataset: Dataset
+            The dataset to train on.
+        model: str
+            The model to use for training.
+            Currently only "GraphSAGE" is supported.
+        device: int, default=0
+            The CUDA device to use.
+        rank: int, default=0
+            The global rank of the worker this trainer is assigned to.
+        world_size: int, default=1
+            The number of workers in the world.
+        num_epochs: int, default=1
+            The number of training epochs to run.
+        sample_dir: str, default="."
+            The directory where samples generated by the bulk sampler
+            are stored.
+        backend: str, default="torch"
+            The feature store backend to be used by the cuGraph Feature Store.
+            Defaults to "torch".  Options are "torch" and "wholegraph"
+        kwargs
+            Keyword arguments to pass to the loader.
+        """
+
+        import logging
+
+        logger = logging.getLogger("PyGCuGraphTrainer")
+        logger.info("creating trainer")
         self.__data = None
         self.__device = device
         self.__rank = rank
         self.__world_size = world_size
+        self.__gpus_per_node = gpus_per_node
         self.__num_epochs = num_epochs
         self.__dataset = dataset
         self.__sample_dir = sample_dir
         self.__loader_kwargs = kwargs
         self.__model = self.get_model(model)
+        self.__backend = backend
         self.__optimizer = None
+        logger.info("created trainer")
 
     @property
     def rank(self):
@@ -64,8 +107,11 @@ def dataset(self):
     @property
     def optimizer(self):
         if self.__optimizer is None:
-            self.__optimizer = torch.optim.Adam(
-                self.model.parameters(), lr=0.01, weight_decay=0.0005
+            self.__optimizer = ZeroRedundancyOptimizer(
+                self.model.parameters(),
+                lr=0.01,
+                weight_decay=0.0005,
+                optimizer_class=torch.optim.Adam,
             )
         return self.__optimizer
 
@@ -73,7 +119,7 @@ def optimizer(self):
     def num_epochs(self) -> int:
         return self.__num_epochs
 
-    def get_loader(self, epoch: int = 0, stage="train") -> int:
+    def get_loader(self, epoch: int = 0, stage="train"):
         import logging
 
         logger = logging.getLogger("PyGCuGraphTrainer")
@@ -81,22 +127,25 @@ def get_loader(self, epoch: int = 0, stage="train") -> int:
         logger.info(f"getting loader for epoch {epoch}, {stage} stage")
 
         # TODO support online sampling
-        if stage == "val":
-            path = os.path.join(self.__sample_dir, "val", "samples")
-        else:
+        if stage == "train":
             path = os.path.join(self.__sample_dir, f"epoch={epoch}", stage, "samples")
+        elif stage in ["test", "val"]:
+            path = os.path.join(self.__sample_dir, stage, "samples")
+        else:
+            raise ValueError(f"invalid stage {stage}")
 
+        input_files, num_batches = self.get_input_files(path, epoch=epoch, stage=stage)
         loader = BulkSampleLoader(
             self.data,
             self.data,
             None,  # FIXME get input nodes properly
             directory=path,
-            input_files=self.get_input_files(path, epoch=epoch, stage=stage),
+            input_files=input_files,
             **self.__loader_kwargs,
         )
 
         logger.info(f"got loader successfully on rank {self.rank}")
-        return loader
+        return loader, num_batches
 
     @property
     def data(self):
@@ -106,36 +155,73 @@ def data(self):
         logger.info("getting data")
 
         if self.__data is None:
-            # FIXME wholegraph
-            fs = FeatureStore(backend="torch")
+            if self.__backend == "wholegraph":
+                logger.info("using wholegraph backend")
+                fs = FeatureStore(
+                    backend="wholegraph",
+                    wg_type="chunked",
+                    wg_location="cpu",
+                )
+            else:
+                fs = FeatureStore(backend=self.__backend)
             num_nodes_dict = {}
 
+            if self.__backend == "wholegraph":
+                from pylibwholegraph.torch.initialize import get_global_communicator
+
+                wm_comm = get_global_communicator()
+                wm_comm.barrier()
+
             for node_type, x in self.__dataset.x_dict.items():
                 logger.debug(f"getting x for {node_type}")
                 fs.add_data(x, node_type, "x")
                 num_nodes_dict[node_type] = self.__dataset.num_nodes(node_type)
+                if self.__backend == "wholegraph":
+                    wm_comm.barrier()
 
             for node_type, y in self.__dataset.y_dict.items():
                 logger.debug(f"getting y for {node_type}")
-                fs.add_data(y, node_type, "y")
 
+                if self.__backend == "wholegraph":
+                    logger.info("using wholegraph backend")
+                    fs.add_data(y, node_type, "y")
+                    wm_comm.barrier()
+                else:
+                    y = y.cuda()
+                    y = y.reshape((y.shape[0], 1))
+                    fs.add_data(y, node_type, "y")
+
+            """
             for node_type, train in self.__dataset.train_dict.items():
                 logger.debug(f"getting train for {node_type}")
+                train = train.reshape((train.shape[0], 1))
+                if self.__backend != "wholegraph":
+                    train = train.cuda()
                 fs.add_data(train, node_type, "train")
 
             for node_type, test in self.__dataset.test_dict.items():
                 logger.debug(f"getting test for {node_type}")
+                test = test.reshape((test.shape[0], 1))
+                if self.__backend != "wholegraph":
+                    test = test.cuda()
                 fs.add_data(test, node_type, "test")
 
             for node_type, val in self.__dataset.val_dict.items():
                 logger.debug(f"getting val for {node_type}")
+                val = val.reshape((val.shape[0], 1))
+                if self.__backend != "wholegraph":
+                    val = val.cuda()
                 fs.add_data(val, node_type, "val")
+            """
 
             # TODO support online sampling if the edge index is provided
             num_edges_dict = self.__dataset.edge_index_dict
             if not isinstance(list(num_edges_dict.values())[0], int):
                 num_edges_dict = {k: len(v) for k, v in num_edges_dict}
 
+            if self.__backend == "wholegraph":
+                wm_comm.barrier()
+
             self.__data = CuGraphStore(
                 fs,
                 num_edges_dict,
@@ -147,14 +233,28 @@ def data(self):
         return self.__data
 
     def get_model(self, name="GraphSAGE"):
+        import logging
+
+        logger = logging.getLogger("PyGCuGraphTrainer")
+
+        logger.info("Creating model...")
+
         if name != "GraphSAGE":
             raise ValueError("only GraphSAGE is currently supported")
 
+        logger.info("getting input features...")
         num_input_features = self.__dataset.num_input_features
+
+        logger.info("getting output features...")
         num_output_features = self.__dataset.num_labels
+
+        logger.info("getting num neighbors...")
         num_layers = len(self.__loader_kwargs["num_neighbors"])
 
+        logger.info("Got input features, output features, num neighbors")
+
         with torch.cuda.device(self.__device):
+            logger.info("Constructing CuGraphSAGE model...")
             model = (
                 CuGraphSAGE(
                     in_channels=num_input_features,
@@ -166,8 +266,10 @@ def get_model(self, name="GraphSAGE"):
                 .to(self.__device)
             )
 
+            logger.info("Parallelizing model with ddp...")
             model = ddp(model, device_ids=[self.__device])
-            print("done creating model")
+
+        logger.info("done creating model")
 
         return model
 
@@ -175,10 +277,28 @@ def get_input_files(self, path, epoch=0, stage="train"):
         file_list = np.array(os.listdir(path))
         file_list.sort()
 
-        if stage == "train":
-            splits = np.array_split(file_list, self.__world_size)
-            np.random.seed(epoch)
-            np.random.shuffle(splits)
-            return splits[self.rank]
-        else:
-            return file_list
+        np.random.seed(epoch)
+        np.random.shuffle(file_list)
+
+        splits = np.array_split(file_list, self.__gpus_per_node)
+
+        import logging
+
+        logger = logging.getLogger("PyGCuGraphTrainer")
+
+        split = splits[self.__device]
+        logger.info(f"rank {self.__rank} input files: {str(split)}")
+
+        ex = re.compile(r"batch=([0-9]+)\-([0-9]+).parquet")
+        num_batches = min(
+            [
+                sum([int(ex.match(fname)[2]) - int(ex.match(fname)[1]) for fname in s])
+                for s in splits
+            ]
+        )
+        if num_batches == 0:
+            raise ValueError(
+                f"Too few batches for training with world size {self.__world_size}"
+            )
+
+        return split, num_batches
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
index bddd6ae2644..d6205901b68 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
@@ -33,7 +33,12 @@
 import time
 
 
-def pyg_num_workers(world_size):
+def pyg_num_workers(world_size: int) -> int:
+    """
+    Calculates the number of workers for the
+    loader in PyG by calling sched_getaffinity.
+    """
+
     num_workers = None
     if hasattr(os, "sched_getaffinity"):
         try:
@@ -45,14 +50,80 @@ def pyg_num_workers(world_size):
     return int(num_workers)
 
 
+def calc_accuracy(
+    loader: NeighborLoader,
+    max_num_batches: int,
+    model: torch.nn.Module,
+    num_classes: int,
+) -> float:
+    """
+    Evaluates the accuracy of a model given a loader over evaluation samples.
+
+    Parameters
+    ----------
+    loader: NeighborLoader
+        The loader over evaluation samples.
+    model: torch.nn.Module
+        The model being evaluated.
+    num_classes: int
+        The number of output classes of the model.
+
+    Returns
+    -------
+    The calculated accuracy as a fraction.
+    """
+
+    from torchmetrics import Accuracy
+
+    acc = Accuracy(task="multiclass", num_classes=num_classes).cuda()
+
+    acc_sum = 0.0
+    num_batches = 0
+    with torch.no_grad():
+        for i, batch in enumerate(loader):
+            num_sampled_nodes = sum(
+                [torch.as_tensor(n) for n in batch.num_sampled_nodes_dict.values()]
+            )
+            num_sampled_edges = sum(
+                [torch.as_tensor(e) for e in batch.num_sampled_edges_dict.values()]
+            )
+            batch_size = num_sampled_nodes[0]
+
+            batch = batch.to_homogeneous().cuda()
+
+            batch.y = batch.y.to(torch.long).reshape((batch.y.shape[0],))
+
+            out = model(
+                batch.x,
+                batch.edge_index,
+                num_sampled_nodes,
+                num_sampled_edges,
+            )
+            acc_sum += acc(out[:batch_size].softmax(dim=-1), batch.y[:batch_size])
+            num_batches += 1
+
+            if max_num_batches is not None and i >= max_num_batches:
+                break
+
+    acc_sum = torch.tensor(float(acc_sum), dtype=torch.float32, device="cuda")
+    td.all_reduce(acc_sum, op=td.ReduceOp.SUM)
+    nb = torch.tensor(float(num_batches), dtype=torch.float32, device=acc_sum.device)
+    td.all_reduce(nb, op=td.ReduceOp.SUM)
+
+    return acc_sum / nb
+
+
 class PyGTrainer(Trainer):
+    """
+    Trainer implementation for node classification in PyG.
+    """
+
     def train(self):
         import logging
 
         logger = logging.getLogger("PyGTrainer")
         logger.info("Entered train loop")
 
-        total_loss = 0.0
         num_batches = 0
 
         time_forward = 0.0
@@ -62,19 +133,32 @@ def train(self):
         start_time = time.perf_counter()
         end_time_backward = start_time
 
+        num_layers = len(self.model.module.convs)
+
         for epoch in range(self.num_epochs):
             with td.algorithms.join.Join(
-                [self.model], divide_by_initial_world_size=False
+                [self.model, self.optimizer], divide_by_initial_world_size=False
             ):
                 self.model.train()
-                for iter_i, data in enumerate(
-                    self.get_loader(epoch=epoch, stage="train")
-                ):
+                loader, max_num_batches = self.get_loader(epoch=epoch, stage="train")
+
+                max_num_batches = torch.tensor([max_num_batches], device="cuda")
+                torch.distributed.all_reduce(
+                    max_num_batches, op=torch.distributed.ReduceOp.MIN
+                )
+                max_num_batches = int(max_num_batches[0])
+
+                for iter_i, data in enumerate(loader):
                     loader_time_iter = time.perf_counter() - end_time_backward
                     time_loader += loader_time_iter
 
                     time_feature_transfer_start = time.perf_counter()
 
+                    if len(data.edge_index_dict[("paper", "cites", "paper")][0]) < 3:
+                        logger.error(f"Invalid edge index in iteration {iter_i}")
+                        data = old_data
+
+                    old_data = data
                     num_sampled_nodes = sum(
                         [
                             torch.as_tensor(n)
@@ -89,7 +173,6 @@ def train(self):
                     )
 
                     # FIXME find a way to get around this and not have to call extend_tensor
-                    num_layers = len(self.model.module.convs)
                     num_sampled_nodes = extend_tensor(num_sampled_nodes, num_layers + 1)
                     num_sampled_edges = extend_tensor(num_sampled_edges, num_layers)
 
@@ -118,7 +201,12 @@ def train(self):
                         )
                         logger.info(f"total time: {total_time_iter}")
 
+                        # from pynvml.smi import nvidia_smi
+                        # mem_info = nvidia_smi.getInstance().DeviceQuery('memory.free, memory.total')['gpu'][self.rank % 8]['fb_memory_usage']
+                        # logger.info(f"rank {self.rank} memory: {mem_info}")
+
                     y_true = data.y
+                    y_true = y_true.reshape((y_true.shape[0],))
                     x = data.x.to(torch.float32)
 
                     start_time_forward = time.perf_counter()
@@ -160,101 +248,48 @@ def train(self):
                     self.optimizer.zero_grad()
                     loss.backward()
                     self.optimizer.step()
-                    total_loss += loss.item()
                     end_time_backward = time.perf_counter()
                     time_backward += end_time_backward - start_time_backward
 
-            end_time = time.perf_counter()
-
-            # test
-            from torchmetrics import Accuracy
+                    if max_num_batches is not None and iter_i >= max_num_batches:
+                        break
 
-            acc = Accuracy(
-                task="multiclass", num_classes=self.dataset.num_labels
-            ).cuda()
+            end_time = time.perf_counter()
 
+            """
+            logger.info("Entering test stage...")
             with td.algorithms.join.Join(
                 [self.model], divide_by_initial_world_size=False
             ):
                 self.model.eval()
-                if self.rank == 0:
-                    acc_sum = 0.0
-                    with torch.no_grad():
-                        for i, batch in enumerate(
-                            self.get_loader(epoch=epoch, stage="test")
-                        ):
-                            num_sampled_nodes = sum(
-                                [
-                                    torch.as_tensor(n)
-                                    for n in batch.num_sampled_nodes_dict.values()
-                                ]
-                            )
-                            num_sampled_edges = sum(
-                                [
-                                    torch.as_tensor(e)
-                                    for e in batch.num_sampled_edges_dict.values()
-                                ]
-                            )
-                            batch_size = num_sampled_nodes[0]
-
-                            batch = batch.to_homogeneous().cuda()
-
-                            batch.y = batch.y.to(torch.long)
-                            out = self.model.module(
-                                batch.x,
-                                batch.edge_index,
-                                num_sampled_nodes,
-                                num_sampled_edges,
-                            )
-                            acc_sum += acc(
-                                out[:batch_size].softmax(dim=-1), batch.y[:batch_size]
-                            )
-                    print(
-                        f"Accuracy: {acc_sum/(i) * 100.0:.4f}%",
-                    )
+                loader, max_num_batches = self.get_loader(epoch=epoch, stage="test")
+                num_classes = self.dataset.num_labels
 
-            td.barrier()
+                acc = calc_accuracy(
+                    loader, max_num_batches, self.model.module, num_classes
+                )
 
-        with td.algorithms.join.Join([self.model], divide_by_initial_world_size=False):
-            self.model.eval()
             if self.rank == 0:
-                acc_sum = 0.0
-                with torch.no_grad():
-                    for i, batch in enumerate(
-                        self.get_loader(epoch=epoch, stage="val")
-                    ):
-                        num_sampled_nodes = sum(
-                            [
-                                torch.as_tensor(n)
-                                for n in batch.num_sampled_nodes_dict.values()
-                            ]
-                        )
-                        num_sampled_edges = sum(
-                            [
-                                torch.as_tensor(e)
-                                for e in batch.num_sampled_edges_dict.values()
-                            ]
-                        )
-                        batch_size = num_sampled_nodes[0]
-
-                        batch = batch.to_homogeneous().cuda()
-
-                        batch.y = batch.y.to(torch.long)
-                        out = self.model.module(
-                            batch.x,
-                            batch.edge_index,
-                            num_sampled_nodes,
-                            num_sampled_edges,
-                        )
-                        acc_sum += acc(
-                            out[:batch_size].softmax(dim=-1), batch.y[:batch_size]
-                        )
                 print(
-                    f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%",
+                    f"Accuracy: {acc * 100.0:.4f}%",
                 )
+            """
+
+        """
+        logger.info("Entering validation stage")
+        with td.algorithms.join.Join([self.model], divide_by_initial_world_size=False):
+            self.model.eval()
+            loader, max_num_batches = self.get_loader(epoch=epoch, stage="val")
+            num_classes = self.dataset.num_labels
+            acc = calc_accuracy(loader, max_num_batches, self.model.module, num_classes)
+
+        if self.rank == 0:
+            print(
+                f"Validation Accuracy: {acc * 100.0:.4f}%",
+            )
+        """
 
         stats = {
-            "Accuracy": float(acc_sum / (i) * 100.0) if self.rank == 0 else 0.0,
             "# Batches": num_batches,
             "Loader Time": time_loader,
             "Feature Transfer Time": time_feature_transfer,
@@ -265,6 +300,12 @@ def train(self):
 
 
 class PyGNativeTrainer(PyGTrainer):
+    """
+    Trainer implementation for native PyG
+    training using HeteroData as the graph and feature
+    store and NeighborLoader as the loader.
+    """
+
     def __init__(
         self,
         dataset,
@@ -403,7 +444,7 @@ def get_loader(self, epoch: int = 0, stage="train"):
         )
 
         logger.info("done creating loader")
-        return loader
+        return loader, None
 
     def get_model(self, name="GraphSAGE"):
         if name != "GraphSAGE":
diff --git a/benchmarks/nx-cugraph/pytest-based/bench_algos.py b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
index 97eb32e2aaa..3b085a9bfdb 100644
--- a/benchmarks/nx-cugraph/pytest-based/bench_algos.py
+++ b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
@@ -242,6 +242,28 @@ def get_highest_degree_node(graph_obj):
     return max(degrees, key=lambda t: t[1])[0]
 
 
+def build_personalization_dict(pagerank_dict):
+    """
+    Returns a dictionary that can be used as the personalization value for a
+    call to nx.pagerank(). The pagerank_dict passed in is used as the initial
+    source of values for each node, and this function simply treats the list of
+    dict values as two halves (halves A and B) and swaps them so (most if not
+    all) nodes/keys are assigned a different value from the dictionary.
+    """
+    num_half = len(pagerank_dict) // 2
+    A_half_items = list(pagerank_dict.items())[:num_half]
+    B_half_items = list(pagerank_dict.items())[num_half:]
+
+    # Support an odd number of items by initializing with B_half_items, which
+    # will always be one bigger if the number of items is odd. This will leave
+    # the one remainder (in the case of an odd number) unchanged.
+    pers_dict = dict(B_half_items)
+    pers_dict.update({A_half_items[i][0]: B_half_items[i][1] for i in range(num_half)})
+    pers_dict.update({B_half_items[i][0]: A_half_items[i][1] for i in range(num_half)})
+
+    return pers_dict
+
+
 ################################################################################
 # Benchmarks
 def bench_from_networkx(benchmark, graph_obj):
@@ -431,6 +453,26 @@ def bench_pagerank(benchmark, graph_obj, backend_wrapper):
     assert type(result) is dict
 
 
+def bench_pagerank_personalized(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+
+    # FIXME: This will run for every combination of inputs, even if the
+    # graph/dataset does not change. Ideally this is run once per
+    # graph/dataset.
+    pagerank_dict = nx.pagerank(G)
+    personalization_dict = build_personalization_dict(pagerank_dict)
+
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.pagerank),
+        args=(G,),
+        kwargs={"personalization": personalization_dict},
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is dict
+
+
 def bench_single_source_shortest_path_length(benchmark, graph_obj, backend_wrapper):
     G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
     node = get_highest_degree_node(graph_obj)
@@ -804,3 +846,73 @@ def bench_weakly_connected_components(benchmark, graph_obj, backend_wrapper):
         warmup_rounds=warmup_rounds,
     )
     assert type(result) is list
+
+
+@pytest.mark.skip(reason="benchmark not implemented")
+def bench_complete_bipartite_graph(benchmark, graph_obj, backend_wrapper):
+    pass
+
+
+@pytest.mark.skip(reason="benchmark not implemented")
+def bench_connected_components(benchmark, graph_obj, backend_wrapper):
+    pass
+
+
+@pytest.mark.skip(reason="benchmark not implemented")
+def bench_is_connected(benchmark, graph_obj, backend_wrapper):
+    pass
+
+
+@pytest.mark.skip(reason="benchmark not implemented")
+def bench_node_connected_component(benchmark, graph_obj, backend_wrapper):
+    pass
+
+
+@pytest.mark.skip(reason="benchmark not implemented")
+def bench_number_connected_components(benchmark, graph_obj, backend_wrapper):
+    pass
+
+
+@pytest.mark.skip(reason="benchmark not implemented")
+def bench_is_isolate(benchmark, graph_obj, backend_wrapper):
+    pass
+
+
+@pytest.mark.skip(reason="benchmark not implemented")
+def bench_isolates(benchmark, graph_obj, backend_wrapper):
+    pass
+
+
+@pytest.mark.skip(reason="benchmark not implemented")
+def bench_number_of_isolates(benchmark, graph_obj, backend_wrapper):
+    pass
+
+
+@pytest.mark.skip(reason="benchmark not implemented")
+def bench_complement(benchmark, graph_obj, backend_wrapper):
+    pass
+
+
+@pytest.mark.skip(reason="benchmark not implemented")
+def bench_reverse(benchmark, graph_obj, backend_wrapper):
+    pass
+
+
+@pytest.mark.skip(reason="benchmark not implemented")
+def bench_is_arborescence(benchmark, graph_obj, backend_wrapper):
+    pass
+
+
+@pytest.mark.skip(reason="benchmark not implemented")
+def bench_is_branching(benchmark, graph_obj, backend_wrapper):
+    pass
+
+
+@pytest.mark.skip(reason="benchmark not implemented")
+def bench_is_forest(benchmark, graph_obj, backend_wrapper):
+    pass
+
+
+@pytest.mark.skip(reason="benchmark not implemented")
+def bench_is_tree(benchmark, graph_obj, backend_wrapper):
+    pass
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
deleted file mode 100644
index ba8b73898e2..00000000000
--- a/ci/checks/copyright.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-import datetime
-import os
-import re
-import sys
-
-import git
-
-FilesToCheck = [
-    re.compile(r"[.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$"),
-    re.compile(r"CMakeLists[.]txt$"),
-    re.compile(r"setup[.]cfg$"),
-    re.compile(r"[.]flake8[.]cython$"),
-    re.compile(r"meta[.]yaml$"),
-]
-
-# this will break starting at year 10000, which is probably OK :)
-CheckSimple = re.compile(
-    r"Copyright *(?:\(c\))? *(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)"
-)
-CheckDouble = re.compile(
-    r"Copyright *(?:\(c\))? *(\d{4})-(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)"  # noqa: E501
-)
-
-
-def checkThisFile(f):
-    if isinstance(f, git.Diff):
-        if f.deleted_file or f.b_blob.size == 0:
-            return False
-        f = f.b_path
-    elif not os.path.exists(f) or os.stat(f).st_size == 0:
-        # This check covers things like symlinks which point to files that DNE
-        return False
-    for checker in FilesToCheck:
-        if checker.search(f):
-            return True
-    return False
-
-
-def modifiedFiles():
-    """Get a set of all modified files, as Diff objects.
-
-    The files returned have been modified in git since the merge base of HEAD
-    and the upstream of the target branch. We return the Diff objects so that
-    we can read only the staged changes.
-    """
-    repo = git.Repo()
-    # Use the environment variable TARGET_BRANCH or RAPIDS_BASE_BRANCH (defined in CI) if possible
-    target_branch = os.environ.get("TARGET_BRANCH", os.environ.get("RAPIDS_BASE_BRANCH"))
-    if target_branch is None:
-        # Fall back to the closest branch if not on CI
-        target_branch = repo.git.describe(
-            all=True, tags=True, match="branch-*", abbrev=0
-        ).lstrip("heads/")
-
-    upstream_target_branch = None
-    if target_branch in repo.heads:
-        # Use the tracking branch of the local reference if it exists. This
-        # returns None if no tracking branch is set.
-        upstream_target_branch = repo.heads[target_branch].tracking_branch()
-    if upstream_target_branch is None:
-        # Fall back to the remote with the newest target_branch. This code
-        # path is used on CI because the only local branch reference is
-        # current-pr-branch, and thus target_branch is not in repo.heads.
-        # This also happens if no tracking branch is defined for the local
-        # target_branch. We use the remote with the latest commit if
-        # multiple remotes are defined.
-        candidate_branches = [
-            remote.refs[target_branch] for remote in repo.remotes
-            if target_branch in remote.refs
-        ]
-        if len(candidate_branches) > 0:
-            upstream_target_branch = sorted(
-                candidate_branches,
-                key=lambda branch: branch.commit.committed_datetime,
-            )[-1]
-        else:
-            # If no remotes are defined, try to use the local version of the
-            # target_branch. If this fails, the repo configuration must be very
-            # strange and we can fix this script on a case-by-case basis.
-            upstream_target_branch = repo.heads[target_branch]
-    merge_base = repo.merge_base("HEAD", upstream_target_branch.commit)[0]
-    diff = merge_base.diff()
-    changed_files = {f for f in diff if f.b_path is not None}
-    return changed_files
-
-
-def getCopyrightYears(line):
-    res = CheckSimple.search(line)
-    if res:
-        return int(res.group(1)), int(res.group(1))
-    res = CheckDouble.search(line)
-    if res:
-        return int(res.group(1)), int(res.group(2))
-    return None, None
-
-
-def replaceCurrentYear(line, start, end):
-    # first turn a simple regex into double (if applicable). then update years
-    res = CheckSimple.sub(r"Copyright (c) \1-\1, NVIDIA CORPORATION", line)
-    res = CheckDouble.sub(
-        rf"Copyright (c) {start:04d}-{end:04d}, NVIDIA CORPORATION",
-        res,
-    )
-    return res
-
-
-def checkCopyright(f, update_current_year):
-    """Checks for copyright headers and their years."""
-    errs = []
-    thisYear = datetime.datetime.now().year
-    lineNum = 0
-    crFound = False
-    yearMatched = False
-
-    if isinstance(f, git.Diff):
-        path = f.b_path
-        lines = f.b_blob.data_stream.read().decode().splitlines(keepends=True)
-    else:
-        path = f
-        with open(f, encoding="utf-8") as fp:
-            lines = fp.readlines()
-
-    for line in lines:
-        lineNum += 1
-        start, end = getCopyrightYears(line)
-        if start is None:
-            continue
-        crFound = True
-        if start > end:
-            e = [
-                path,
-                lineNum,
-                "First year after second year in the copyright "
-                "header (manual fix required)",
-                None,
-            ]
-            errs.append(e)
-        elif thisYear < start or thisYear > end:
-            e = [
-                path,
-                lineNum,
-                "Current year not included in the copyright header",
-                None,
-            ]
-            if thisYear < start:
-                e[-1] = replaceCurrentYear(line, thisYear, end)
-            if thisYear > end:
-                e[-1] = replaceCurrentYear(line, start, thisYear)
-            errs.append(e)
-        else:
-            yearMatched = True
-    # copyright header itself not found
-    if not crFound:
-        e = [
-            path,
-            0,
-            "Copyright header missing or formatted incorrectly "
-            "(manual fix required)",
-            None,
-        ]
-        errs.append(e)
-    # even if the year matches a copyright header, make the check pass
-    if yearMatched:
-        errs = []
-
-    if update_current_year:
-        errs_update = [x for x in errs if x[-1] is not None]
-        if len(errs_update) > 0:
-            lines_changed = ", ".join(str(x[1]) for x in errs_update)
-            print(f"File: {path}. Changing line(s) {lines_changed}")
-            for _, lineNum, __, replacement in errs_update:
-                lines[lineNum - 1] = replacement
-            with open(path, "w", encoding="utf-8") as out_file:
-                out_file.writelines(lines)
-
-    return errs
-
-
-def getAllFilesUnderDir(root, pathFilter=None):
-    retList = []
-    for dirpath, dirnames, filenames in os.walk(root):
-        for fn in filenames:
-            filePath = os.path.join(dirpath, fn)
-            if pathFilter(filePath):
-                retList.append(filePath)
-    return retList
-
-
-def checkCopyright_main():
-    """
-    Checks for copyright headers in all the modified files. In case of local
-    repo, this script will just look for uncommitted files and in case of CI
-    it compares between branches "$PR_TARGET_BRANCH" and "current-pr-branch"
-    """
-    retVal = 0
-
-    argparser = argparse.ArgumentParser(
-        "Checks for a consistent copyright header in git's modified files"
-    )
-    argparser.add_argument(
-        "--update-current-year",
-        dest="update_current_year",
-        action="store_true",
-        required=False,
-        help="If set, "
-        "update the current year if a header is already "
-        "present and well formatted.",
-    )
-    argparser.add_argument(
-        "--git-modified-only",
-        dest="git_modified_only",
-        action="store_true",
-        required=False,
-        help="If set, "
-        "only files seen as modified by git will be "
-        "processed.",
-    )
-
-    args, dirs = argparser.parse_known_args()
-
-    if args.git_modified_only:
-        files = [f for f in modifiedFiles() if checkThisFile(f)]
-    else:
-        files = []
-        for d in [os.path.abspath(d) for d in dirs]:
-            if not os.path.isdir(d):
-                raise ValueError(f"{d} is not a directory.")
-            files += getAllFilesUnderDir(d, pathFilter=checkThisFile)
-
-    errors = []
-    for f in files:
-        errors += checkCopyright(f, args.update_current_year)
-
-    if len(errors) > 0:
-        if any(e[-1] is None for e in errors):
-            print("Copyright headers incomplete in some of the files!")
-        for e in errors:
-            print("  %s:%d Issue: %s" % (e[0], e[1], e[2]))
-        print("")
-        n_fixable = sum(1 for e in errors if e[-1] is not None)
-        path_parts = os.path.abspath(__file__).split(os.sep)
-        file_from_repo = os.sep.join(path_parts[path_parts.index("ci") :])
-        if n_fixable > 0 and not args.update_current_year:
-            print(
-                f"You can run `python {file_from_repo} --git-modified-only "
-                "--update-current-year` and stage the results in git to "
-                f"fix {n_fixable} of these errors.\n"
-            )
-        retVal = 1
-
-    return retVal
-
-
-if __name__ == "__main__":
-    sys.exit(checkCopyright_main())
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 6aed308c498..f0eff82e1ae 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -42,7 +42,7 @@ dependencies:
 - ninja
 - notebook>=0.5.0
 - numba>=0.57
-- numpy>=1.23
+- numpy>=1.23,<2.0a0
 - numpydoc
 - nvcc_linux-64=11.8
 - openmpi
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 4a095058219..93972f40d8b 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -48,7 +48,7 @@ dependencies:
 - ninja
 - notebook>=0.5.0
 - numba>=0.57
-- numpy>=1.23
+- numpy>=1.23,<2.0a0
 - numpydoc
 - openmpi
 - packaging>=21
diff --git a/conda/recipes/cugraph-dgl/meta.yaml b/conda/recipes/cugraph-dgl/meta.yaml
index 09322a9c7d3..5e28e69a0d7 100644
--- a/conda/recipes/cugraph-dgl/meta.yaml
+++ b/conda/recipes/cugraph-dgl/meta.yaml
@@ -25,7 +25,7 @@ requirements:
     - cugraph ={{ version }}
     - dgl >=1.1.0.cu*
     - numba >=0.57
-    - numpy >=1.23
+    - numpy >=1.23,<2.0a0
     - pylibcugraphops ={{ minor_version }}
     - python
     - pytorch
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index 624f5753fd2..4ada5e31211 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -28,7 +28,7 @@ requirements:
   run:
     - rapids-dask-dependency ={{ minor_version }}
     - numba >=0.57
-    - numpy >=1.23
+    - numpy >=1.23,<2.0a0
     - python
     - pytorch >=2.0
     - cupy >=12.0.0
diff --git a/conda/recipes/cugraph-service/meta.yaml b/conda/recipes/cugraph-service/meta.yaml
index c04c1a7c7fa..8698d4f6985 100644
--- a/conda/recipes/cugraph-service/meta.yaml
+++ b/conda/recipes/cugraph-service/meta.yaml
@@ -60,7 +60,7 @@ outputs:
         - dask-cuda ={{ minor_version }}
         - dask-cudf ={{ minor_version }}
         - numba >=0.57
-        - numpy >=1.23
+        - numpy >=1.23,<2.0a0
         - python
         - rapids-dask-dependency ={{ minor_version }}
         - thriftpy2 >=0.4.15
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a3392627fb8..88908ef70ce 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -186,9 +186,9 @@ endif()
 #       which should give us a better parallel schedule.
 
 set(CUGRAPH_SOURCES
-    src/detail/shuffle_vertices.cu
+    src/utilities/shuffle_vertices.cu
     src/detail/permute_range.cu
-    src/detail/shuffle_vertex_pairs.cu
+    src/utilities/shuffle_vertex_pairs.cu
     src/detail/collect_local_vertex_values.cu
     src/detail/groupby_and_count.cu
     src/detail/collect_comm_wrapper.cu
@@ -197,8 +197,8 @@ set(CUGRAPH_SOURCES
     src/community/detail/common_methods_sg.cu
     src/community/detail/refine_sg.cu
     src/community/detail/refine_mg.cu
-    src/community/detail/mis_sg.cu
-    src/community/detail/mis_mg.cu
+    src/community/detail/maximal_independent_moves_sg.cu
+    src/community/detail/maximal_independent_moves_mg.cu
     src/detail/utility_wrappers.cu
     src/structure/graph_view_mg.cu
     src/structure/remove_self_loops.cu
@@ -295,6 +295,10 @@ set(CUGRAPH_SOURCES
     src/tree/legacy/mst.cu
     src/components/weakly_connected_components_sg.cu
     src/components/weakly_connected_components_mg.cu
+    src/components/mis_sg.cu
+    src/components/mis_mg.cu
+    src/components/vertex_coloring_sg.cu
+    src/components/vertex_coloring_mg.cu
     src/structure/create_graph_from_edgelist_sg.cu
     src/structure/create_graph_from_edgelist_mg.cu
     src/structure/symmetrize_edgelist_sg.cu
@@ -411,6 +415,8 @@ endif()
 add_library(cugraph_c
         src/c_api/resource_handle.cpp
         src/c_api/array.cpp
+        src/c_api/degrees.cu
+        src/c_api/degrees_result.cpp
         src/c_api/error.cpp
         src/c_api/graph_sg.cpp
         src/c_api/graph_mg.cpp
diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 5c29604a5a7..1471d340cec 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -2340,15 +2340,41 @@ std::tuple<rmm::device_uvector<size_t>, rmm::device_uvector<vertex_t>> k_hop_nbr
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Graph view object.
  * @param rng_state The RngState instance holding pseudo-random number generator state.
- * @return A device vector containing vertices found in the maximal independent set
+ * @return A device vector containing vertices in the maximal independent set.
  */
-
 template <typename vertex_t, typename edge_t, bool multi_gpu>
 rmm::device_uvector<vertex_t> maximal_independent_set(
   raft::handle_t const& handle,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   raft::random::RngState& rng_state);
 
+/*
+ * @brief Find a Greedy Vertex Coloring
+ *
+ * A vertex coloring is an assignment of colors or labels to each vertex of a graph so that
+ * no two adjacent vertices have the same color or label. Finding the minimum number of colors
+ * needed to color the vertices of a graph is an NP-hard problem and therefore for practical
+ * use cases greedy coloring is used. Here we provide an implementation of greedy vertex
+ * coloring based on maximal independent set.
+ * See
+ * https://research.nvidia.com/sites/default/files/pubs/2015-05_Parallel-Graph-Coloring/nvr-2015-001.pdf
+ * for further information.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object.
+ * @param rng_state The RngState instance holding pseudo-random number generator state.
+ * @return A device vector containing color for each vertex.
+ */
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+rmm::device_uvector<vertex_t> vertex_coloring(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  raft::random::RngState& rng_state);
+
 }  // namespace cugraph
 
 /**
diff --git a/cpp/include/cugraph/detail/decompress_edge_partition.cuh b/cpp/include/cugraph/detail/decompress_edge_partition.cuh
index dad5ce77e45..6b974a326dd 100644
--- a/cpp/include/cugraph/detail/decompress_edge_partition.cuh
+++ b/cpp/include/cugraph/detail/decompress_edge_partition.cuh
@@ -44,7 +44,7 @@ namespace detail {
 int32_t constexpr decompress_edge_partition_block_size = 1024;
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void decompress_to_edgelist_mid_degree(
+__global__ static void decompress_to_edgelist_mid_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   vertex_t major_range_first,
   vertex_t major_range_last,
@@ -74,7 +74,7 @@ __global__ void decompress_to_edgelist_mid_degree(
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void decompress_to_edgelist_high_degree(
+__global__ static void decompress_to_edgelist_high_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   vertex_t major_range_first,
   vertex_t major_range_last,
diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 90425f86bef..6d4470e8251 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -1052,4 +1052,67 @@ remove_multi_edges(raft::handle_t const& handle,
                    std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
                    bool keep_min_value_edge = false);
 
+/**
+ * @brief Shuffle external vertex ids to the proper GPU.
+ *
+ * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices  List of vertex ids
+ * @return Vector of vertex ids mapped to this GPU.
+ */
+template <typename vertex_t>
+rmm::device_uvector<vertex_t> shuffle_external_vertices(raft::handle_t const& handle,
+                                                        rmm::device_uvector<vertex_t>&& vertices);
+
+/**
+ * @brief Shuffle external vertex ids and values to the proper GPU.
+ *
+ * @tparam vertex_t   Type of vertex identifiers. Needs to be an integral type.
+ * @tparam value_t    Type of values. currently supported types are int32_t,
+ * int64_t, size_t, float and double.
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices  List of vertex ids
+ * @param values List of values
+ * @return Tuple of vectors storing vertex ids and values mapped to this GPU.
+ */
+template <typename vertex_t, typename value_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<value_t>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<vertex_t>&& vertices,
+                                    rmm::device_uvector<value_t>&& values);
+
+/**
+ * @brief Shuffle external edges to the proper GPU.
+ *
+ * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t      Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t    Type of edge weight. Currently float and double are supported.
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edge_srcs  List of source vertex ids
+ * @param edge_dsts  List of destination vertex ids
+ * @param edge_weights  Optional list of edge weights
+ * @param edge_ids  Optional list of edge ids
+ * @param edge_types Optional list of edge types
+ * @return Tuple of vectors storing edge sources, destinations, optional weights,
+ *          optional edge ids, optional edge types mapped to this GPU.
+ */
+template <typename vertex_t, typename edge_t, typename weight_t, typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<vertex_t>&& edge_srcs,
+                       rmm::device_uvector<vertex_t>&& edge_dsts,
+                       std::optional<rmm::device_uvector<weight_t>>&& edge_weights,
+                       std::optional<rmm::device_uvector<edge_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<edge_type_t>>&& edge_types);
+
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index 3f3514179bf..cbb52ef3b1e 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -613,7 +613,7 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
       major_value_range_start_offset);
   }
 
-  // FIXME: deprecated, replaced with copmute_number_of_edges (which works with or without edge
+  // FIXME: deprecated, replaced with compute_number_of_edges (which works with or without edge
   // masking)
   edge_t number_of_edges() const
   {
@@ -923,7 +923,7 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
       offsets_, indices_, this->number_of_vertices());
   }
 
-  // FIXME: deprecated, replaced with copmute_number_of_edges (which works with or without edge
+  // FIXME: deprecated, replaced with compute_number_of_edges (which works with or without edge
   // masking)
   edge_t number_of_edges() const
   {
diff --git a/cpp/include/cugraph/utilities/misc_utils.cuh b/cpp/include/cugraph/utilities/misc_utils.cuh
index d3917a3e851..633dabe5b40 100644
--- a/cpp/include/cugraph/utilities/misc_utils.cuh
+++ b/cpp/include/cugraph/utilities/misc_utils.cuh
@@ -94,6 +94,14 @@ thrust::optional<T> to_thrust_optional(std::optional<T> val)
   return ret;
 }
 
+template <typename T>
+std::optional<T> to_std_optional(thrust::optional<T> val)
+{
+  std::optional<T> ret{std::nullopt};
+  if (val) { ret = *val; }
+  return ret;
+}
+
 template <typename idx_t, typename offset_t>
 rmm::device_uvector<idx_t> expand_sparse_offsets(raft::device_span<offset_t const> offsets,
                                                  idx_t base_idx,
diff --git a/cpp/include/cugraph_c/graph_functions.h b/cpp/include/cugraph_c/graph_functions.h
index 8fe1ea0b958..94b06189796 100644
--- a/cpp/include/cugraph_c/graph_functions.h
+++ b/cpp/include/cugraph_c/graph_functions.h
@@ -229,6 +229,118 @@ cugraph_error_code_t cugraph_allgather(const cugraph_resource_handle_t* handle,
                                        cugraph_induced_subgraph_result_t** result,
                                        cugraph_error_t** error);
 
+/**
+ * @brief       Opaque degree result type
+ */
+typedef struct {
+  int32_t align_;
+} cugraph_degrees_result_t;
+
+/**
+ * @brief      Compute in degrees
+ *
+ * Compute the in degrees for the vertices in the graph.
+ *
+ * @param [in]  handle              Handle for accessing resources.
+ * @param [in]  graph               Pointer to graph
+ * @param [in]  source_vertices     Device array of vertices we want to compute in degrees for.
+ * @param [in]  do_expensive_check  A flag to run expensive checks for input arguments (if set to
+ * true)
+ * @param [out] result              Opaque pointer to degrees result
+ * @param [out] error               Pointer to an error object storing details of any error.  Will
+ *                                  be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_in_degrees(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* source_vertices,
+  bool_t do_expensive_check,
+  cugraph_degrees_result_t** result,
+  cugraph_error_t** error);
+
+/**
+ * @brief      Compute out degrees
+ *
+ * Compute the out degrees for the vertices in the graph.
+ *
+ * @param [in]  handle              Handle for accessing resources.
+ * @param [in]  graph               Pointer to graph
+ * @param [in]  source_vertices     Device array of vertices we want to compute out degrees for.
+ * @param [in]  do_expensive_check  A flag to run expensive checks for input arguments (if set to
+ * true)
+ * @param [out] result              Opaque pointer to degrees result
+ * @param [out] error               Pointer to an error object storing details of any error.  Will
+ *                                  be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_out_degrees(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* source_vertices,
+  bool_t do_expensive_check,
+  cugraph_degrees_result_t** result,
+  cugraph_error_t** error);
+
+/**
+ * @brief      Compute degrees
+ *
+ * Compute the degrees for the vertices in the graph.
+ *
+ * @param [in]  handle              Handle for accessing resources.
+ * @param [in]  graph               Pointer to graph
+ * @param [in]  source_vertices     Device array of vertices we want to compute degrees for.
+ * @param [in]  do_expensive_check  A flag to run expensive checks for input arguments (if set to
+ * true)
+ * @param [out] result              Opaque pointer to degrees result
+ * @param [out] error               Pointer to an error object storing details of any error.  Will
+ *                                  be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_degrees(const cugraph_resource_handle_t* handle,
+                                     cugraph_graph_t* graph,
+                                     const cugraph_type_erased_device_array_view_t* source_vertices,
+                                     bool_t do_expensive_check,
+                                     cugraph_degrees_result_t** result,
+                                     cugraph_error_t** error);
+
+/**
+ * @brief       Get the vertex ids
+ *
+ * @param [in]     degrees_result   Opaque pointer to degree result
+ * @return type erased array view of vertex ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_degrees_result_get_vertices(
+  cugraph_degrees_result_t* degrees_result);
+
+/**
+ * @brief       Get the in degrees
+ *
+ * @param [in]     degrees_result   Opaque pointer to degree result
+ * @return type erased array view of vertex ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_degrees_result_get_in_degrees(
+  cugraph_degrees_result_t* degrees_result);
+
+/**
+ * @brief       Get the out degrees
+ *
+ * If the graph is symmetric, in degrees and out degrees will be equal (and
+ * will be stored in the same memory).
+ *
+ * @param [in]     degrees_result   Opaque pointer to degree result
+ * @return type erased array view of vertex ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_degrees_result_get_out_degrees(
+  cugraph_degrees_result_t* degrees_result);
+
+/**
+ * @brief     Free degree result
+ *
+ * @param [in]    degrees_result   Opaque pointer to degree result
+ */
+void cugraph_degrees_result_free(cugraph_degrees_result_t* degrees_result);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/cpp/libcugraph_etl/include/hash/concurrent_unordered_map.cuh b/cpp/libcugraph_etl/include/hash/concurrent_unordered_map.cuh
index 18e3a6669ad..a1aab595f2f 100644
--- a/cpp/libcugraph_etl/include/hash/concurrent_unordered_map.cuh
+++ b/cpp/libcugraph_etl/include/hash/concurrent_unordered_map.cuh
@@ -25,7 +25,6 @@
 #include <hash/helper_functions.cuh>
 #include <hash/managed.cuh>
 
-#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/hashing/detail/default_hash.cuh>
 #include <cudf/hashing/detail/hash_functions.cuh>
@@ -171,7 +170,6 @@ class concurrent_unordered_map {
                      const Equality& equal            = key_equal(),
                      const allocator_type& allocator  = allocator_type())
   {
-    CUDF_FUNC_RANGE();
     using Self = concurrent_unordered_map<Key, Element, Hasher, Equality, Allocator>;
 
     // Note: need `(*p).destroy` instead of `p->destroy` here
diff --git a/cpp/libcugraph_etl/include/hash/helper_functions.cuh b/cpp/libcugraph_etl/include/hash/helper_functions.cuh
index db377f938d2..8a11867f7e2 100644
--- a/cpp/libcugraph_etl/include/hash/helper_functions.cuh
+++ b/cpp/libcugraph_etl/include/hash/helper_functions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -131,7 +131,7 @@ __forceinline__ __device__ void store_pair_vectorized(pair_type* __restrict__ co
 }
 
 template <typename value_type, typename size_type, typename key_type, typename elem_type>
-__global__ void init_hashtbl(value_type* __restrict__ const hashtbl_values,
+__global__ static void init_hashtbl(value_type* __restrict__ const hashtbl_values,
                              const size_type n,
                              const key_type key_val,
                              const elem_type elem_val)
diff --git a/cpp/libcugraph_etl/src/renumbering.cu b/cpp/libcugraph_etl/src/renumbering.cu
index 08759702ab4..1cbeeeeea05 100644
--- a/cpp/libcugraph_etl/src/renumbering.cu
+++ b/cpp/libcugraph_etl/src/renumbering.cu
@@ -270,7 +270,7 @@ __device__ __inline__ int32_t validate_ht_col_insert(volatile int32_t* ptr_col)
   return col;
 }
 
-__global__ void concat_and_create_histogram(int8_t* col_1,
+__global__ static void concat_and_create_histogram(int8_t* col_1,
                                             int32_t* offset_1,
                                             int8_t* col_2,
                                             int32_t* offset_2,
@@ -349,7 +349,7 @@ __global__ void concat_and_create_histogram(int8_t* col_1,
   }
 }
 
-__global__ void concat_and_create_histogram_2(int8_t* col_1,
+__global__ static void concat_and_create_histogram_2(int8_t* col_1,
                                               int32_t* offset_1,
                                               int8_t* col_2,
                                               int32_t* offset_2,
@@ -452,7 +452,7 @@ __global__ void concat_and_create_histogram_2(int8_t* col_1,
 }
 
 template <typename T>
-__global__ void set_src_vertex_idx(int8_t* col_1,
+__global__ static void set_src_vertex_idx(int8_t* col_1,
                                    int32_t* offset_1,
                                    int8_t* col_2,
                                    int32_t* offset_2,
@@ -509,7 +509,7 @@ __global__ void set_src_vertex_idx(int8_t* col_1,
 }
 
 template <typename T>
-__global__ void set_dst_vertex_idx(int8_t* col_1,
+__global__ static void set_dst_vertex_idx(int8_t* col_1,
                                    int32_t* offset_1,
                                    int8_t* col_2,
                                    int32_t* offset_2,
@@ -585,7 +585,7 @@ __global__ void set_dst_vertex_idx(int8_t* col_1,
   }
 }
 
-__global__ void create_mapping_histogram(uint32_t* hash_value,
+__global__ static void create_mapping_histogram(uint32_t* hash_value,
                                          str_hash_value* payload,
                                          cudf_map_type hash_map,
                                          accum_type count)
@@ -595,7 +595,7 @@ __global__ void create_mapping_histogram(uint32_t* hash_value,
   if (idx < count) { auto it = hash_map.insert(thrust::make_pair(hash_value[idx], payload[idx])); }
 }
 
-__global__ void assign_histogram_idx(cudf_map_type cuda_map_obj,
+__global__ static void assign_histogram_idx(cudf_map_type cuda_map_obj,
                                      size_t slot_count,
                                      str_hash_value* key,
                                      uint32_t* value,
@@ -621,7 +621,7 @@ __global__ void assign_histogram_idx(cudf_map_type cuda_map_obj,
   }
 }
 
-__global__ void set_vertex_indices(str_hash_value* ht_value_payload, accum_type count)
+__global__ static void set_vertex_indices(str_hash_value* ht_value_payload, accum_type count)
 {
   accum_type tid = threadIdx.x + blockIdx.x * blockDim.x;
   // change count_ to renumber_idx
@@ -630,7 +630,7 @@ __global__ void set_vertex_indices(str_hash_value* ht_value_payload, accum_type
   }
 }
 
-__global__ void set_output_col_offsets(str_hash_value* row_col_pair,
+__global__ static void set_output_col_offsets(str_hash_value* row_col_pair,
                                        int32_t* out_col1_offset,
                                        int32_t* out_col2_offset,
                                        int dst_pair_match,
@@ -653,7 +653,7 @@ __global__ void set_output_col_offsets(str_hash_value* row_col_pair,
   }
 }
 
-__global__ void offset_buffer_size_comp(int32_t* out_col1_length,
+__global__ static void offset_buffer_size_comp(int32_t* out_col1_length,
                                         int32_t* out_col2_length,
                                         int32_t* out_col1_offsets,
                                         int32_t* out_col2_offsets,
@@ -673,7 +673,7 @@ __global__ void offset_buffer_size_comp(int32_t* out_col1_length,
   }
 }
 
-__global__ void select_unrenumber_string(str_hash_value* idx_to_col_row,
+__global__ static void select_unrenumber_string(str_hash_value* idx_to_col_row,
                                          int32_t total_elements,
                                          int8_t* src_col1,
                                          int8_t* src_col2,
diff --git a/cpp/src/c_api/abstract_functor.hpp b/cpp/src/c_api/abstract_functor.hpp
index 219b1256065..8d3ed11341f 100644
--- a/cpp/src/c_api/abstract_functor.hpp
+++ b/cpp/src/c_api/abstract_functor.hpp
@@ -27,7 +27,7 @@ namespace c_api {
 struct abstract_functor {
   // Move to abstract functor... make operator a void, add cugraph_graph_t * result to functor
   // try that with instantiation questions
-  std::unique_ptr<cugraph_error_t> error_{std::make_unique<cugraph_error_t>("")};
+  std::unique_ptr<cugraph_error_t> error_ = {std::make_unique<cugraph_error_t>("")};
   cugraph_error_code_t error_code_{CUGRAPH_SUCCESS};
 
   void unsupported()
diff --git a/cpp/src/c_api/degrees.cu b/cpp/src/c_api/degrees.cu
new file mode 100644
index 00000000000..d6481efa905
--- /dev/null
+++ b/cpp/src/c_api/degrees.cu
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_api/abstract_functor.hpp"
+#include "c_api/degrees_result.hpp"
+#include "c_api/graph.hpp"
+#include "c_api/resource_handle.hpp"
+#include "c_api/utils.hpp"
+
+#include <cugraph_c/algorithms.h>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/vertex_partition_device_view.cuh>
+
+#include <thrust/gather.h>
+
+#include <optional>
+
+namespace {
+
+struct degrees_functor : public cugraph::c_api::abstract_functor {
+  raft::handle_t const& handle_;
+  cugraph::c_api::cugraph_graph_t* graph_{};
+  cugraph::c_api::cugraph_type_erased_device_array_view_t const* source_vertices_;
+  bool in_degrees_{false};
+  bool out_degrees_{false};
+  bool do_expensive_check_{false};
+  cugraph::c_api::cugraph_degrees_result_t* result_{};
+
+  degrees_functor(cugraph_resource_handle_t const* handle,
+                  cugraph_graph_t* graph,
+                  ::cugraph_type_erased_device_array_view_t const* source_vertices,
+                  bool in_degrees,
+                  bool out_degrees,
+                  bool do_expensive_check)
+    : abstract_functor(),
+      handle_(*reinterpret_cast<cugraph::c_api::cugraph_resource_handle_t const*>(handle)->handle_),
+      graph_(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)),
+      source_vertices_(
+        reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+          source_vertices)),
+      in_degrees_{in_degrees},
+      out_degrees_{out_degrees},
+      do_expensive_check_(do_expensive_check)
+  {
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename edge_type_type_t,
+            bool store_transposed,
+            bool multi_gpu>
+  void operator()()
+  {
+    // FIXME: Think about how to handle SG vice MG
+    if constexpr (!cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
+      unsupported();
+    } else {
+      auto graph =
+        reinterpret_cast<cugraph::graph_t<vertex_t, edge_t, store_transposed, multi_gpu>*>(
+          graph_->graph_);
+
+      auto graph_view = graph->view();
+
+      auto number_map = reinterpret_cast<rmm::device_uvector<vertex_t>*>(graph_->number_map_);
+
+      std::optional<rmm::device_uvector<edge_t>> in_degrees{std::nullopt};
+      std::optional<rmm::device_uvector<edge_t>> out_degrees{std::nullopt};
+
+      if (in_degrees_ && out_degrees_ && graph_view.is_symmetric()) {
+        in_degrees = store_transposed ? graph_view.compute_in_degrees(handle_)
+                                      : graph_view.compute_out_degrees(handle_);
+        // out_degrees will be extracted from in_degrees in the result
+      } else {
+        if (in_degrees_) in_degrees = graph_view.compute_in_degrees(handle_);
+
+        if (out_degrees_) out_degrees = graph_view.compute_out_degrees(handle_);
+      }
+
+      rmm::device_uvector<vertex_t> vertex_ids(0, handle_.get_stream());
+
+      if (source_vertices_) {
+        // FIXME: Would be more efficient if graph_view.compute_*_degrees could take a vertex
+        //  subset
+        vertex_ids.resize(source_vertices_->size_, handle_.get_stream());
+        raft::copy(vertex_ids.data(),
+                   source_vertices_->as_type<vertex_t>(),
+                   vertex_ids.size(),
+                   handle_.get_stream());
+
+        if constexpr (multi_gpu) {
+          vertex_ids = cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
+            handle_, std::move(vertex_ids));
+        }
+
+        cugraph::renumber_ext_vertices<vertex_t, multi_gpu>(
+          handle_,
+          vertex_ids.data(),
+          vertex_ids.size(),
+          number_map->data(),
+          graph_view.local_vertex_partition_range_first(),
+          graph_view.local_vertex_partition_range_last(),
+          do_expensive_check_);
+
+        auto vertex_partition = cugraph::vertex_partition_device_view_t<vertex_t, multi_gpu>(
+          graph_view.local_vertex_partition_view());
+
+        auto vertices_iter = thrust::make_transform_iterator(
+          vertex_ids.begin(),
+          cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
+            return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
+          }));
+
+        if (in_degrees && out_degrees) {
+          rmm::device_uvector<edge_t> tmp_in_degrees(vertex_ids.size(), handle_.get_stream());
+          rmm::device_uvector<edge_t> tmp_out_degrees(vertex_ids.size(), handle_.get_stream());
+          thrust::gather(
+            handle_.get_thrust_policy(),
+            vertices_iter,
+            vertices_iter + vertex_ids.size(),
+            thrust::make_zip_iterator(in_degrees->begin(), out_degrees->begin()),
+            thrust::make_zip_iterator(tmp_in_degrees.begin(), tmp_out_degrees.begin()));
+          in_degrees  = std::move(tmp_in_degrees);
+          out_degrees = std::move(tmp_out_degrees);
+        } else if (in_degrees) {
+          rmm::device_uvector<edge_t> tmp_in_degrees(vertex_ids.size(), handle_.get_stream());
+          thrust::gather(handle_.get_thrust_policy(),
+                         vertices_iter,
+                         vertices_iter + vertex_ids.size(),
+                         in_degrees->begin(),
+                         tmp_in_degrees.begin());
+          in_degrees = std::move(tmp_in_degrees);
+        } else {
+          rmm::device_uvector<edge_t> tmp_out_degrees(vertex_ids.size(), handle_.get_stream());
+          thrust::gather(handle_.get_thrust_policy(),
+                         vertices_iter,
+                         vertices_iter + vertex_ids.size(),
+                         out_degrees->begin(),
+                         tmp_out_degrees.begin());
+          out_degrees = std::move(tmp_out_degrees);
+        }
+
+        cugraph::unrenumber_local_int_vertices<vertex_t>(
+          handle_,
+          vertex_ids.data(),
+          vertex_ids.size(),
+          number_map->data(),
+          graph_view.local_vertex_partition_range_first(),
+          graph_view.local_vertex_partition_range_last(),
+          do_expensive_check_);
+      } else {
+        vertex_ids.resize(graph_view.local_vertex_partition_range_size(), handle_.get_stream());
+        raft::copy(vertex_ids.data(), number_map->data(), vertex_ids.size(), handle_.get_stream());
+      }
+
+      result_ = new cugraph::c_api::cugraph_degrees_result_t{
+        graph_view.is_symmetric(),
+        new cugraph::c_api::cugraph_type_erased_device_array_t(vertex_ids, graph_->vertex_type_),
+        in_degrees
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(*in_degrees, graph_->edge_type_)
+          : nullptr,
+        out_degrees
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(*out_degrees, graph_->edge_type_)
+          : nullptr};
+    }
+  }
+};
+
+}  // namespace
+
+extern "C" cugraph_error_code_t cugraph_in_degrees(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* source_vertices,
+  bool_t do_expensive_check,
+  cugraph_degrees_result_t** result,
+  cugraph_error_t** error)
+{
+  degrees_functor functor(handle, graph, source_vertices, true, false, do_expensive_check);
+
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
+
+extern "C" cugraph_error_code_t cugraph_out_degrees(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* source_vertices,
+  bool_t do_expensive_check,
+  cugraph_degrees_result_t** result,
+  cugraph_error_t** error)
+{
+  degrees_functor functor(handle, graph, source_vertices, false, true, do_expensive_check);
+
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
+
+extern "C" cugraph_error_code_t cugraph_degrees(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* source_vertices,
+  bool_t do_expensive_check,
+  cugraph_degrees_result_t** result,
+  cugraph_error_t** error)
+{
+  degrees_functor functor(handle, graph, source_vertices, true, true, do_expensive_check);
+
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
diff --git a/cpp/src/c_api/degrees_result.cpp b/cpp/src/c_api/degrees_result.cpp
new file mode 100644
index 00000000000..a4649e36d05
--- /dev/null
+++ b/cpp/src/c_api/degrees_result.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_api/degrees_result.hpp"
+
+#include <cugraph_c/graph_functions.h>
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_degrees_result_get_vertices(
+  cugraph_degrees_result_t* degrees_result)
+{
+  auto internal_pointer =
+    reinterpret_cast<cugraph::c_api::cugraph_degrees_result_t*>(degrees_result);
+  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+    internal_pointer->vertex_ids_->view());
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_degrees_result_get_in_degrees(
+  cugraph_degrees_result_t* degrees_result)
+{
+  auto internal_pointer =
+    reinterpret_cast<cugraph::c_api::cugraph_degrees_result_t*>(degrees_result);
+  return internal_pointer->in_degrees_ == nullptr
+           ? nullptr
+           : reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->in_degrees_->view());
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_degrees_result_get_out_degrees(
+  cugraph_degrees_result_t* degrees_result)
+{
+  auto internal_pointer =
+    reinterpret_cast<cugraph::c_api::cugraph_degrees_result_t*>(degrees_result);
+  return internal_pointer->out_degrees_ != nullptr
+           ? reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->out_degrees_->view())
+         : internal_pointer->is_symmetric
+           ? reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->in_degrees_->view())
+           : nullptr;
+}
+
+extern "C" void cugraph_degrees_result_free(cugraph_degrees_result_t* degrees_result)
+{
+  auto internal_pointer =
+    reinterpret_cast<cugraph::c_api::cugraph_degrees_result_t*>(degrees_result);
+  delete internal_pointer->vertex_ids_;
+  delete internal_pointer->in_degrees_;
+  delete internal_pointer->out_degrees_;
+  delete internal_pointer;
+}
diff --git a/cpp/src/c_api/degrees_result.hpp b/cpp/src/c_api/degrees_result.hpp
new file mode 100644
index 00000000000..c6e9bffa5a1
--- /dev/null
+++ b/cpp/src/c_api/degrees_result.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "c_api/array.hpp"
+
+namespace cugraph {
+namespace c_api {
+
+struct cugraph_degrees_result_t {
+  bool is_symmetric{false};
+  cugraph_type_erased_device_array_t* vertex_ids_{};
+  cugraph_type_erased_device_array_t* in_degrees_{};
+  cugraph_type_erased_device_array_t* out_degrees_{};
+};
+
+}  // namespace c_api
+}  // namespace cugraph
diff --git a/cpp/src/community/detail/maximal_independent_moves.cuh b/cpp/src/community/detail/maximal_independent_moves.cuh
new file mode 100644
index 00000000000..82d20a04203
--- /dev/null
+++ b/cpp/src/community/detail/maximal_independent_moves.cuh
@@ -0,0 +1,313 @@
+
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "maximal_independent_moves.hpp"
+#include "prims/fill_edge_src_dst_property.cuh"
+#include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
+#include "prims/update_edge_src_dst_property.cuh"
+
+#include <cugraph/edge_property.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+
+#include <cuda/functional>
+#include <thrust/count.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/merge.h>
+#include <thrust/optional.h>
+#include <thrust/remove.h>
+#include <thrust/set_operations.h>
+#include <thrust/transform.h>
+#include <thrust/transform_reduce.h>
+
+#include <cmath>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+rmm::device_uvector<vertex_t> maximal_independent_moves(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  raft::random::RngState& rng_state)
+{
+  using GraphViewType = cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>;
+
+  vertex_t local_vtx_partitoin_size = graph_view.local_vertex_partition_range_size();
+
+  rmm::device_uvector<vertex_t> remaining_vertices(local_vtx_partitoin_size, handle.get_stream());
+
+  auto vertex_begin =
+    thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first());
+  auto vertex_end = thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last());
+
+  // Compute out-degree
+  auto out_degrees = graph_view.compute_out_degrees(handle);
+
+  // Only vertices with non-zero out-degree are possible can move
+  remaining_vertices.resize(
+    thrust::distance(remaining_vertices.begin(),
+                     thrust::copy_if(handle.get_thrust_policy(),
+                                     vertex_begin,
+                                     vertex_end,
+                                     out_degrees.begin(),
+                                     remaining_vertices.begin(),
+                                     [] __device__(auto deg) { return deg > 0; })),
+    handle.get_stream());
+
+  // Set ID of each vertex as its rank
+  rmm::device_uvector<vertex_t> ranks(local_vtx_partitoin_size, handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, ranks.begin());
+
+  // Set ranks of zero out-degree vetices to std::numeric_limits<vertex_t>::lowest()
+  thrust::transform_if(handle.get_thrust_policy(),
+                       out_degrees.begin(),
+                       out_degrees.end(),
+                       ranks.begin(),
+                       cuda::proclaim_return_type<vertex_t>(
+                         [] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); }),
+                       [] __device__(auto deg) { return deg == 0; });
+
+  out_degrees.resize(0, handle.get_stream());
+  out_degrees.shrink_to_fit(handle.get_stream());
+
+  size_t loop_counter = 0;
+  while (true) {
+    loop_counter++;
+
+    // Copy ranks into temporary vector to begin with
+
+    rmm::device_uvector<vertex_t> temporary_ranks(local_vtx_partitoin_size, handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(), ranks.begin(), ranks.end(), temporary_ranks.begin());
+
+    // Select a random set of candidate vertices
+
+    vertex_t nr_remaining_vertices_to_check = remaining_vertices.size();
+    if (multi_gpu) {
+      nr_remaining_vertices_to_check = host_scalar_allreduce(handle.get_comms(),
+                                                             nr_remaining_vertices_to_check,
+                                                             raft::comms::op_t::SUM,
+                                                             handle.get_stream());
+    }
+
+    vertex_t nr_candidates = (nr_remaining_vertices_to_check < 1024)
+                               ? nr_remaining_vertices_to_check
+                               : std::min(static_cast<vertex_t>((0.50 + 0.25 * loop_counter) *
+                                                                nr_remaining_vertices_to_check),
+                                          nr_remaining_vertices_to_check);
+
+    // FIXME: Can we improve performance here?
+    // FIXME: if(nr_remaining_vertices_to_check < 1024), may avoid calling select_random_vertices
+    auto d_sampled_vertices =
+      cugraph::select_random_vertices(handle,
+                                      graph_view,
+                                      std::make_optional(raft::device_span<vertex_t const>{
+                                        remaining_vertices.data(), remaining_vertices.size()}),
+                                      rng_state,
+                                      nr_candidates,
+                                      false,
+                                      true);
+
+    rmm::device_uvector<vertex_t> non_candidate_vertices(
+      remaining_vertices.size() - d_sampled_vertices.size(), handle.get_stream());
+
+    thrust::set_difference(handle.get_thrust_policy(),
+                           remaining_vertices.begin(),
+                           remaining_vertices.end(),
+                           d_sampled_vertices.begin(),
+                           d_sampled_vertices.end(),
+                           non_candidate_vertices.begin());
+
+    // Set temporary ranks of non-candidate vertices to std::numeric_limits<vertex_t>::lowest()
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      non_candidate_vertices.begin(),
+      non_candidate_vertices.end(),
+      [temporary_ranks =
+         raft::device_span<vertex_t>(temporary_ranks.data(), temporary_ranks.size()),
+       v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) {
+        //
+        // if rank of a non-candidate vertex is not std::numeric_limits<vertex_t>::max() (i.e. the
+        // vertex is not already in MIS), set it to std::numeric_limits<vertex_t>::lowest()
+        //
+        auto v_offset = v - v_first;
+        if (temporary_ranks[v_offset] < std::numeric_limits<vertex_t>::max()) {
+          temporary_ranks[v_offset] = std::numeric_limits<vertex_t>::lowest();
+        }
+      });
+
+    // Caches for ranks
+    edge_src_property_t<GraphViewType, vertex_t> src_rank_cache(handle);
+    edge_dst_property_t<GraphViewType, vertex_t> dst_rank_cache(handle);
+
+    // Update rank caches with temporary ranks
+    if constexpr (multi_gpu) {
+      src_rank_cache = edge_src_property_t<GraphViewType, vertex_t>(handle, graph_view);
+      dst_rank_cache = edge_dst_property_t<GraphViewType, vertex_t>(handle, graph_view);
+      update_edge_src_property(handle, graph_view, temporary_ranks.begin(), src_rank_cache);
+      update_edge_dst_property(handle, graph_view, temporary_ranks.begin(), dst_rank_cache);
+    }
+
+    //
+    // Find maximum rank outgoing neighbor for each vertex
+    //
+
+    rmm::device_uvector<vertex_t> max_outgoing_ranks(local_vtx_partitoin_size, handle.get_stream());
+
+    per_v_transform_reduce_outgoing_e(
+      handle,
+      graph_view,
+      multi_gpu
+        ? src_rank_cache.view()
+        : detail::edge_major_property_view_t<vertex_t, vertex_t const*>(temporary_ranks.data()),
+      multi_gpu ? dst_rank_cache.view()
+                : detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
+                    temporary_ranks.data(), vertex_t{0}),
+      edge_dummy_property_t{}.view(),
+      [] __device__(auto src, auto dst, auto src_rank, auto dst_rank, auto wt) { return dst_rank; },
+      std::numeric_limits<vertex_t>::lowest(),
+      cugraph::reduce_op::maximum<vertex_t>{},
+      max_outgoing_ranks.begin());
+
+    //
+    // Find maximum rank incoming neighbor for each vertex
+    //
+
+    rmm::device_uvector<vertex_t> max_incoming_ranks(local_vtx_partitoin_size, handle.get_stream());
+
+    per_v_transform_reduce_incoming_e(
+      handle,
+      graph_view,
+      multi_gpu
+        ? src_rank_cache.view()
+        : detail::edge_major_property_view_t<vertex_t, vertex_t const*>(temporary_ranks.data()),
+      multi_gpu ? dst_rank_cache.view()
+                : detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
+                    temporary_ranks.data(), vertex_t{0}),
+      edge_dummy_property_t{}.view(),
+      [] __device__(auto src, auto dst, auto src_rank, auto dst_rank, auto wt) { return src_rank; },
+      std::numeric_limits<vertex_t>::lowest(),
+      cugraph::reduce_op::maximum<vertex_t>{},
+      max_incoming_ranks.begin());
+
+    temporary_ranks.resize(0, handle.get_stream());
+    temporary_ranks.shrink_to_fit(handle.get_stream());
+
+    //
+    // Compute max of outgoing and incoming neighbors
+    //
+    thrust::transform(handle.get_thrust_policy(),
+                      max_incoming_ranks.begin(),
+                      max_incoming_ranks.end(),
+                      max_outgoing_ranks.begin(),
+                      max_outgoing_ranks.begin(),
+                      thrust::maximum<vertex_t>());
+
+    max_incoming_ranks.resize(0, handle.get_stream());
+    max_incoming_ranks.shrink_to_fit(handle.get_stream());
+
+    //
+    // If the max neighbor of a vertex is already in MIS (i.e. has rank
+    // std::numeric_limits<vertex_t>::max()), discard it, otherwise,
+    // include the vertex if it has larger rank than its maximum rank neighbor
+    //
+    auto last = thrust::remove_if(
+      handle.get_thrust_policy(),
+      d_sampled_vertices.begin(),
+      d_sampled_vertices.end(),
+      [max_rank_neighbor_first = max_outgoing_ranks.begin(),
+       ranks                   = raft::device_span<vertex_t>(ranks.data(), ranks.size()),
+       v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) {
+        auto v_offset          = v - v_first;
+        auto max_neighbor_rank = *(max_rank_neighbor_first + v_offset);
+        auto rank_of_v         = ranks[v_offset];
+
+        if (max_neighbor_rank >= std::numeric_limits<vertex_t>::max()) {
+          // Maximum rank neighbor is alreay in MIS
+          // Discard current vertex by setting its rank to
+          // std::numeric_limits<vertex_t>::lowest()
+          ranks[v_offset] = std::numeric_limits<vertex_t>::lowest();
+          return true;
+        }
+
+        if (rank_of_v >= max_neighbor_rank) {
+          // Include v and set its rank to std::numeric_limits<vertex_t>::max()
+          ranks[v_offset] = std::numeric_limits<vertex_t>::max();
+          return true;
+        }
+        return false;
+      });
+
+    max_outgoing_ranks.resize(0, handle.get_stream());
+    max_outgoing_ranks.shrink_to_fit(handle.get_stream());
+
+    d_sampled_vertices.resize(thrust::distance(d_sampled_vertices.begin(), last),
+                              handle.get_stream());
+    d_sampled_vertices.shrink_to_fit(handle.get_stream());
+
+    remaining_vertices.resize(non_candidate_vertices.size() + d_sampled_vertices.size(),
+                              handle.get_stream());
+    remaining_vertices.shrink_to_fit(handle.get_stream());
+
+    // merge non-candidate and remaining candidate vertices
+    thrust::merge(handle.get_thrust_policy(),
+                  non_candidate_vertices.begin(),
+                  non_candidate_vertices.end(),
+                  d_sampled_vertices.begin(),
+                  d_sampled_vertices.end(),
+                  remaining_vertices.begin());
+
+    nr_remaining_vertices_to_check = remaining_vertices.size();
+    if (multi_gpu) {
+      nr_remaining_vertices_to_check = host_scalar_allreduce(handle.get_comms(),
+                                                             nr_remaining_vertices_to_check,
+                                                             raft::comms::op_t::SUM,
+                                                             handle.get_stream());
+    }
+
+    if (nr_remaining_vertices_to_check == 0) { break; }
+  }
+
+  // Count number of vertices included in MIS
+
+  vertex_t nr_vertices_included_in_mis = thrust::count_if(
+    handle.get_thrust_policy(), ranks.begin(), ranks.end(), [] __device__(auto v_rank) {
+      return v_rank >= std::numeric_limits<vertex_t>::max();
+    });
+
+  // Build MIS and return
+  rmm::device_uvector<vertex_t> mis(nr_vertices_included_in_mis, handle.get_stream());
+  thrust::copy_if(
+    handle.get_thrust_policy(),
+    vertex_begin,
+    vertex_end,
+    ranks.begin(),
+    mis.begin(),
+    [] __device__(auto v_rank) { return v_rank >= std::numeric_limits<vertex_t>::max(); });
+
+  ranks.resize(0, handle.get_stream());
+  ranks.shrink_to_fit(handle.get_stream());
+  return mis;
+}
+}  // namespace detail
+
+}  // namespace cugraph
diff --git a/cpp/src/community/mis.hpp b/cpp/src/community/detail/maximal_independent_moves.hpp
similarity index 91%
rename from cpp/src/community/mis.hpp
rename to cpp/src/community/detail/maximal_independent_moves.hpp
index 83c0d9775f9..b5588b11ef7 100644
--- a/cpp/src/community/mis.hpp
+++ b/cpp/src/community/detail/maximal_independent_moves.hpp
@@ -23,9 +23,13 @@
 #include <rmm/device_uvector.hpp>
 
 namespace cugraph {
+namespace detail {
+
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-rmm::device_uvector<vertex_t> maximal_independent_set(
+rmm::device_uvector<vertex_t> maximal_independent_moves(
   raft::handle_t const& handle,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   raft::random::RngState& rng_state);
+
+}  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/community/detail/maximal_independent_moves_mg.cu b/cpp/src/community/detail/maximal_independent_moves_mg.cu
new file mode 100644
index 00000000000..577253cdf58
--- /dev/null
+++ b/cpp/src/community/detail/maximal_independent_moves_mg.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "maximal_independent_moves.cuh"
+
+namespace cugraph {
+namespace detail {
+
+template rmm::device_uvector<int32_t> maximal_independent_moves(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& decision_graph_view,
+  raft::random::RngState& rng_state);
+
+template rmm::device_uvector<int32_t> maximal_independent_moves(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, true> const& decision_graph_view,
+  raft::random::RngState& rng_state);
+
+template rmm::device_uvector<int64_t> maximal_independent_moves(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& decision_graph_view,
+  raft::random::RngState& rng_state);
+
+}  // namespace detail
+
+}  // namespace cugraph
diff --git a/cpp/src/community/detail/maximal_independent_moves_sg.cu b/cpp/src/community/detail/maximal_independent_moves_sg.cu
new file mode 100644
index 00000000000..18527c1ce48
--- /dev/null
+++ b/cpp/src/community/detail/maximal_independent_moves_sg.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "maximal_independent_moves.cuh"
+
+namespace cugraph {
+namespace detail {
+
+template rmm::device_uvector<int32_t> maximal_independent_moves(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& decision_graph_view,
+  raft::random::RngState& rng_state);
+
+template rmm::device_uvector<int32_t> maximal_independent_moves(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, false> const& decision_graph_view,
+  raft::random::RngState& rng_state);
+
+template rmm::device_uvector<int64_t> maximal_independent_moves(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& decision_graph_view,
+  raft::random::RngState& rng_state);
+
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index b767ce7d8bb..ef34ad90584 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -16,8 +16,8 @@
 #pragma once
 
 #include "common_methods.hpp"
-#include "community/mis.hpp"
 #include "detail/graph_partition_utils.cuh"
+#include "maximal_independent_moves.hpp"
 #include "prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh"
 #include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
 #include "prims/reduce_op.cuh"
@@ -660,8 +660,8 @@ refine_clustering(
     // Determine a set of moves using MIS of the decision_graph
     //
 
-    auto vertices_in_mis =
-      maximal_independent_set<vertex_t, edge_t, multi_gpu>(handle, decision_graph_view, rng_state);
+    auto vertices_in_mis = maximal_independent_moves<vertex_t, edge_t, multi_gpu>(
+      handle, decision_graph_view, rng_state);
 
     rmm::device_uvector<vertex_t> numbering_indices((*renumber_map).size(), handle.get_stream());
     detail::sequence_fill(handle.get_stream(),
diff --git a/cpp/src/community/legacy/ecg.cu b/cpp/src/community/legacy/ecg.cu
index d93a4446faa..b2ad79204ed 100644
--- a/cpp/src/community/legacy/ecg.cu
+++ b/cpp/src/community/legacy/ecg.cu
@@ -52,7 +52,7 @@ binsearch_maxle(const IndexType* vec, const IndexType val, IndexType low, IndexT
 // FIXME: This shouldn't need to be a custom kernel, this
 //        seems like it should just be a thrust::transform
 template <typename IdxT, typename ValT>
-__global__ void match_check_kernel(
+__global__ static void match_check_kernel(
   IdxT size, IdxT num_verts, IdxT* offsets, IdxT* indices, IdxT* parts, ValT* weights)
 {
   IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/cpp/src/components/legacy/weak_cc.cuh b/cpp/src/components/legacy/weak_cc.cuh
index ad9aa773590..f4254e2d55d 100644
--- a/cpp/src/components/legacy/weak_cc.cuh
+++ b/cpp/src/components/legacy/weak_cc.cuh
@@ -59,15 +59,15 @@ class WeakCCState {
 };
 
 template <typename vertex_t, typename edge_t, int TPB_X = 32>
-__global__ void weak_cc_label_device(vertex_t* labels,
-                                     edge_t const* offsets,
-                                     vertex_t const* indices,
-                                     edge_t nnz,
-                                     bool* fa,
-                                     bool* xa,
-                                     bool* m,
-                                     vertex_t startVertexId,
-                                     vertex_t batchSize)
+__global__ static void weak_cc_label_device(vertex_t* labels,
+                                            edge_t const* offsets,
+                                            vertex_t const* indices,
+                                            edge_t nnz,
+                                            bool* fa,
+                                            bool* xa,
+                                            bool* m,
+                                            vertex_t startVertexId,
+                                            vertex_t batchSize)
 {
   vertex_t tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < batchSize) {
@@ -118,11 +118,11 @@ __global__ void weak_cc_label_device(vertex_t* labels,
 }
 
 template <typename vertex_t, int TPB_X = 32, typename Lambda>
-__global__ void weak_cc_init_label_kernel(vertex_t* labels,
-                                          vertex_t startVertexId,
-                                          vertex_t batchSize,
-                                          vertex_t MAX_LABEL,
-                                          Lambda filter_op)
+__global__ static void weak_cc_init_label_kernel(vertex_t* labels,
+                                                 vertex_t startVertexId,
+                                                 vertex_t batchSize,
+                                                 vertex_t MAX_LABEL,
+                                                 Lambda filter_op)
 {
   /** F1 and F2 in the paper correspond to fa and xa */
   /** Cd in paper corresponds to db_cluster */
@@ -134,7 +134,7 @@ __global__ void weak_cc_init_label_kernel(vertex_t* labels,
 }
 
 template <typename vertex_t, int TPB_X = 32>
-__global__ void weak_cc_init_all_kernel(
+__global__ static void weak_cc_init_all_kernel(
   vertex_t* labels, bool* fa, bool* xa, vertex_t N, vertex_t MAX_LABEL)
 {
   vertex_t tid = threadIdx.x + blockIdx.x * TPB_X;
diff --git a/cpp/src/community/detail/mis_impl.cuh b/cpp/src/components/mis_impl.cuh
similarity index 90%
rename from cpp/src/community/detail/mis_impl.cuh
rename to cpp/src/components/mis_impl.cuh
index 217d64eb1c1..550edf9807a 100644
--- a/cpp/src/community/detail/mis_impl.cuh
+++ b/cpp/src/components/mis_impl.cuh
@@ -16,11 +16,12 @@
  */
 #pragma once
 
-#include "community/mis.hpp"
+#include "prims/fill_edge_property.cuh"
 #include "prims/fill_edge_src_dst_property.cuh"
 #include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 
+#include <cugraph/algorithms.hpp>
 #include <cugraph/edge_property.hpp>
 #include <cugraph/edge_src_dst_property.hpp>
 #include <cugraph/graph_functions.hpp>
@@ -60,36 +61,46 @@ rmm::device_uvector<vertex_t> maximal_independent_set(
     thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first());
   auto vertex_end = thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last());
 
-  // Compute out-degree
   auto out_degrees = graph_view.compute_out_degrees(handle);
+  auto in_degrees  = graph_view.compute_in_degrees(handle);
 
-  // Vertices with non-zero out-degree are possible candidates for MIS.
+  // Vertices with degree zero are always part of MIS
   remaining_vertices.resize(
     thrust::distance(remaining_vertices.begin(),
                      thrust::copy_if(handle.get_thrust_policy(),
                                      vertex_begin,
                                      vertex_end,
-                                     out_degrees.begin(),
+                                     thrust::make_zip_iterator(
+                                       thrust::make_tuple(out_degrees.begin(), in_degrees.begin())),
                                      remaining_vertices.begin(),
-                                     [] __device__(auto deg) { return deg > 0; })),
+                                     [] __device__(auto out_deg_and_in_deg) {
+                                       return !((thrust::get<0>(out_deg_and_in_deg) == 0) &&
+                                                (thrust::get<1>(out_deg_and_in_deg) == 0));
+                                     })),
     handle.get_stream());
 
   // Set ID of each vertex as its rank
   rmm::device_uvector<vertex_t> ranks(local_vtx_partitoin_size, handle.get_stream());
   thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, ranks.begin());
 
-  // Set ranks of zero out-degree vetices to std::numeric_limits<vertex_t>::lowest()
-  thrust::transform_if(handle.get_thrust_policy(),
-                       out_degrees.begin(),
-                       out_degrees.end(),
-                       ranks.begin(),
-                       cuda::proclaim_return_type<vertex_t>(
-                         [] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); }),
-                       [] __device__(auto deg) { return deg == 0; });
+  // Set ranks of zero degree vetices to std::numeric_limits<vertex_t>::max()
+  thrust::transform_if(
+    handle.get_thrust_policy(),
+    thrust::make_zip_iterator(thrust::make_tuple(out_degrees.begin(), in_degrees.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(out_degrees.end(), in_degrees.end())),
+    ranks.begin(),
+    cuda::proclaim_return_type<vertex_t>(
+      [] __device__(auto) { return std::numeric_limits<vertex_t>::max(); }),
+    [] __device__(auto in_out_degree) {
+      return (thrust::get<0>(in_out_degree) == 0) && (thrust::get<1>(in_out_degree) == 0);
+    });
 
   out_degrees.resize(0, handle.get_stream());
   out_degrees.shrink_to_fit(handle.get_stream());
 
+  in_degrees.resize(0, handle.get_stream());
+  in_degrees.shrink_to_fit(handle.get_stream());
+
   size_t loop_counter = 0;
   while (true) {
     loop_counter++;
diff --git a/cpp/src/community/detail/mis_mg.cu b/cpp/src/components/mis_mg.cu
similarity index 83%
rename from cpp/src/community/detail/mis_mg.cu
rename to cpp/src/components/mis_mg.cu
index 0fc5eeb63c0..2418b38dd0b 100644
--- a/cpp/src/community/detail/mis_mg.cu
+++ b/cpp/src/components/mis_mg.cu
@@ -14,21 +14,21 @@
  * limitations under the License.
  */
 #include "mis_impl.cuh"
-
 namespace cugraph {
+
 template rmm::device_uvector<int32_t> maximal_independent_set(
   raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, true> const& decision_graph_view,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
   raft::random::RngState& rng_state);
 
 template rmm::device_uvector<int32_t> maximal_independent_set(
   raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, false, true> const& decision_graph_view,
+  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
   raft::random::RngState& rng_state);
 
 template rmm::device_uvector<int64_t> maximal_independent_set(
   raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, true> const& decision_graph_view,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
   raft::random::RngState& rng_state);
 
 }  // namespace cugraph
diff --git a/cpp/src/community/detail/mis_sg.cu b/cpp/src/components/mis_sg.cu
similarity index 82%
rename from cpp/src/community/detail/mis_sg.cu
rename to cpp/src/components/mis_sg.cu
index 8a20b31d157..fea4c4f2765 100644
--- a/cpp/src/community/detail/mis_sg.cu
+++ b/cpp/src/components/mis_sg.cu
@@ -16,19 +16,20 @@
 #include "mis_impl.cuh"
 
 namespace cugraph {
+
 template rmm::device_uvector<int32_t> maximal_independent_set(
   raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, false> const& decision_graph_view,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
   raft::random::RngState& rng_state);
 
 template rmm::device_uvector<int32_t> maximal_independent_set(
   raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, false, false> const& decision_graph_view,
+  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
   raft::random::RngState& rng_state);
 
 template rmm::device_uvector<int64_t> maximal_independent_set(
   raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, false> const& decision_graph_view,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
   raft::random::RngState& rng_state);
 
 }  // namespace cugraph
diff --git a/cpp/src/components/vertex_coloring_impl.cuh b/cpp/src/components/vertex_coloring_impl.cuh
new file mode 100644
index 00000000000..ce445ab3809
--- /dev/null
+++ b/cpp/src/components/vertex_coloring_impl.cuh
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "prims/fill_edge_property.cuh"
+#include "prims/transform_e.cuh"
+#include "prims/update_edge_src_dst_property.cuh"
+
+#include <cugraph/algorithms.hpp>
+
+#include <raft/core/handle.hpp>
+#include <raft/random/rng_state.hpp>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+rmm::device_uvector<vertex_t> vertex_coloring(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  raft::random::RngState& rng_state)
+{
+  using graph_view_t = cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>;
+  graph_view_t current_graph_view(graph_view);
+
+  // edge mask
+  cugraph::edge_property_t<graph_view_t, bool> edge_masks_even(handle, current_graph_view);
+  cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_even);
+
+  cugraph::edge_property_t<graph_view_t, bool> edge_masks_odd(handle, current_graph_view);
+  cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_odd);
+
+  cugraph::transform_e(
+    handle,
+    current_graph_view,
+    edge_src_dummy_property_t{}.view(),
+    edge_dst_dummy_property_t{}.view(),
+    cugraph::edge_dummy_property_t{}.view(),
+    [] __device__(auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) {
+      return !(src == dst);  // mask out self-loop
+    },
+    edge_masks_even.mutable_view());
+
+  current_graph_view.attach_edge_mask(edge_masks_even.view());
+
+  // device vector to store colors of vertices
+  rmm::device_uvector<vertex_t> colors = rmm::device_uvector<vertex_t>(
+    current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
+  thrust::fill(
+    handle.get_thrust_policy(), colors.begin(), colors.end(), std::numeric_limits<vertex_t>::max());
+
+  vertex_t color_id = 0;
+  while (true) {
+    auto mis = cugraph::maximal_independent_set<vertex_t, edge_t, multi_gpu>(
+      handle, current_graph_view, rng_state);
+
+    using flag_t                                 = uint8_t;
+    rmm::device_uvector<flag_t> is_vertex_in_mis = rmm::device_uvector<flag_t>(
+      current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(), is_vertex_in_mis.begin(), is_vertex_in_mis.end(), 0);
+
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      mis.begin(),
+      mis.end(),
+      [color_id,
+       colors           = colors.data(),
+       is_vertex_in_mis = is_vertex_in_mis.data(),
+       v_first = current_graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) {
+        auto v_offset              = v - v_first;
+        is_vertex_in_mis[v_offset] = flag_t{1};
+        vertex_t initial_color_id  = colors[v_offset];
+        colors[v_offset]           = (color_id < initial_color_id) ? color_id : initial_color_id;
+      });
+
+    if (current_graph_view.compute_number_of_edges(handle) == 0) { break; }
+
+    cugraph::edge_src_property_t<graph_view_t, flag_t> src_mis_flags(handle, current_graph_view);
+    cugraph::edge_dst_property_t<graph_view_t, flag_t> dst_mis_flags(handle, current_graph_view);
+
+    cugraph::update_edge_src_property(
+      handle, current_graph_view, is_vertex_in_mis.begin(), src_mis_flags);
+
+    cugraph::update_edge_dst_property(
+      handle, current_graph_view, is_vertex_in_mis.begin(), dst_mis_flags);
+
+    if (color_id % 2 == 0) {
+      cugraph::transform_e(
+        handle,
+        current_graph_view,
+        src_mis_flags.view(),
+        dst_mis_flags.view(),
+        cugraph::edge_dummy_property_t{}.view(),
+        [color_id] __device__(
+          auto src, auto dst, auto is_src_in_mis, auto is_dst_in_mis, thrust::nullopt_t) {
+          return !((is_src_in_mis == uint8_t{true}) || (is_dst_in_mis == uint8_t{true}));
+        },
+        edge_masks_odd.mutable_view());
+
+      if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
+      cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_even);
+      current_graph_view.attach_edge_mask(edge_masks_odd.view());
+    } else {
+      cugraph::transform_e(
+        handle,
+        current_graph_view,
+        src_mis_flags.view(),
+        dst_mis_flags.view(),
+        cugraph::edge_dummy_property_t{}.view(),
+        [color_id] __device__(
+          auto src, auto dst, auto is_src_in_mis, auto is_dst_in_mis, thrust::nullopt_t) {
+          return !((is_src_in_mis == uint8_t{true}) || (is_dst_in_mis == uint8_t{true}));
+        },
+        edge_masks_even.mutable_view());
+
+      if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
+      cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_odd);
+      current_graph_view.attach_edge_mask(edge_masks_even.view());
+    }
+
+    color_id++;
+  }
+  return colors;
+}
+}  // namespace detail
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+rmm::device_uvector<vertex_t> vertex_coloring(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  raft::random::RngState& rng_state)
+{
+  return detail::vertex_coloring(handle, graph_view, rng_state);
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/components/vertex_coloring_mg.cu b/cpp/src/components/vertex_coloring_mg.cu
new file mode 100644
index 00000000000..8f87e8bd534
--- /dev/null
+++ b/cpp/src/components/vertex_coloring_mg.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "vertex_coloring_impl.cuh"
+
+namespace cugraph {
+
+template rmm::device_uvector<int32_t> vertex_coloring(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  raft::random::RngState& rng_state);
+
+template rmm::device_uvector<int32_t> vertex_coloring(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  raft::random::RngState& rng_state);
+
+template rmm::device_uvector<int64_t> vertex_coloring(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  raft::random::RngState& rng_state);
+
+}  // namespace cugraph
diff --git a/cpp/src/components/vertex_coloring_sg.cu b/cpp/src/components/vertex_coloring_sg.cu
new file mode 100644
index 00000000000..427bc0b2c81
--- /dev/null
+++ b/cpp/src/components/vertex_coloring_sg.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "vertex_coloring_impl.cuh"
+
+namespace cugraph {
+
+template rmm::device_uvector<int32_t> vertex_coloring(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  raft::random::RngState& rng_state);
+
+template rmm::device_uvector<int32_t> vertex_coloring(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  raft::random::RngState& rng_state);
+
+template rmm::device_uvector<int64_t> vertex_coloring(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  raft::random::RngState& rng_state);
+
+}  // namespace cugraph
diff --git a/cpp/src/layout/legacy/bh_kernels.cuh b/cpp/src/layout/legacy/bh_kernels.cuh
index 5b101363314..f6e163ab306 100644
--- a/cpp/src/layout/legacy/bh_kernels.cuh
+++ b/cpp/src/layout/legacy/bh_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,9 +42,9 @@ namespace detail {
 /**
  * Intializes the states of objects. This speeds the overall kernel up.
  */
-__global__ void InitializationKernel(unsigned* restrict limiter,
-                                     int* restrict maxdepthd,
-                                     float* restrict radiusd)
+__global__ static void InitializationKernel(unsigned* restrict limiter,
+                                            int* restrict maxdepthd,
+                                            float* restrict radiusd)
 {
   maxdepthd[0] = 1;
   limiter[0]   = 0;
@@ -54,10 +54,10 @@ __global__ void InitializationKernel(unsigned* restrict limiter,
 /**
  * Reset root.
  */
-__global__ void ResetKernel(float* restrict radiusd_squared,
-                            int* restrict bottomd,
-                            const int NNODES,
-                            const float* restrict radiusd)
+__global__ static void ResetKernel(float* restrict radiusd_squared,
+                                   int* restrict bottomd,
+                                   const int NNODES,
+                                   const float* restrict radiusd)
 {
   radiusd_squared[0] = radiusd[0] * radiusd[0];
   // create root node
@@ -67,20 +67,21 @@ __global__ void ResetKernel(float* restrict radiusd_squared,
 /**
  * Figures the bounding boxes for every point in the embedding.
  */
-__global__ __launch_bounds__(THREADS1, FACTOR1) void BoundingBoxKernel(int* restrict startd,
-                                                                       int* restrict childd,
-                                                                       int* restrict massd,
-                                                                       float* restrict posxd,
-                                                                       float* restrict posyd,
-                                                                       float* restrict maxxd,
-                                                                       float* restrict maxyd,
-                                                                       float* restrict minxd,
-                                                                       float* restrict minyd,
-                                                                       const int FOUR_NNODES,
-                                                                       const int NNODES,
-                                                                       const int N,
-                                                                       unsigned* restrict limiter,
-                                                                       float* restrict radiusd)
+__global__ static __launch_bounds__(THREADS1,
+                                    FACTOR1) void BoundingBoxKernel(int* restrict startd,
+                                                                    int* restrict childd,
+                                                                    int* restrict massd,
+                                                                    float* restrict posxd,
+                                                                    float* restrict posyd,
+                                                                    float* restrict maxxd,
+                                                                    float* restrict maxyd,
+                                                                    float* restrict minxd,
+                                                                    float* restrict minyd,
+                                                                    const int FOUR_NNODES,
+                                                                    const int NNODES,
+                                                                    const int N,
+                                                                    unsigned* restrict limiter,
+                                                                    float* restrict radiusd)
 {
   float val, minx, maxx, miny, maxy;
   __shared__ float sminx[THREADS1], smaxx[THREADS1], sminy[THREADS1], smaxy[THREADS1];
@@ -158,9 +159,9 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void BoundingBoxKernel(int* rest
 /**
  * Clear some of the state vectors up.
  */
-__global__ __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd,
-                                                        const int FOUR_NNODES,
-                                                        const int FOUR_N)
+__global__ static __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd,
+                                                               const int FOUR_NNODES,
+                                                               const int FOUR_N)
 {
   const int inc = blockDim.x * gridDim.x;
   int k         = (FOUR_N & -32) + threadIdx.x + blockIdx.x * blockDim.x;
@@ -175,15 +176,15 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd,
 /**
  * Build the actual KD Tree.
  */
-__global__ __launch_bounds__(THREADS2,
-                             FACTOR2) void TreeBuildingKernel(int* restrict childd,
-                                                              const float* restrict posxd,
-                                                              const float* restrict posyd,
-                                                              const int NNODES,
-                                                              const int N,
-                                                              int* restrict maxdepthd,
-                                                              int* restrict bottomd,
-                                                              const float* restrict radiusd)
+__global__ static __launch_bounds__(THREADS2,
+                                    FACTOR2) void TreeBuildingKernel(int* restrict childd,
+                                                                     const float* restrict posxd,
+                                                                     const float* restrict posyd,
+                                                                     const int NNODES,
+                                                                     const int N,
+                                                                     int* restrict maxdepthd,
+                                                                     int* restrict bottomd,
+                                                                     const float* restrict radiusd)
 {
   int j, depth;
   float x, y, r;
@@ -296,10 +297,10 @@ __global__ __launch_bounds__(THREADS2,
 /**
  * Clean more state vectors.
  */
-__global__ __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd,
-                                                        int* restrict massd,
-                                                        const int NNODES,
-                                                        const int* restrict bottomd)
+__global__ static __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd,
+                                                               int* restrict massd,
+                                                               const int NNODES,
+                                                               const int* restrict bottomd)
 {
   const int bottom = bottomd[0];
   const int inc    = blockDim.x * gridDim.x;
@@ -317,15 +318,15 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd,
 /**
  * Summarize the KD Tree via cell gathering
  */
-__global__ __launch_bounds__(THREADS3,
-                             FACTOR3) void SummarizationKernel(int* restrict countd,
-                                                               const int* restrict childd,
-                                                               volatile int* restrict massd,
-                                                               float* restrict posxd,
-                                                               float* restrict posyd,
-                                                               const int NNODES,
-                                                               const int N,
-                                                               const int* restrict bottomd)
+__global__ static __launch_bounds__(THREADS3,
+                                    FACTOR3) void SummarizationKernel(int* restrict countd,
+                                                                      const int* restrict childd,
+                                                                      volatile int* restrict massd,
+                                                                      float* restrict posxd,
+                                                                      float* restrict posyd,
+                                                                      const int NNODES,
+                                                                      const int N,
+                                                                      const int* restrict bottomd)
 {
   bool flag = 0;
   float cm, px, py;
@@ -453,13 +454,14 @@ __global__ __launch_bounds__(THREADS3,
 /**
  * Sort the cells
  */
-__global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(int* restrict sortd,
-                                                                const int* restrict countd,
-                                                                volatile int* restrict startd,
-                                                                int* restrict childd,
-                                                                const int NNODES,
-                                                                const int N,
-                                                                const int* restrict bottomd)
+__global__ static __launch_bounds__(THREADS4,
+                                    FACTOR4) void SortKernel(int* restrict sortd,
+                                                             const int* restrict countd,
+                                                             volatile int* restrict startd,
+                                                             int* restrict childd,
+                                                             const int NNODES,
+                                                             const int N,
+                                                             const int* restrict bottomd)
 {
   const int bottom = bottomd[0];
   const int dec    = blockDim.x * gridDim.x;
@@ -502,7 +504,7 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(int* restrict so
 /**
  * Calculate the repulsive forces using the KD Tree
  */
-__global__ __launch_bounds__(
+__global__ static __launch_bounds__(
   THREADS5, FACTOR5) void RepulsionKernel(/* int *restrict errd, */
                                           const float scaling_ratio,
                                           const float theta,
@@ -612,18 +614,18 @@ __global__ __launch_bounds__(
   }
 }
 
-__global__ __launch_bounds__(THREADS6,
-                             FACTOR6) void apply_forces_bh(float* restrict Y_x,
-                                                           float* restrict Y_y,
-                                                           const float* restrict attract_x,
-                                                           const float* restrict attract_y,
-                                                           const float* restrict repel_x,
-                                                           const float* restrict repel_y,
-                                                           float* restrict old_dx,
-                                                           float* restrict old_dy,
-                                                           const float* restrict swinging,
-                                                           const float speed,
-                                                           const int n)
+__global__ static __launch_bounds__(THREADS6,
+                                    FACTOR6) void apply_forces_bh(float* restrict Y_x,
+                                                                  float* restrict Y_y,
+                                                                  const float* restrict attract_x,
+                                                                  const float* restrict attract_y,
+                                                                  const float* restrict repel_x,
+                                                                  const float* restrict repel_y,
+                                                                  float* restrict old_dx,
+                                                                  float* restrict old_dy,
+                                                                  const float* restrict swinging,
+                                                                  const float speed,
+                                                                  const int n)
 {
   // For evrery vertex
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
diff --git a/cpp/src/layout/legacy/exact_repulsion.cuh b/cpp/src/layout/legacy/exact_repulsion.cuh
index fe895bae6a0..8530202afd5 100644
--- a/cpp/src/layout/legacy/exact_repulsion.cuh
+++ b/cpp/src/layout/legacy/exact_repulsion.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,13 +22,13 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t>
-__global__ void repulsion_kernel(const float* restrict x_pos,
-                                 const float* restrict y_pos,
-                                 float* restrict repel_x,
-                                 float* restrict repel_y,
-                                 const int* restrict mass,
-                                 const float scaling_ratio,
-                                 const vertex_t n)
+__global__ static void repulsion_kernel(const float* restrict x_pos,
+                                        const float* restrict y_pos,
+                                        float* restrict repel_x,
+                                        float* restrict repel_y,
+                                        const int* restrict mass,
+                                        const float scaling_ratio,
+                                        const vertex_t n)
 {
   int j = (blockIdx.x * blockDim.x) + threadIdx.x;  // for every item in row
   int i = (blockIdx.y * blockDim.y) + threadIdx.y;  // for every row
diff --git a/cpp/src/layout/legacy/fa2_kernels.cuh b/cpp/src/layout/legacy/fa2_kernels.cuh
index 4f1ce520387..33e7841a380 100644
--- a/cpp/src/layout/legacy/fa2_kernels.cuh
+++ b/cpp/src/layout/legacy/fa2_kernels.cuh
@@ -23,19 +23,19 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-__global__ void attraction_kernel(const vertex_t* restrict row,
-                                  const vertex_t* restrict col,
-                                  const weight_t* restrict v,
-                                  const edge_t e,
-                                  const float* restrict x_pos,
-                                  const float* restrict y_pos,
-                                  float* restrict attract_x,
-                                  float* restrict attract_y,
-                                  const int* restrict mass,
-                                  bool outbound_attraction_distribution,
-                                  bool lin_log_mode,
-                                  const float edge_weight_influence,
-                                  const float coef)
+__global__ static void attraction_kernel(const vertex_t* restrict row,
+                                         const vertex_t* restrict col,
+                                         const weight_t* restrict v,
+                                         const edge_t e,
+                                         const float* restrict x_pos,
+                                         const float* restrict y_pos,
+                                         float* restrict attract_x,
+                                         float* restrict attract_y,
+                                         const int* restrict mass,
+                                         bool outbound_attraction_distribution,
+                                         bool lin_log_mode,
+                                         const float edge_weight_influence,
+                                         const float coef)
 {
   vertex_t i, src, dst;
   weight_t weight = 1;
@@ -116,13 +116,13 @@ void apply_attraction(const vertex_t* restrict row,
 }
 
 template <typename vertex_t>
-__global__ void linear_gravity_kernel(const float* restrict x_pos,
-                                      const float* restrict y_pos,
-                                      float* restrict attract_x,
-                                      float* restrict attract_y,
-                                      const int* restrict mass,
-                                      const float gravity,
-                                      const vertex_t n)
+__global__ static void linear_gravity_kernel(const float* restrict x_pos,
+                                             const float* restrict y_pos,
+                                             float* restrict attract_x,
+                                             float* restrict attract_y,
+                                             const int* restrict mass,
+                                             const float gravity,
+                                             const vertex_t n)
 {
   // For every node.
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
@@ -136,14 +136,14 @@ __global__ void linear_gravity_kernel(const float* restrict x_pos,
 }
 
 template <typename vertex_t>
-__global__ void strong_gravity_kernel(const float* restrict x_pos,
-                                      const float* restrict y_pos,
-                                      float* restrict attract_x,
-                                      float* restrict attract_y,
-                                      const int* restrict mass,
-                                      const float gravity,
-                                      const float scaling_ratio,
-                                      const vertex_t n)
+__global__ static void strong_gravity_kernel(const float* restrict x_pos,
+                                             const float* restrict y_pos,
+                                             float* restrict attract_x,
+                                             float* restrict attract_y,
+                                             const int* restrict mass,
+                                             const float gravity,
+                                             const float scaling_ratio,
+                                             const vertex_t n)
 {
   // For every node.
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
@@ -187,16 +187,16 @@ void apply_gravity(const float* restrict x_pos,
 }
 
 template <typename vertex_t>
-__global__ void local_speed_kernel(const float* restrict repel_x,
-                                   const float* restrict repel_y,
-                                   const float* restrict attract_x,
-                                   const float* restrict attract_y,
-                                   const float* restrict old_dx,
-                                   const float* restrict old_dy,
-                                   const int* restrict mass,
-                                   float* restrict swinging,
-                                   float* restrict traction,
-                                   const vertex_t n)
+__global__ static void local_speed_kernel(const float* restrict repel_x,
+                                          const float* restrict repel_y,
+                                          const float* restrict attract_x,
+                                          const float* restrict attract_y,
+                                          const float* restrict old_dx,
+                                          const float* restrict old_dy,
+                                          const int* restrict mass,
+                                          float* restrict swinging,
+                                          float* restrict traction,
+                                          const vertex_t n)
 {
   // For every node.
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
@@ -272,17 +272,17 @@ void adapt_speed(const float jitter_tolerance,
 }
 
 template <typename vertex_t>
-__global__ void update_positions_kernel(float* restrict x_pos,
-                                        float* restrict y_pos,
-                                        const float* restrict repel_x,
-                                        const float* restrict repel_y,
-                                        const float* restrict attract_x,
-                                        const float* restrict attract_y,
-                                        float* restrict old_dx,
-                                        float* restrict old_dy,
-                                        const float* restrict swinging,
-                                        const float speed,
-                                        const vertex_t n)
+__global__ static void update_positions_kernel(float* restrict x_pos,
+                                               float* restrict y_pos,
+                                               const float* restrict repel_x,
+                                               const float* restrict repel_y,
+                                               const float* restrict attract_x,
+                                               const float* restrict attract_y,
+                                               float* restrict old_dx,
+                                               float* restrict old_dy,
+                                               const float* restrict swinging,
+                                               const float speed,
+                                               const vertex_t n)
 {
   // For every node.
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
index fc3da3cac07..0b6447f50d9 100644
--- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
+++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
@@ -127,7 +127,7 @@ template <bool hypersparse,
           typename BufferKeyOutputIterator,
           typename BufferValueOutputIterator,
           typename EdgeOp>
-__global__ void extract_transform_v_frontier_e_hypersparse_or_low_degree(
+__global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -295,7 +295,7 @@ template <typename GraphViewType,
           typename BufferKeyOutputIterator,
           typename BufferValueOutputIterator,
           typename EdgeOp>
-__global__ void extract_transform_v_frontier_e_mid_degree(
+__global__ static void extract_transform_v_frontier_e_mid_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -396,7 +396,7 @@ template <typename GraphViewType,
           typename BufferKeyOutputIterator,
           typename BufferValueOutputIterator,
           typename EdgeOp>
-__global__ void extract_transform_v_frontier_e_high_degree(
+__global__ static void extract_transform_v_frontier_e_high_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
diff --git a/cpp/src/prims/detail/nbr_intersection.cuh b/cpp/src/prims/detail/nbr_intersection.cuh
index e0a04eb59da..847c1db6937 100644
--- a/cpp/src/prims/detail/nbr_intersection.cuh
+++ b/cpp/src/prims/detail/nbr_intersection.cuh
@@ -50,6 +50,7 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/optional.h>
+#include <thrust/reduce.h>
 #include <thrust/remove.h>
 #include <thrust/scan.h>
 #include <thrust/set_operations.h>
@@ -1232,9 +1233,11 @@ nbr_intersection(raft::handle_t const& handle,
           rx_v_pair_nbr_intersection_sizes.size() + 1, handle.get_stream());
         rx_v_pair_nbr_intersection_offsets.set_element_to_zero_async(size_t{0},
                                                                      handle.get_stream());
+        auto size_first = thrust::make_transform_iterator(
+          rx_v_pair_nbr_intersection_sizes.begin(), cugraph::detail::typecast_t<edge_t, size_t>{});
         thrust::inclusive_scan(handle.get_thrust_policy(),
-                               rx_v_pair_nbr_intersection_sizes.begin(),
-                               rx_v_pair_nbr_intersection_sizes.end(),
+                               size_first,
+                               size_first + rx_v_pair_nbr_intersection_sizes.size(),
                                rx_v_pair_nbr_intersection_offsets.begin() + 1);
 
         rx_v_pair_nbr_intersection_indices.resize(
@@ -1344,8 +1347,8 @@ nbr_intersection(raft::handle_t const& handle,
         }
 
         thrust::inclusive_scan(handle.get_thrust_policy(),
-                               rx_v_pair_nbr_intersection_sizes.begin(),
-                               rx_v_pair_nbr_intersection_sizes.end(),
+                               size_first,
+                               size_first + rx_v_pair_nbr_intersection_sizes.size(),
                                rx_v_pair_nbr_intersection_offsets.begin() + 1);
 
         std::vector<size_t> h_rx_v_pair_lasts(rx_v_pair_counts.size());
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 9cb3365116e..5240c49cb80 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -328,7 +328,7 @@ struct return_value_compute_offset_t {
 };
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void compute_valid_local_nbr_inclusive_sums_mid_local_degree(
+__global__ static void compute_valid_local_nbr_inclusive_sums_mid_local_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool> edge_partition_e_mask,
   raft::device_span<vertex_t const> edge_partition_frontier_majors,
@@ -382,7 +382,7 @@ __global__ void compute_valid_local_nbr_inclusive_sums_mid_local_degree(
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void compute_valid_local_nbr_inclusive_sums_high_local_degree(
+__global__ static void compute_valid_local_nbr_inclusive_sums_high_local_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool> edge_partition_e_mask,
   raft::device_span<vertex_t const> edge_partition_frontier_majors,
diff --git a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
index 3b25ae50773..5e4cd81513e 100644
--- a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "detail/graph_partition_utils.cuh"
+#include "prims/detail/optional_dataframe_buffer.hpp"
 #include "prims/kv_store.cuh"
 #include "utilities/collect_comm.cuh"
 
@@ -83,15 +84,23 @@ struct rebase_offset_t {
 
 // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
 template <typename vertex_t, typename edge_value_t>
-struct triplet_to_minor_comm_rank_t {
+struct tuple_to_minor_comm_rank_t {
   compute_vertex_partition_id_from_ext_vertex_t<vertex_t> key_func{};
   int minor_comm_size{};
 
-  __device__ int operator()(
+  template <typename edge_value_type = edge_value_t>
+  __device__ std::enable_if_t<!std::is_same_v<edge_value_type, thrust::nullopt_t>, int> operator()(
     thrust::tuple<vertex_t, vertex_t, edge_value_t> val /* major, minor key, edge value */) const
   {
     return key_func(thrust::get<1>(val)) % minor_comm_size;
   }
+
+  template <typename edge_value_type = edge_value_t>
+  __device__ std::enable_if_t<std::is_same_v<edge_value_type, thrust::nullopt_t>, int> operator()(
+    thrust::tuple<vertex_t, vertex_t> val /* major, minor key */) const
+  {
+    return key_func(thrust::get<1>(val)) % minor_comm_size;
+  }
 };
 
 // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
@@ -106,6 +115,7 @@ struct pair_to_binary_partition_id_t {
 // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
 template <typename vertex_t,
           typename edge_value_t,
+          typename e_op_result_t,
           typename EdgePartitionDeviceView,
           typename EdgeMajorValueMap,
           typename EdgePartitionMajorValueInputWrapper,
@@ -118,8 +128,10 @@ struct call_key_aggregated_e_op_t {
   EdgeMinorKeyValueMap edge_minor_key_value_map{};
   KeyAggregatedEdgeOp key_aggregated_e_op{};
 
-  __device__ auto operator()(thrust::tuple<vertex_t, vertex_t, edge_value_t>
-                               val /* major, minor key, aggregated edge value */) const
+  template <typename edge_value_type = edge_value_t>
+  __device__ std::enable_if_t<!std::is_same_v<edge_value_type, thrust::nullopt_t>, e_op_result_t>
+  operator()(thrust::tuple<vertex_t, vertex_t, edge_value_t>
+               val /* major, minor key, aggregated edge value */) const
   {
     auto major                 = thrust::get<0>(val);
     auto minor_key             = thrust::get<1>(val);
@@ -131,6 +143,20 @@ struct call_key_aggregated_e_op_t {
     return key_aggregated_e_op(
       major, minor_key, major_val, edge_minor_key_value_map.find(minor_key), aggregated_edge_value);
   }
+
+  template <typename edge_value_type = edge_value_t>
+  __device__ std::enable_if_t<std::is_same_v<edge_value_type, thrust::nullopt_t>, e_op_result_t>
+  operator()(thrust::tuple<vertex_t, vertex_t> val /* major, minor key */) const
+  {
+    auto major     = thrust::get<0>(val);
+    auto minor_key = thrust::get<1>(val);
+    auto major_val = edge_major_value_map
+                       ? (*edge_major_value_map).find(major)
+                       : edge_partition_major_value_input.get(
+                           edge_partition.major_offset_from_major_nocheck(major));
+    return key_aggregated_e_op(
+      major, minor_key, major_val, edge_minor_key_value_map.find(minor_key), thrust::nullopt);
+  }
 };
 
 // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
@@ -182,9 +208,8 @@ struct reduce_with_init_t {
  * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
  * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
  * @tparam EdgeDstKeyInputWrapper Type of the wrapper for edge destination key values.
- * @tparam VertexIterator Type of the iterator for keys in (key, value) pairs (key type should
- * coincide with vertex type).
- * @tparam ValueIterator Type of the iterator for values in (key, value) pairs.
+ * @tparam KVStoreViewType Type of the (key, value) store. Key type should coincide with vertex
+ * type.
  * @tparam KeyAggregatedEdgeOp Type of the quinary key-aggregated edge operator.
  * @tparam ReduceOp Type of the binary reduction operator.
  * @tparam T Type of the initial value for per-vertex reduction.
@@ -204,15 +229,10 @@ struct reduce_with_init_t {
  * @param edge_dst_key_input Wrapper used to access destination input key values (for the edge
  * destinations assigned to this process in multi-GPU). Use  cugraph::edge_dst_property_t::view().
  * Use update_edge_dst_property to fill the wrapper.
- * @param map_unique_key_first Iterator pointing to the first (inclusive) key in (key, value) pairs
- * (assigned to this process in multi-GPU, `cugraph::detail::compute_gpu_id_from_ext_vertex_t` is
- * used to map keys to processes). (Key, value) pairs may be provided by
- * transform_reduce_by_src_key_e() or transform_reduce_by_dst_key_e().
- * @param map_unique_key_last Iterator pointing to the last (exclusive) key in (key, value) pairs
- * (assigned to this process in multi-GPU).
- * @param map_value_first Iterator pointing to the first (inclusive) value in (key, value) pairs
- * (assigned to this process in multi-GPU). `map_value_last` (exclusive) is deduced as @p
- * map_value_first + thrust::distance(@p map_unique_key_first, @p map_unique_key_last).
+ * @param kv_store_view view object of the (key, value) store (for the keys assigned to this process
+ * in multi-GPU). `cugraph::detail::compute_gpu_id_from_ext_vertex_t` is used to map keys to
+ * processes). (Key, value) pairs may be provided by transform_reduce_e_by_src_key() or
+ * transform_reduce_e_by_dst_key().
  * @param key_aggregated_e_op Quinary operator takes 1) edge source, 2) key, 3) *(@p
  * edge_partition_src_value_input_first + i), 4) value for the key stored in the input (key, value)
  * pairs provided by @p map_unique_key_first, @p map_unique_key_last, and @p map_value_first
@@ -263,8 +283,11 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
   using edge_src_value_t = typename EdgeSrcValueInputWrapper::value_type;
   using edge_value_t     = typename EdgeValueInputWrapper::value_type;
   using kv_pair_value_t  = typename KVStoreViewType::value_type;
+  using optional_edge_value_buffer_value_type =
+    std::conditional_t<!std::is_same_v<edge_value_t, thrust::nullopt_t>, edge_value_t, void>;
+
   static_assert(
-    std::is_arithmetic_v<edge_value_t>,
+    std::is_same_v<edge_value_t, thrust::nullopt_t> || std::is_arithmetic_v<edge_value_t>,
     "Currently only scalar values are supported, should be extended to support thrust::tuple of "
     "arithmetic types and void (for dummy property values) to be consistent with other "
     "primitives.");  // this will also require a custom edge value aggregation op.
@@ -284,16 +307,15 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
     detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_edge_property_device_view_t<
       edge_t,
-      typename EdgeValueInputWrapper::value_iterator>>;
-
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+      typename EdgeValueInputWrapper::value_iterator,
+      typename EdgeValueInputWrapper::value_type>>;
 
   if (do_expensive_check) { /* currently, nothing to do */
   }
 
   auto total_global_mem = handle.get_device_properties().totalGlobalMem;
   size_t element_size   = sizeof(vertex_t) * 2;  // major + minor keys
-  if constexpr (!std::is_same_v<edge_value_t, void>) {
+  if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
     static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<edge_value_t>::value);
     if constexpr (is_thrust_tuple_of_arithmetic<edge_value_t>::value) {
       element_size += sum_thrust_tuple_element_sizes<edge_value_t>();
@@ -317,24 +339,78 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
 
   // 1. aggregate each vertex out-going edges based on keys and transform-reduce.
 
+  auto edge_mask_view = graph_view.edge_mask_view();
+
   rmm::device_uvector<vertex_t> majors(0, handle.get_stream());
   auto e_op_result_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
 
     auto edge_partition_src_value_input =
       edge_partition_src_input_device_view_t(edge_src_value_input, i);
     auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i);
 
-    rmm::device_uvector<vertex_t> tmp_majors(edge_partition.number_of_edges(), handle.get_stream());
+    std::optional<rmm::device_uvector<edge_t>> offsets_with_mask{std::nullopt};
+    if (edge_partition_e_mask) {
+      rmm::device_uvector<edge_t> degrees_with_mask(0, handle.get_stream());
+      if (edge_partition.dcs_nzd_vertices()) {
+        auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
+
+        auto major_sparse_range_size =
+          (*segment_offsets)[detail::num_sparse_segments_per_vertex_partition];
+        degrees_with_mask = rmm::device_uvector<edge_t>(
+          major_sparse_range_size + *(edge_partition.dcs_nzd_vertex_count()), handle.get_stream());
+        auto major_first = thrust::make_transform_iterator(
+          thrust::make_counting_iterator(vertex_t{0}),
+          cuda::proclaim_return_type<vertex_t>(
+            [major_sparse_range_size,
+             major_range_first = edge_partition.major_range_first(),
+             dcs_nzd_vertices  = *(edge_partition.dcs_nzd_vertices())] __device__(vertex_t i) {
+              if (i < major_sparse_range_size) {  // sparse
+                return major_range_first + i;
+              } else {  // hypersparse
+                return *(dcs_nzd_vertices + (i - major_sparse_range_size));
+              }
+            }));
+        degrees_with_mask =
+          edge_partition.compute_local_degrees_with_mask((*edge_partition_e_mask).value_first(),
+                                                         major_first,
+                                                         major_first + degrees_with_mask.size(),
+                                                         handle.get_stream());
+      } else {
+        degrees_with_mask = edge_partition.compute_local_degrees_with_mask(
+          (*edge_partition_e_mask).value_first(),
+          thrust::make_counting_iterator(edge_partition.major_range_first()),
+          thrust::make_counting_iterator(edge_partition.major_range_last()),
+          handle.get_stream());
+      }
+      offsets_with_mask =
+        rmm::device_uvector<edge_t>(degrees_with_mask.size() + 1, handle.get_stream());
+      (*offsets_with_mask).set_element_to_zero_async(0, handle.get_stream());
+      thrust::inclusive_scan(handle.get_thrust_policy(),
+                             degrees_with_mask.begin(),
+                             degrees_with_mask.end(),
+                             (*offsets_with_mask).begin() + 1);
+    }
+
+    rmm::device_uvector<vertex_t> tmp_majors(
+      edge_partition_e_mask ? (*offsets_with_mask).back_element(handle.get_stream())
+                            : edge_partition.number_of_edges(),
+      handle.get_stream());
     rmm::device_uvector<vertex_t> tmp_minor_keys(tmp_majors.size(), handle.get_stream());
-    // FIXME: this doesn't work if edge_value_t is thrust::tuple or void
-    rmm::device_uvector<edge_value_t> tmp_key_aggregated_edge_values(tmp_majors.size(),
-                                                                     handle.get_stream());
+    auto tmp_key_aggregated_edge_values =
+      detail::allocate_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+        tmp_majors.size(), handle.get_stream());
 
-    if (edge_partition.number_of_edges() > 0) {
+    if (tmp_majors.size() > 0) {
       auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
 
       detail::decompress_edge_partition_to_fill_edgelist_majors<vertex_t,
@@ -342,7 +418,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
                                                                 GraphViewType::is_multi_gpu>(
         handle,
         edge_partition,
-        std::nullopt,
+        detail::to_std_optional(edge_partition_e_mask),
         raft::device_span<vertex_t>(tmp_majors.data(), tmp_majors.size()),
         segment_offsets);
 
@@ -357,14 +433,14 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
         static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * (1 << 20);
       auto [h_vertex_offsets, h_edge_offsets] = detail::compute_offset_aligned_element_chunks(
         handle,
-        raft::device_span<edge_t const>{
-          edge_partition.offsets(),
-          1 + static_cast<size_t>(
-                edge_partition.dcs_nzd_vertices()
-                  ? (*segment_offsets)[detail::num_sparse_segments_per_vertex_partition] +
-                      *(edge_partition.dcs_nzd_vertex_count())
-                  : edge_partition.major_range_size())},
-        edge_partition.number_of_edges(),
+        raft::device_span<edge_t const>(
+          offsets_with_mask ? (*offsets_with_mask).data() : edge_partition.offsets(),
+          (edge_partition.dcs_nzd_vertices()
+             ? (*segment_offsets)[detail::num_sparse_segments_per_vertex_partition] +
+                 *(edge_partition.dcs_nzd_vertex_count())
+             : edge_partition.major_range_size()) +
+            1),
+        static_cast<edge_t>(tmp_majors.size()),
         approx_edges_to_sort_per_iteration);
       auto num_chunks = h_vertex_offsets.size() - 1;
 
@@ -376,30 +452,69 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
       rmm::device_uvector<vertex_t> unreduced_majors(max_chunk_size, handle.get_stream());
       rmm::device_uvector<vertex_t> unreduced_minor_keys(unreduced_majors.size(),
                                                          handle.get_stream());
-      // FIXME: this doesn't work if edge_value_t is thrust::tuple or void
-      rmm::device_uvector<edge_value_t> unreduced_key_aggregated_edge_values(
-        unreduced_majors.size(), handle.get_stream());
+      auto unreduced_key_aggregated_edge_values =
+        detail::allocate_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+          unreduced_majors.size(), handle.get_stream());
       rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
 
       size_t reduced_size{0};
       for (size_t j = 0; j < num_chunks; ++j) {
-        thrust::copy(handle.get_thrust_policy(),
-                     minor_key_first + h_edge_offsets[j],
-                     minor_key_first + h_edge_offsets[j + 1],
-                     tmp_minor_keys.begin() + h_edge_offsets[j]);
+        if (edge_partition_e_mask) {
+          std::array<edge_t, 2> unmasked_ranges{};
+          raft::update_host(unmasked_ranges.data(),
+                            edge_partition.offsets() + h_vertex_offsets[j],
+                            1,
+                            handle.get_stream());
+          raft::update_host(unmasked_ranges.data() + 1,
+                            edge_partition.offsets() + h_vertex_offsets[j + 1],
+                            1,
+                            handle.get_stream());
+          handle.sync_stream();
+          if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+            detail::copy_if_mask_set(
+              handle,
+              thrust::make_zip_iterator(minor_key_first,
+                                        edge_partition_e_value_input.value_first()) +
+                unmasked_ranges[0],
+              thrust::make_zip_iterator(minor_key_first,
+                                        edge_partition_e_value_input.value_first()) +
+                unmasked_ranges[1],
+              (*edge_partition_e_mask).value_first() + unmasked_ranges[0],
+              thrust::make_zip_iterator(tmp_minor_keys.begin(),
+                                        detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                          tmp_key_aggregated_edge_values)) +
+                h_edge_offsets[j]);
+          } else {
+            detail::copy_if_mask_set(handle,
+                                     minor_key_first + unmasked_ranges[0],
+                                     minor_key_first + unmasked_ranges[1],
+                                     (*edge_partition_e_mask).value_first() + unmasked_ranges[0],
+                                     tmp_minor_keys.begin() + h_edge_offsets[j]);
+          }
+        } else {
+          thrust::copy(handle.get_thrust_policy(),
+                       minor_key_first + h_edge_offsets[j],
+                       minor_key_first + h_edge_offsets[j + 1],
+                       tmp_minor_keys.begin() + h_edge_offsets[j]);
+        }
 
         size_t tmp_storage_bytes{0};
-        auto offset_first =
-          thrust::make_transform_iterator(edge_partition.offsets() + h_vertex_offsets[j],
-                                          detail::rebase_offset_t<edge_t>{h_edge_offsets[j]});
-        if constexpr (!std::is_same_v<edge_value_t, void>) {
+        auto offset_first = thrust::make_transform_iterator(
+          (offsets_with_mask ? (*offsets_with_mask).data() : edge_partition.offsets()) +
+            h_vertex_offsets[j],
+          detail::rebase_offset_t<edge_t>{h_edge_offsets[j]});
+        if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
           cub::DeviceSegmentedSort::SortPairs(
             static_cast<void*>(nullptr),
             tmp_storage_bytes,
             tmp_minor_keys.begin() + h_edge_offsets[j],
             unreduced_minor_keys.begin(),
-            edge_partition_e_value_input.value_first() + h_edge_offsets[j],
-            unreduced_key_aggregated_edge_values.begin(),
+            (edge_partition_e_mask ? detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                       tmp_key_aggregated_edge_values)
+                                   : edge_partition_e_value_input.value_first()) +
+              h_edge_offsets[j],
+            detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+              unreduced_key_aggregated_edge_values),
             h_edge_offsets[j + 1] - h_edge_offsets[j],
             h_vertex_offsets[j + 1] - h_vertex_offsets[j],
             offset_first,
@@ -419,14 +534,18 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
         if (tmp_storage_bytes > d_tmp_storage.size()) {
           d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
         }
-        if constexpr (!std::is_same_v<edge_value_t, void>) {
+        if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
           cub::DeviceSegmentedSort::SortPairs(
             d_tmp_storage.data(),
             tmp_storage_bytes,
             tmp_minor_keys.begin() + h_edge_offsets[j],
             unreduced_minor_keys.begin(),
-            edge_partition_e_value_input.value_first() + h_edge_offsets[j],
-            unreduced_key_aggregated_edge_values.begin(),
+            (edge_partition_e_mask ? detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                       tmp_key_aggregated_edge_values)
+                                   : edge_partition_e_value_input.value_first()) +
+              h_edge_offsets[j],
+            detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+              unreduced_key_aggregated_edge_values),
             h_edge_offsets[j + 1] - h_edge_offsets[j],
             h_vertex_offsets[j + 1] - h_vertex_offsets[j],
             offset_first,
@@ -448,39 +567,44 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
                      tmp_majors.begin() + h_edge_offsets[j],
                      tmp_majors.begin() + h_edge_offsets[j + 1],
                      unreduced_majors.begin());
-        auto input_key_first = thrust::make_zip_iterator(
-          thrust::make_tuple(unreduced_majors.begin(), unreduced_minor_keys.begin()));
+        auto input_key_first =
+          thrust::make_zip_iterator(unreduced_majors.begin(), unreduced_minor_keys.begin());
         auto output_key_first =
-          thrust::make_zip_iterator(thrust::make_tuple(tmp_majors.begin(), tmp_minor_keys.begin()));
-        if constexpr (!std::is_same_v<edge_value_t, void>) {
+          thrust::make_zip_iterator(tmp_majors.begin(), tmp_minor_keys.begin());
+        if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
           reduced_size +=
             thrust::distance(output_key_first + reduced_size,
                              thrust::get<0>(thrust::reduce_by_key(
                                handle.get_thrust_policy(),
                                input_key_first,
                                input_key_first + (h_edge_offsets[j + 1] - h_edge_offsets[j]),
-                               unreduced_key_aggregated_edge_values.begin(),
+                               detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                 unreduced_key_aggregated_edge_values),
                                output_key_first + reduced_size,
-                               tmp_key_aggregated_edge_values.begin() + reduced_size)));
+                               detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                 tmp_key_aggregated_edge_values) +
+                                 reduced_size)));
         } else {
-          reduced_size +=
-            thrust::distance(output_key_first + reduced_size,
-                             thrust::get<0>(thrust::unique(
-                               handle.get_thrust_policy(),
-                               input_key_first,
-                               input_key_first + (h_edge_offsets[j + 1] - h_edge_offsets[j]),
-                               output_key_first + reduced_size)));
+          reduced_size += thrust::distance(
+            output_key_first + reduced_size,
+            thrust::copy_if(
+              handle.get_thrust_policy(),
+              input_key_first,
+              input_key_first + (h_edge_offsets[j + 1] - h_edge_offsets[j]),
+              thrust::make_counting_iterator(size_t{0}),
+              output_key_first + reduced_size,
+              cugraph::detail::is_first_in_run_t<decltype(input_key_first)>{input_key_first}));
         }
       }
       tmp_majors.resize(reduced_size, handle.get_stream());
       tmp_minor_keys.resize(tmp_majors.size(), handle.get_stream());
-      // FIXME: this doesn't work if edge_value_t is thrust::tuple or void
-      tmp_key_aggregated_edge_values.resize(tmp_majors.size(), handle.get_stream());
+      detail::resize_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+        tmp_key_aggregated_edge_values, tmp_majors.size(), handle.get_stream());
     }
     tmp_majors.shrink_to_fit(handle.get_stream());
     tmp_minor_keys.shrink_to_fit(handle.get_stream());
-    // FIXME: this doesn't work if edge_value_t is thrust::tuple or void
-    tmp_key_aggregated_edge_values.shrink_to_fit(handle.get_stream());
+    detail::shrink_to_fit_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+      tmp_key_aggregated_edge_values, handle.get_stream());
 
     std::unique_ptr<
       kv_store_t<vertex_t,
@@ -499,18 +623,34 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
       auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
       auto const minor_comm_size = minor_comm.get_size();
 
-      // FIXME: this doesn't work if edge_value_t is thrust::tuple or void
-      auto triplet_first     = thrust::make_zip_iterator(thrust::make_tuple(
-        tmp_majors.begin(), tmp_minor_keys.begin(), tmp_key_aggregated_edge_values.begin()));
-      auto d_tx_value_counts = cugraph::groupby_and_count(
-        triplet_first,
-        triplet_first + tmp_majors.size(),
-        detail::triplet_to_minor_comm_rank_t<vertex_t, edge_value_t>{
-          detail::compute_vertex_partition_id_from_ext_vertex_t<vertex_t>{comm_size},
-          minor_comm_size},
-        minor_comm_size,
-        mem_frugal_threshold,
-        handle.get_stream());
+      rmm::device_uvector<size_t> d_tx_value_counts(0, handle.get_stream());
+      if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+        auto triplet_first =
+          thrust::make_zip_iterator(tmp_majors.begin(),
+                                    tmp_minor_keys.begin(),
+                                    detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                      tmp_key_aggregated_edge_values));
+        d_tx_value_counts = cugraph::groupby_and_count(
+          triplet_first,
+          triplet_first + tmp_majors.size(),
+          detail::tuple_to_minor_comm_rank_t<vertex_t, edge_value_t>{
+            detail::compute_vertex_partition_id_from_ext_vertex_t<vertex_t>{comm_size},
+            minor_comm_size},
+          minor_comm_size,
+          mem_frugal_threshold,
+          handle.get_stream());
+      } else {
+        auto pair_first   = thrust::make_zip_iterator(tmp_majors.begin(), tmp_minor_keys.begin());
+        d_tx_value_counts = cugraph::groupby_and_count(
+          pair_first,
+          pair_first + tmp_majors.size(),
+          detail::tuple_to_minor_comm_rank_t<vertex_t, edge_value_t>{
+            detail::compute_vertex_partition_id_from_ext_vertex_t<vertex_t>{comm_size},
+            minor_comm_size},
+          minor_comm_size,
+          mem_frugal_threshold,
+          handle.get_stream());
+      }
 
       std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
       raft::update_host(h_tx_value_counts.data(),
@@ -544,8 +684,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
           thrust::copy(
             handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end(), majors.begin());
 
-          auto pair_first =
-            thrust::make_zip_iterator(thrust::make_tuple(minor_comm_ranks.begin(), majors.begin()));
+          auto pair_first = thrust::make_zip_iterator(minor_comm_ranks.begin(), majors.begin());
           thrust::sort(
             handle.get_thrust_policy(), pair_first, pair_first + minor_comm_ranks.size());
           auto unique_pair_last = thrust::unique(
@@ -622,7 +761,9 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
 
       rmm::device_uvector<vertex_t> rx_majors(0, handle.get_stream());
       rmm::device_uvector<vertex_t> rx_minor_keys(0, handle.get_stream());
-      rmm::device_uvector<edge_value_t> rx_key_aggregated_edge_values(0, handle.get_stream());
+      auto rx_key_aggregated_edge_values =
+        detail::allocate_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+          0, handle.get_stream());
       auto mem_frugal_flag =
         host_scalar_allreduce(minor_comm,
                               tmp_majors.size() > mem_frugal_threshold ? int{1} : int{0},
@@ -639,66 +780,120 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
         tmp_minor_keys.resize(0, handle.get_stream());
         tmp_minor_keys.shrink_to_fit(handle.get_stream());
 
-        std::tie(rx_key_aggregated_edge_values, std::ignore) =
-          shuffle_values(minor_comm,
-                         tmp_key_aggregated_edge_values.begin(),
-                         h_tx_value_counts,
-                         handle.get_stream());
-        tmp_key_aggregated_edge_values.resize(0, handle.get_stream());
-        tmp_key_aggregated_edge_values.shrink_to_fit(handle.get_stream());
+        if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+          std::tie(rx_key_aggregated_edge_values, std::ignore) =
+            shuffle_values(minor_comm,
+                           detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                             tmp_key_aggregated_edge_values),
+                           h_tx_value_counts,
+                           handle.get_stream());
+        }
+        detail::resize_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+          tmp_key_aggregated_edge_values, 0, handle.get_stream());
+        detail::shrink_to_fit_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+          tmp_key_aggregated_edge_values, handle.get_stream());
       } else {
-        std::forward_as_tuple(std::tie(rx_majors, rx_minor_keys, rx_key_aggregated_edge_values),
-                              std::ignore) =
-          shuffle_values(minor_comm, triplet_first, h_tx_value_counts, handle.get_stream());
+        if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+          auto triplet_first =
+            thrust::make_zip_iterator(tmp_majors.begin(),
+                                      tmp_minor_keys.begin(),
+                                      detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                        tmp_key_aggregated_edge_values));
+          std::forward_as_tuple(std::tie(rx_majors, rx_minor_keys, rx_key_aggregated_edge_values),
+                                std::ignore) =
+            shuffle_values(minor_comm, triplet_first, h_tx_value_counts, handle.get_stream());
+        } else {
+          auto pair_first = thrust::make_zip_iterator(tmp_majors.begin(), tmp_minor_keys.begin());
+          std::forward_as_tuple(std::tie(rx_majors, rx_minor_keys), std::ignore) =
+            shuffle_values(minor_comm, pair_first, h_tx_value_counts, handle.get_stream());
+        }
         tmp_majors.resize(0, handle.get_stream());
         tmp_majors.shrink_to_fit(handle.get_stream());
         tmp_minor_keys.resize(0, handle.get_stream());
         tmp_minor_keys.shrink_to_fit(handle.get_stream());
-        tmp_key_aggregated_edge_values.resize(0, handle.get_stream());
-        tmp_key_aggregated_edge_values.shrink_to_fit(handle.get_stream());
+        detail::resize_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+          tmp_key_aggregated_edge_values, 0, handle.get_stream());
+        detail::shrink_to_fit_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+          tmp_key_aggregated_edge_values, handle.get_stream());
       }
 
-      auto key_pair_first =
-        thrust::make_zip_iterator(thrust::make_tuple(rx_majors.begin(), rx_minor_keys.begin()));
-      if (rx_majors.size() > mem_frugal_threshold) {  // trade-off parallelism to lower peak memory
-        auto second_first =
-          detail::mem_frugal_partition(key_pair_first,
-                                       key_pair_first + rx_majors.size(),
-                                       rx_key_aggregated_edge_values.begin(),
-                                       detail::pair_to_binary_partition_id_t<vertex_t>{},
-                                       int{1},
-                                       handle.get_stream());
-
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            key_pair_first,
-                            std::get<0>(second_first),
-                            rx_key_aggregated_edge_values.begin());
-
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            std::get<0>(second_first),
-                            key_pair_first + rx_majors.size(),
-                            std::get<1>(second_first));
+      auto key_pair_first = thrust::make_zip_iterator(rx_majors.begin(), rx_minor_keys.begin());
+      if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+        if (rx_majors.size() >
+            mem_frugal_threshold) {  // trade-off parallelism to lower peak memory
+          auto second_first =
+            detail::mem_frugal_partition(key_pair_first,
+                                         key_pair_first + rx_majors.size(),
+                                         detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                           rx_key_aggregated_edge_values),
+                                         detail::pair_to_binary_partition_id_t<vertex_t>{},
+                                         int{1},
+                                         handle.get_stream());
+
+          thrust::sort_by_key(handle.get_thrust_policy(),
+                              key_pair_first,
+                              std::get<0>(second_first),
+                              detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                rx_key_aggregated_edge_values));
+
+          thrust::sort_by_key(handle.get_thrust_policy(),
+                              std::get<0>(second_first),
+                              key_pair_first + rx_majors.size(),
+                              std::get<1>(second_first));
+        } else {
+          thrust::sort_by_key(handle.get_thrust_policy(),
+                              key_pair_first,
+                              key_pair_first + rx_majors.size(),
+                              detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                rx_key_aggregated_edge_values));
+        }
+
+        auto num_uniques =
+          thrust::count_if(handle.get_thrust_policy(),
+                           thrust::make_counting_iterator(size_t{0}),
+                           thrust::make_counting_iterator(rx_majors.size()),
+                           detail::is_first_in_run_t<decltype(key_pair_first)>{key_pair_first});
+        tmp_majors.resize(num_uniques, handle.get_stream());
+        tmp_minor_keys.resize(tmp_majors.size(), handle.get_stream());
+        detail::resize_optional_dataframe_buffer<edge_value_t>(
+          tmp_key_aggregated_edge_values, tmp_majors.size(), handle.get_stream());
+        thrust::reduce_by_key(
+          handle.get_thrust_policy(),
+          key_pair_first,
+          key_pair_first + rx_majors.size(),
+          detail::get_optional_dataframe_buffer_begin<edge_value_t>(rx_key_aggregated_edge_values),
+          thrust::make_zip_iterator(tmp_majors.begin(), tmp_minor_keys.begin()),
+          detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+            tmp_key_aggregated_edge_values));
       } else {
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            key_pair_first,
-                            key_pair_first + rx_majors.size(),
-                            rx_key_aggregated_edge_values.begin());
+        if (rx_majors.size() >
+            mem_frugal_threshold) {  // trade-off parallelism to lower peak memory
+          auto second_first =
+            detail::mem_frugal_partition(key_pair_first,
+                                         key_pair_first + rx_majors.size(),
+                                         detail::pair_to_binary_partition_id_t<vertex_t>{},
+                                         int{1},
+                                         handle.get_stream());
+
+          thrust::sort(handle.get_thrust_policy(), key_pair_first, second_first);
+
+          thrust::sort(handle.get_thrust_policy(), second_first, key_pair_first + rx_majors.size());
+        } else {
+          thrust::sort(
+            handle.get_thrust_policy(), key_pair_first, key_pair_first + rx_majors.size());
+        }
+
+        auto num_uniques = thrust::distance(
+          key_pair_first,
+          thrust::unique(
+            handle.get_thrust_policy(), key_pair_first, key_pair_first + rx_majors.size()));
+        tmp_majors.resize(num_uniques, handle.get_stream());
+        tmp_minor_keys.resize(tmp_majors.size(), handle.get_stream());
+        thrust::copy(handle.get_thrust_policy(),
+                     key_pair_first,
+                     key_pair_first + num_uniques,
+                     thrust::make_zip_iterator(tmp_majors.begin(), tmp_minor_keys.begin()));
       }
-      auto num_uniques =
-        thrust::count_if(handle.get_thrust_policy(),
-                         thrust::make_counting_iterator(size_t{0}),
-                         thrust::make_counting_iterator(rx_majors.size()),
-                         detail::is_first_in_run_t<decltype(key_pair_first)>{key_pair_first});
-      tmp_majors.resize(num_uniques, handle.get_stream());
-      tmp_minor_keys.resize(tmp_majors.size(), handle.get_stream());
-      tmp_key_aggregated_edge_values.resize(tmp_majors.size(), handle.get_stream());
-      thrust::reduce_by_key(
-        handle.get_thrust_policy(),
-        key_pair_first,
-        key_pair_first + rx_majors.size(),
-        rx_key_aggregated_edge_values.begin(),
-        thrust::make_zip_iterator(thrust::make_tuple(tmp_majors.begin(), tmp_minor_keys.begin())),
-        tmp_key_aggregated_edge_values.begin());
     }
 
     std::unique_ptr<kv_store_t<vertex_t, kv_pair_value_t, KVStoreViewType::binary_search>>
@@ -756,8 +951,6 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
     auto tmp_e_op_result_buffer =
       allocate_dataframe_buffer<T>(tmp_majors.size(), handle.get_stream());
 
-    auto triplet_first = thrust::make_zip_iterator(thrust::make_tuple(
-      tmp_majors.begin(), tmp_minor_keys.begin(), tmp_key_aggregated_edge_values.begin()));
     auto major_value_map_device_view =
       (GraphViewType::is_multi_gpu && edge_src_value_input.keys())
         ? thrust::make_optional<detail::kv_binary_search_store_device_view_t<
@@ -768,28 +961,56 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
                        detail::kv_cuco_store_find_device_view_t<KVStoreViewType>>
       dst_key_value_map_device_view(
         GraphViewType::is_multi_gpu ? multi_gpu_minor_key_value_map_ptr->view() : kv_store_view);
-    thrust::transform(handle.get_thrust_policy(),
-                      triplet_first,
-                      triplet_first + tmp_majors.size(),
-                      get_dataframe_buffer_begin(tmp_e_op_result_buffer),
-                      detail::call_key_aggregated_e_op_t<
-                        vertex_t,
-                        edge_value_t,
-                        decltype(edge_partition),
-                        std::remove_reference_t<decltype(*major_value_map_device_view)>,
-                        edge_partition_src_input_device_view_t,
-                        decltype(dst_key_value_map_device_view),
-                        KeyAggregatedEdgeOp>{edge_partition,
-                                             major_value_map_device_view,
-                                             edge_partition_src_value_input,
-                                             dst_key_value_map_device_view,
-                                             key_aggregated_e_op});
+    if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+      auto triplet_first = thrust::make_zip_iterator(
+        tmp_majors.begin(),
+        tmp_minor_keys.begin(),
+        detail::get_optional_dataframe_buffer_begin<edge_value_t>(tmp_key_aggregated_edge_values));
+      thrust::transform(handle.get_thrust_policy(),
+                        triplet_first,
+                        triplet_first + tmp_majors.size(),
+                        get_dataframe_buffer_begin(tmp_e_op_result_buffer),
+                        detail::call_key_aggregated_e_op_t<
+                          vertex_t,
+                          edge_value_t,
+                          T,
+                          decltype(edge_partition),
+                          std::remove_reference_t<decltype(*major_value_map_device_view)>,
+                          edge_partition_src_input_device_view_t,
+                          decltype(dst_key_value_map_device_view),
+                          KeyAggregatedEdgeOp>{edge_partition,
+                                               major_value_map_device_view,
+                                               edge_partition_src_value_input,
+                                               dst_key_value_map_device_view,
+                                               key_aggregated_e_op});
+    } else {
+      auto pair_first = thrust::make_zip_iterator(tmp_majors.begin(), tmp_minor_keys.begin());
+      thrust::transform(handle.get_thrust_policy(),
+                        pair_first,
+                        pair_first + tmp_majors.size(),
+                        get_dataframe_buffer_begin(tmp_e_op_result_buffer),
+                        detail::call_key_aggregated_e_op_t<
+                          vertex_t,
+                          edge_value_t,
+                          T,
+                          decltype(edge_partition),
+                          std::remove_reference_t<decltype(*major_value_map_device_view)>,
+                          edge_partition_src_input_device_view_t,
+                          decltype(dst_key_value_map_device_view),
+                          KeyAggregatedEdgeOp>{edge_partition,
+                                               major_value_map_device_view,
+                                               edge_partition_src_value_input,
+                                               dst_key_value_map_device_view,
+                                               key_aggregated_e_op});
+    }
 
     if constexpr (GraphViewType::is_multi_gpu) { multi_gpu_minor_key_value_map_ptr.reset(); }
     tmp_minor_keys.resize(0, handle.get_stream());
     tmp_minor_keys.shrink_to_fit(handle.get_stream());
-    tmp_key_aggregated_edge_values.resize(0, handle.get_stream());
-    tmp_key_aggregated_edge_values.shrink_to_fit(handle.get_stream());
+    detail::resize_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+      tmp_key_aggregated_edge_values, 0, handle.get_stream());
+    detail::shrink_to_fit_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+      tmp_key_aggregated_edge_values, handle.get_stream());
 
     {
       auto num_uniques =
diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
index 083487fa5b4..509ab56d3fe 100644
--- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
@@ -149,7 +149,7 @@ template <bool update_major,
           typename EdgeOp,
           typename ReduceOp,
           typename T>
-__global__ void per_v_transform_reduce_e_hypersparse(
+__global__ static void per_v_transform_reduce_e_hypersparse(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -251,7 +251,7 @@ template <bool update_major,
           typename EdgeOp,
           typename ReduceOp,
           typename T>
-__global__ void per_v_transform_reduce_e_low_degree(
+__global__ static void per_v_transform_reduce_e_low_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -350,7 +350,7 @@ template <bool update_major,
           typename EdgeOp,
           typename ReduceOp,
           typename T>
-__global__ void per_v_transform_reduce_e_mid_degree(
+__global__ static void per_v_transform_reduce_e_mid_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -466,7 +466,7 @@ template <bool update_major,
           typename EdgeOp,
           typename ReduceOp,
           typename T>
-__global__ void per_v_transform_reduce_e_high_degree(
+__global__ static void per_v_transform_reduce_e_high_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
diff --git a/cpp/src/prims/transform_e.cuh b/cpp/src/prims/transform_e.cuh
index 2cb1a5358b0..9c7670f68d2 100644
--- a/cpp/src/prims/transform_e.cuh
+++ b/cpp/src/prims/transform_e.cuh
@@ -51,7 +51,7 @@ template <bool check_edge_mask,
           typename EdgePartitionEdgeMaskWrapper,
           typename EdgePartitionEdgeValueOutputWrapper,
           typename EdgeOp>
-__global__ void transform_e_packed_bool(
+__global__ static void transform_e_packed_bool(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
diff --git a/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh b/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
index b63b014ed05..244586e6d9e 100644
--- a/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
+++ b/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
@@ -19,6 +19,7 @@
 #include "prims/detail/nbr_intersection.cuh"
 #include "prims/property_op_utils.cuh"
 
+#include <cugraph/detail/decompress_edge_partition.cuh>
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/edge_partition_endpoint_property_device_view.cuh>
 #include <cugraph/edge_src_dst_property.hpp>
@@ -130,7 +131,9 @@ std::tuple<rmm::device_uvector<vertex_t>, ValueBuffer> sort_and_reduce_by_vertic
                         vertices.end(),
                         get_dataframe_buffer_begin(value_buffer),
                         reduced_vertices.begin(),
-                        get_dataframe_buffer_begin(reduced_value_buffer));
+                        get_dataframe_buffer_begin(reduced_value_buffer),
+                        thrust::equal_to<vertex_t>{},
+                        property_op<value_t, thrust::plus>{});
 
   vertices.resize(size_t{0}, handle.get_stream());
   resize_dataframe_buffer(value_buffer, size_t{0}, handle.get_stream());
@@ -201,14 +204,14 @@ struct accumulate_vertex_property_t {
  * @param graph_view Non-owning graph object.
  * @param edge_src_value_input Wrapper used to access source input property values (for the edge
  * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
- * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
- * (if @p e_op does not access source property values). Use update_edge_src_property to fill the
- * wrapper.
+ * (if @p intersection_op needs to access source property values) or
+ * cugraph::edge_src_dummy_property_t::view() (if @p intersection_op does not access source property
+ * values). Use update_edge_src_property to fill the wrapper.
  * @param edge_dst_value_input Wrapper used to access destination input property values (for the
  * edge destinations assigned to this process in multi-GPU). Use either
- * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
- * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property
- * values). Use update_edge_dst_property to fill the wrapper.
+ * cugraph::edge_dst_property_t::view() (if @p intersection_op needs to access destination property
+ * values) or cugraph::edge_dst_dummy_property_t::view() (if @p intersection_op does not access
+ * destination property values). Use update_edge_dst_property to fill the wrapper.
  * @param intersection_op quinary operator takes edge source, edge destination, property values for
  * the source, property values for the destination, and a list of vertices in the intersection of
  * edge source & destination vertices' destination neighbors and returns a thrust::tuple of three
@@ -260,8 +263,6 @@ void transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v(
       typename EdgeDstValueInputWrapper::value_iterator,
       typename EdgeDstValueInputWrapper::value_type>>;
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do.
   }
@@ -272,6 +273,7 @@ void transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v(
                init);
 
   auto edge_mask_view = graph_view.edge_mask_view();
+
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
@@ -484,7 +486,9 @@ void transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v(
                             merged_vertices.end(),
                             get_dataframe_buffer_begin(merged_value_buffer),
                             reduced_vertices.begin(),
-                            get_dataframe_buffer_begin(reduced_value_buffer));
+                            get_dataframe_buffer_begin(reduced_value_buffer),
+                            thrust::equal_to<vertex_t>{},
+                            property_op<T, thrust::plus>{});
       merged_vertices.resize(size_t{0}, handle.get_stream());
       merged_vertices.shrink_to_fit(handle.get_stream());
       resize_dataframe_buffer(merged_value_buffer, size_t{0}, handle.get_stream());
diff --git a/cpp/src/prims/transform_reduce_e.cuh b/cpp/src/prims/transform_reduce_e.cuh
index e5855b105ee..43722550c58 100644
--- a/cpp/src/prims/transform_reduce_e.cuh
+++ b/cpp/src/prims/transform_reduce_e.cuh
@@ -61,7 +61,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
-__global__ void transform_reduce_e_hypersparse(
+__global__ static void transform_reduce_e_hypersparse(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -153,7 +153,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
-__global__ void transform_reduce_e_low_degree(
+__global__ static void transform_reduce_e_low_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -242,7 +242,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
-__global__ void transform_reduce_e_mid_degree(
+__global__ static void transform_reduce_e_mid_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -320,7 +320,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
-__global__ void transform_reduce_e_high_degree(
+__global__ static void transform_reduce_e_high_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
diff --git a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
index 42203085077..00876012906 100644
--- a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
+++ b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
@@ -95,9 +95,10 @@ template <bool edge_partition_src_key,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
           typename EdgePartitionSrcDstKeyInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename EdgeOp,
           typename ValueIterator>
-__global__ void transform_reduce_by_src_dst_key_hypersparse(
+__global__ static void transform_reduce_by_src_dst_key_hypersparse(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -105,6 +106,9 @@ __global__ void transform_reduce_by_src_dst_key_hypersparse(
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
   EdgePartitionSrcDstKeyInputWrapper edge_partition_src_dst_key_input,
+  EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
+  thrust::optional<raft::device_span<typename GraphViewType::edge_type const>>
+    edge_offsets_with_mask,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* keys,
   ValueIterator value_iter)
@@ -129,19 +133,42 @@ __global__ void transform_reduce_by_src_dst_key_hypersparse(
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) =
       edge_partition.local_edges(static_cast<vertex_t>(major_idx));
-    auto local_offset = edge_partition.local_offset(major_idx);
-    for (edge_t i = 0; i < local_degree; ++i) {
-      update_buffer_element<edge_partition_src_key, GraphViewType>(edge_partition,
-                                                                   major,
-                                                                   indices[i],
-                                                                   edge_offset + i,
-                                                                   edge_partition_src_value_input,
-                                                                   edge_partition_dst_value_input,
-                                                                   edge_partition_e_value_input,
-                                                                   edge_partition_src_dst_key_input,
-                                                                   e_op,
-                                                                   keys + local_offset + i,
-                                                                   value_iter + local_offset + i);
+    if (edge_partition_e_mask) {
+      auto major_offset          = edge_partition.major_offset_from_major_nocheck(major);
+      auto edge_offset_with_mask = (*edge_offsets_with_mask)[major_offset];
+      edge_t counter{0};
+      for (edge_t i = 0; i < local_degree; ++i) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          update_buffer_element<edge_partition_src_key, GraphViewType>(
+            edge_partition,
+            major,
+            indices[i],
+            edge_offset + i,
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            edge_partition_src_dst_key_input,
+            e_op,
+            keys + edge_offset_with_mask + counter,
+            value_iter + edge_offset_with_mask + counter);
+          ++counter;
+        }
+      }
+    } else {
+      for (edge_t i = 0; i < local_degree; ++i) {
+        update_buffer_element<edge_partition_src_key, GraphViewType>(
+          edge_partition,
+          major,
+          indices[i],
+          edge_offset + i,
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_src_dst_key_input,
+          e_op,
+          keys + edge_offset + i,
+          value_iter + edge_offset + i);
+      }
     }
 
     idx += gridDim.x * blockDim.x;
@@ -154,9 +181,10 @@ template <bool edge_partition_src_key,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
           typename EdgePartitionSrcDstKeyInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename EdgeOp,
           typename ValueIterator>
-__global__ void transform_reduce_by_src_dst_key_low_degree(
+__global__ static void transform_reduce_by_src_dst_key_low_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -166,6 +194,9 @@ __global__ void transform_reduce_by_src_dst_key_low_degree(
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
   EdgePartitionSrcDstKeyInputWrapper edge_partition_src_dst_key_input,
+  EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
+  thrust::optional<raft::device_span<typename GraphViewType::edge_type const>>
+    edge_offsets_with_mask,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* keys,
   ValueIterator value_iter)
@@ -187,19 +218,41 @@ __global__ void transform_reduce_by_src_dst_key_low_degree(
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) =
       edge_partition.local_edges(static_cast<vertex_t>(major_offset));
-    auto local_offset = edge_partition.local_offset(major_offset);
-    for (edge_t i = 0; i < local_degree; ++i) {
-      update_buffer_element<edge_partition_src_key, GraphViewType>(edge_partition,
-                                                                   major,
-                                                                   indices[i],
-                                                                   edge_offset + i,
-                                                                   edge_partition_src_value_input,
-                                                                   edge_partition_dst_value_input,
-                                                                   edge_partition_e_value_input,
-                                                                   edge_partition_src_dst_key_input,
-                                                                   e_op,
-                                                                   keys + local_offset + i,
-                                                                   value_iter + local_offset + i);
+    if (edge_partition_e_mask) {
+      auto edge_offset_with_mask = (*edge_offsets_with_mask)[major_offset];
+      edge_t counter{0};
+      for (edge_t i = 0; i < local_degree; ++i) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          update_buffer_element<edge_partition_src_key, GraphViewType>(
+            edge_partition,
+            major,
+            indices[i],
+            edge_offset + i,
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            edge_partition_src_dst_key_input,
+            e_op,
+            keys + edge_offset_with_mask + counter,
+            value_iter + edge_offset_with_mask + counter);
+          ++counter;
+        }
+      }
+    } else {
+      for (edge_t i = 0; i < local_degree; ++i) {
+        update_buffer_element<edge_partition_src_key, GraphViewType>(
+          edge_partition,
+          major,
+          indices[i],
+          edge_offset + i,
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_src_dst_key_input,
+          e_op,
+          keys + edge_offset + i,
+          value_iter + edge_offset + i);
+      }
     }
 
     idx += gridDim.x * blockDim.x;
@@ -212,9 +265,10 @@ template <bool edge_partition_src_key,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
           typename EdgePartitionSrcDstKeyInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename EdgeOp,
           typename ValueIterator>
-__global__ void transform_reduce_by_src_dst_key_mid_degree(
+__global__ static void transform_reduce_by_src_dst_key_mid_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -224,6 +278,9 @@ __global__ void transform_reduce_by_src_dst_key_mid_degree(
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
   EdgePartitionSrcDstKeyInputWrapper edge_partition_src_dst_key_input,
+  EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
+  thrust::optional<raft::device_span<typename GraphViewType::edge_type const>>
+    edge_offsets_with_mask,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* keys,
   ValueIterator value_iter)
@@ -238,6 +295,9 @@ __global__ void transform_reduce_by_src_dst_key_mid_degree(
     static_cast<size_t>(major_range_first - edge_partition.major_range_first());
   size_t idx = static_cast<size_t>(tid / raft::warp_size());
 
+  using WarpScan = cub::WarpScan<edge_t, raft::warp_size()>;
+  __shared__ typename WarpScan::TempStorage temp_storage;
+
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
     auto major_offset = major_start_offset + idx;
     auto major =
@@ -247,19 +307,49 @@ __global__ void transform_reduce_by_src_dst_key_mid_degree(
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) =
       edge_partition.local_edges(static_cast<vertex_t>(major_offset));
-    auto local_offset = edge_partition.local_offset(major_offset);
-    for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
-      update_buffer_element<edge_partition_src_key, GraphViewType>(edge_partition,
-                                                                   major,
-                                                                   indices[i],
-                                                                   edge_offset + i,
-                                                                   edge_partition_src_value_input,
-                                                                   edge_partition_dst_value_input,
-                                                                   edge_partition_e_value_input,
-                                                                   edge_partition_src_dst_key_input,
-                                                                   e_op,
-                                                                   keys + local_offset + i,
-                                                                   value_iter + local_offset + i);
+    if (edge_partition_e_mask) {
+      // FIXME: it might be faster to update in warp-sync way
+      auto edge_offset_with_mask = (*edge_offsets_with_mask)[major_offset];
+      edge_t counter{0};
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) { ++counter; }
+      }
+      edge_t offset_within_warp{};
+      WarpScan(temp_storage).ExclusiveSum(counter, offset_within_warp);
+      edge_offset_with_mask += offset_within_warp;
+      counter = 0;
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          update_buffer_element<edge_partition_src_key, GraphViewType>(
+            edge_partition,
+            major,
+            indices[i],
+            edge_offset + i,
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            edge_partition_src_dst_key_input,
+            e_op,
+            keys + edge_offset_with_mask + counter,
+            value_iter + edge_offset_with_mask + counter);
+          ++counter;
+        }
+      }
+    } else {
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        update_buffer_element<edge_partition_src_key, GraphViewType>(
+          edge_partition,
+          major,
+          indices[i],
+          edge_offset + i,
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_src_dst_key_input,
+          e_op,
+          keys + edge_offset + i,
+          value_iter + edge_offset + i);
+      }
     }
 
     idx += gridDim.x * (blockDim.x / raft::warp_size());
@@ -272,9 +362,10 @@ template <bool edge_partition_src_key,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
           typename EdgePartitionSrcDstKeyInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename EdgeOp,
           typename ValueIterator>
-__global__ void transform_reduce_by_src_dst_key_high_degree(
+__global__ static void transform_reduce_by_src_dst_key_high_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -284,6 +375,9 @@ __global__ void transform_reduce_by_src_dst_key_high_degree(
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
   EdgePartitionSrcDstKeyInputWrapper edge_partition_src_dst_key_input,
+  EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
+  thrust::optional<raft::device_span<typename GraphViewType::edge_type const>>
+    edge_offsets_with_mask,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* keys,
   ValueIterator value_iter)
@@ -295,6 +389,9 @@ __global__ void transform_reduce_by_src_dst_key_high_degree(
     static_cast<size_t>(major_range_first - edge_partition.major_range_first());
   auto idx = static_cast<size_t>(blockIdx.x);
 
+  using BlockScan = cub::BlockScan<edge_t, transform_reduce_e_by_src_dst_key_kernel_block_size>;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
     auto major_offset = major_start_offset + idx;
     auto major =
@@ -304,19 +401,49 @@ __global__ void transform_reduce_by_src_dst_key_high_degree(
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) =
       edge_partition.local_edges(static_cast<vertex_t>(major_offset));
-    auto local_offset = edge_partition.local_offset(major_offset);
-    for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
-      update_buffer_element<edge_partition_src_key, GraphViewType>(edge_partition,
-                                                                   major,
-                                                                   indices[i],
-                                                                   edge_offset + i,
-                                                                   edge_partition_src_value_input,
-                                                                   edge_partition_dst_value_input,
-                                                                   edge_partition_e_value_input,
-                                                                   edge_partition_src_dst_key_input,
-                                                                   e_op,
-                                                                   keys + local_offset + i,
-                                                                   value_iter + local_offset + i);
+    if (edge_partition_e_mask) {
+      // FIXME: it might be faster to update in block-sync way
+      auto edge_offset_with_mask = (*edge_offsets_with_mask)[major_offset];
+      edge_t counter{0};
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) { ++counter; }
+      }
+      edge_t offset_within_block{};
+      BlockScan(temp_storage).ExclusiveSum(counter, offset_within_block);
+      edge_offset_with_mask += offset_within_block;
+      counter = 0;
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          update_buffer_element<edge_partition_src_key, GraphViewType>(
+            edge_partition,
+            major,
+            indices[i],
+            edge_offset + i,
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            edge_partition_src_dst_key_input,
+            e_op,
+            keys + edge_offset_with_mask + counter,
+            value_iter + edge_offset_with_mask + counter);
+          ++counter;
+        }
+      }
+    } else {
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        update_buffer_element<edge_partition_src_key, GraphViewType>(
+          edge_partition,
+          major,
+          indices[i],
+          edge_offset + i,
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_src_dst_key_input,
+          e_op,
+          keys + edge_offset + i,
+          value_iter + edge_offset + i);
+      }
     }
 
     idx += gridDim.x;
@@ -410,19 +537,41 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
       typename EdgeSrcDstKeyInputWrapper::value_iterator,
       typename EdgeSrcDstKeyInputWrapper::value_type>;
 
+  auto edge_mask_view = graph_view.edge_mask_view();
+
   rmm::device_uvector<vertex_t> keys(0, handle.get_stream());
   auto value_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
-
-    auto num_edges = edge_partition.number_of_edges();
-
-    rmm::device_uvector<vertex_t> tmp_keys(num_edges, handle.get_stream());
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
+
+    rmm::device_uvector<vertex_t> tmp_keys(0, handle.get_stream());
+    std::optional<rmm::device_uvector<edge_t>> edge_offsets_with_mask{std::nullopt};
+    if (edge_partition_e_mask) {
+      auto local_degrees = edge_partition.compute_local_degrees_with_mask(
+        (*edge_partition_e_mask).value_first(), handle.get_stream());
+      edge_offsets_with_mask =
+        rmm::device_uvector<edge_t>(edge_partition.major_range_size() + 1, handle.get_stream());
+      (*edge_offsets_with_mask).set_element_to_zero_async(0, handle.get_stream());
+      thrust::inclusive_scan(handle.get_thrust_policy(),
+                             local_degrees.begin(),
+                             local_degrees.end(),
+                             (*edge_offsets_with_mask).begin() + 1);
+      tmp_keys.resize((*edge_offsets_with_mask).back_element(handle.get_stream()),
+                      handle.get_stream());
+    } else {
+      tmp_keys.resize(edge_partition.number_of_edges(), handle.get_stream());
+    }
     auto tmp_value_buffer = allocate_dataframe_buffer<T>(tmp_keys.size(), handle.get_stream());
 
-    if (num_edges > 0) {
+    if (tmp_keys.size() > 0) {
       edge_partition_src_input_device_view_t edge_partition_src_value_input{};
       edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
       if constexpr (GraphViewType::is_storage_transposed) {
@@ -467,6 +616,11 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
               edge_partition_dst_value_input,
               edge_partition_e_value_input,
               edge_partition_src_dst_key_input,
+              edge_partition_e_mask,
+              edge_offsets_with_mask
+                ? thrust::make_optional<raft::device_span<edge_t const>>(
+                    (*edge_offsets_with_mask).data(), (*edge_offsets_with_mask).size())
+                : thrust::nullopt,
               e_op,
               tmp_keys.data(),
               get_dataframe_buffer_begin(tmp_value_buffer));
@@ -485,6 +639,11 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
               edge_partition_dst_value_input,
               edge_partition_e_value_input,
               edge_partition_src_dst_key_input,
+              edge_partition_e_mask,
+              edge_offsets_with_mask
+                ? thrust::make_optional<raft::device_span<edge_t const>>(
+                    (*edge_offsets_with_mask).data(), (*edge_offsets_with_mask).size())
+                : thrust::nullopt,
               e_op,
               tmp_keys.data(),
               get_dataframe_buffer_begin(tmp_value_buffer));
@@ -503,6 +662,11 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
               edge_partition_dst_value_input,
               edge_partition_e_value_input,
               edge_partition_src_dst_key_input,
+              edge_partition_e_mask,
+              edge_offsets_with_mask
+                ? thrust::make_optional<raft::device_span<edge_t const>>(
+                    (*edge_offsets_with_mask).data(), (*edge_offsets_with_mask).size())
+                : thrust::nullopt,
               e_op,
               tmp_keys.data(),
               get_dataframe_buffer_begin(tmp_value_buffer));
@@ -520,6 +684,11 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
               edge_partition_dst_value_input,
               edge_partition_e_value_input,
               edge_partition_src_dst_key_input,
+              edge_partition_e_mask,
+              edge_offsets_with_mask
+                ? thrust::make_optional<raft::device_span<edge_t const>>(
+                    (*edge_offsets_with_mask).data(), (*edge_offsets_with_mask).size())
+                : thrust::nullopt,
               e_op,
               tmp_keys.data(),
               get_dataframe_buffer_begin(tmp_value_buffer));
@@ -539,6 +708,11 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
             edge_partition_src_dst_key_input,
+            edge_partition_e_mask,
+            edge_offsets_with_mask
+              ? thrust::make_optional<raft::device_span<edge_t const>>(
+                  (*edge_offsets_with_mask).data(), (*edge_offsets_with_mask).size())
+              : thrust::nullopt,
             e_op,
             tmp_keys.data(),
             get_dataframe_buffer_begin(tmp_value_buffer));
@@ -682,8 +856,6 @@ auto transform_reduce_e_by_src_key(raft::handle_t const& handle,
                              typename GraphViewType::vertex_type>::value);
   static_assert(ReduceOp::pure_function, "ReduceOp should be a pure function.");
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -772,8 +944,6 @@ auto transform_reduce_e_by_dst_key(raft::handle_t const& handle,
                              typename GraphViewType::vertex_type>::value);
   static_assert(ReduceOp::pure_function, "ReduceOp should be a pure function.");
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 4ee5ad5ca02..29dca6ef409 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -241,7 +241,7 @@ rmm::device_uvector<edge_t> compute_minor_degrees(
 int32_t constexpr count_edge_partition_multi_edges_block_size = 1024;
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void for_all_major_for_all_nbr_mid_degree(
+__global__ static void for_all_major_for_all_nbr_mid_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   vertex_t major_range_first,
   vertex_t major_range_last,
@@ -275,7 +275,7 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void for_all_major_for_all_nbr_high_degree(
+__global__ static void for_all_major_for_all_nbr_high_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   vertex_t major_range_first,
   vertex_t major_range_last,
diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh
index c2a3f1160ca..612eb0c48f2 100644
--- a/cpp/src/traversal/od_shortest_distances_impl.cuh
+++ b/cpp/src/traversal/od_shortest_distances_impl.cuh
@@ -215,7 +215,7 @@ template <int32_t max_num_partitions,
           typename key_t,
           typename PartitionOp,
           typename KeyOp>
-__global__ void multi_partition_copy(
+__global__ static void multi_partition_copy(
   InputIterator input_first,
   InputIterator input_last,
   raft::device_span<key_t*> output_buffer_ptrs,
diff --git a/cpp/src/utilities/eidecl_graph_utils.hpp b/cpp/src/utilities/eidecl_graph_utils.hpp
index 84240ba2845..abf026cbbfe 100644
--- a/cpp/src/utilities/eidecl_graph_utils.hpp
+++ b/cpp/src/utilities/eidecl_graph_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,9 +29,12 @@ extern template void offsets_to_indices<int, int>(int const*, int, int*);
 extern template void offsets_to_indices<long, int>(long const*, int, int*);
 extern template void offsets_to_indices<long, long>(long const*, long, long*);
 
-extern template __global__ void offsets_to_indices_kernel<int, int>(int const*, int, int*);
-extern template __global__ void offsets_to_indices_kernel<long, int>(long const*, int, int*);
-extern template __global__ void offsets_to_indices_kernel<long, long>(long const*, long, long*);
+extern template __attribute__((visibility("hidden"))) __global__ void
+offsets_to_indices_kernel<int, int>(int const*, int, int*);
+extern template __attribute__((visibility("hidden"))) __global__ void
+offsets_to_indices_kernel<long, int>(long const*, int, int*);
+extern template __attribute__((visibility("hidden"))) __global__ void
+offsets_to_indices_kernel<long, long>(long const*, long, long*);
 
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/utilities/eidir_graph_utils.hpp b/cpp/src/utilities/eidir_graph_utils.hpp
index 033bb197ce8..ba06c6f56ea 100644
--- a/cpp/src/utilities/eidir_graph_utils.hpp
+++ b/cpp/src/utilities/eidir_graph_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,15 +29,12 @@ template void offsets_to_indices<int32_t, int32_t>(int32_t const*, int32_t, int3
 template void offsets_to_indices<int64_t, int32_t>(int64_t const*, int32_t, int32_t*);
 template void offsets_to_indices<int64_t, int64_t>(int64_t const*, int64_t, int64_t*);
 
-template __global__ void offsets_to_indices_kernel<int32_t, int32_t>(int32_t const*,
-                                                                     int32_t,
-                                                                     int32_t*);
-template __global__ void offsets_to_indices_kernel<int64_t, int32_t>(int64_t const*,
-                                                                     int32_t,
-                                                                     int32_t*);
-template __global__ void offsets_to_indices_kernel<int64_t, int64_t>(int64_t const*,
-                                                                     int64_t,
-                                                                     int64_t*);
+template __global__ __attribute__((visibility("hidden"))) void
+offsets_to_indices_kernel<int32_t, int32_t>(int32_t const*, int32_t, int32_t*);
+template __global__ __attribute__((visibility("hidden"))) void
+offsets_to_indices_kernel<int64_t, int32_t>(int64_t const*, int32_t, int32_t*);
+template __global__ __attribute__((visibility("hidden"))) void
+offsets_to_indices_kernel<int64_t, int64_t>(int64_t const*, int64_t, int64_t*);
 
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/utilities/graph_utils.cuh b/cpp/src/utilities/graph_utils.cuh
index 2d542956531..0b257e7abde 100644
--- a/cpp/src/utilities/graph_utils.cuh
+++ b/cpp/src/utilities/graph_utils.cuh
@@ -247,34 +247,36 @@ void update_dangling_nodes(size_t n, T* dangling_nodes, T damping_factor)
 
 // google matrix kernels
 template <typename IndexType, typename ValueType>
-__global__ void degree_coo(const IndexType n,
-                           const IndexType e,
-                           const IndexType* ind,
-                           ValueType* degree)
+__global__ static void degree_coo(const IndexType n,
+                                  const IndexType e,
+                                  const IndexType* ind,
+                                  ValueType* degree)
 {
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x)
     atomicAdd(&degree[ind[i]], (ValueType)1.0);
 }
 
 template <typename IndexType, typename ValueType>
-__global__ void flag_leafs_kernel(const size_t n, const IndexType* degree, ValueType* bookmark)
+__global__ static void flag_leafs_kernel(const size_t n,
+                                         const IndexType* degree,
+                                         ValueType* bookmark)
 {
   for (auto i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
     if (degree[i] == 0) bookmark[i] = 1.0;
 }
 
 template <typename IndexType, typename ValueType>
-__global__ void degree_offsets(const IndexType n,
-                               const IndexType e,
-                               const IndexType* ind,
-                               ValueType* degree)
+__global__ static void degree_offsets(const IndexType n,
+                                      const IndexType e,
+                                      const IndexType* ind,
+                                      ValueType* degree)
 {
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
     degree[i] += ind[i + 1] - ind[i];
 }
 
 template <typename FromType, typename ToType>
-__global__ void type_convert(FromType* array, int n)
+__global__ static void type_convert(FromType* array, int n)
 {
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
     ToType val   = array[i];
@@ -284,12 +286,12 @@ __global__ void type_convert(FromType* array, int n)
 }
 
 template <typename IndexType, typename ValueType>
-__global__ void equi_prob3(const IndexType n,
-                           const IndexType e,
-                           const IndexType* csrPtr,
-                           const IndexType* csrInd,
-                           ValueType* val,
-                           IndexType* degree)
+__global__ static void equi_prob3(const IndexType n,
+                                  const IndexType e,
+                                  const IndexType* csrPtr,
+                                  const IndexType* csrInd,
+                                  ValueType* val,
+                                  IndexType* degree)
 {
   int j, row, col;
   for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) {
@@ -303,12 +305,12 @@ __global__ void equi_prob3(const IndexType n,
 }
 
 template <typename IndexType, typename ValueType>
-__global__ void equi_prob2(const IndexType n,
-                           const IndexType e,
-                           const IndexType* csrPtr,
-                           const IndexType* csrInd,
-                           ValueType* val,
-                           IndexType* degree)
+__global__ static void equi_prob2(const IndexType n,
+                                  const IndexType e,
+                                  const IndexType* csrPtr,
+                                  const IndexType* csrInd,
+                                  ValueType* val,
+                                  IndexType* degree)
 {
   int row = blockIdx.x * blockDim.x + threadIdx.x;
   if (row < n) {
@@ -372,7 +374,8 @@ void HT_matrix_csc_coo(const IndexType n,
 }
 
 template <typename offsets_t, typename index_t>
-__global__ void offsets_to_indices_kernel(const offsets_t* offsets, index_t v, index_t* indices)
+__attribute__((visibility("hidden"))) __global__ void offsets_to_indices_kernel(
+  const offsets_t* offsets, index_t v, index_t* indices)
 {
   auto tid{threadIdx.x};
   auto ctaStart{blockIdx.x};
diff --git a/cpp/src/utilities/path_retrieval.cu b/cpp/src/utilities/path_retrieval.cu
index e37ce3a3ced..eda60941c23 100644
--- a/cpp/src/utilities/path_retrieval.cu
+++ b/cpp/src/utilities/path_retrieval.cu
@@ -29,13 +29,13 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t, typename weight_t>
-__global__ void get_traversed_cost_kernel(vertex_t const* vertices,
-                                          vertex_t const* preds,
-                                          vertex_t const* vtx_map,
-                                          weight_t const* info_weights,
-                                          weight_t* out,
-                                          vertex_t stop_vertex,
-                                          vertex_t num_vertices)
+__global__ static void get_traversed_cost_kernel(vertex_t const* vertices,
+                                                 vertex_t const* preds,
+                                                 vertex_t const* vtx_map,
+                                                 weight_t const* info_weights,
+                                                 weight_t* out,
+                                                 vertex_t stop_vertex,
+                                                 vertex_t num_vertices)
 {
   for (vertex_t i = threadIdx.x + blockIdx.x * blockDim.x; i < num_vertices;
        i += gridDim.x * blockDim.x) {
diff --git a/cpp/src/detail/shuffle_vertex_pairs.cu b/cpp/src/utilities/shuffle_vertex_pairs.cu
similarity index 80%
rename from cpp/src/detail/shuffle_vertex_pairs.cu
rename to cpp/src/utilities/shuffle_vertex_pairs.cu
index 33a7834f5ff..b473796aa9d 100644
--- a/cpp/src/detail/shuffle_vertex_pairs.cu
+++ b/cpp/src/utilities/shuffle_vertex_pairs.cu
@@ -519,4 +519,106 @@ shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   std::vector<int64_t> const& vertex_partition_range_lasts);
 
 }  // namespace detail
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<vertex_t>&& edge_srcs,
+                       rmm::device_uvector<vertex_t>&& edge_dsts,
+                       std::optional<rmm::device_uvector<weight_t>>&& edge_weights,
+                       std::optional<rmm::device_uvector<edge_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<edge_type_t>>&& edge_types)
+{
+  auto& comm                 = handle.get_comms();
+  auto const comm_size       = comm.get_size();
+  auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+  auto const major_comm_size = major_comm.get_size();
+  auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+  auto const minor_comm_size = minor_comm.get_size();
+
+  return detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+    handle,
+    std::move(edge_srcs),
+    std::move(edge_dsts),
+    std::move(edge_weights),
+    std::move(edge_ids),
+    std::move(edge_types));
+}
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int32_t>&& majors,
+                       rmm::device_uvector<int32_t>&& minors,
+                       std::optional<rmm::device_uvector<float>>&& weights,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int32_t>&& majors,
+                       rmm::device_uvector<int32_t>&& minors,
+                       std::optional<rmm::device_uvector<double>>&& weights,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int32_t>&& majors,
+                       rmm::device_uvector<int32_t>&& minors,
+                       std::optional<rmm::device_uvector<float>>&& weights,
+                       std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int32_t>&& majors,
+                       rmm::device_uvector<int32_t>&& minors,
+                       std::optional<rmm::device_uvector<double>>&& weights,
+                       std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int64_t>&& majors,
+                       rmm::device_uvector<int64_t>&& minors,
+                       std::optional<rmm::device_uvector<float>>&& weights,
+                       std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int64_t>&& majors,
+                       rmm::device_uvector<int64_t>&& minors,
+                       std::optional<rmm::device_uvector<double>>&& weights,
+                       std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
 }  // namespace cugraph
diff --git a/cpp/src/detail/shuffle_vertices.cu b/cpp/src/utilities/shuffle_vertices.cu
similarity index 73%
rename from cpp/src/detail/shuffle_vertices.cu
rename to cpp/src/utilities/shuffle_vertices.cu
index be6875f1073..b396201f509 100644
--- a/cpp/src/detail/shuffle_vertices.cu
+++ b/cpp/src/utilities/shuffle_vertices.cu
@@ -249,4 +249,74 @@ shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   rmm::device_uvector<double>&& values);
 
 }  // namespace detail
+
+template <typename vertex_t, typename value_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<value_t>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<vertex_t>&& vertices,
+                                    rmm::device_uvector<value_t>&& values)
+{
+  return detail::shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+    handle, std::move(vertices), std::move(values));
+}
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int32_t>&& vertices,
+                                    rmm::device_uvector<int32_t>&& values);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<size_t>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int32_t>&& vertices,
+                                    rmm::device_uvector<size_t>&& values);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<float>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int32_t>&& vertices,
+                                    rmm::device_uvector<float>&& values);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<double>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int32_t>&& vertices,
+                                    rmm::device_uvector<double>&& values);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int32_t>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int64_t>&& vertices,
+                                    rmm::device_uvector<int32_t>&& values);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int64_t>&& vertices,
+                                    rmm::device_uvector<int64_t>&& values);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<size_t>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int64_t>&& vertices,
+                                    rmm::device_uvector<size_t>&& values);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<float>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int64_t>&& vertices,
+                                    rmm::device_uvector<float>&& values);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<double>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int64_t>&& vertices,
+                                    rmm::device_uvector<double>&& values);
+
+template <typename vertex_t>
+rmm::device_uvector<vertex_t> shuffle_external_vertices(raft::handle_t const& handle,
+                                                        rmm::device_uvector<vertex_t>&& vertices)
+{
+  return detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(handle,
+                                                                          std::move(vertices));
+}
+
+template rmm::device_uvector<int32_t> shuffle_external_vertices(
+  raft::handle_t const& handle, rmm::device_uvector<int32_t>&& d_vertices);
+
+template rmm::device_uvector<int64_t> shuffle_external_vertices(
+  raft::handle_t const& handle, rmm::device_uvector<int64_t>&& d_vertices);
+
 }  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 46a895536ef..4d37c93326d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -372,6 +372,14 @@ ConfigureTest(EDGE_BETWEENNESS_CENTRALITY_TEST centrality/edge_betweenness_centr
 # - WEAKLY CONNECTED COMPONENTS tests -------------------------------------------------------------
 ConfigureTest(WEAKLY_CONNECTED_COMPONENTS_TEST components/weakly_connected_components_test.cpp)
 
+###############################################################################################
+# - MIS tests ------------------------------------------------------------------------------
+ConfigureTest(MIS_TEST components/mis_test.cu)
+
+###############################################################################################
+# - VERTEX COLORING tests -------------------------------------------------------------------
+ConfigureTest(VERTEX_COLORING_TEST components/vertex_coloring_test.cu)
+
 ###################################################################################################
 # - SIMILARITY tests ------------------------------------------------------------------------------
 ConfigureTest(SIMILARITY_TEST link_prediction/similarity_test.cu)
@@ -388,12 +396,10 @@ ConfigureTest(RANDOM_WALKS_TEST sampling/sg_random_walks_test.cpp)
 ###################################################################################################
 # - NBR SAMPLING tests ----------------------------------------------------------------------------
 ConfigureTest(UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/sg_uniform_neighbor_sampling.cu)
-target_link_libraries(UNIFORM_NEIGHBOR_SAMPLING_TEST PRIVATE cuco::cuco)
 
 ###################################################################################################
 # - SAMPLING_POST_PROCESSING tests ----------------------------------------------------------------
 ConfigureTest(SAMPLING_POST_PROCESSING_TEST sampling/sampling_post_processing_test.cu)
-target_link_libraries(SAMPLING_POST_PROCESSING_TEST PRIVATE cuco::cuco)
 
 ###################################################################################################
 # - Renumber tests --------------------------------------------------------------------------------
@@ -535,10 +541,6 @@ if(BUILD_CUGRAPH_MG_TESTS)
     # - MG ECG tests --------------------------------------------------------------------------
     ConfigureTestMG(MG_ECG_TEST community/mg_ecg_test.cpp)
 
-    ###############################################################################################
-    # - MG MIS tests ------------------------------------------------------------------------------
-    ConfigureTestMG(MG_MIS_TEST community/mg_mis_test.cu)
-
     ###############################################################################################
     # - MG SELECT RANDOM VERTICES tests -----------------------------------------------------------
     ConfigureTestMG(MG_SELECT_RANDOM_VERTICES structure/mg_select_random_vertices_test.cpp)
@@ -552,6 +554,14 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureTestMG(MG_WEAKLY_CONNECTED_COMPONENTS_TEST
                     components/mg_weakly_connected_components_test.cpp)
 
+    ###############################################################################################
+    # - MG MIS tests ------------------------------------------------------------------------------
+    ConfigureTestMG(MG_MIS_TEST components/mg_mis_test.cu)
+
+    ###############################################################################################
+    # - MG VERTEX COLORING tests -------------------------------------------------------------------
+    ConfigureTestMG(MG_VERTEX_COLORING_TEST components/mg_vertex_coloring_test.cu)
+
     ###############################################################################################
     # - MG Core Number tests ----------------------------------------------------------------------
     ConfigureTestMG(MG_CORE_NUMBER_TEST cores/mg_core_number_test.cpp)
@@ -571,78 +581,79 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ###############################################################################################
     # - MG PRIMS COUNT_IF_V tests -----------------------------------------------------------------
     ConfigureTestMG(MG_COUNT_IF_V_TEST prims/mg_count_if_v.cu)
-    target_link_libraries(MG_COUNT_IF_V_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST tests ------------------------------
     ConfigureTestMG(MG_TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST_TEST
                     prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu)
-    target_link_libraries(MG_TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS REDUCE_V tests -------------------------------------------------------------------
     ConfigureTestMG(MG_REDUCE_V_TEST prims/mg_reduce_v.cu)
-    target_link_libraries(MG_REDUCE_V_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS TRANSFORM_REDUCE_V tests ---------------------------------------------------------
     ConfigureTestMG(MG_TRANSFORM_REDUCE_V_TEST prims/mg_transform_reduce_v.cu)
-    target_link_libraries(MG_TRANSFORM_REDUCE_V_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS TRANSFORM_REDUCE_E tests ---------------------------------------------------------
     ConfigureTestMG(MG_TRANSFORM_REDUCE_E_TEST prims/mg_transform_reduce_e.cu)
-    target_link_libraries(MG_TRANSFORM_REDUCE_E_TEST PRIVATE cuco::cuco)
+
+    ###############################################################################################
+    # - MG PRIMS TRANSFORM_REDUCE_E _BY_SRC_DST_KEY tests -----------------------------------------
+    ConfigureTestMG(MG_TRANSFORM_REDUCE_E_BY_SRC_DST_KEY_TEST
+                    prims/mg_transform_reduce_e_by_src_dst_key.cu)
 
     ###############################################################################################
     # - MG PRIMS TRANSFORM_E tests ----------------------------------------------------------------
     ConfigureTestMG(MG_TRANSFORM_E_TEST prims/mg_transform_e.cu)
-    target_link_libraries(MG_TRANSFORM_E_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS COUNT_IF_E tests -----------------------------------------------------------------
     ConfigureTestMG(MG_COUNT_IF_E_TEST prims/mg_count_if_e.cu)
-    target_link_libraries(MG_COUNT_IF_E_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS PER_V_TRANSFORM_REDUCE_INCOMING_OUTGOING_E tests ---------------------------------
     ConfigureTestMG(MG_PER_V_TRANSFORM_REDUCE_INCOMING_OUTGOING_E_TEST
       prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu)
-    target_link_libraries(MG_PER_V_TRANSFORM_REDUCE_INCOMING_OUTGOING_E_TEST PRIVATE cuco::cuco)
+
+    ###############################################################################################
+    # - MG PRIMS PER_V_TRANSFORM_REDUCE_DST_KEY_AGGREGATED_OUTGOING_E tests -----------------------
+    ConfigureTestMG(MG_PER_V_TRANSFORM_REDUCE_DST_KEY_AGGREGATED_OUTGOING_E_TEST
+      prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu)
 
     ###############################################################################################
     # - MG PRIMS EXTRACT_TRANSFORM_E tests --------------------------------------------------------
     ConfigureTestMG(MG_EXTRACT_TRANSFORM_E_TEST prims/mg_extract_transform_e.cu)
-    target_link_libraries(MG_EXTRACT_TRANSFORM_E_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS EXTRACT_TRANSFORM_V_FRONTIER_OUTGOING_E tests ------------------------------------
     ConfigureTestMG(MG_EXTRACT_TRANSFORM_V_FRONTIER_OUTGOING_E_TEST
                     prims/mg_extract_transform_v_frontier_outgoing_e.cu)
-    target_link_libraries(MG_EXTRACT_TRANSFORM_V_FRONTIER_OUTGOING_E_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS PER_V_RANDOM_SELECT_TRANSFORM_OUTGOING_E tests -----------------------------------
     ConfigureTestMG(MG_PER_V_RANDOM_SELECT_TRANSFORM_OUTGOING_E_TEST
                     prims/mg_per_v_random_select_transform_outgoing_e.cu)
-    target_link_libraries(MG_PER_V_RANDOM_SELECT_TRANSFORM_OUTGOING_E_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS PER_V_PAIR_TRANSFORM_DST_NBR_INTERSECTION tests ----------------------------------
     ConfigureTestMG(MG_PER_V_PAIR_TRANSFORM_DST_NBR_INTERSECTION_TEST
                     prims/mg_per_v_pair_transform_dst_nbr_intersection.cu)
-    target_link_libraries(MG_PER_V_PAIR_TRANSFORM_DST_NBR_INTERSECTION_TEST PRIVATE cuco::cuco)
+
+    ###############################################################################################
+    # - MG PRIMS TRANSFORM_REDUCE_DST_NBR_INTERSECTION OF_E_ENDPOINTS_BY_V tests ------------------
+    ConfigureTestMG(MG_TRANSFORM_REDUCE_DST_NBR_INTERSECTION_BY_E_ENDPOINTS_BY_V_TEST
+                    prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu)
 
     ###############################################################################################
     # - MG PRIMS PER_V_PAIR_TRANSFORM_DST_NBR_WEIGHTED_INTERSECTION tests -------------------------
     ConfigureTestMG(MG_PER_V_PAIR_TRANSFORM_DST_NBR_WEIGHTED_INTERSECTION_TEST
-    prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu)
-    target_link_libraries(MG_PER_V_PAIR_TRANSFORM_DST_NBR_WEIGHTED_INTERSECTION_TEST PRIVATE cuco::cuco)
+                    prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu)
 
     ###############################################################################################
     # - MG NBR SAMPLING tests ---------------------------------------------------------------------
     ConfigureTestMG(MG_UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/mg_uniform_neighbor_sampling.cu)
-    target_link_libraries(MG_UNIFORM_NEIGHBOR_SAMPLING_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG RANDOM_WALKS tests ---------------------------------------------------------------------
@@ -684,6 +695,7 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureCTestMG(MG_CAPI_SIMILARITY_TEST c_api/mg_similarity_test.c)
     ConfigureCTestMG(MG_CAPI_K_CORE_TEST c_api/mg_k_core_test.c)
     ConfigureCTestMG(MG_CAPI_INDUCED_SUBGRAPH_TEST c_api/mg_induced_subgraph_test.c)
+    ConfigureCTestMG(MG_CAPI_DEGREES c_api/mg_degrees_test.c)
     ConfigureCTestMG(MG_CAPI_EGONET_TEST c_api/mg_egonet_test.c)
     ConfigureCTestMG(MG_CAPI_TWO_HOP_NEIGHBORS_TEST c_api/mg_two_hop_neighbors_test.c)
 
@@ -752,6 +764,7 @@ ConfigureCTest(CAPI_CORE_NUMBER_TEST c_api/core_number_test.c)
 ConfigureCTest(CAPI_SIMILARITY_TEST c_api/similarity_test.c)
 ConfigureCTest(CAPI_K_CORE_TEST c_api/k_core_test.c)
 ConfigureCTest(CAPI_INDUCED_SUBGRAPH_TEST c_api/induced_subgraph_test.c)
+ConfigureCTest(CAPI_DEGREES c_api/degrees_test.c)
 ConfigureCTest(CAPI_EGONET_TEST c_api/egonet_test.c)
 ConfigureCTest(CAPI_TWO_HOP_NEIGHBORS_TEST c_api/two_hop_neighbors_test.c)
 ConfigureCTest(CAPI_LEGACY_K_TRUSS_TEST c_api/legacy_k_truss_test.c)
diff --git a/cpp/tests/c_api/c_test_utils.h b/cpp/tests/c_api/c_test_utils.h
index ab9fbeccd4b..fbbf6333ee3 100644
--- a/cpp/tests/c_api/c_test_utils.h
+++ b/cpp/tests/c_api/c_test_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -101,6 +101,8 @@ int create_sg_test_graph(const cugraph_resource_handle_t* handle,
                          cugraph_graph_t** graph,
                          cugraph_error_t** ret_error);
 
+size_t cugraph_size_t_allreduce(const cugraph_resource_handle_t* handle, size_t value);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/cpp/tests/c_api/degrees_test.c b/cpp/tests/c_api/degrees_test.c
new file mode 100644
index 00000000000..10a038b323b
--- /dev/null
+++ b/cpp/tests/c_api/degrees_test.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_test_utils.h" /* RUN_TEST */
+
+#include <cugraph_c/graph.h>
+#include <cugraph_c/graph_functions.h>
+
+#include <stdio.h>
+
+typedef int32_t vertex_t;
+typedef int32_t edge_t;
+typedef float weight_t;
+
+/*
+ * Simple check of creating a graph from a COO on device memory.
+ */
+int generic_degrees_test(vertex_t* h_src,
+                         vertex_t* h_dst,
+                         weight_t* h_wgt,
+                         size_t num_vertices,
+                         size_t num_edges,
+                         vertex_t* h_vertices,
+                         size_t num_vertices_to_compute,
+                         bool_t in_degrees,
+                         bool_t out_degrees,
+                         bool_t store_transposed,
+                         bool_t is_symmetric,
+                         edge_t *h_in_degrees,
+                         edge_t *h_out_degrees)
+{
+  int test_ret_value = 0;
+
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error;
+
+  cugraph_resource_handle_t* handle = NULL;
+  cugraph_graph_t* graph            = NULL;
+  cugraph_degrees_result_t* result  = NULL;
+
+  handle = cugraph_create_resource_handle(NULL);
+  TEST_ASSERT(test_ret_value, handle != NULL, "resource handle creation failed.");
+
+  ret_code = create_test_graph(
+    handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, is_symmetric, &graph, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  if (h_vertices == NULL) {
+    if (in_degrees && out_degrees) {
+      ret_code = cugraph_degrees(
+        handle, graph, NULL, FALSE, &result, &ret_error);
+    } else if (in_degrees) {
+      ret_code = cugraph_in_degrees(
+        handle, graph, NULL, FALSE, &result, &ret_error);
+    } else {
+      ret_code = cugraph_out_degrees(
+        handle, graph, NULL, FALSE, &result, &ret_error);
+    }
+
+    TEST_ASSERT(
+      test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_extract_degrees failed.");
+  } else {
+    cugraph_type_erased_device_array_t* vertices           = NULL;
+    cugraph_type_erased_device_array_view_t* vertices_view = NULL;
+
+    ret_code =
+      cugraph_type_erased_device_array_create(handle, num_vertices_to_compute, INT32, &vertices, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "seeds create failed.");
+
+    vertices_view = cugraph_type_erased_device_array_view(vertices);
+
+    ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, vertices_view, (byte_t*)h_vertices, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "src copy_from_host failed.");
+
+    if (in_degrees && out_degrees) {
+      ret_code = cugraph_degrees(
+        handle, graph, vertices_view, FALSE, &result, &ret_error);
+    } else if (in_degrees) {
+      ret_code = cugraph_in_degrees(
+        handle, graph, vertices_view, FALSE, &result, &ret_error);
+    } else {
+      ret_code = cugraph_out_degrees(
+        handle, graph, vertices_view, FALSE, &result, &ret_error);
+    }
+
+    TEST_ASSERT(
+      test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_extract_degrees failed.");
+  }
+
+  cugraph_type_erased_device_array_view_t* result_vertices;
+  cugraph_type_erased_device_array_view_t* result_in_degrees;
+  cugraph_type_erased_device_array_view_t* result_out_degrees;
+
+  result_vertices    = cugraph_degrees_result_get_vertices(result);
+  result_in_degrees  = cugraph_degrees_result_get_in_degrees(result);
+  result_out_degrees = cugraph_degrees_result_get_out_degrees(result);
+
+  size_t num_result_vertices = cugraph_type_erased_device_array_view_size(result_vertices);
+
+  vertex_t h_result_vertices[num_result_vertices];
+  edge_t   h_result_in_degrees[num_result_vertices];
+  edge_t   h_result_out_degrees[num_result_vertices];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_result_vertices, result_vertices, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  if (result_in_degrees != NULL) {
+    ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+      handle, (byte_t*)h_result_in_degrees, result_in_degrees, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+  }
+
+  if (result_out_degrees != NULL) {
+    ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+      handle, (byte_t*)h_result_out_degrees, result_out_degrees, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+  }
+
+  if (h_vertices != NULL) {
+    TEST_ASSERT(test_ret_value, num_result_vertices == num_vertices_to_compute, "results not the same size");
+  } else {
+    TEST_ASSERT(test_ret_value, num_result_vertices == num_vertices, "results not the same size");
+  }
+
+  for (size_t i = 0; (i < num_result_vertices) && (test_ret_value == 0); ++i) {
+    if (h_in_degrees != NULL) {
+      TEST_ASSERT(test_ret_value, h_result_in_degrees[i] == h_in_degrees[h_result_vertices[i]], "in degree did not match");
+    }
+
+    if (h_out_degrees != NULL) {
+      TEST_ASSERT(test_ret_value, h_result_out_degrees[i] == h_out_degrees[h_result_vertices[i]], "out degree did not match");
+    }
+  }
+
+  cugraph_degrees_result_free(result);
+  cugraph_graph_free(graph);
+  cugraph_error_free(ret_error);
+
+  return test_ret_value;
+}
+
+int test_degrees()
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_in_degrees[]  = {1, 2, 0, 2, 1, 2};
+  vertex_t h_out_degrees[] = {1, 2, 3, 1, 1, 0};
+
+  return generic_degrees_test(h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              NULL,
+                              0,
+                              TRUE,
+                              TRUE,
+                              FALSE,
+                              FALSE,
+                              h_in_degrees,
+                              h_out_degrees);
+}
+
+int test_degrees_symmetric()
+{
+  size_t num_edges         = 16;
+  size_t num_vertices      = 6;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f,
+                              0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_in_degrees[]  = {2, 4, 3, 3, 2, 2};
+  vertex_t h_out_degrees[] = {2, 4, 3, 3, 2, 2};
+
+  return generic_degrees_test(h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              NULL,
+                              0,
+                              TRUE,
+                              TRUE,
+                              FALSE,
+                              TRUE,
+                              h_in_degrees,
+                              h_out_degrees);
+}
+
+int test_in_degrees()
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_in_degrees[]  = {1, 2, 0, 2, 1, 2};
+
+  return generic_degrees_test(h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              NULL,
+                              0,
+                              TRUE,
+                              FALSE,
+                              FALSE,
+                              TRUE,
+                              h_in_degrees,
+                              NULL);
+}
+
+int test_out_degrees()
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_out_degrees[] = {1, 2, 3, 1, 1, 0};
+
+  return generic_degrees_test(h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              NULL,
+                              0,
+                              FALSE,
+                              TRUE,
+                              FALSE,
+                              TRUE,
+                              NULL,
+                              h_out_degrees);
+}
+
+int test_degrees_subset()
+{
+  size_t num_edges               = 8;
+  size_t num_vertices            = 6;
+  size_t num_vertices_to_compute = 3;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_vertices[]    = {2, 3, 5};
+  vertex_t h_in_degrees[]  = {-1, -1, 0, 2, -1, 2};
+  vertex_t h_out_degrees[] = {-1, -1, 3, 1, -1, 0};
+
+  return generic_degrees_test(h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              h_vertices,
+                              num_vertices_to_compute,
+                              TRUE,
+                              TRUE,
+                              FALSE,
+                              FALSE,
+                              h_in_degrees,
+                              h_out_degrees);
+}
+
+int test_degrees_symmetric_subset()
+{
+  size_t num_edges         = 16;
+  size_t num_vertices      = 6;
+  size_t num_vertices_to_compute = 3;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f,
+                              0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_vertices[]    = {2, 3, 5};
+  vertex_t h_in_degrees[]  = {-1, -1, 3, 3, -1, 2};
+  vertex_t h_out_degrees[] = {-1, -1, 3, 3, -1, 2};
+
+  return generic_degrees_test(h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              h_vertices,
+                              num_vertices_to_compute,
+                              TRUE,
+                              TRUE,
+                              FALSE,
+                              TRUE,
+                              h_in_degrees,
+                              h_out_degrees);
+}
+
+int test_in_degrees_subset()
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+  size_t num_vertices_to_compute = 3;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_vertices[]    = {2, 3, 5};
+  vertex_t h_in_degrees[]  = {-1, -1, 0, 2, -1, 2};
+
+  return generic_degrees_test(h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              h_vertices,
+                              num_vertices_to_compute,
+                              TRUE,
+                              FALSE,
+                              FALSE,
+                              TRUE,
+                              h_in_degrees,
+                              NULL);
+}
+
+int test_out_degrees_subset()
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+  size_t num_vertices_to_compute = 3;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_vertices[]    = {2, 3, 5};
+  vertex_t h_out_degrees[] = {-1, -1, 3, 1, -1, 0};
+
+  return generic_degrees_test(h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              h_vertices,
+                              num_vertices_to_compute,
+                              FALSE,
+                              TRUE,
+                              FALSE,
+                              TRUE,
+                              NULL,
+                              h_out_degrees);
+}
+
+/******************************************************************************/
+
+int main(int argc, char** argv)
+{
+  int result = 0;
+  result |= RUN_TEST(test_degrees);
+  result |= RUN_TEST(test_degrees_symmetric);
+  result |= RUN_TEST(test_in_degrees);
+  result |= RUN_TEST(test_out_degrees);
+  result |= RUN_TEST(test_degrees_subset);
+  result |= RUN_TEST(test_degrees_symmetric_subset);
+  result |= RUN_TEST(test_in_degrees_subset);
+  result |= RUN_TEST(test_out_degrees_subset);
+  return result;
+}
diff --git a/cpp/tests/c_api/mg_degrees_test.c b/cpp/tests/c_api/mg_degrees_test.c
new file mode 100644
index 00000000000..3312dd4f5bb
--- /dev/null
+++ b/cpp/tests/c_api/mg_degrees_test.c
@@ -0,0 +1,407 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mg_test_utils.h" /* RUN_TEST */
+
+#include <cugraph_c/graph.h>
+#include <cugraph_c/graph_functions.h>
+
+#include <stdio.h>
+
+typedef int32_t vertex_t;
+typedef int32_t edge_t;
+typedef float weight_t;
+
+/*
+ * Simple check of creating a graph from a COO on device memory.
+ */
+int generic_degrees_test(const cugraph_resource_handle_t* handle,
+                         vertex_t* h_src,
+                         vertex_t* h_dst,
+                         weight_t* h_wgt,
+                         size_t num_vertices,
+                         size_t num_edges,
+                         vertex_t* h_vertices,
+                         size_t num_vertices_to_compute,
+                         bool_t in_degrees,
+                         bool_t out_degrees,
+                         bool_t store_transposed,
+                         bool_t is_symmetric,
+                         edge_t* h_in_degrees,
+                         edge_t* h_out_degrees)
+{
+  int test_ret_value = 0;
+
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error;
+
+  cugraph_graph_t* graph                    = NULL;
+  cugraph_degrees_result_t* result = NULL;
+
+  ret_code = create_mg_test_graph(
+    handle, h_src, h_dst, h_wgt, num_edges, store_transposed, is_symmetric, &graph, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  if (h_vertices == NULL) {
+    if (in_degrees && out_degrees) {
+      ret_code = cugraph_degrees(
+        handle, graph, NULL, FALSE, &result, &ret_error);
+    } else if (in_degrees) {
+      ret_code = cugraph_in_degrees(
+        handle, graph, NULL, FALSE, &result, &ret_error);
+    } else {
+      ret_code = cugraph_out_degrees(
+        handle, graph, NULL, FALSE, &result, &ret_error);
+    }
+
+    TEST_ASSERT(
+      test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_extract_degrees failed.");
+  } else {
+    cugraph_type_erased_device_array_t* vertices           = NULL;
+    cugraph_type_erased_device_array_view_t* vertices_view = NULL;
+
+    int rank = cugraph_resource_handle_get_rank(handle);
+
+    size_t num_to_allocate = 0;
+    if (rank == 0) num_to_allocate = num_vertices_to_compute;
+
+    ret_code =
+      cugraph_type_erased_device_array_create(handle, num_to_allocate, INT32, &vertices, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "seeds create failed.");
+
+    vertices_view = cugraph_type_erased_device_array_view(vertices);
+
+    ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, vertices_view, (byte_t*)h_vertices, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "src copy_from_host failed.");
+
+    if (in_degrees && out_degrees) {
+      ret_code = cugraph_degrees(
+        handle, graph, vertices_view, FALSE, &result, &ret_error);
+    } else if (in_degrees) {
+      ret_code = cugraph_in_degrees(
+        handle, graph, vertices_view, FALSE, &result, &ret_error);
+    } else {
+      ret_code = cugraph_out_degrees(
+        handle, graph, vertices_view, FALSE, &result, &ret_error);
+    }
+
+    TEST_ASSERT(
+      test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_extract_degrees failed.");
+  }
+
+  cugraph_type_erased_device_array_view_t* result_vertices;
+  cugraph_type_erased_device_array_view_t* result_in_degrees;
+  cugraph_type_erased_device_array_view_t* result_out_degrees;
+
+  result_vertices    = cugraph_degrees_result_get_vertices(result);
+  result_in_degrees  = cugraph_degrees_result_get_in_degrees(result);
+  result_out_degrees = cugraph_degrees_result_get_out_degrees(result);
+
+  size_t num_result_vertices = cugraph_type_erased_device_array_view_size(result_vertices);
+
+  vertex_t h_result_vertices[num_result_vertices];
+  edge_t   h_result_in_degrees[num_result_vertices];
+  edge_t   h_result_out_degrees[num_result_vertices];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_result_vertices, result_vertices, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  if (result_in_degrees != NULL) {
+    ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+      handle, (byte_t*)h_result_in_degrees, result_in_degrees, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+  }
+
+  if (result_out_degrees != NULL) {
+    ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+      handle, (byte_t*)h_result_out_degrees, result_out_degrees, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+  }
+
+  if (h_vertices != NULL) {
+    size_t xxx = cugraph_size_t_allreduce(handle, num_result_vertices);
+    TEST_ASSERT(test_ret_value, cugraph_size_t_allreduce(handle, num_result_vertices) == num_vertices_to_compute, "results not the same size");
+  } else {
+    size_t xxx = cugraph_size_t_allreduce(handle, num_result_vertices);
+    TEST_ASSERT(test_ret_value, cugraph_size_t_allreduce(handle, num_result_vertices) == num_vertices, "results not the same size");
+  }
+
+  for (size_t i = 0; (i < num_result_vertices) && (test_ret_value == 0); ++i) {
+    if (h_in_degrees != NULL) {
+      TEST_ASSERT(test_ret_value, h_result_in_degrees[i] == h_in_degrees[h_result_vertices[i]], "in degree did not match");
+    }
+
+    if (h_out_degrees != NULL) {
+      TEST_ASSERT(test_ret_value, h_result_out_degrees[i] == h_out_degrees[h_result_vertices[i]], "out degree did not match");
+    }
+  }
+
+  cugraph_degrees_result_free(result);
+  cugraph_graph_free(graph);
+  cugraph_error_free(ret_error);
+  return test_ret_value;
+}
+
+int test_degrees(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges            = 8;
+  size_t num_vertices         = 6;
+
+  vertex_t h_src[]               = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]               = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]               = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_in_degrees[]        = {1, 2, 0, 2, 1, 2};
+  vertex_t h_out_degrees[]       = {1, 2, 3, 1, 1, 0};
+
+  // Pagerank wants store_transposed = TRUE
+  return generic_degrees_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              NULL,
+                              0,
+                              TRUE,
+                              TRUE,
+                              TRUE,
+                              FALSE,
+                              h_in_degrees,
+                              h_out_degrees);
+}
+
+int test_degrees_symmetric(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges         = 16;
+  size_t num_vertices      = 6;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f,
+                              0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_in_degrees[]  = {2, 4, 3, 3, 2, 2};
+  vertex_t h_out_degrees[] = {2, 4, 3, 3, 2, 2};
+
+  // Pagerank wants store_transposed = TRUE
+  return generic_degrees_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              NULL,
+                              0,
+                              TRUE,
+                              TRUE,
+                              TRUE,
+                              TRUE,
+                              h_in_degrees,
+                              h_out_degrees);
+}
+
+int test_in_degrees(const cugraph_resource_handle_t *handle)
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_in_degrees[]  = {1, 2, 0, 2, 1, 2};
+
+  return generic_degrees_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              NULL,
+                              0,
+                              TRUE,
+                              FALSE,
+                              FALSE,
+                              TRUE,
+                              h_in_degrees,
+                              NULL);
+}
+
+int test_out_degrees(const cugraph_resource_handle_t *handle)
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_out_degrees[] = {1, 2, 3, 1, 1, 0};
+
+  return generic_degrees_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              NULL,
+                              0,
+                              FALSE,
+                              TRUE,
+                              FALSE,
+                              TRUE,
+                              NULL,
+                              h_out_degrees);
+}
+
+int test_degrees_subset(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges               = 8;
+  size_t num_vertices            = 6;
+  size_t num_vertices_to_compute = 3;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_vertices[]    = {2, 3, 5};
+  vertex_t h_in_degrees[]  = {-1, -1, 0, 2, -1, 2};
+  vertex_t h_out_degrees[] = {-1, -1, 3, 1, -1, 0};
+
+  return generic_degrees_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              h_vertices,
+                              num_vertices_to_compute,
+                              TRUE,
+                              TRUE,
+                              FALSE,
+                              FALSE,
+                              h_in_degrees,
+                              h_out_degrees);
+}
+
+int test_degrees_symmetric_subset(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges         = 16;
+  size_t num_vertices      = 6;
+  size_t num_vertices_to_compute = 3;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f,
+                              0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_vertices[]    = {2, 3, 5};
+  vertex_t h_in_degrees[]  = {-1, -1, 3, 3, -1, 2};
+  vertex_t h_out_degrees[] = {-1, -1, 3, 3, -1, 2};
+
+  return generic_degrees_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              h_vertices,
+                              num_vertices_to_compute,
+                              TRUE,
+                              TRUE,
+                              FALSE,
+                              TRUE,
+                              h_in_degrees,
+                              h_out_degrees);
+}
+
+int test_in_degrees_subset(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+  size_t num_vertices_to_compute = 3;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_vertices[]    = {2, 3, 5};
+  vertex_t h_in_degrees[]  = {-1, -1, 0, 2, -1, 2};
+
+  return generic_degrees_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              h_vertices,
+                              num_vertices_to_compute,
+                              TRUE,
+                              FALSE,
+                              FALSE,
+                              TRUE,
+                              h_in_degrees,
+                              NULL);
+}
+
+int test_out_degrees_subset(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+  size_t num_vertices_to_compute = 3;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_vertices[]    = {2, 3, 5};
+  vertex_t h_out_degrees[] = {-1, -1, 3, 1, -1, 0};
+
+  return generic_degrees_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              h_vertices,
+                              num_vertices_to_compute,
+                              FALSE,
+                              TRUE,
+                              FALSE,
+                              TRUE,
+                              NULL,
+                              h_out_degrees);
+}
+
+/******************************************************************************/
+
+int main(int argc, char** argv)
+{
+  void* raft_handle                 = create_mg_raft_handle(argc, argv);
+  cugraph_resource_handle_t* handle = cugraph_create_resource_handle(raft_handle);
+
+  int result = 0;
+  result |= RUN_MG_TEST(test_degrees, handle);
+  result |= RUN_MG_TEST(test_degrees_symmetric, handle);
+  result |= RUN_MG_TEST(test_in_degrees, handle);
+  result |= RUN_MG_TEST(test_out_degrees, handle);
+  result |= RUN_MG_TEST(test_degrees_subset, handle);
+  result |= RUN_MG_TEST(test_degrees_symmetric_subset, handle);
+  result |= RUN_MG_TEST(test_in_degrees_subset, handle);
+  result |= RUN_MG_TEST(test_out_degrees_subset, handle);
+
+  cugraph_free_resource_handle(handle);
+  free_mg_raft_handle(raft_handle);
+
+  return result;
+}
diff --git a/cpp/tests/c_api/test_utils.cpp b/cpp/tests/c_api/test_utils.cpp
index e37cc4555dd..3013cbb7cc6 100644
--- a/cpp/tests/c_api/test_utils.cpp
+++ b/cpp/tests/c_api/test_utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,9 @@
  */
 
 #include "c_test_utils.h"
+#include "c_api/resource_handle.hpp"
+
+#include <cugraph/utilities/host_scalar_comm.hpp>
 
 #include <math.h>
 
@@ -388,3 +391,12 @@ int create_sg_test_graph(const cugraph_resource_handle_t* handle,
 
   return test_ret_value;
 }
+
+extern "C" size_t cugraph_size_t_allreduce(const cugraph_resource_handle_t* handle, size_t value)
+{
+  auto internal_handle = reinterpret_cast<cugraph::c_api::cugraph_resource_handle_t const *>(handle);
+  return cugraph::host_scalar_allreduce(internal_handle->handle_->get_comms(),
+                                        value,
+                                        raft::comms::op_t::SUM,
+                                        internal_handle->handle_->get_stream());
+}
diff --git a/cpp/tests/community/mg_mis_test.cu b/cpp/tests/components/mg_mis_test.cu
similarity index 95%
rename from cpp/tests/community/mg_mis_test.cu
rename to cpp/tests/components/mg_mis_test.cu
index 1240cf812f9..04c346b0f00 100644
--- a/cpp/tests/community/mg_mis_test.cu
+++ b/cpp/tests/components/mg_mis_test.cu
@@ -245,18 +245,20 @@ TEST_P(Tests_MGMaximalIndependentSet_Rmat, CheckInt64Int64FloatFloat)
     override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
 }
 
+bool constexpr check_correctness = false;
 INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGMaximalIndependentSet_File,
-  ::testing::Combine(::testing::Values(MaximalIndependentSet_Usecase{false},
-                                       MaximalIndependentSet_Usecase{false}),
+  ::testing::Combine(::testing::Values(MaximalIndependentSet_Usecase{check_correctness},
+                                       MaximalIndependentSet_Usecase{check_correctness}),
                      ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
 
-INSTANTIATE_TEST_SUITE_P(rmat_small_test,
-                         Tests_MGMaximalIndependentSet_Rmat,
-                         ::testing::Combine(::testing::Values(MaximalIndependentSet_Usecase{false}),
-                                            ::testing::Values(cugraph::test::Rmat_Usecase(
-                                              3, 4, 0.57, 0.19, 0.19, 0, true, false))));
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGMaximalIndependentSet_Rmat,
+  ::testing::Combine(
+    ::testing::Values(MaximalIndependentSet_Usecase{check_correctness}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(3, 4, 0.57, 0.19, 0.19, 0, true, false))));
 
 INSTANTIATE_TEST_SUITE_P(
   rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
@@ -266,7 +268,8 @@ INSTANTIATE_TEST_SUITE_P(
                           factor (to avoid running same benchmarks more than once) */
   Tests_MGMaximalIndependentSet_Rmat,
   ::testing::Combine(
-    ::testing::Values(MaximalIndependentSet_Usecase{false}, MaximalIndependentSet_Usecase{false}),
+    ::testing::Values(MaximalIndependentSet_Usecase{check_correctness},
+                      MaximalIndependentSet_Usecase{check_correctness}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/components/mg_vertex_coloring_test.cu b/cpp/tests/components/mg_vertex_coloring_test.cu
new file mode 100644
index 00000000000..ce4dadaa786
--- /dev/null
+++ b/cpp/tests/components/mg_vertex_coloring_test.cu
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include "prims/fill_edge_src_dst_property.cuh"
+#include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
+#include "prims/property_generator.cuh"
+#include "prims/reduce_op.cuh"
+#include "prims/transform_reduce_e.cuh"
+#include "prims/update_edge_src_dst_property.cuh"
+#include "utilities/base_fixture.hpp"
+#include "utilities/test_graphs.hpp"
+#include "utilities/test_utilities.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/edge_partition_view.hpp>
+#include <cugraph/edge_property.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+
+#include <raft/random/rng_state.hpp>
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <iostream>
+#include <random>
+
+struct GraphColoring_UseCase {
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGGraphColoring
+  : public ::testing::TestWithParam<std::tuple<GraphColoring_UseCase, input_usecase_t>> {
+ public:
+  Tests_MGGraphColoring() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(std::tuple<GraphColoring_UseCase, input_usecase_t> const& param)
+  {
+    auto [coloring_usecase, input_usecase] = param;
+
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    constexpr bool multi_gpu = true;
+
+    auto [mg_graph, mg_edge_weights, mg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, multi_gpu>(
+        *handle_, input_usecase, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    raft::random::RngState rng_state(multi_gpu ? handle_->get_comms().get_rank() : 0);
+    auto d_colors =
+      cugraph::vertex_coloring<vertex_t, edge_t, multi_gpu>(*handle_, mg_graph_view, rng_state);
+
+    // Test Graph Coloring
+
+    if (coloring_usecase.check_correctness) {
+      std::vector<vertex_t> h_colors(d_colors.size());
+      raft::update_host(h_colors.data(), d_colors.data(), d_colors.size(), handle_->get_stream());
+
+      std::for_each(h_colors.begin(),
+                    h_colors.end(),
+                    [num_vertices = mg_graph_view.number_of_vertices()](vertex_t color_id) {
+                      ASSERT_TRUE(color_id <= num_vertices);
+                    });
+
+      using GraphViewType = cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>;
+      cugraph::edge_src_property_t<GraphViewType, vertex_t> src_color_cache(*handle_);
+      cugraph::edge_dst_property_t<GraphViewType, vertex_t> dst_color_cache(*handle_);
+
+      if constexpr (multi_gpu) {
+        src_color_cache =
+          cugraph::edge_src_property_t<GraphViewType, vertex_t>(*handle_, mg_graph_view);
+        dst_color_cache =
+          cugraph::edge_dst_property_t<GraphViewType, vertex_t>(*handle_, mg_graph_view);
+        update_edge_src_property(*handle_, mg_graph_view, d_colors.begin(), src_color_cache);
+        update_edge_dst_property(*handle_, mg_graph_view, d_colors.begin(), dst_color_cache);
+      }
+
+      rmm::device_uvector<uint8_t> d_color_conflicts(
+        mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
+
+      per_v_transform_reduce_outgoing_e(
+        *handle_,
+        mg_graph_view,
+        multi_gpu
+          ? src_color_cache.view()
+          : cugraph::detail::edge_major_property_view_t<vertex_t, vertex_t const*>(d_colors.data()),
+        multi_gpu ? dst_color_cache.view()
+                  : cugraph::detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
+                      d_colors.data(), vertex_t{0}),
+        cugraph::edge_dummy_property_t{}.view(),
+        [] __device__(auto src, auto dst, auto src_color, auto dst_color, thrust::nullopt_t) {
+          if ((src != dst) && (src_color == dst_color)) {
+            return uint8_t{1};
+          } else {
+            return uint8_t{0};
+          }
+        },
+        uint8_t{0},
+        cugraph::reduce_op::maximum<uint8_t>{},
+        d_color_conflicts.begin());
+
+      std::vector<uint8_t> h_color_conflicts(d_color_conflicts.size());
+      raft::update_host(h_color_conflicts.data(),
+                        d_color_conflicts.data(),
+                        d_color_conflicts.size(),
+                        handle_->get_stream());
+
+      std::vector<vertex_t> h_vertices_in_this_proces((*mg_renumber_map).size());
+
+      raft::update_host(h_vertices_in_this_proces.data(),
+                        (*mg_renumber_map).data(),
+                        (*mg_renumber_map).size(),
+                        handle_->get_stream());
+      handle_->sync_stream();
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      size_t nr_conflicts = cugraph::transform_reduce_e(
+        *handle_,
+        mg_graph_view,
+        multi_gpu ? src_color_cache.view()
+                  : cugraph::detail::edge_major_property_view_t<vertex_t, vertex_t const*>(
+                      d_colors.begin()),
+        multi_gpu ? dst_color_cache.view()
+                  : cugraph::detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
+                      d_colors.begin(), vertex_t{0}),
+        cugraph::edge_dummy_property_t{}.view(),
+        [renumber_map = (*mg_renumber_map).data()] __device__(
+          auto src, auto dst, auto src_color, auto dst_color, thrust::nullopt_t) {
+          if ((src != dst) && (src_color == dst_color)) {
+            return vertex_t{1};
+          } else {
+            return vertex_t{0};
+          }
+        },
+        vertex_t{0});
+
+      ASSERT_TRUE(nr_conflicts == edge_t{0})
+        << "adjacent vertices can't have same color." << std::endl;
+
+      {
+        thrust::for_each(
+          thrust::host,
+          thrust::make_zip_iterator(thrust::make_tuple(
+            h_colors.begin(), h_vertices_in_this_proces.begin(), h_color_conflicts.begin())),
+          thrust::make_zip_iterator(thrust::make_tuple(
+            h_colors.end(), h_vertices_in_this_proces.end(), h_color_conflicts.end())),
+          [](auto color_vetex_and_conflict_flag) {
+            auto color         = thrust::get<0>(color_vetex_and_conflict_flag);
+            auto v             = thrust::get<1>(color_vetex_and_conflict_flag);
+            auto conflict_flag = thrust::get<2>(color_vetex_and_conflict_flag);
+            ASSERT_TRUE(conflict_flag == 0)
+              << v << " got same color as one of its neighbor" << std::endl;
+          });
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGGraphColoring<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGGraphColoring_File = Tests_MGGraphColoring<cugraph::test::File_Usecase>;
+using Tests_MGGraphColoring_Rmat = Tests_MGGraphColoring<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGGraphColoring_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGGraphColoring_File, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGGraphColoring_File, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGGraphColoring_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGGraphColoring_Rmat, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGGraphColoring_Rmat, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+bool constexpr check_correctness = false;
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGGraphColoring_File,
+  ::testing::Combine(::testing::Values(GraphColoring_UseCase{check_correctness},
+                                       GraphColoring_UseCase{check_correctness}),
+                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGGraphColoring_Rmat,
+  ::testing::Combine(
+    ::testing::Values(GraphColoring_UseCase{check_correctness}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(3, 4, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGGraphColoring_Rmat,
+  ::testing::Combine(
+    ::testing::Values(GraphColoring_UseCase{check_correctness},
+                      GraphColoring_UseCase{check_correctness}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/components/mis_test.cu b/cpp/tests/components/mis_test.cu
new file mode 100644
index 00000000000..f3bdd3d0e8b
--- /dev/null
+++ b/cpp/tests/components/mis_test.cu
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
+#include "prims/reduce_op.cuh"
+#include "utilities/base_fixture.hpp"
+#include "utilities/test_graphs.hpp"
+#include "utilities/test_utilities.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/edge_partition_view.hpp>
+#include <cugraph/edge_property.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/random/rng_state.hpp>
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <iostream>
+#include <random>
+
+struct MaximalIndependentSet_Usecase {
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_SGMaximalIndependentSet
+  : public ::testing::TestWithParam<std::tuple<MaximalIndependentSet_Usecase, input_usecase_t>> {
+ public:
+  Tests_SGMaximalIndependentSet() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(std::tuple<MaximalIndependentSet_Usecase, input_usecase_t> const& param)
+  {
+    auto [mis_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      hr_timer.start("Construct graph");
+    }
+
+    constexpr bool multi_gpu = false;
+
+    auto [sg_graph, sg_edge_weights, sg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, multi_gpu>(
+        handle, input_usecase, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto sg_graph_view = sg_graph.view();
+    auto sg_edge_weight_view =
+      sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
+
+    raft::random::RngState rng_state(0);
+    auto d_mis = cugraph::maximal_independent_set<vertex_t, edge_t, multi_gpu>(
+      handle, sg_graph_view, rng_state);
+
+    // Test MIS
+    if (mis_usecase.check_correctness) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      std::vector<vertex_t> h_mis(d_mis.size());
+      raft::update_host(h_mis.data(), d_mis.data(), d_mis.size(), handle.get_stream());
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      auto vertex_first = sg_graph_view.local_vertex_partition_range_first();
+      auto vertex_last  = sg_graph_view.local_vertex_partition_range_last();
+
+      std::for_each(h_mis.begin(), h_mis.end(), [vertex_first, vertex_last](vertex_t v) {
+        ASSERT_TRUE((v >= vertex_first) && (v < vertex_last));
+      });
+
+      // If a vertex is included in MIS, then none of its neighbor should be
+
+      vertex_t local_vtx_partitoin_size = sg_graph_view.local_vertex_partition_range_size();
+      rmm::device_uvector<vertex_t> d_total_outgoing_nbrs_included_mis(local_vtx_partitoin_size,
+                                                                       handle.get_stream());
+
+      rmm::device_uvector<vertex_t> inclusiong_flags(local_vtx_partitoin_size, handle.get_stream());
+
+      thrust::uninitialized_fill(
+        handle.get_thrust_policy(), inclusiong_flags.begin(), inclusiong_flags.end(), vertex_t{0});
+
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        d_mis.begin(),
+        d_mis.end(),
+        [inclusiong_flags =
+           raft::device_span<vertex_t>(inclusiong_flags.data(), inclusiong_flags.size()),
+         v_first = sg_graph_view.local_vertex_partition_range_first()] __device__(auto v) {
+          auto v_offset              = v - v_first;
+          inclusiong_flags[v_offset] = vertex_t{1};
+        });
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      per_v_transform_reduce_outgoing_e(
+        handle,
+        sg_graph_view,
+        cugraph::detail::edge_major_property_view_t<vertex_t, vertex_t const*>(
+          inclusiong_flags.data()),
+        cugraph::detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
+          inclusiong_flags.data(), vertex_t{0}),
+        cugraph::edge_dummy_property_t{}.view(),
+        [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) {
+          return (src == dst) ? 0 : dst_included;
+        },
+        vertex_t{0},
+        cugraph::reduce_op::plus<vertex_t>{},
+        d_total_outgoing_nbrs_included_mis.begin());
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      std::vector<vertex_t> h_total_outgoing_nbrs_included_mis(
+        d_total_outgoing_nbrs_included_mis.size());
+      raft::update_host(h_total_outgoing_nbrs_included_mis.data(),
+                        d_total_outgoing_nbrs_included_mis.data(),
+                        d_total_outgoing_nbrs_included_mis.size(),
+                        handle.get_stream());
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      {
+        auto vertex_first = sg_graph_view.local_vertex_partition_range_first();
+        auto vertex_last  = sg_graph_view.local_vertex_partition_range_last();
+
+        std::for_each(h_mis.begin(),
+                      h_mis.end(),
+                      [vertex_first, vertex_last, &h_total_outgoing_nbrs_included_mis](vertex_t v) {
+                        ASSERT_TRUE((v >= vertex_first) && (v < vertex_last))
+                          << v << " is not within vertex parition range" << std::endl;
+
+                        ASSERT_TRUE(h_total_outgoing_nbrs_included_mis[v - vertex_first] == 0)
+                          << v << "'s neighbor is included in MIS" << std::endl;
+                      });
+      }
+    }
+  }
+};
+
+using Tests_SGMaximalIndependentSet_File =
+  Tests_SGMaximalIndependentSet<cugraph::test::File_Usecase>;
+using Tests_SGMaximalIndependentSet_Rmat =
+  Tests_SGMaximalIndependentSet<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_SGMaximalIndependentSet_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGMaximalIndependentSet_File, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGMaximalIndependentSet_File, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGMaximalIndependentSet_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGMaximalIndependentSet_Rmat, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGMaximalIndependentSet_Rmat, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+bool constexpr check_correctness = false;
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_SGMaximalIndependentSet_File,
+  ::testing::Combine(::testing::Values(MaximalIndependentSet_Usecase{check_correctness},
+                                       MaximalIndependentSet_Usecase{check_correctness}),
+                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_SGMaximalIndependentSet_Rmat,
+  ::testing::Combine(
+    ::testing::Values(MaximalIndependentSet_Usecase{check_correctness}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(3, 4, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_SGMaximalIndependentSet_Rmat,
+  ::testing::Combine(
+    ::testing::Values(MaximalIndependentSet_Usecase{check_correctness},
+                      MaximalIndependentSet_Usecase{check_correctness}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/components/vertex_coloring_test.cu b/cpp/tests/components/vertex_coloring_test.cu
new file mode 100644
index 00000000000..27a0c5013bd
--- /dev/null
+++ b/cpp/tests/components/vertex_coloring_test.cu
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
+#include "prims/reduce_op.cuh"
+#include "prims/transform_reduce_e.cuh"
+#include "utilities/base_fixture.hpp"
+#include "utilities/test_graphs.hpp"
+#include "utilities/test_utilities.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/edge_partition_view.hpp>
+#include <cugraph/edge_property.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/random/rng_state.hpp>
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <iostream>
+#include <random>
+
+struct GraphColoring_UseCase {
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_SGGraphColoring
+  : public ::testing::TestWithParam<std::tuple<GraphColoring_UseCase, input_usecase_t>> {
+ public:
+  Tests_SGGraphColoring() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(std::tuple<GraphColoring_UseCase, input_usecase_t> const& param)
+  {
+    auto [coloring_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      hr_timer.start("Construct graph");
+    }
+
+    constexpr bool multi_gpu = false;
+
+    auto [sg_graph, sg_edge_weights, sg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, multi_gpu>(
+        handle, input_usecase, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto sg_graph_view = sg_graph.view();
+    auto sg_edge_weight_view =
+      sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
+
+    raft::random::RngState rng_state(0);
+    auto d_colors =
+      cugraph::vertex_coloring<vertex_t, edge_t, multi_gpu>(handle, sg_graph_view, rng_state);
+
+    // Test Graph Coloring
+
+    if (coloring_usecase.check_correctness) {
+      std::vector<vertex_t> h_colors(d_colors.size());
+      raft::update_host(h_colors.data(), d_colors.data(), d_colors.size(), handle.get_stream());
+
+      std::for_each(h_colors.begin(),
+                    h_colors.end(),
+                    [num_vertices = sg_graph_view.number_of_vertices()](vertex_t color_id) {
+                      ASSERT_TRUE(color_id <= num_vertices);
+                    });
+
+      rmm::device_uvector<uint8_t> d_color_conflict_flags(
+        sg_graph_view.local_vertex_partition_range_size(), handle.get_stream());
+
+      per_v_transform_reduce_outgoing_e(
+        handle,
+        sg_graph_view,
+        cugraph::detail::edge_major_property_view_t<vertex_t, vertex_t const*>(d_colors.data()),
+        cugraph::detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(d_colors.data(),
+                                                                               vertex_t{0}),
+        cugraph::edge_dummy_property_t{}.view(),
+        [] __device__(auto src, auto dst, auto src_color, auto dst_color, thrust::nullopt_t) {
+          if ((src != dst) && (src_color == dst_color)) {
+            return uint8_t{1};
+          } else {
+            return uint8_t{0};
+          }
+        },
+        uint8_t{0},
+        cugraph::reduce_op::maximum<uint8_t>{},
+        d_color_conflict_flags.begin());
+
+      std::vector<uint8_t> h_color_conflict_flags(d_color_conflict_flags.size());
+      raft::update_host(h_color_conflict_flags.data(),
+                        d_color_conflict_flags.data(),
+                        d_color_conflict_flags.size(),
+                        handle.get_stream());
+
+      std::vector<vertex_t> h_vertices_in_this_proces((*sg_renumber_map).size());
+
+      raft::update_host(h_vertices_in_this_proces.data(),
+                        (*sg_renumber_map).data(),
+                        (*sg_renumber_map).size(),
+                        handle.get_stream());
+      handle.sync_stream();
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      edge_t nr_conflicts = cugraph::transform_reduce_e(
+        handle,
+        sg_graph_view,
+        cugraph::detail::edge_major_property_view_t<vertex_t, vertex_t const*>(d_colors.begin()),
+        cugraph::detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(d_colors.begin(),
+                                                                               vertex_t{0}),
+        cugraph::edge_dummy_property_t{}.view(),
+        [renumber_map = (*sg_renumber_map).data()] __device__(
+          auto src, auto dst, auto src_color, auto dst_color, thrust::nullopt_t) {
+          if ((src != dst) && (src_color == dst_color)) {
+            return vertex_t{1};
+          } else {
+            return vertex_t{0};
+          }
+        },
+        vertex_t{0});
+
+      ASSERT_TRUE(nr_conflicts == edge_t{0})
+        << "adjacent vertices can't have same color." << std::endl;
+
+      if (nr_conflicts >= 0) {
+        thrust::for_each(
+          thrust::host,
+          thrust::make_zip_iterator(thrust::make_tuple(
+            h_colors.begin(), h_vertices_in_this_proces.begin(), h_color_conflict_flags.begin())),
+          thrust::make_zip_iterator(thrust::make_tuple(
+            h_colors.end(), h_vertices_in_this_proces.end(), h_color_conflict_flags.end())),
+          [](auto color_vetex_and_conflict_flag) {
+            auto color         = thrust::get<0>(color_vetex_and_conflict_flag);
+            auto v             = thrust::get<1>(color_vetex_and_conflict_flag);
+            auto conflict_flag = thrust::get<2>(color_vetex_and_conflict_flag);
+            ASSERT_TRUE(conflict_flag == 0)
+              << v << " got same color as one of its neighbor" << std::endl;
+          });
+      }
+    }
+  }
+};
+
+using Tests_SGGraphColoring_File = Tests_SGGraphColoring<cugraph::test::File_Usecase>;
+using Tests_SGGraphColoring_Rmat = Tests_SGGraphColoring<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_SGGraphColoring_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGGraphColoring_File, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGGraphColoring_File, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGGraphColoring_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGGraphColoring_Rmat, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGGraphColoring_Rmat, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+bool constexpr check_correctness = false;
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_SGGraphColoring_File,
+  ::testing::Combine(::testing::Values(GraphColoring_UseCase{check_correctness},
+                                       GraphColoring_UseCase{check_correctness}),
+                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_SGGraphColoring_Rmat,
+  ::testing::Combine(
+    ::testing::Values(GraphColoring_UseCase{check_correctness}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(3, 4, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_SGGraphColoring_Rmat,
+  ::testing::Combine(
+    ::testing::Values(GraphColoring_UseCase{check_correctness},
+                      GraphColoring_UseCase{check_correctness}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu
new file mode 100644
index 00000000000..af56807746a
--- /dev/null
+++ b/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh"
+#include "prims/reduce_op.cuh"
+#include "prims/update_edge_src_dst_property.cuh"
+#include "property_generator.cuh"
+#include "result_compare.cuh"
+#include "utilities/base_fixture.hpp"
+#include "utilities/device_comm_wrapper.hpp"
+#include "utilities/mg_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+#include "utilities/test_utilities.hpp"
+#include "utilities/thrust_wrapper.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/edge_partition_view.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+#include <cugraph/utilities/thrust_tuple_utils.hpp>
+
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/count.h>
+#include <thrust/distance.h>
+#include <thrust/equal.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+#include <cuco/hash_functions.cuh>
+
+#include <gtest/gtest.h>
+
+#include <random>
+#include <sstream>
+
+template <typename vertex_t, typename edge_value_t, typename result_t>
+struct key_aggregated_e_op_t {
+  __device__ result_t operator()(vertex_t src,
+                                 vertex_t key,
+                                 result_t src_property,
+                                 result_t key_property,
+                                 edge_value_t edge_property) const
+  {
+    if (src_property < key_property) {
+      return src_property;
+    } else {
+      return key_property;
+    }
+  }
+};
+
+struct Prims_Usecase {
+  bool test_weighted{false};
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE
+  : public ::testing::TestWithParam<std::tuple<Prims_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of per_v_transform_reduce_incoming|outgoing_e primitive
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase)
+  {
+    HighResTimer hr_timer{};
+
+    auto const comm_rank = handle_->get_comms().get_rank();
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    auto [mg_graph, mg_edge_weights, mg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_, input_usecase, prims_usecase.test_weighted, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    // 2. run MG per_v_transform_reduce_dst_key_aggregated_outgoing_e
+
+    const int vertex_prop_hash_bin_count = 5;
+    const int key_hash_bin_count         = 10;
+    const int key_prop_hash_bin_count    = 20;
+    const int initial_value              = 4;
+
+    auto property_initial_value =
+      cugraph::test::generate<vertex_t, result_t>::initial_value(initial_value);
+
+    auto mg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+      *handle_, *mg_renumber_map, vertex_prop_hash_bin_count);
+    auto mg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+
+    auto mg_vertex_key = cugraph::test::generate<vertex_t, vertex_t>::vertex_property(
+      *handle_, *mg_renumber_map, key_hash_bin_count);
+    auto mg_dst_key = cugraph::test::generate<vertex_t, vertex_t>::dst_property(
+      *handle_, mg_graph_view, mg_vertex_key);
+
+    rmm::device_uvector<vertex_t> mg_kv_store_keys(comm_rank == 0 ? key_hash_bin_count : int{0},
+                                                   handle_->get_stream());
+    thrust::sequence(
+      handle_->get_thrust_policy(), mg_kv_store_keys.begin(), mg_kv_store_keys.end(), vertex_t{0});
+    mg_kv_store_keys = cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
+      *handle_, std::move(mg_kv_store_keys));
+    auto mg_kv_store_values = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+      *handle_, mg_kv_store_keys, key_prop_hash_bin_count);
+
+    static_assert(std::is_same_v<result_t, int> ||
+                  std::is_same_v<result_t, thrust::tuple<int, float>>);
+    result_t invalid_value{};
+    if constexpr (std::is_same_v<result_t, int>) {
+      invalid_value = std::numeric_limits<int>::max();
+    } else {
+      invalid_value =
+        thrust::make_tuple(std::numeric_limits<int>::max(), std::numeric_limits<float>::max());
+    }
+    cugraph::kv_store_t<vertex_t, result_t, false> mg_kv_store(
+      mg_kv_store_keys.begin(),
+      mg_kv_store_keys.end(),
+      cugraph::get_dataframe_buffer_begin(mg_kv_store_values),
+      cugraph::invalid_vertex_id<vertex_t>::value,
+      invalid_value,
+      handle_->get_stream());
+
+    enum class reduction_type_t { PLUS, ELEMWISE_MIN, ELEMWISE_MAX };
+    std::array<reduction_type_t, 3> reduction_types = {
+      reduction_type_t::PLUS, reduction_type_t::ELEMWISE_MIN, reduction_type_t::ELEMWISE_MAX};
+
+    std::vector<decltype(cugraph::allocate_dataframe_buffer<result_t>(0, rmm::cuda_stream_view{}))>
+      mg_results{};
+    mg_results.reserve(reduction_types.size());
+
+    for (size_t i = 0; i < reduction_types.size(); ++i) {
+      mg_results.push_back(cugraph::allocate_dataframe_buffer<result_t>(
+        mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream()));
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        handle_->get_comms().barrier();
+        hr_timer.start("MG per_v_transform_reduce_outgoing_e");
+      }
+
+      switch (reduction_types[i]) {
+        case reduction_type_t::PLUS:
+          if (mg_edge_weight_view) {
+            per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+              *handle_,
+              mg_graph_view,
+              mg_src_prop.view(),
+              *mg_edge_weight_view,
+              mg_dst_key.view(),
+              mg_kv_store.view(),
+              key_aggregated_e_op_t<vertex_t, weight_t, result_t>{},
+              property_initial_value,
+              cugraph::reduce_op::plus<result_t>{},
+              cugraph::get_dataframe_buffer_begin(mg_results[i]));
+          } else {
+            per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+              *handle_,
+              mg_graph_view,
+              mg_src_prop.view(),
+              cugraph::edge_dummy_property_t{}.view(),
+              mg_dst_key.view(),
+              mg_kv_store.view(),
+              key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+              property_initial_value,
+              cugraph::reduce_op::plus<result_t>{},
+              cugraph::get_dataframe_buffer_begin(mg_results[i]));
+          }
+          break;
+        case reduction_type_t::ELEMWISE_MIN:
+          if (mg_edge_weight_view) {
+            per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+              *handle_,
+              mg_graph_view,
+              mg_src_prop.view(),
+              *mg_edge_weight_view,
+              mg_dst_key.view(),
+              mg_kv_store.view(),
+              key_aggregated_e_op_t<vertex_t, weight_t, result_t>{},
+              property_initial_value,
+              cugraph::reduce_op::elementwise_minimum<result_t>{},
+              cugraph::get_dataframe_buffer_begin(mg_results[i]));
+          } else {
+            per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+              *handle_,
+              mg_graph_view,
+              mg_src_prop.view(),
+              cugraph::edge_dummy_property_t{}.view(),
+              mg_dst_key.view(),
+              mg_kv_store.view(),
+              key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+              property_initial_value,
+              cugraph::reduce_op::elementwise_minimum<result_t>{},
+              cugraph::get_dataframe_buffer_begin(mg_results[i]));
+          }
+          break;
+        case reduction_type_t::ELEMWISE_MAX:
+          if (mg_edge_weight_view) {
+            per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+              *handle_,
+              mg_graph_view,
+              mg_src_prop.view(),
+              *mg_edge_weight_view,
+              mg_dst_key.view(),
+              mg_kv_store.view(),
+              key_aggregated_e_op_t<vertex_t, weight_t, result_t>{},
+              property_initial_value,
+              cugraph::reduce_op::elementwise_maximum<result_t>{},
+              cugraph::get_dataframe_buffer_begin(mg_results[i]));
+          } else {
+            per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+              *handle_,
+              mg_graph_view,
+              mg_src_prop.view(),
+              cugraph::edge_dummy_property_t{}.view(),
+              mg_dst_key.view(),
+              mg_kv_store.view(),
+              key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+              property_initial_value,
+              cugraph::reduce_op::elementwise_maximum<result_t>{},
+              cugraph::get_dataframe_buffer_begin(mg_results[i]));
+          }
+          break;
+        default: FAIL() << "should not be reached.";
+      }
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        handle_->get_comms().barrier();
+        hr_timer.stop();
+        hr_timer.display_and_clear(std::cout);
+      }
+    }
+
+    // 3. compare SG & MG results
+
+    if (prims_usecase.check_correctness) {
+      cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
+      std::optional<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
+        sg_edge_weights{std::nullopt};
+      std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
+        *handle_,
+        mg_graph_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                              (*mg_renumber_map).size()),
+        false);
+
+      for (size_t i = 0; i < reduction_types.size(); ++i) {
+        auto mg_aggregate_results =
+          cugraph::allocate_dataframe_buffer<result_t>(0, handle_->get_stream());
+
+        static_assert(cugraph::is_arithmetic_or_thrust_tuple_of_arithmetic<result_t>::value);
+        if constexpr (std::is_arithmetic_v<result_t>) {
+          std::tie(std::ignore, mg_aggregate_results) =
+            cugraph::test::mg_vertex_property_values_to_sg_vertex_property_values(
+              *handle_,
+              std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                    (*mg_renumber_map).size()),
+              mg_graph_view.local_vertex_partition_range(),
+              std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+              std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+              raft::device_span<result_t const>(mg_results[i].data(), mg_results[i].size()));
+        } else {
+          std::tie(std::ignore, std::get<0>(mg_aggregate_results)) =
+            cugraph::test::mg_vertex_property_values_to_sg_vertex_property_values(
+              *handle_,
+              std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                    (*mg_renumber_map).size()),
+              mg_graph_view.local_vertex_partition_range(),
+              std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+              std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+              raft::device_span<typename thrust::tuple_element<0, result_t>::type const>(
+                std::get<0>(mg_results[i]).data(), std::get<0>(mg_results[i]).size()));
+
+          std::tie(std::ignore, std::get<1>(mg_aggregate_results)) =
+            cugraph::test::mg_vertex_property_values_to_sg_vertex_property_values(
+              *handle_,
+              std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                    (*mg_renumber_map).size()),
+              mg_graph_view.local_vertex_partition_range(),
+              std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+              std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+              raft::device_span<typename thrust::tuple_element<1, result_t>::type const>(
+                std::get<1>(mg_results[i]).data(), std::get<1>(mg_results[i]).size()));
+        }
+
+        if (handle_->get_comms().get_rank() == int{0}) {
+          auto sg_graph_view = sg_graph.view();
+          auto sg_edge_weight_view =
+            sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
+
+          auto sg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+            *handle_,
+            thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
+            thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
+            vertex_prop_hash_bin_count);
+          auto sg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+            *handle_, sg_graph_view, sg_vertex_prop);
+
+          auto sg_vertex_key = cugraph::test::generate<vertex_t, vertex_t>::vertex_property(
+            *handle_,
+            thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
+            thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
+            key_hash_bin_count);
+          auto sg_dst_key = cugraph::test::generate<vertex_t, vertex_t>::dst_property(
+            *handle_, sg_graph_view, sg_vertex_key);
+
+          rmm::device_uvector<vertex_t> sg_kv_store_keys(key_hash_bin_count, handle_->get_stream());
+          thrust::sequence(handle_->get_thrust_policy(),
+                           sg_kv_store_keys.begin(),
+                           sg_kv_store_keys.end(),
+                           vertex_t{0});
+          auto sg_kv_store_values = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+            *handle_, sg_kv_store_keys, key_prop_hash_bin_count);
+
+          cugraph::kv_store_t<vertex_t, result_t, false> sg_kv_store(
+            sg_kv_store_keys.begin(),
+            sg_kv_store_keys.end(),
+            cugraph::get_dataframe_buffer_begin(sg_kv_store_values),
+            cugraph::invalid_vertex_id<vertex_t>::value,
+            invalid_value,
+            handle_->get_stream());
+
+          cugraph::test::vector_result_compare compare{*handle_};
+
+          auto global_result = cugraph::allocate_dataframe_buffer<result_t>(
+            sg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
+
+          switch (reduction_types[i]) {
+            case reduction_type_t::PLUS:
+              if (sg_edge_weight_view) {
+                per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+                  *handle_,
+                  sg_graph_view,
+                  sg_src_prop.view(),
+                  *sg_edge_weight_view,
+                  sg_dst_key.view(),
+                  sg_kv_store.view(),
+                  key_aggregated_e_op_t<vertex_t, weight_t, result_t>{},
+                  property_initial_value,
+                  cugraph::reduce_op::plus<result_t>{},
+                  cugraph::get_dataframe_buffer_begin(global_result));
+              } else {
+                per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+                  *handle_,
+                  sg_graph_view,
+                  sg_src_prop.view(),
+                  cugraph::edge_dummy_property_t{}.view(),
+                  sg_dst_key.view(),
+                  sg_kv_store.view(),
+                  key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+                  property_initial_value,
+                  cugraph::reduce_op::plus<result_t>{},
+                  cugraph::get_dataframe_buffer_begin(global_result));
+              }
+              break;
+            case reduction_type_t::ELEMWISE_MIN:
+              if (sg_edge_weight_view) {
+                per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+                  *handle_,
+                  sg_graph_view,
+                  sg_src_prop.view(),
+                  *sg_edge_weight_view,
+                  sg_dst_key.view(),
+                  sg_kv_store.view(),
+                  key_aggregated_e_op_t<vertex_t, weight_t, result_t>{},
+                  property_initial_value,
+                  cugraph::reduce_op::elementwise_minimum<result_t>{},
+                  cugraph::get_dataframe_buffer_begin(global_result));
+              } else {
+                per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+                  *handle_,
+                  sg_graph_view,
+                  sg_src_prop.view(),
+                  cugraph::edge_dummy_property_t{}.view(),
+                  sg_dst_key.view(),
+                  sg_kv_store.view(),
+                  key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+                  property_initial_value,
+                  cugraph::reduce_op::elementwise_minimum<result_t>{},
+                  cugraph::get_dataframe_buffer_begin(global_result));
+              }
+              break;
+            case reduction_type_t::ELEMWISE_MAX:
+              if (sg_edge_weight_view) {
+                per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+                  *handle_,
+                  sg_graph_view,
+                  sg_src_prop.view(),
+                  *sg_edge_weight_view,
+                  sg_dst_key.view(),
+                  sg_kv_store.view(),
+                  key_aggregated_e_op_t<vertex_t, weight_t, result_t>{},
+                  property_initial_value,
+                  cugraph::reduce_op::elementwise_maximum<result_t>{},
+                  cugraph::get_dataframe_buffer_begin(global_result));
+              } else {
+                per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+                  *handle_,
+                  sg_graph_view,
+                  sg_src_prop.view(),
+                  cugraph::edge_dummy_property_t{}.view(),
+                  sg_dst_key.view(),
+                  sg_kv_store.view(),
+                  key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+                  property_initial_value,
+                  cugraph::reduce_op::elementwise_maximum<result_t>{},
+                  cugraph::get_dataframe_buffer_begin(global_result));
+              }
+              break;
+            default: FAIL() << "should not be reached.";
+          }
+
+          ASSERT_TRUE(compare(mg_aggregate_results, global_result));
+        }
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t>
+  Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_File =
+  Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE<cugraph::test::File_Usecase>;
+using Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat =
+  Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE<cugraph::test::Rmat_Usecase>;
+
+// FIXME: this tests do not build as cugrpah::kv_store_t has a build error when use_binary_search =
+// false and value_t is thrust::tuple, this will be fixed in a separate PR
+#if 0
+TEST_P(Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_File,
+       CheckInt32Int32FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>>(std::get<0>(param),
+                                                                              std::get<1>(param));
+}
+
+TEST_P(Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat,
+       CheckInt32Int32FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat,
+       CheckInt32Int64FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, thrust::tuple<int, float>>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat,
+       CheckInt64Int64FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, thrust::tuple<int, float>>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+#endif
+
+TEST_P(Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_File,
+       CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat,
+       CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat,
+       CheckInt32Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, int>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat,
+       CheckInt64Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, int>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_File,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{false, false, true},
+                      Prims_Usecase{false, true, true},
+                      Prims_Usecase{true, false, true},
+                      Prims_Usecase{true, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat,
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, false, true},
+                                                              Prims_Usecase{false, true, true},
+                                                              Prims_Usecase{true, false, true},
+                                                              Prims_Usecase{true, true, true}),
+                                            ::testing::Values(cugraph::test::Rmat_Usecase(
+                                              10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{false, false, false},
+                      Prims_Usecase{false, true, false},
+                      Prims_Usecase{true, false, false},
+                      Prims_Usecase{true, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
index efab53f89e6..a459a677569 100644
--- a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
@@ -18,6 +18,7 @@
 #include "prims/reduce_op.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 #include "property_generator.cuh"
+#include "result_compare.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/device_comm_wrapper.hpp"
 #include "utilities/mg_utilities.hpp"
@@ -72,83 +73,6 @@ struct e_op_t {
   }
 };
 
-template <typename T>
-__host__ __device__ bool compare_scalar(T val0, T val1, thrust::optional<T> threshold_ratio)
-{
-  if (threshold_ratio) {
-    return std::abs(val0 - val1) <= (std::max(std::abs(val0), std::abs(val1)) * *threshold_ratio);
-  } else {
-    return val0 == val1;
-  }
-}
-
-template <typename T>
-struct comparator {
-  static constexpr double threshold_ratio{1e-2};
-
-  __host__ __device__ bool operator()(T t0, T t1) const
-  {
-    static_assert(cugraph::is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
-    if constexpr (std::is_arithmetic_v<T>) {
-      return compare_scalar(
-        t0,
-        t1,
-        std::is_floating_point_v<T> ? thrust::optional<T>{threshold_ratio} : thrust::nullopt);
-    } else {
-      auto val0   = thrust::get<0>(t0);
-      auto val1   = thrust::get<0>(t1);
-      auto passed = compare_scalar(val0,
-                                   val1,
-                                   std::is_floating_point_v<decltype(val0)>
-                                     ? thrust::optional<decltype(val0)>{threshold_ratio}
-                                     : thrust::nullopt);
-      if (!passed) return false;
-
-      if constexpr (thrust::tuple_size<T>::value >= 2) {
-        auto val0   = thrust::get<1>(t0);
-        auto val1   = thrust::get<1>(t1);
-        auto passed = compare_scalar(val0,
-                                     val1,
-                                     std::is_floating_point_v<decltype(val1)>
-                                       ? thrust::optional<decltype(val1)>{threshold_ratio}
-                                       : thrust::nullopt);
-        if (!passed) return false;
-      }
-      if constexpr (thrust::tuple_size<T>::value >= 3) {
-        assert(false);  // should not be reached.
-      }
-      return true;
-    }
-  }
-};
-
-struct result_compare {
-  const raft::handle_t& handle_;
-  result_compare(raft::handle_t const& handle) : handle_(handle) {}
-
-  template <typename... Args>
-  auto operator()(const std::tuple<rmm::device_uvector<Args>...>& t1,
-                  const std::tuple<rmm::device_uvector<Args>...>& t2)
-  {
-    using type = thrust::tuple<Args...>;
-    return equality_impl(t1, t2, std::make_index_sequence<thrust::tuple_size<type>::value>());
-  }
-
-  template <typename T>
-  auto operator()(const rmm::device_uvector<T>& t1, const rmm::device_uvector<T>& t2)
-  {
-    return thrust::equal(
-      handle_.get_thrust_policy(), t1.begin(), t1.end(), t2.begin(), comparator<T>());
-  }
-
- private:
-  template <typename T, std::size_t... I>
-  auto equality_impl(T& t1, T& t2, std::index_sequence<I...>)
-  {
-    return (... && (result_compare::operator()(std::get<I>(t1), std::get<I>(t2))));
-  }
-};
-
 struct Prims_Usecase {
   bool test_weighted{false};
   bool edge_masking{false};
@@ -440,7 +364,7 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE
             *handle_, sg_graph_view, sg_vertex_prop);
           auto sg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
             *handle_, sg_graph_view, sg_vertex_prop);
-          result_compare comp{*handle_};
+          cugraph::test::vector_result_compare compare{*handle_};
 
           auto global_in_result = cugraph::allocate_dataframe_buffer<result_t>(
             sg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
@@ -528,8 +452,8 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE
             default: FAIL() << "should not be reached.";
           }
 
-          ASSERT_TRUE(comp(mg_aggregate_in_results, global_in_result));
-          ASSERT_TRUE(comp(mg_aggregate_out_results, global_out_result));
+          ASSERT_TRUE(compare(mg_aggregate_in_results, global_in_result));
+          ASSERT_TRUE(compare(mg_aggregate_out_results, global_out_result));
         }
       }
     }
diff --git a/cpp/tests/prims/mg_reduce_v.cu b/cpp/tests/prims/mg_reduce_v.cu
index da3354b77d9..783e17b6d8f 100644
--- a/cpp/tests/prims/mg_reduce_v.cu
+++ b/cpp/tests/prims/mg_reduce_v.cu
@@ -17,6 +17,7 @@
 #include "prims/property_op_utils.cuh"
 #include "prims/reduce_v.cuh"
 #include "property_generator.cuh"
+#include "result_compare.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/device_comm_wrapper.hpp"
 #include "utilities/mg_utilities.hpp"
@@ -49,50 +50,6 @@
 
 #include <random>
 
-template <typename T>
-struct result_compare {
-  static constexpr double threshold_ratio{1e-2};
-  constexpr auto operator()(const T& t1, const T& t2)
-  {
-    if constexpr (std::is_floating_point_v<T>) {
-      bool passed = (t1 == t2)  // when t1 == t2 == 0
-                    ||
-                    (std::abs(t1 - t2) < (std::max(std::abs(t1), std::abs(t2)) * threshold_ratio));
-      return passed;
-    }
-    return t1 == t2;
-  }
-};
-
-template <typename... Args>
-struct result_compare<thrust::tuple<Args...>> {
-  static constexpr double threshold_ratio{1e-3};
-
-  using Type = thrust::tuple<Args...>;
-  constexpr auto operator()(const Type& t1, const Type& t2)
-  {
-    return equality_impl(t1, t2, std::make_index_sequence<thrust::tuple_size<Type>::value>());
-  }
-
- private:
-  template <typename T>
-  constexpr bool equal(T t1, T t2)
-  {
-    if constexpr (std::is_floating_point_v<T>) {
-      bool passed = (t1 == t2)  // when t1 == t2 == 0
-                    ||
-                    (std::abs(t1 - t2) < (std::max(std::abs(t1), std::abs(t2)) * threshold_ratio));
-      return passed;
-    }
-    return t1 == t2;
-  }
-  template <typename T, std::size_t... I>
-  constexpr auto equality_impl(T& t1, T& t2, std::index_sequence<I...>)
-  {
-    return (... && (equal(thrust::get<I>(t1), thrust::get<I>(t2))));
-  }
-};
-
 struct Prims_Usecase {
   bool check_correctness{true};
 };
@@ -249,7 +206,7 @@ class Tests_MGReduceV
               break;
             default: FAIL() << "should not be reached.";
           }
-          result_compare<result_t> compare{};
+          cugraph::test::scalar_result_compare compare{};
           ASSERT_TRUE(compare(expected_result, results[reduction_type]));
         }
       }
diff --git a/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu b/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu
new file mode 100644
index 00000000000..5fa37250e21
--- /dev/null
+++ b/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "property_generator.cuh"
+
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/edge_property.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+#include <cugraph/utilities/thrust_tuple_utils.hpp>
+
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/core/comms.hpp>
+#include <raft/core/device_span.hpp>
+#include <raft/core/handle.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/tuple.h>
+
+#include <gtest/gtest.h>
+#include <prims/transform_e.cuh>
+#include <prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh>
+#include <prims/update_edge_src_dst_property.cuh>
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/mg_utilities.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <random>
+
+template <typename vertex_t, typename edge_t>
+struct intersection_op_t {
+  __device__ thrust::tuple<edge_t, edge_t, edge_t> operator()(
+    vertex_t v0,
+    vertex_t v1,
+    edge_t v0_prop,
+    edge_t v1_prop,
+    raft::device_span<vertex_t const> intersection) const
+  {
+    return thrust::make_tuple(
+      v0_prop + v1_prop, v0_prop + v1_prop, static_cast<edge_t>(intersection.size()));
+  }
+};
+
+struct Prims_Usecase {
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV
+  : public ::testing::TestWithParam<std::tuple<Prims_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Verify the results of transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v primitive
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase)
+  {
+    HighResTimer hr_timer{};
+
+    auto const comm_rank = handle_->get_comms().get_rank();
+    auto const comm_size = handle_->get_comms().get_size();
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    cugraph::graph_t<vertex_t, edge_t, false, true> mg_graph(*handle_);
+    std::optional<rmm::device_uvector<vertex_t>> mg_renumber_map{std::nullopt};
+    std::tie(mg_graph, std::ignore, mg_renumber_map) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_, input_usecase, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    // 2. run MG transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v primitive
+
+    const int hash_bin_count = 5;
+    const int initial_value  = 4;
+
+    auto property_initial_value =
+      cugraph::test::generate<vertex_t, edge_t>::initial_value(initial_value);
+
+    auto mg_vertex_prop = cugraph::test::generate<vertex_t, edge_t>::vertex_property(
+      *handle_, *mg_renumber_map, hash_bin_count);
+    auto mg_src_prop = cugraph::test::generate<vertex_t, edge_t>::src_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+    auto mg_dst_prop = cugraph::test::generate<vertex_t, edge_t>::dst_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+
+    auto mg_result_buffer = rmm::device_uvector<edge_t>(
+      mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v");
+    }
+
+    cugraph::transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v(
+      *handle_,
+      mg_graph_view,
+      mg_src_prop.view(),
+      mg_dst_prop.view(),
+      intersection_op_t<vertex_t, edge_t>{},
+      property_initial_value,
+      mg_result_buffer.begin());
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    // 3. validate MG results
+
+    if (prims_usecase.check_correctness) {
+      rmm::device_uvector<edge_t> mg_aggregate_result_buffer(0, handle_->get_stream());
+      std::tie(std::ignore, mg_aggregate_result_buffer) =
+        cugraph::test::mg_vertex_property_values_to_sg_vertex_property_values(
+          *handle_,
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          mg_graph_view.local_vertex_partition_range(),
+          std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+          std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+          raft::device_span<edge_t const>(mg_result_buffer.data(), mg_result_buffer.size()));
+
+      cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
+      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
+        *handle_,
+        mg_graph_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                              (*mg_renumber_map).size()),
+        false);
+
+      if (handle_->get_comms().get_rank() == 0) {
+        auto sg_graph_view = sg_graph.view();
+
+        auto sg_vertex_prop = cugraph::test::generate<vertex_t, edge_t>::vertex_property(
+          *handle_,
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
+          hash_bin_count);
+        auto sg_src_prop = cugraph::test::generate<vertex_t, edge_t>::src_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
+        auto sg_dst_prop = cugraph::test::generate<vertex_t, edge_t>::dst_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
+
+        auto sg_result_buffer = cugraph::allocate_dataframe_buffer<edge_t>(
+          sg_graph_view.number_of_vertices(), handle_->get_stream());
+
+        cugraph::transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v(
+          *handle_,
+          sg_graph_view,
+          sg_src_prop.view(),
+          sg_dst_prop.view(),
+          intersection_op_t<vertex_t, edge_t>{},
+          property_initial_value,
+          sg_result_buffer.begin());
+
+        bool valid = thrust::equal(handle_->get_thrust_policy(),
+                                   mg_aggregate_result_buffer.begin(),
+                                   mg_aggregate_result_buffer.end(),
+                                   sg_result_buffer.begin());
+
+        ASSERT_TRUE(valid);
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t>
+  Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_File =
+  Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV<cugraph::test::File_Usecase>;
+using Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_Rmat =
+  Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_File, CheckInt32Int32Float)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_Rmat, CheckInt32Int32Float)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_Rmat, CheckInt32Int64Float)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_Rmat, CheckInt64Int64Float)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_File,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{false, true}, Prims_Usecase{true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/netscience.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_Rmat,
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, true},
+                                                              Prims_Usecase{true, true}),
+                                            ::testing::Values(cugraph::test::Rmat_Usecase(
+                                              10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{false, false}, Prims_Usecase{true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_transform_reduce_e.cu b/cpp/tests/prims/mg_transform_reduce_e.cu
index c8ce9fc3a47..53f37e83b30 100644
--- a/cpp/tests/prims/mg_transform_reduce_e.cu
+++ b/cpp/tests/prims/mg_transform_reduce_e.cu
@@ -17,6 +17,7 @@
 #include "prims/transform_reduce_e.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 #include "property_generator.cuh"
+#include "result_compare.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/device_comm_wrapper.hpp"
 #include "utilities/mg_utilities.hpp"
@@ -52,44 +53,6 @@
 
 #include <random>
 
-template <typename T>
-struct result_compare {
-  static constexpr double threshold_ratio{1e-3};
-  constexpr auto operator()(const T& t1, const T& t2)
-  {
-    if constexpr (std::is_floating_point_v<T>) {
-      return std::abs(t1 - t2) < (std::max(t1, t2) * threshold_ratio);
-    }
-    return t1 == t2;
-  }
-};
-
-template <typename... Args>
-struct result_compare<thrust::tuple<Args...>> {
-  static constexpr double threshold_ratio{1e-3};
-
-  using type = thrust::tuple<Args...>;
-  constexpr auto operator()(const type& t1, const type& t2)
-  {
-    return equality_impl(t1, t2, std::make_index_sequence<thrust::tuple_size<type>::value>());
-  }
-
- private:
-  template <typename T>
-  constexpr bool equal(T t1, T t2)
-  {
-    if constexpr (std::is_floating_point_v<T>) {
-      return std::abs(t1 - t2) < (std::max(t1, t2) * threshold_ratio);
-    }
-    return t1 == t2;
-  }
-  template <typename T, std::size_t... I>
-  constexpr auto equality_impl(T& t1, T& t2, std::index_sequence<I...>)
-  {
-    return (... && (equal(thrust::get<I>(t1), thrust::get<I>(t2))));
-  }
-};
-
 struct Prims_Usecase {
   bool test_weighted{false};
   bool edge_masking{false};
@@ -231,7 +194,7 @@ class Tests_MGTransformReduceE
             }
           },
           property_initial_value);
-        result_compare<result_t> compare{};
+        cugraph::test::scalar_result_compare compare{};
         ASSERT_TRUE(compare(expected_result, result));
       }
     }
diff --git a/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu b/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu
new file mode 100644
index 00000000000..457e6b5ab93
--- /dev/null
+++ b/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "property_generator.cuh"
+#include "result_compare.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/edge_partition_view.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/count.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+#include <cuco/hash_functions.cuh>
+
+#include <gtest/gtest.h>
+#include <prims/reduce_op.cuh>
+#include <prims/transform_reduce_e_by_src_dst_key.cuh>
+#include <prims/update_edge_src_dst_property.cuh>
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/mg_utilities.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <random>
+
+struct Prims_Usecase {
+  bool test_weighted{false};
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGTransformReduceEBySrcDstKey
+  : public ::testing::TestWithParam<std::tuple<Prims_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGTransformReduceEBySrcDstKey() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of transform_reduce_e_by_src|dst_key primitive
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename result_t,
+            bool store_transposed>
+  void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase)
+  {
+    HighResTimer hr_timer{};
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    cugraph::graph_t<vertex_t, edge_t, store_transposed, true> mg_graph(*handle_);
+    std::optional<rmm::device_uvector<vertex_t>> mg_renumber_map{std::nullopt};
+    std::tie(mg_graph, std::ignore, mg_renumber_map) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, true>(
+        *handle_, input_usecase, prims_usecase.test_weighted, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    // 2. run MG transform reduce
+
+    const int hash_bin_count = 5;
+    const int initial_value  = 4;
+
+    auto property_initial_value =
+      cugraph::test::generate<vertex_t, result_t>::initial_value(initial_value);
+
+    auto mg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+      *handle_, *mg_renumber_map, hash_bin_count);
+    auto mg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+    auto mg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+
+    auto mg_vertex_key = cugraph::test::generate<vertex_t, vertex_t>::vertex_property(
+      *handle_, *mg_renumber_map, hash_bin_count);
+    auto mg_src_key = cugraph::test::generate<vertex_t, vertex_t>::src_property(
+      *handle_, mg_graph_view, mg_vertex_key);
+    auto mg_dst_key = cugraph::test::generate<vertex_t, vertex_t>::dst_property(
+      *handle_, mg_graph_view, mg_vertex_key);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG transform_reduce_e_by_src_key");
+    }
+
+    auto [by_src_keys, by_src_values] = transform_reduce_e_by_src_key(
+      *handle_,
+      mg_graph_view,
+      mg_src_prop.view(),
+      mg_dst_prop.view(),
+      cugraph::edge_dummy_property_t{}.view(),
+      mg_src_key.view(),
+      [] __device__(auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+        if (src_property < dst_property) {
+          return src_property;
+        } else {
+          return dst_property;
+        }
+      },
+      property_initial_value,
+      cugraph::reduce_op::plus<result_t>{});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG transform_reduce_e_by_dst_key");
+    }
+
+    auto [by_dst_keys, by_dst_values] = transform_reduce_e_by_dst_key(
+      *handle_,
+      mg_graph_view,
+      mg_src_prop.view(),
+      mg_dst_prop.view(),
+      cugraph::edge_dummy_property_t{}.view(),
+      mg_dst_key.view(),
+      [] __device__(auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+        if (src_property < dst_property) {
+          return src_property;
+        } else {
+          return dst_property;
+        }
+      },
+      property_initial_value,
+      cugraph::reduce_op::plus<result_t>{});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    // 3. compare SG & MG results
+
+    if (prims_usecase.check_correctness) {
+      auto mg_aggregate_by_src_keys =
+        cugraph::test::device_gatherv(*handle_, by_src_keys.data(), by_src_keys.size());
+      auto mg_aggregate_by_src_values =
+        cugraph::allocate_dataframe_buffer<result_t>(0, handle_->get_stream());
+      if constexpr (std::is_arithmetic_v<result_t>) {
+        mg_aggregate_by_src_values =
+          cugraph::test::device_gatherv(*handle_, by_src_values.data(), by_src_values.size());
+      } else {
+        std::get<0>(mg_aggregate_by_src_values) = cugraph::test::device_gatherv(
+          *handle_, std::get<0>(by_src_values).data(), std::get<0>(by_src_values).size());
+        std::get<1>(mg_aggregate_by_src_values) = cugraph::test::device_gatherv(
+          *handle_, std::get<1>(by_src_values).data(), std::get<1>(by_src_values).size());
+      }
+      thrust::sort_by_key(handle_->get_thrust_policy(),
+                          mg_aggregate_by_src_keys.begin(),
+                          mg_aggregate_by_src_keys.end(),
+                          cugraph::get_dataframe_buffer_begin(mg_aggregate_by_src_values));
+
+      auto mg_aggregate_by_dst_keys =
+        cugraph::test::device_gatherv(*handle_, by_dst_keys.data(), by_dst_keys.size());
+      auto mg_aggregate_by_dst_values =
+        cugraph::allocate_dataframe_buffer<result_t>(0, handle_->get_stream());
+      if constexpr (std::is_arithmetic_v<result_t>) {
+        mg_aggregate_by_dst_values =
+          cugraph::test::device_gatherv(*handle_, by_dst_values.data(), by_dst_values.size());
+      } else {
+        std::get<0>(mg_aggregate_by_dst_values) = cugraph::test::device_gatherv(
+          *handle_, std::get<0>(by_dst_values).data(), std::get<0>(by_dst_values).size());
+        std::get<1>(mg_aggregate_by_dst_values) = cugraph::test::device_gatherv(
+          *handle_, std::get<1>(by_dst_values).data(), std::get<1>(by_dst_values).size());
+      }
+      thrust::sort_by_key(handle_->get_thrust_policy(),
+                          mg_aggregate_by_dst_keys.begin(),
+                          mg_aggregate_by_dst_keys.end(),
+                          cugraph::get_dataframe_buffer_begin(mg_aggregate_by_dst_values));
+
+      cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
+      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
+        *handle_,
+        mg_graph_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                              (*mg_renumber_map).size()),
+        false);
+
+      if (handle_->get_comms().get_rank() == 0) {
+        auto sg_graph_view = sg_graph.view();
+
+        auto sg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+          *handle_,
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
+          hash_bin_count);
+        auto sg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
+        auto sg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
+
+        auto sg_vertex_key = cugraph::test::generate<vertex_t, vertex_t>::vertex_property(
+          *handle_,
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
+          hash_bin_count);
+        auto sg_src_key = cugraph::test::generate<vertex_t, vertex_t>::src_property(
+          *handle_, sg_graph_view, sg_vertex_key);
+        auto sg_dst_key = cugraph::test::generate<vertex_t, vertex_t>::dst_property(
+          *handle_, sg_graph_view, sg_vertex_key);
+
+        auto [sg_by_src_keys, sg_by_src_values] = transform_reduce_e_by_src_key(
+          *handle_,
+          sg_graph_view,
+          sg_src_prop.view(),
+          sg_dst_prop.view(),
+          cugraph::edge_dummy_property_t{}.view(),
+          sg_src_key.view(),
+          [] __device__(
+            auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+            if (src_property < dst_property) {
+              return src_property;
+            } else {
+              return dst_property;
+            }
+          },
+          property_initial_value,
+          cugraph::reduce_op::plus<result_t>{});
+        thrust::sort_by_key(handle_->get_thrust_policy(),
+                            sg_by_src_keys.begin(),
+                            sg_by_src_keys.end(),
+                            cugraph::get_dataframe_buffer_begin(sg_by_src_values));
+
+        auto [sg_by_dst_keys, sg_by_dst_values] = transform_reduce_e_by_dst_key(
+          *handle_,
+          sg_graph_view,
+          sg_src_prop.view(),
+          sg_dst_prop.view(),
+          cugraph::edge_dummy_property_t{}.view(),
+          sg_dst_key.view(),
+          [] __device__(
+            auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+            if (src_property < dst_property) {
+              return src_property;
+            } else {
+              return dst_property;
+            }
+          },
+          property_initial_value,
+          cugraph::reduce_op::plus<result_t>{});
+        thrust::sort_by_key(handle_->get_thrust_policy(),
+                            sg_by_dst_keys.begin(),
+                            sg_by_dst_keys.end(),
+                            cugraph::get_dataframe_buffer_begin(sg_by_dst_values));
+
+        cugraph::test::vector_result_compare compare{*handle_};
+
+        ASSERT_TRUE(compare(sg_by_src_keys, mg_aggregate_by_src_keys));
+        ASSERT_TRUE(compare(sg_by_src_values, mg_aggregate_by_src_values));
+
+        ASSERT_TRUE(compare(sg_by_dst_keys, mg_aggregate_by_dst_keys));
+        ASSERT_TRUE(compare(sg_by_dst_values, mg_aggregate_by_dst_values));
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGTransformReduceEBySrcDstKey<input_usecase_t>::handle_ =
+  nullptr;
+
+using Tests_MGTransformReduceEBySrcDstKey_File =
+  Tests_MGTransformReduceEBySrcDstKey<cugraph::test::File_Usecase>;
+using Tests_MGTransformReduceEBySrcDstKey_Rmat =
+  Tests_MGTransformReduceEBySrcDstKey<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_File, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, false>(std::get<0>(param),
+                                                                              std::get<1>(param));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt32Int64FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, thrust::tuple<int, float>, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt64Int64FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, thrust::tuple<int, float>, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_File, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, true>(std::get<0>(param),
+                                                                             std::get<1>(param));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt32Int64FloatTupleIntFloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, thrust::tuple<int, float>, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt64Int64FloatTupleIntFloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, thrust::tuple<int, float>, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_File, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int, false>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt32Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, int, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt64Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, int, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_File, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int, true>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt32Int64FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, int, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt64Int64FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, int, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGTransformReduceEBySrcDstKey_File,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{false, false, true},
+                      Prims_Usecase{false, true, true},
+                      Prims_Usecase{true, false, true},
+                      Prims_Usecase{true, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_MGTransformReduceEBySrcDstKey_Rmat,
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, false, true},
+                                                              Prims_Usecase{false, true, true},
+                                                              Prims_Usecase{true, false, true},
+                                                              Prims_Usecase{true, true, true}),
+                                            ::testing::Values(cugraph::test::Rmat_Usecase(
+                                              10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGTransformReduceEBySrcDstKey_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{false, false, false},
+                      Prims_Usecase{false, true, false},
+                      Prims_Usecase{true, false, false},
+                      Prims_Usecase{true, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_transform_reduce_v.cu b/cpp/tests/prims/mg_transform_reduce_v.cu
index c0d44bc94f1..c954f31d0f9 100644
--- a/cpp/tests/prims/mg_transform_reduce_v.cu
+++ b/cpp/tests/prims/mg_transform_reduce_v.cu
@@ -16,6 +16,7 @@
 
 #include "prims/transform_reduce_v.cuh"
 #include "property_generator.cuh"
+#include "result_compare.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/device_comm_wrapper.hpp"
 #include "utilities/mg_utilities.hpp"
@@ -56,50 +57,6 @@ struct v_op_t {
   }
 };
 
-template <typename T>
-struct result_compare {
-  static constexpr double threshold_ratio{1e-3};
-  constexpr auto operator()(const T& t1, const T& t2)
-  {
-    if constexpr (std::is_floating_point_v<T>) {
-      bool passed = (t1 == t2)  // when t1 == t2 == 0
-                    ||
-                    (std::abs(t1 - t2) < (std::max(std::abs(t1), std::abs(t2)) * threshold_ratio));
-      return passed;
-    }
-    return t1 == t2;
-  }
-};
-
-template <typename... Args>
-struct result_compare<thrust::tuple<Args...>> {
-  static constexpr double threshold_ratio{1e-3};
-
-  using Type = thrust::tuple<Args...>;
-  constexpr auto operator()(const Type& t1, const Type& t2)
-  {
-    return equality_impl(t1, t2, std::make_index_sequence<thrust::tuple_size<Type>::value>());
-  }
-
- private:
-  template <typename T>
-  constexpr bool equal(T t1, T t2)
-  {
-    if constexpr (std::is_floating_point_v<T>) {
-      bool passed = (t1 == t2)  // when t1 == t2 == 0
-                    ||
-                    (std::abs(t1 - t2) < (std::max(std::abs(t1), std::abs(t2)) * threshold_ratio));
-      return passed;
-    }
-    return t1 == t2;
-  }
-  template <typename T, std::size_t... I>
-  constexpr auto equality_impl(T& t1, T& t2, std::index_sequence<I...>)
-  {
-    return (... && (equal(thrust::get<I>(t1), thrust::get<I>(t2))));
-  }
-};
-
 struct Prims_Usecase {
   bool check_correctness{true};
 };
@@ -254,7 +211,7 @@ class Tests_MGTransformReduceV
               break;
             default: FAIL() << "should not be reached.";
           }
-          result_compare<result_t> compare{};
+          cugraph::test::scalar_result_compare compare{};
           ASSERT_TRUE(compare(expected_result, results[reduction_type]));
         }
       }
diff --git a/cpp/tests/prims/result_compare.cuh b/cpp/tests/prims/result_compare.cuh
new file mode 100644
index 00000000000..5a1abb90e3c
--- /dev/null
+++ b/cpp/tests/prims/result_compare.cuh
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <cugraph/utilities/thrust_tuple_utils.hpp>
+
+#include <raft/core/handle.hpp>
+
+#include <thrust/equal.h>
+#include <thrust/optional.h>
+#include <thrust/tuple.h>
+
+#include <algorithm>
+#include <cmath>
+#include <type_traits>
+#include <utility>
+
+namespace cugraph {
+namespace test {
+
+namespace detail {
+
+template <typename T>
+__host__ __device__ bool compare_arithmetic_scalar(T val0,
+                                                   T val1,
+                                                   thrust::optional<T> threshold_ratio)
+{
+  if (threshold_ratio) {
+    return std::abs(val0 - val1) <= (std::max(std::abs(val0), std::abs(val1)) * *threshold_ratio);
+  } else {
+    return val0 == val1;
+  }
+}
+
+}  // namespace detail
+
+template <typename T>
+struct comparator {
+  static constexpr double threshold_ratio{1e-2};
+
+  __host__ __device__ bool operator()(T t0, T t1) const
+  {
+    static_assert(cugraph::is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+    if constexpr (std::is_arithmetic_v<T>) {
+      return detail::compare_arithmetic_scalar(
+        t0,
+        t1,
+        std::is_floating_point_v<T> ? thrust::optional<T>{threshold_ratio} : thrust::nullopt);
+    } else {
+      auto val0   = thrust::get<0>(t0);
+      auto val1   = thrust::get<0>(t1);
+      auto passed = detail::compare_arithmetic_scalar(
+        val0,
+        val1,
+        std::is_floating_point_v<decltype(val0)> ? thrust::optional<decltype(val0)>{threshold_ratio}
+                                                 : thrust::nullopt);
+      if (!passed) return false;
+
+      if constexpr (thrust::tuple_size<T>::value >= 2) {
+        auto val0 = thrust::get<1>(t0);
+        auto val1 = thrust::get<1>(t1);
+        auto passed =
+          detail::compare_arithmetic_scalar(val0,
+                                            val1,
+                                            std::is_floating_point_v<decltype(val1)>
+                                              ? thrust::optional<decltype(val1)>{threshold_ratio}
+                                              : thrust::nullopt);
+        if (!passed) return false;
+      }
+      if constexpr (thrust::tuple_size<T>::value >= 3) {
+        assert(false);  // should not be reached.
+      }
+      return true;
+    }
+  }
+};
+
+struct scalar_result_compare {
+  template <typename... Args>
+  auto operator()(thrust::tuple<Args...> t1, thrust::tuple<Args...> t2)
+  {
+    using type = thrust::tuple<Args...>;
+    return equality_impl(t1, t2, std::make_index_sequence<thrust::tuple_size<type>::value>());
+  }
+
+  template <typename T>
+  auto operator()(T t1, T t2)
+  {
+    comparator<T> comp{};
+    return comp(t1, t2);
+  }
+
+ private:
+  template <typename T, std::size_t... I>
+  auto equality_impl(T t1, T t2, std::index_sequence<I...>)
+  {
+    return (... && (scalar_result_compare::operator()(thrust::get<I>(t1), thrust::get<I>(t2))));
+  }
+};
+
+struct vector_result_compare {
+  const raft::handle_t& handle_;
+
+  vector_result_compare(raft::handle_t const& handle) : handle_(handle) {}
+
+  template <typename... Args>
+  auto operator()(std::tuple<rmm::device_uvector<Args>...> const& t1,
+                  std::tuple<rmm::device_uvector<Args>...> const& t2)
+  {
+    using type = thrust::tuple<Args...>;
+    return equality_impl(t1, t2, std::make_index_sequence<thrust::tuple_size<type>::value>());
+  }
+
+  template <typename T>
+  auto operator()(rmm::device_uvector<T> const& t1, rmm::device_uvector<T> const& t2)
+  {
+    return thrust::equal(
+      handle_.get_thrust_policy(), t1.begin(), t1.end(), t2.begin(), comparator<T>());
+  }
+
+ private:
+  template <typename T, std::size_t... I>
+  auto equality_impl(T& t1, T& t2, std::index_sequence<I...>)
+  {
+    return (... && (vector_result_compare::operator()(std::get<I>(t1), std::get<I>(t2))));
+  }
+};
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/dependencies.yaml b/dependencies.yaml
index e6cf6c9e93c..d8be5352c7d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -449,7 +449,7 @@ dependencies:
           - &dask rapids-dask-dependency==24.4.*
           - &dask_cuda dask-cuda==24.4.*
           - &numba numba>=0.57
-          - &numpy numpy>=1.23
+          - &numpy numpy>=1.23,<2.0a0
           - &ucx_py ucx-py==0.37.*
       - output_types: conda
         packages:
diff --git a/docs/cugraph/source/api_docs/index.rst b/docs/cugraph/source/api_docs/index.rst
index 1b907165a39..d76902772fb 100644
--- a/docs/cugraph/source/api_docs/index.rst
+++ b/docs/cugraph/source/api_docs/index.rst
@@ -15,7 +15,7 @@ Core Graph API Documentation
     cugraph_c/index.rst
     cugraph_cpp/index.rst
 
-Graph Nerual Networks API Documentation
+Graph Neural Networks API Documentation
 ---------------------------------------
 
 .. toctree::
diff --git a/docs/cugraph/source/basics/nx_transition.rst b/docs/cugraph/source/basics/nx_transition.rst
index 9849865814d..07c2ad26ffa 100644
--- a/docs/cugraph/source/basics/nx_transition.rst
+++ b/docs/cugraph/source/basics/nx_transition.rst
@@ -1,30 +1,20 @@
 **************************************
-NetworkX Compatibility and Transition
+NetworkX by calling cuGraph Algorithms
 **************************************
 
+
 *Note: this is a work in progress and will be updatred and changed as we better flesh out
 compatibility issues*
 
-One of the goals of RAPIDS cuGraph is to mimic the NetworkX API to simplify
-the transition to accelerated GPU data science.  However, graph analysis,
-also called network science, like most other data science workflow, is more
-than just running an algorithm.  Graph data requires cleaning and prep (ETL)
-and then the construction of a graph object; that is all before the execution
-of a graph algorithm.  RAPIDS and cuGraph allow a portion or the complete
-analytic workflow to be accelerated.  To achieve the maximum amount of
-acceleration, we encourage fully replacing existing code with cuGraph.
-But sometimes it is easier to replace just a portion.
-
-Last Update
-###########
+Latest Update
+#############
 
-Last Update:	Oct 14th, 2020
-Release:		0.16
+Last Update:	March 7th, 2024
+Release:		24.04
 
-Information on `NetworkX <https://networkx.github.io/documentation/stable/index.html>`_
-
-This transition guide in an expansion of the Medium Blog on `NetworkX Compatibility
-<https://medium.com/rapids-ai/rapids-cugraph-networkx-compatibility-d119e417557c>`_
+**CuGraph is now a registered backend for networkX. This is described in the following blog:
+`Accelerating NetworkX on NVIDIA GPUs for High Performance Graph Analytics
+<https://developer.nvidia.com/blog/accelerating-networkx-on-nvidia-gpus-for-high-performance-graph-analytics/>`_
 
 
 Easy Path – Use NetworkX Graph Objects, Accelerated Algorithms
@@ -33,12 +23,11 @@ Easy Path – Use NetworkX Graph Objects, Accelerated Algorithms
 Rather than updating all of your existing code, simply update the calls to
 graph algorithms by replacing the module name.  This allows all the complicated
 ETL code to be unchanged while still seeing significate performance
+improvements. Again this will be deprecated since networkX dispatching to nx_cugraph
+has many advantages.
+
 improvements.
 
-In the following example, the cuGraph module is being imported as “cnx”.
-While module can be assigned any name can be used, we picked cnx to reduce
-the amount of text to be changed. The text highlighted in yellow indicates
-changes.
 
 .. image:: ../images/Nx_Cg_1.png
   :width: 600
@@ -49,9 +38,6 @@ input and match the NetworkX API list of arguments.
 Currently, cuGraph accepts both NetworkX Graph and DiGraph objects. We will be
 adding support for Bipartite graph and Multigraph over the next few releases.
 
-|
-
-
 Differences in Algorithms
 ##########################
 
@@ -169,8 +155,8 @@ Graph Building
 ##############
 
 The biggest difference between NetworkX and cuGraph is with how Graph objects
-are built.  NetworkX, for the most part, stores graph data in a dictionary.
-That structure allows easy insertion of new records.    Consider the following
+are built. NetworkX, for the most part, stores graph data in a dictionary.
+That structure allows easy insertion of new records. Consider the following
 code for building a NetworkX Graph::
 
     # Read the node data
diff --git a/docs/cugraph/source/graph_support/property_graph.md b/docs/cugraph/source/graph_support/property_graph.md
index ef07be79ba0..94d170c18df 100644
--- a/docs/cugraph/source/graph_support/property_graph.md
+++ b/docs/cugraph/source/graph_support/property_graph.md
@@ -21,7 +21,7 @@ import cugraph
 from cugraph.experimental import PropertyGraph
 
 # Import a built-in dataset
-from cugraph.experimental.datasets import karate
+from cugraph.datasets import karate
 
 # Read edgelist data into a DataFrame, load into PropertyGraph as edge data.
 # Create a graph using the imported Dataset object
diff --git a/docs/cugraph/source/images/ancestors.png b/docs/cugraph/source/images/ancestors.png
new file mode 100644
index 00000000000..37b8e7933a8
Binary files /dev/null and b/docs/cugraph/source/images/ancestors.png differ
diff --git a/docs/cugraph/source/images/bfs_tree.png b/docs/cugraph/source/images/bfs_tree.png
new file mode 100644
index 00000000000..5bca39ca3bf
Binary files /dev/null and b/docs/cugraph/source/images/bfs_tree.png differ
diff --git a/docs/cugraph/source/images/conn_component.png b/docs/cugraph/source/images/conn_component.png
new file mode 100644
index 00000000000..b7db09657c8
Binary files /dev/null and b/docs/cugraph/source/images/conn_component.png differ
diff --git a/docs/cugraph/source/images/descendents.png b/docs/cugraph/source/images/descendents.png
new file mode 100644
index 00000000000..8afc38b4ef4
Binary files /dev/null and b/docs/cugraph/source/images/descendents.png differ
diff --git a/docs/cugraph/source/images/k_truss.png b/docs/cugraph/source/images/k_truss.png
new file mode 100644
index 00000000000..78a1978d103
Binary files /dev/null and b/docs/cugraph/source/images/k_truss.png differ
diff --git a/docs/cugraph/source/images/katz.png b/docs/cugraph/source/images/katz.png
new file mode 100644
index 00000000000..9f2303a21e3
Binary files /dev/null and b/docs/cugraph/source/images/katz.png differ
diff --git a/docs/cugraph/source/images/pagerank.png b/docs/cugraph/source/images/pagerank.png
new file mode 100644
index 00000000000..193c0a8bbd1
Binary files /dev/null and b/docs/cugraph/source/images/pagerank.png differ
diff --git a/docs/cugraph/source/images/sssp.png b/docs/cugraph/source/images/sssp.png
new file mode 100644
index 00000000000..2c9dfc36852
Binary files /dev/null and b/docs/cugraph/source/images/sssp.png differ
diff --git a/docs/cugraph/source/images/wcc.png b/docs/cugraph/source/images/wcc.png
new file mode 100644
index 00000000000..2d27a3f675c
Binary files /dev/null and b/docs/cugraph/source/images/wcc.png differ
diff --git a/docs/cugraph/source/index.rst b/docs/cugraph/source/index.rst
index b18a79d3396..9ea9e4d65cf 100644
--- a/docs/cugraph/source/index.rst
+++ b/docs/cugraph/source/index.rst
@@ -46,6 +46,7 @@ the docs and links
    :caption: Contents:
 
    basics/index
+   nx_cugraph/index
    installation/index
    tutorials/index
    graph_support/index
diff --git a/docs/cugraph/source/nx_cugraph/index.rst b/docs/cugraph/source/nx_cugraph/index.rst
new file mode 100644
index 00000000000..ef6f51601ab
--- /dev/null
+++ b/docs/cugraph/source/nx_cugraph/index.rst
@@ -0,0 +1,9 @@
+===============================
+nxCugraph as a NetworkX Backend
+===============================
+
+
+.. toctree::
+   :maxdepth: 2
+
+   nx_cugraph.md
diff --git a/docs/cugraph/source/nx_cugraph/nx_cugraph.md b/docs/cugraph/source/nx_cugraph/nx_cugraph.md
new file mode 100644
index 00000000000..8d497e3a1d7
--- /dev/null
+++ b/docs/cugraph/source/nx_cugraph/nx_cugraph.md
@@ -0,0 +1,165 @@
+### nx_cugraph
+
+
+Whereas previous versions of cuGraph have included mechanisms to make it
+trivial to plug in cuGraph algorithm calls. Beginning with version 24.02, nx-cuGraph
+is now a [networkX backend](<https://networkx.org/documentation/stable/reference/utils.html#backends>).
+The user now need only [install nx-cugraph](<https://github.com/rapidsai/cugraph/blob/branch-24.04/python/nx-cugraph/README.md#install>)
+to experience GPU speedups.
+
+Lets look at some examples of algorithm speedups comparing CPU based NetworkX to dispatched versions run on GPU with nx_cugraph.
+
+Each chart has three measurements.
+* NX - running the algorithm natively with networkX on CPU.
+* nx-cugraph - running with GPU accelerated networkX achieved by simply calling the cugraph backend. This pays the overhead of building the GPU resident object for each algorithm called. This achieves significant improvement but stil isn't compleltely optimum.
+* nx-cugraph (preconvert) - This is a bit more complicated since it involves building (precomputing) the GPU resident graph ahead and reusing it for each algorithm.
+
+
+![Ancestors](../images/ancestors.png)
+![BFS Tree](../images/bfs_tree.png)
+![Connected Components](../images/conn_component.png)
+![Descendents](../images/descendents.png)
+![Katz](../images/katz.png)
+![Pagerank](../images/pagerank.png)
+![Single Source Shortest Path](../images/sssp.png)
+![Weakly Connected Components](../images/wcc.png)
+
+
+The following algorithms are supported and automatically dispatched to nx-cuGraph for acceleration.
+
+#### Algorithms
+```
+bipartite
+ ├─ basic
+ │   └─ is_bipartite
+ └─ generators
+     └─ complete_bipartite_graph
+centrality
+ ├─ betweenness
+ │   ├─ betweenness_centrality
+ │   └─ edge_betweenness_centrality
+ ├─ degree_alg
+ │   ├─ degree_centrality
+ │   ├─ in_degree_centrality
+ │   └─ out_degree_centrality
+ ├─ eigenvector
+ │   └─ eigenvector_centrality
+ └─ katz
+     └─ katz_centrality
+cluster
+ ├─ average_clustering
+ ├─ clustering
+ ├─ transitivity
+ └─ triangles
+community
+ └─ louvain
+     └─ louvain_communities
+components
+ ├─ connected
+ │   ├─ connected_components
+ │   ├─ is_connected
+ │   ├─ node_connected_component
+ │   └─ number_connected_components
+ └─ weakly_connected
+     ├─ is_weakly_connected
+     ├─ number_weakly_connected_components
+     └─ weakly_connected_components
+core
+ ├─ core_number
+ └─ k_truss
+dag
+ ├─ ancestors
+ └─ descendants
+isolate
+ ├─ is_isolate
+ ├─ isolates
+ └─ number_of_isolates
+link_analysis
+ ├─ hits_alg
+ │   └─ hits
+ └─ pagerank_alg
+     └─ pagerank
+operators
+ └─ unary
+     ├─ complement
+     └─ reverse
+reciprocity
+ ├─ overall_reciprocity
+ └─ reciprocity
+shortest_paths
+ └─ unweighted
+     ├─ single_source_shortest_path_length
+     └─ single_target_shortest_path_length
+traversal
+ └─ breadth_first_search
+     ├─ bfs_edges
+     ├─ bfs_layers
+     ├─ bfs_predecessors
+     ├─ bfs_successors
+     ├─ bfs_tree
+     ├─ descendants_at_distance
+     └─ generic_bfs_edges
+tree
+ └─ recognition
+     ├─ is_arborescence
+     ├─ is_branching
+     ├─ is_forest
+     └─ is_tree
+```
+
+#### Generators
+```
+classic
+ ├─ barbell_graph
+ ├─ circular_ladder_graph
+ ├─ complete_graph
+ ├─ complete_multipartite_graph
+ ├─ cycle_graph
+ ├─ empty_graph
+ ├─ ladder_graph
+ ├─ lollipop_graph
+ ├─ null_graph
+ ├─ path_graph
+ ├─ star_graph
+ ├─ tadpole_graph
+ ├─ trivial_graph
+ ├─ turan_graph
+ └─ wheel_graph
+community
+ └─ caveman_graph
+small
+ ├─ bull_graph
+ ├─ chvatal_graph
+ ├─ cubical_graph
+ ├─ desargues_graph
+ ├─ diamond_graph
+ ├─ dodecahedral_graph
+ ├─ frucht_graph
+ ├─ heawood_graph
+ ├─ house_graph
+ ├─ house_x_graph
+ ├─ icosahedral_graph
+ ├─ krackhardt_kite_graph
+ ├─ moebius_kantor_graph
+ ├─ octahedral_graph
+ ├─ pappus_graph
+ ├─ petersen_graph
+ ├─ sedgewick_maze_graph
+ ├─ tetrahedral_graph
+ ├─ truncated_cube_graph
+ ├─ truncated_tetrahedron_graph
+ └─ tutte_graph
+social
+ ├─ davis_southern_women_graph
+ ├─ florentine_families_graph
+ ├─ karate_club_graph
+ └─ les_miserables_graph
+```
+
+#### Other
+
+```
+convert_matrix
+ ├─ from_pandas_edgelist
+ └─ from_scipy_sparse_array
+```
diff --git a/docs/cugraph/source/tutorials/community_resources.md b/docs/cugraph/source/tutorials/community_resources.md
index 1c4362393d1..975f11965de 100644
--- a/docs/cugraph/source/tutorials/community_resources.md
+++ b/docs/cugraph/source/tutorials/community_resources.md
@@ -1,2 +1,4 @@
 # Commmunity Resources
 [Rapids Community Repository](https://github.com/rapidsai-community/notebooks-contrib)
+[RAPIDS Containers on Docker Hub](https://catalog.ngc.nvidia.com/containers)
+[RAPIDS PyTorch Container in Docker](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pyg)
diff --git a/docs/cugraph/source/tutorials/cugraph_blogs.rst b/docs/cugraph/source/tutorials/cugraph_blogs.rst
index 373e846f6c3..3665f425e3f 100644
--- a/docs/cugraph/source/tutorials/cugraph_blogs.rst
+++ b/docs/cugraph/source/tutorials/cugraph_blogs.rst
@@ -9,6 +9,17 @@ Here, we've selected just a few that are of particular interest to cuGraph users
 
 Blogs & Conferences
 ====================
+2024
+------
+Coming Soon
+
+2023
+------
+  * `Intro to Graph Neural Networks with cuGraph-DGL <https://medium.com/rapids-ai/introduction-to-graph-neural-networks-with-cugraph-dgl-64c632e9cc52>`_
+  * `GTC 2023 Ask the Experts Q&A <https://forums.developer.nvidia.com/c/blogs-events/connect-with-experts/ama-cugraph/652?ncid=em-even-260150-vt33#cid=dev03_em-even_en-us>`_
+  * `Accelerating NetworkX on NVIDIA GPUs for High Performance Graph Analytics <https://developer.nvidia.com/blog/accelerating-networkx-on-nvidia-gpus-for-high-performance-graph-analytics/>`_
+  * `Introduction to Graph Neural Networks with NVIDIA cuGraph-DGL <https://developer.nvidia.com/blog/introduction-to-graph-neural-networks-with-nvidia-cugraph-dgl/>`_
+  * `Supercharge Graph Analytics at Scale with GPU-CPU Fusion for 100x Performance <https://developer.nvidia.com/blog/supercharge-graph-analytics-at-scale-with-gpu-cpu-fusion-for-100x-performance/>`_
 2022
 ------
   * `GTC: State of cuGraph  (video & slides) <https://www.nvidia.com/gtc/session-catalog/?search=cuGraph&tab.scheduledorondemand=1583520458947001NJiE&search=cuGraph#/session/1635793340204001n4p2>`_
@@ -50,6 +61,8 @@ Media
 Academic Papers
 ===============
 
+ * Seunghwa Kang, Chuck Hastings, Joe Eaton, Brad Rees `cuGraph C++ primitives: vertex/edge-centric building blocks for parallel graph computing <https://ieeexplore.ieee.org/abstract/document/10196665>`_
+
  * Alex Fender, Brad Rees, Joe Eaton (2022) `Massive Graph Analytics <https://books.google.com/books?hl=en&lr=&id=QspxEAAAQBAJ&oi=fnd&pg=PT8&dq=book:%22Massive+Graph+Analytics%22&ots=3HAGJ0njKO&sig=8e4v0azmzA6LTQNUNgPw-uTLkoc#v=onepage&q&f=false>`_  Bader, D. (Editor) CRC Press
 
  * S Kang, A. Fender, J. Eaton, B. Rees:`Computing PageRank Scores of Web Crawl Data Using DGX A100 Clusters`. In IEEE HPEC, Sep. 2020
@@ -58,6 +71,8 @@ Academic Papers
 
  * Richardson, B., Rees, B., Drabas, T., Oldridge, E., Bader, D. A., & Allen, R. (2020, August). Accelerating and Expanding End-to-End Data Science Workflows with DL/ML Interoperability Using RAPIDS. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (pp. 3503-3504).
 
+ * A Gondhalekar, P Sathre, W Feng `Hybrid CPU-GPU Implementation of Edge-Connected Jaccard Similarity in Graph Datasets <https://sc23.supercomputing.org/proceedings/tech_poster/poster_files/rpost221s3-file3.pdf>`_
+
 
 Other Blogs
 ========================
diff --git a/notebooks/algorithms/link_prediction/Jaccard-Similarity.ipynb b/notebooks/algorithms/link_prediction/Jaccard-Similarity.ipynb
index 86bb4d17c22..9f62fd4f421 100755
--- a/notebooks/algorithms/link_prediction/Jaccard-Similarity.ipynb
+++ b/notebooks/algorithms/link_prediction/Jaccard-Similarity.ipynb
@@ -8,12 +8,7 @@
     "# Jaccard Similarity\n",
     "----\n",
     "\n",
-    "In this notebook we will explore the Jaccard vertex similarity metrics available in cuGraph.\n",
-    "\n",
-    "cuGraph supports Jaccard similarity for both unweighted and weighted graphs, but this notebook \n",
-    "will demonstrate Jaccard similarity only on unweighted graphs. A future update will include an \n",
-    "example using a graph with edge weights, where the weights are used to influence the Jaccard \n",
-    "similarity coefficients."
+    "In this notebook we will explore the Jaccard vertex similarity metrics available in cuGraph."
    ]
   },
   {
@@ -23,48 +18,30 @@
    "source": [
     "## Introduction\n",
     "\n",
-    "The Jaccard similarity between two sets is defined as the ratio of the volume of their intersection \n",
-    "divided by the volume of their union, where the sets used are the sets of neighboring vertices for each \n",
-    "vertex.\n",
-    "\n",
-    "The neighbors of a vertex, _v_, is defined as the set, _U_, of vertices connected by way of an edge to vertex v, or _N(v) = {U} where v ∈ V and ∀ u ∈ U ∃ edge(v,u)∈ E_.\n",
+    "The Jaccard similarity between two sets is defined as the ratio of the volume of their intersection divided by the volume of their union. \n",
     "\n",
-    "If we then let set __A__ be the set of neighbors for vertex _a_, and set __B__ be the set of neighbors for vertex _b_, then the Jaccard Similarity for the vertex pair _(a, b)_ can be expressed as\n",
+    "The Jaccard Similarity can then be expressed as\n",
     "\n",
     "$\\text{Jaccard similarity} = \\frac{|A \\cap B|}{|A \\cup B|}$\n",
     "\n",
     "\n",
-    "cuGraph's Jaccard function will, by default, compute the Jaccard similarity coefficient for every pair of \n",
-    "vertices in the two-hop neighborhood for every vertex.\n",
-    "\n",
-    "```df = cugraph.jaccard(G, vertex_pair=None)```\n",
-    "\n",
-    "Parameters:\n",
+    "To compute the Jaccard similarity between all pairs of vertices connected by an edge in cuGraph use: <br>\n",
+    "__df = cugraph.jaccard(G)__\n",
     "\n",
     "    G: A cugraph.Graph object\n",
     "\n",
-    "    vertex_pair: cudf.DataFrame, optional (default=None)\n",
-    "        A GPU dataframe consisting of two columns representing pairs of\n",
-    "        vertices. If provided, the jaccard coefficient is computed for the\n",
-    "        given vertex pairs.  If the vertex_pair is not provided then the\n",
-    "        current implementation computes the jaccard coefficient for all\n",
-    "        adjacent vertices in the graph.\n",
-    "\n",
     "Returns:\n",
     "\n",
     "    df: cudf.DataFrame with three columns:\n",
     "        df[\"first\"]: The first vertex id of each pair.\n",
     "        df[\"second\"]: The second vertex id of each pair.\n",
     "        df[\"jaccard_coeff\"]: The jaccard coefficient computed between the vertex pairs.\n",
-    "\n",
-    "To limit the computation to specific vertex pairs, including those not in the same two-hop \n",
-    "neighborhood, pass a `vertex_pair` value (see example below).\n",
+    "<br>\n",
     "\n",
     "__References__ \n",
     "- https://research.nvidia.com/publication/2017-11_Parallel-Jaccard-and \n",
     "\n",
     "__Additional Reading__ \n",
-    "- [Intro to Graph Analysis using cuGraph: Similarity Algorithms](https://medium.com/rapids-ai/intro-to-graph-analysis-using-cugraph-similarity-algorithms-64fa923791ac)\n",
     "- [Wikipedia: Jaccard](https://en.wikipedia.org/wiki/Jaccard_index)\n"
    ]
   },
@@ -94,7 +71,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
     "scrolled": true
    },
@@ -119,7 +96,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -138,7 +115,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -157,7 +134,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -170,189 +147,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>first</th>\n",
-       "      <th>second</th>\n",
-       "      <th>jaccard_coeff</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>541</th>\n",
-       "      <td>14</td>\n",
-       "      <td>15</td>\n",
-       "      <td>1.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>542</th>\n",
-       "      <td>14</td>\n",
-       "      <td>18</td>\n",
-       "      <td>1.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>543</th>\n",
-       "      <td>14</td>\n",
-       "      <td>20</td>\n",
-       "      <td>1.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>544</th>\n",
-       "      <td>14</td>\n",
-       "      <td>22</td>\n",
-       "      <td>1.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>561</th>\n",
-       "      <td>15</td>\n",
-       "      <td>18</td>\n",
-       "      <td>1.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>562</th>\n",
-       "      <td>15</td>\n",
-       "      <td>20</td>\n",
-       "      <td>1.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>563</th>\n",
-       "      <td>15</td>\n",
-       "      <td>22</td>\n",
-       "      <td>1.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>587</th>\n",
-       "      <td>17</td>\n",
-       "      <td>21</td>\n",
-       "      <td>1.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>605</th>\n",
-       "      <td>18</td>\n",
-       "      <td>20</td>\n",
-       "      <td>1.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>606</th>\n",
-       "      <td>18</td>\n",
-       "      <td>22</td>\n",
-       "      <td>1.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>625</th>\n",
-       "      <td>20</td>\n",
-       "      <td>22</td>\n",
-       "      <td>1.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>299</th>\n",
-       "      <td>7</td>\n",
-       "      <td>13</td>\n",
-       "      <td>0.800000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>285</th>\n",
-       "      <td>6</td>\n",
-       "      <td>10</td>\n",
-       "      <td>0.750000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>388</th>\n",
-       "      <td>4</td>\n",
-       "      <td>5</td>\n",
-       "      <td>0.750000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>443</th>\n",
-       "      <td>19</td>\n",
-       "      <td>21</td>\n",
-       "      <td>0.666667</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>502</th>\n",
-       "      <td>9</td>\n",
-       "      <td>28</td>\n",
-       "      <td>0.666667</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>584</th>\n",
-       "      <td>17</td>\n",
-       "      <td>19</td>\n",
-       "      <td>0.666667</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>223</th>\n",
-       "      <td>13</td>\n",
-       "      <td>19</td>\n",
-       "      <td>0.600000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>45</th>\n",
-       "      <td>32</td>\n",
-       "      <td>33</td>\n",
-       "      <td>0.526316</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>310</th>\n",
-       "      <td>7</td>\n",
-       "      <td>12</td>\n",
-       "      <td>0.500000</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     first  second  jaccard_coeff\n",
-       "541     14      15       1.000000\n",
-       "542     14      18       1.000000\n",
-       "543     14      20       1.000000\n",
-       "544     14      22       1.000000\n",
-       "561     15      18       1.000000\n",
-       "562     15      20       1.000000\n",
-       "563     15      22       1.000000\n",
-       "587     17      21       1.000000\n",
-       "605     18      20       1.000000\n",
-       "606     18      22       1.000000\n",
-       "625     20      22       1.000000\n",
-       "299      7      13       0.800000\n",
-       "285      6      10       0.750000\n",
-       "388      4       5       0.750000\n",
-       "443     19      21       0.666667\n",
-       "502      9      28       0.666667\n",
-       "584     17      19       0.666667\n",
-       "223     13      19       0.600000\n",
-       "45      32      33       0.526316\n",
-       "310      7      12       0.500000"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Show the top-20 most similar vertices.\n",
     "jaccard_coeffs.head(20)"
@@ -372,63 +169,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "If we want to see the similarity of a pair of vertices that are not part of \n",
-    "the same two-hop neighborhood, we have to specify them in a `cudf.DataFrame` \n",
-    "to pass to the `jaccard` call."
+    "We have to specify vertices in a DataFrame to see their similarity if they\n",
+    "are not part of the same two-hop neighborhood."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>first</th>\n",
-       "      <th>second</th>\n",
-       "      <th>jaccard_coeff</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>16</td>\n",
-       "      <td>33</td>\n",
-       "      <td>0.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   first  second  jaccard_coeff\n",
-       "0     16      33            0.0"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "cugraph.jaccard(G, cudf.DataFrame([(16, 33)]))"
    ]
@@ -443,19 +192,75 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can use the `cudf.DataFrame` argument to pass in any number of specific vertex pairs \n",
-    "to compute the similarity for, regardless of whether or not they're included by default. \n",
-    "This is useful to limit the computation and result size when only specific vertex \n",
-    "similarities are needed."
+    "---\n",
+    "# Now we look at weighted Jaccard!\n",
+    "\n",
+    "A full explanation of the weighted jaccard is found [here](https://en.wikipedia.org/wiki/Jaccard_index#Weighted_Jaccard_similarity_and_distance).\n",
+    "\n",
+    "The Dining Preferences data set is a staple of smallest scale social network analysis.\n",
+    "The data represents the first (weight = 1) and second (weight = 2) dining partner preference from a survey done in a small school dormitory.\n",
+    "\n",
+    "This data originated in social network publication by J.L. Moreno\n",
+    "\n",
+    "Reference: J. L. Moreno (1960). The Sociometry Reader. The Free Press, Glencoe, Illinois, pg.35\n",
+    "\n",
+    "\n",
+    "Here is a visualization of the dataset\n",
+    "<img src=\"../../img/dorm_data_diagram.png\" width=\"100%\"/>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### First pull in the dining preferences data set and load it into a cuGraph."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import the dining preferences dataset from cugraph's examples\n",
+    "from cugraph.datasets import dining_prefs\n",
+    "# load the graph making sure to not ignore the weights\n",
+    "G = dining_prefs.get_graph(download=True, store_transposed=True, ignore_weights=False)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Do the calculations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# calculate both the unweighted and weighted Jaccard\n",
+    "jaccard_coeffs = cugraph.jaccard(G)\n",
+    "jaccard_weighted = cugraph.jaccard(G, use_weight=True)\n",
+    "# rename the weighted results\n",
+    "jaccard_weighted = jaccard_weighted.rename(columns={'jaccard_coeff' : 'weighted_jaccard' })"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Join the results dataframes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -482,46 +287,68 @@
        "      <th>first</th>\n",
        "      <th>second</th>\n",
        "      <th>jaccard_coeff</th>\n",
+       "      <th>weighted_jaccard</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>16</td>\n",
-       "      <td>33</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>Lena</td>\n",
+       "      <td>Marion</td>\n",
+       "      <td>0.125000</td>\n",
+       "      <td>0.076923</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>32</td>\n",
-       "      <td>33</td>\n",
-       "      <td>0.526316</td>\n",
+       "      <td>Lena</td>\n",
+       "      <td>Adele</td>\n",
+       "      <td>0.142857</td>\n",
+       "      <td>0.090909</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>0</td>\n",
-       "      <td>23</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>Lena</td>\n",
+       "      <td>Ellen</td>\n",
+       "      <td>0.166667</td>\n",
+       "      <td>0.100000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Lena</td>\n",
+       "      <td>Louise</td>\n",
+       "      <td>0.200000</td>\n",
+       "      <td>0.111111</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Louise</td>\n",
+       "      <td>Eva</td>\n",
+       "      <td>0.111111</td>\n",
+       "      <td>0.076923</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   first  second  jaccard_coeff\n",
-       "0     16      33       0.000000\n",
-       "1     32      33       0.526316\n",
-       "2      0      23       0.000000"
+       "    first  second  jaccard_coeff  weighted_jaccard\n",
+       "0    Lena  Marion       0.125000          0.076923\n",
+       "1    Lena   Adele       0.142857          0.090909\n",
+       "2    Lena   Ellen       0.166667          0.100000\n",
+       "3    Lena  Louise       0.200000          0.111111\n",
+       "4  Louise     Eva       0.111111          0.076923"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "pairs = cudf.DataFrame([(16, 33), (32, 33), (0, 23)])\n",
-    "cugraph.jaccard(G, pairs)"
+    "# Merge the two results together joining on the vertices pairs\n",
+    "jaccard_merged = jaccard_coeffs.merge(jaccard_weighted, on=['first','second'], how='left')\n",
+    "jaccard_merged.sort_values('weighted_jaccard',ascending=False)\n",
+    "jaccard_merged.head()"
    ]
   },
   {
@@ -539,21 +366,6 @@
     "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.\n",
     "___"
    ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Revision History\n",
-    "\n",
-    "| Author        | Date       | Update           | cuGraph Version | Test Hardware             |\n",
-    "| --------------|------------|------------------|-----------------|---------------------------|\n",
-    "| Brad Rees     | 10/14/2019 | created          | 0.14            | GV100 32 GB, CUDA 10.2    |\n",
-    "| Don Acosta    | 07/20/2022 | tested/updated   | 22.08 nightly   | DGX Tesla V100, CUDA 11.5 |\n",
-    "| Ralph Liu     | 06/29/2023 | updated          | 23.08 nightly   | DGX Tesla V100, CUDA 12.0 |\n",
-    "| Rick Ratzel   | 02/23/2024 | tested/updated   | 24.04 nightly   | DGX Tesla V100, CUDA 12.0 |"
-   ]
   }
  ],
  "metadata": {
diff --git a/notebooks/img/dorm_data_diagram.png b/notebooks/img/dorm_data_diagram.png
new file mode 100644
index 00000000000..e0780c9c8a3
Binary files /dev/null and b/notebooks/img/dorm_data_diagram.png differ
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
index 815fd30d8eb..f6fe38fe9f8 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -63,6 +63,10 @@ def __getitem__(self, idx: int):
 
         fn, batch_offset = self._batch_to_fn_d[idx]
         if fn != self._current_batch_fn:
+            # Remove current batches to free up memory
+            # before loading new batches
+            if hasattr(self, "_current_batches"):
+                del self._current_batches
             if self.sparse_format == "csc":
                 df = _load_sampled_file(dataset_obj=self, fn=fn, skip_rename=True)
                 self._current_batches = (
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
index cc4ce474f2d..e8813271fd8 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
@@ -186,6 +186,10 @@ def forward(
         nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
         max_in_degree: Optional[int] = None,
+        deterministic_dgrad: bool = False,
+        deterministic_wgrad: bool = False,
+        high_precision_dgrad: bool = False,
+        high_precision_wgrad: bool = False,
     ) -> torch.Tensor:
         r"""Forward computation.
 
@@ -204,6 +208,20 @@ def forward(
             from a neighbor sampler, the value should be set to the corresponding
             :attr:`fanout`. This option is used to invoke the MFG-variant of
             cugraph-ops kernel.
+        deterministic_dgrad : bool, default=False
+            Optional flag indicating whether the feature gradients
+            are computed deterministically using a dedicated workspace buffer.
+        deterministic_wgrad: bool, default=False
+            Optional flag indicating whether the weight gradients
+            are computed deterministically using a dedicated workspace buffer.
+        high_precision_dgrad: bool, default=False
+            Optional flag indicating whether gradients for inputs in half precision
+            are kept in single precision as long as possible and only casted to
+            the corresponding input type at the very end.
+        high_precision_wgrad: bool, default=False
+            Optional flag indicating whether gradients for weights in half precision
+            are kept in single precision as long as possible and only casted to
+            the corresponding input type at the very end.
 
         Returns
         -------
@@ -232,6 +250,8 @@ def forward(
         _graph = self.get_cugraph_ops_CSC(
             g, is_bipartite=bipartite, max_in_degree=max_in_degree
         )
+        if deterministic_dgrad:
+            _graph.add_reverse_graph()
 
         if bipartite:
             nfeat = (self.feat_drop(nfeat[0]), self.feat_drop(nfeat[1]))
@@ -273,6 +293,10 @@ def forward(
             negative_slope=self.negative_slope,
             concat_heads=self.concat,
             edge_feat=efeat,
+            deterministic_dgrad=deterministic_dgrad,
+            deterministic_wgrad=deterministic_wgrad,
+            high_precision_dgrad=high_precision_dgrad,
+            high_precision_wgrad=high_precision_wgrad,
         )[: g.num_dst_nodes()]
 
         if self.concat:
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
index 6c78b4df0b8..4f47005f8ee 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
@@ -150,6 +150,8 @@ def forward(
         nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
         max_in_degree: Optional[int] = None,
+        deterministic_dgrad: bool = False,
+        deterministic_wgrad: bool = False,
     ) -> torch.Tensor:
         r"""Forward computation.
 
@@ -166,6 +168,12 @@ def forward(
             from a neighbor sampler, the value should be set to the corresponding
             :attr:`fanout`. This option is used to invoke the MFG-variant of
             cugraph-ops kernel.
+        deterministic_dgrad : bool, default=False
+            Optional flag indicating whether the feature gradients
+            are computed deterministically using a dedicated workspace buffer.
+        deterministic_wgrad: bool, default=False
+            Optional flag indicating whether the weight gradients
+            are computed deterministically using a dedicated workspace buffer.
 
         Returns
         -------
@@ -196,6 +204,8 @@ def forward(
         _graph = self.get_cugraph_ops_CSC(
             g, is_bipartite=graph_bipartite, max_in_degree=max_in_degree
         )
+        if deterministic_dgrad:
+            _graph.add_reverse_graph()
 
         if nfeat_bipartite:
             nfeat = (self.feat_drop(nfeat[0]), self.feat_drop(nfeat[1]))
@@ -228,6 +238,8 @@ def forward(
             negative_slope=self.negative_slope,
             concat_heads=self.concat,
             edge_feat=efeat,
+            deterministic_dgrad=deterministic_dgrad,
+            deterministic_wgrad=deterministic_wgrad,
         )[: g.num_dst_nodes()]
 
         if self.concat:
diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
index c6f76325761..f17292c5e70 100644
--- a/python/cugraph-dgl/pyproject.toml
+++ b/python/cugraph-dgl/pyproject.toml
@@ -25,7 +25,7 @@ classifiers = [
 dependencies = [
     "cugraph==24.4.*",
     "numba>=0.57",
-    "numpy>=1.23",
+    "numpy>=1.23,<2.0a0",
     "pylibcugraphops==24.4.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
diff --git a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
index 05d540b7c45..df16fc9fd6c 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -1083,13 +1083,12 @@ def _get_tensor(self, attr: CuGraphTensorAttr) -> TensorType:
 
         idx = attr.index
         if idx is not None:
-            if feature_backend == "torch":
+            if feature_backend in ["torch", "wholegraph"]:
                 if not isinstance(idx, torch.Tensor):
                     raise TypeError(
                         f"Type {type(idx)} invalid"
                         f" for feature store backend {feature_backend}"
                     )
-                idx = idx.cpu()
             elif feature_backend == "numpy":
                 # allow feature indexing through cupy arrays
                 if isinstance(idx, cupy.ndarray):
@@ -1244,5 +1243,77 @@ def _infer_unspecified_attr(self, attr: CuGraphTensorAttr) -> CuGraphTensorAttr:
 
         return attr
 
+    def filter(
+        self,
+        format: str,
+        node_dict: Dict[str, torch.Tensor],
+        row_dict: Dict[str, torch.Tensor],
+        col_dict: Dict[str, torch.Tensor],
+        edge_dict: Dict[str, Tuple[torch.Tensor]],
+    ) -> torch_geometric.data.HeteroData:
+        """
+        Parameters
+        ----------
+        format: str
+            COO or CSC
+        node_dict: Dict[str, torch.Tensor]
+            IDs of nodes in original store being outputted
+        row_dict: Dict[str, torch.Tensor]
+            Renumbered output edge index row
+        col_dict: Dict[str, torch.Tensor]
+            Renumbered output edge index column
+        edge_dict: Dict[str, Tuple[torch.Tensor]]
+            Currently unused original edge mapping
+        """
+        data = torch_geometric.data.HeteroData()
+
+        # TODO use torch_geometric.EdgeIndex in release 24.04 (Issue #4051)
+        for attr in self.get_all_edge_attrs():
+            key = attr.edge_type
+            if key in row_dict and key in col_dict:
+                if format == "CSC":
+                    data.put_edge_index(
+                        (row_dict[key], col_dict[key]),
+                        edge_type=key,
+                        layout="csc",
+                        is_sorted=True,
+                    )
+                else:
+                    data[key].edge_index = torch.stack(
+                        [
+                            row_dict[key],
+                            col_dict[key],
+                        ],
+                        dim=0,
+                    )
+
+        required_attrs = []
+        # To prevent copying multiple times, we use a cache;
+        # the original node_dict serves as the gpu cache if needed
+        node_dict_cpu = {}
+        for attr in self.get_all_tensor_attrs():
+            if attr.group_name in node_dict:
+                device = self.__features.get_storage(attr.group_name, attr.attr_name)
+                attr.index = node_dict[attr.group_name]
+                if not isinstance(attr.index, torch.Tensor):
+                    raise ValueError("Node index must be a tensor!")
+                if attr.index.is_cuda and device == "cpu":
+                    if attr.group_name not in node_dict_cpu:
+                        node_dict_cpu[attr.group_name] = attr.index.cpu()
+                    attr.index = node_dict_cpu[attr.group_name]
+                elif attr.index.is_cpu and device == "cuda":
+                    node_dict_cpu[attr.group_name] = attr.index
+                    node_dict[attr.group_name] = attr.index.cuda()
+                    attr.index = node_dict[attr.group_name]
+
+                required_attrs.append(attr)
+                data[attr.group_name].num_nodes = attr.index.size(0)
+
+        tensors = self.multi_get_tensor(required_attrs)
+        for i, attr in enumerate(required_attrs):
+            data[attr.group_name][attr.attr_name] = tensors[i]
+
+        return data
+
     def __len__(self):
         return len(self.get_all_tensor_attrs())
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
index 9c0adaad879..4ca573504a1 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -21,7 +21,7 @@
 import torch
 import numpy as np
 
-from torch_geometric.nn import CuGraphSAGEConv
+from cugraph_pyg.nn import SAGEConv as CuGraphSAGEConv
 
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py
index 82f5e7ea67d..9c96a707e4d 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,7 +18,7 @@
 
 import torch
 
-from torch_geometric.nn import CuGraphSAGEConv
+from cugraph_pyg.nn import SAGEConv as CuGraphSAGEConv
 
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
index bcfaf579820..55c9e9b3329 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
@@ -28,7 +28,6 @@
     _sampler_output_from_sampling_results_heterogeneous,
     _sampler_output_from_sampling_results_homogeneous_csr,
     _sampler_output_from_sampling_results_homogeneous_coo,
-    filter_cugraph_store_csc,
 )
 
 from typing import Union, Tuple, Sequence, List, Dict
@@ -454,31 +453,20 @@ def __next__(self):
 
         start_time_feature = perf_counter()
         # Create a PyG HeteroData object, loading the required features
-        if self.__coo:
-            pyg_filter_fn = (
-                torch_geometric.loader.utils.filter_custom_hetero_store
-                if hasattr(torch_geometric.loader.utils, "filter_custom_hetero_store")
-                else torch_geometric.loader.utils.filter_custom_store
-            )
-            out = pyg_filter_fn(
-                self.__feature_store,
-                self.__graph_store,
-                sampler_output.node,
-                sampler_output.row,
-                sampler_output.col,
-                sampler_output.edge,
-            )
-        else:
-            out = filter_cugraph_store_csc(
-                self.__feature_store,
-                self.__graph_store,
-                sampler_output.node,
-                sampler_output.row,
-                sampler_output.col,
-                sampler_output.edge,
-            )
+        if self.__graph_store != self.__feature_store:
+            # TODO Possibly support this if there is an actual use case
+            raise ValueError("Separate graph and feature stores currently unsupported")
+
+        out = self.__graph_store.filter(
+            "COO" if self.__coo else "CSC",
+            sampler_output.node,
+            sampler_output.row,
+            sampler_output.col,
+            sampler_output.edge,
+        )
 
         # Account for CSR format in cuGraph vs. CSC format in PyG
+        # TODO deprecate and remove this functionality
         if self.__coo and self.__graph_store.order == "CSC":
             for edge_type in out.edge_index_dict:
                 out[edge_type].edge_index = out[edge_type].edge_index.flip(dims=[0])
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
index 309bee4e228..d1785f2bef8 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -162,6 +162,10 @@ def forward(
         csc: Tuple[torch.Tensor, torch.Tensor, int],
         edge_attr: Optional[torch.Tensor] = None,
         max_num_neighbors: Optional[int] = None,
+        deterministic_dgrad: bool = False,
+        deterministic_wgrad: bool = False,
+        high_precision_dgrad: bool = False,
+        high_precision_wgrad: bool = False,
     ) -> torch.Tensor:
         r"""Runs the forward pass of the module.
 
@@ -178,11 +182,27 @@ def forward(
                 of a destination node. When enabled, it allows models to use
                 the message-flow-graph primitives in cugraph-ops.
                 (default: :obj:`None`)
+            deterministic_dgrad : bool, default=False
+                Optional flag indicating whether the feature gradients
+                are computed deterministically using a dedicated workspace buffer.
+            deterministic_wgrad: bool, default=False
+                Optional flag indicating whether the weight gradients
+                are computed deterministically using a dedicated workspace buffer.
+            high_precision_dgrad: bool, default=False
+                Optional flag indicating whether gradients for inputs in half precision
+                are kept in single precision as long as possible and only casted to
+                the corresponding input type at the very end.
+            high_precision_wgrad: bool, default=False
+                Optional flag indicating whether gradients for weights in half precision
+                are kept in single precision as long as possible and only casted to
+                the corresponding input type at the very end.
         """
         bipartite = not isinstance(x, torch.Tensor)
         graph = self.get_cugraph(
             csc, bipartite=bipartite, max_num_neighbors=max_num_neighbors
         )
+        if deterministic_dgrad:
+            graph.add_reverse_graph()
 
         if edge_attr is not None:
             if self.lin_edge is None:
@@ -220,6 +240,10 @@ def forward(
             negative_slope=self.negative_slope,
             concat_heads=self.concat,
             edge_feat=edge_attr,
+            deterministic_dgrad=deterministic_dgrad,
+            deterministic_wgrad=deterministic_wgrad,
+            high_precision_dgrad=high_precision_dgrad,
+            high_precision_wgrad=high_precision_wgrad,
         )
 
         if self.bias is not None:
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
index 32956dcb400..33865898816 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -174,6 +174,8 @@ def forward(
         x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
         csc: Tuple[torch.Tensor, torch.Tensor, int],
         edge_attr: Optional[torch.Tensor] = None,
+        deterministic_dgrad: bool = False,
+        deterministic_wgrad: bool = False,
     ) -> torch.Tensor:
         r"""Runs the forward pass of the module.
 
@@ -186,9 +188,17 @@ def forward(
                 :meth:`to_csc` method to convert an :obj:`edge_index`
                 representation to the desired format.
             edge_attr: (torch.Tensor, optional) The edge features.
+            deterministic_dgrad : bool, default=False
+                Optional flag indicating whether the feature gradients
+                are computed deterministically using a dedicated workspace buffer.
+            deterministic_wgrad: bool, default=False
+                Optional flag indicating whether the weight gradients
+                are computed deterministically using a dedicated workspace buffer.
         """
         bipartite = not isinstance(x, torch.Tensor) or not self.share_weights
         graph = self.get_cugraph(csc, bipartite=bipartite)
+        if deterministic_dgrad:
+            graph.add_reverse_graph()
 
         if edge_attr is not None:
             if self.lin_edge is None:
@@ -217,6 +227,8 @@ def forward(
             negative_slope=self.negative_slope,
             concat_heads=self.concat,
             edge_feat=edge_attr,
+            deterministic_dgrad=deterministic_dgrad,
+            deterministic_wgrad=deterministic_wgrad,
         )
 
         if self.bias is not None:
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
index 65cb63d25e0..ffab54efe08 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
@@ -411,6 +411,10 @@ def filter_cugraph_store_csc(
     col_dict: Dict[str, torch.Tensor],
     edge_dict: Dict[str, Tuple[torch.Tensor]],
 ) -> torch_geometric.data.HeteroData:
+    """
+    Deprecated
+    """
+
     data = torch_geometric.data.HeteroData()
 
     for attr in graph_store.get_all_edge_attrs():
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
index cbee5ed4b58..150ecbf506b 100644
--- a/python/cugraph-pyg/pyproject.toml
+++ b/python/cugraph-pyg/pyproject.toml
@@ -29,7 +29,7 @@ classifiers = [
 dependencies = [
     "cugraph==24.4.*",
     "numba>=0.57",
-    "numpy>=1.23",
+    "numpy>=1.23,<2.0a0",
     "pylibcugraphops==24.4.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
diff --git a/python/cugraph-service/server/pyproject.toml b/python/cugraph-service/server/pyproject.toml
index a32b18a9551..d6cf48432cb 100644
--- a/python/cugraph-service/server/pyproject.toml
+++ b/python/cugraph-service/server/pyproject.toml
@@ -26,7 +26,7 @@ dependencies = [
     "dask-cuda==24.4.*",
     "dask-cudf==24.4.*",
     "numba>=0.57",
-    "numpy>=1.23",
+    "numpy>=1.23,<2.0a0",
     "rapids-dask-dependency==24.4.*",
     "rmm==24.4.*",
     "thriftpy2",
@@ -46,7 +46,7 @@ cugraph-service-server = "cugraph_service_server.__main__:main"
 [project.optional-dependencies]
 test = [
     "networkx>=2.5.1",
-    "numpy>=1.23",
+    "numpy>=1.23,<2.0a0",
     "pandas",
     "pytest",
     "pytest-benchmark",
diff --git a/python/cugraph/cugraph/gnn/feature_storage/feat_storage.py b/python/cugraph/cugraph/gnn/feature_storage/feat_storage.py
index 77a53882fc4..f0186220114 100644
--- a/python/cugraph/cugraph/gnn/feature_storage/feat_storage.py
+++ b/python/cugraph/cugraph/gnn/feature_storage/feat_storage.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -168,19 +168,54 @@ def get_data(
             feat, wgth.WholeMemoryEmbedding
         ):
             indices_tensor = (
-                indices
+                indices.cuda()
                 if isinstance(indices, torch.Tensor)
                 else torch.as_tensor(indices, device="cuda")
             )
             return feat.gather(indices_tensor)
-        else:
-            return feat[indices]
+        elif not isinstance(torch, MissingModule) and isinstance(feat, torch.Tensor):
+            if indices is not None:
+                if not isinstance(indices, torch.Tensor):
+                    indices = torch.as_tensor(indices)
+
+                if feat.is_cpu and indices.is_cuda:
+                    # TODO maybe add a warning here
+                    indices = indices.cpu()
+        return feat[indices]
 
     def get_feature_list(self) -> list[str]:
         return {feat_name: feats.keys() for feat_name, feats in self.fd.items()}
 
+    def get_storage(self, type_name: str, feat_name: str) -> str:
+        """
+        Returns where the data is stored (cuda, cpu).
+        Note: will return "cuda" for data managed by CUDA, even if
+        it is in host memory.
+
+        Parameters
+        ----------
+        type_name : str
+            The node-type/edge-type to store data
+        feat_name:
+            The feature name to retrieve data for
+
+        Returns
+        -------
+        "cuda" for data managed by CUDA, otherwise "CPU".
+        """
+        feat = self.fd[feat_name][type_name]
+        if not isinstance(wgth, MissingModule) and isinstance(
+            feat, wgth.WholeMemoryEmbedding
+        ):
+            return "cuda"
+        elif isinstance(feat, torch.Tensor):
+            return "cpu" if feat.is_cpu else "cuda"
+        else:
+            return "cpu"
+
     @staticmethod
     def _cast_feat_obj_to_backend(feat_obj, backend: str, **kwargs):
+        # TODO (Issue #4078) support casting WG tensors to numpy and torch
         if backend == "numpy":
             if isinstance(feat_obj, (cudf.DataFrame, pd.DataFrame)):
                 return _cast_to_numpy_ar(feat_obj.values, **kwargs)
@@ -192,6 +227,8 @@ def _cast_feat_obj_to_backend(feat_obj, backend: str, **kwargs):
             else:
                 return _cast_to_torch_tensor(feat_obj, **kwargs)
         elif backend == "wholegraph":
+            if isinstance(feat_obj, wgth.WholeMemoryEmbedding):
+                return feat_obj
             return _get_wg_embedding(feat_obj, **kwargs)
 
 
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
index cdf1e937e67..0ef5eaf1b9e 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 
 import gc
-from typing import Union
+from typing import Union, Iterable
 import warnings
 
 import cudf
@@ -28,10 +28,11 @@
     GraphProperties,
     get_two_hop_neighbors as pylibcugraph_get_two_hop_neighbors,
     select_random_vertices as pylibcugraph_select_random_vertices,
+    degrees as pylibcugraph_degrees,
+    in_degrees as pylibcugraph_in_degrees,
+    out_degrees as pylibcugraph_out_degrees,
 )
 
-from cugraph.structure import graph_primtypes_wrapper
-from cugraph.structure.graph_primtypes_wrapper import Direction
 from cugraph.structure.number_map import NumberMap
 from cugraph.structure.symmetrize import symmetrize
 from cugraph.dask.common.part_utils import (
@@ -536,7 +537,158 @@ def number_of_edges(self, directed_edges=False):
                 raise RuntimeError("Graph is Empty")
         return self.properties.edge_count
 
-    def in_degree(self, vertex_subset=None):
+    def degrees_function(
+        self,
+        vertex_subset: Union[cudf.Series, dask_cudf.Series, Iterable] = None,
+        degree_type: str = "in_degree",
+    ) -> dask_cudf.DataFrame:
+        """
+        Compute vertex in-degree, out-degree, degree and degrees.
+
+        1) Vertex in-degree is the number of edges pointing into the vertex.
+        2) Vertex out-degree is the number of edges pointing out from the vertex.
+        3) Vertex degree, is the total number of edges incident to a vertex
+            (both in and out edges)
+        4) Vertex degrees computes vertex in-degree and out-degree.
+
+        By default, this method computes vertex in-degree, out-degree, degree
+        or degrees for the entire set of vertices. If vertex_subset is provided,
+        this method optionally filters out all but those listed in
+        vertex_subset.
+
+        Parameters
+        ----------
+        vertex_subset : cudf.Series or dask_cudf.Series, iterable container, optional
+            A container of vertices for displaying corresponding in-degree.
+            If not set, degrees are computed for the entire set of vertices.
+
+        degree_type : str (default='in_degree')
+
+        Returns
+        -------
+        df : dask_cudf.DataFrame
+            GPU DataFrame of size N (the default) or the size of the given
+            vertices (vertex_subset) containing the in_degree, out_degrees,
+            degree or degrees. The ordering is relative to the adjacency list,
+            or that given by the specified vertex_subset.
+
+        Examples
+        --------
+        >>> M = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
+        ...                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.Graph()
+        >>> G.from_cudf_edgelist(M, '0', '1')
+        >>> df = G.degrees_function([0,9,12], "in_degree")
+
+        """
+        _client = default_client()
+
+        def _call_plc_degrees_function(
+            sID: bytes, mg_graph_x, source_vertices: cudf.Series, degree_type: str
+        ) -> cp.array:
+
+            if degree_type == "in_degree":
+                results = pylibcugraph_in_degrees(
+                    resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+                    graph=mg_graph_x,
+                    source_vertices=source_vertices,
+                    do_expensive_check=False,
+                )
+            elif degree_type == "out_degree":
+                results = pylibcugraph_out_degrees(
+                    resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+                    graph=mg_graph_x,
+                    source_vertices=source_vertices,
+                    do_expensive_check=False,
+                )
+            elif degree_type in ["degree", "degrees"]:
+                results = pylibcugraph_degrees(
+                    resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+                    graph=mg_graph_x,
+                    source_vertices=source_vertices,
+                    do_expensive_check=False,
+                )
+            else:
+                raise ValueError(
+                    "Incorrect degree type passed, valid values are ",
+                    "'in_degree', 'out_degree', 'degree' and 'degrees' ",
+                    f"got '{degree_type}'",
+                )
+
+            return results
+
+        if isinstance(vertex_subset, int):
+            vertex_subset = [vertex_subset]
+
+        if isinstance(vertex_subset, list):
+            vertex_subset = cudf.Series(vertex_subset)
+
+        if vertex_subset is not None:
+            if self.renumbered:
+                vertex_subset = self.renumber_map.to_internal_vertex_id(vertex_subset)
+                vertex_subset_type = self.edgelist.edgelist_df.dtypes.iloc[0]
+            else:
+                vertex_subset_type = self.input_df.dtypes.iloc[0]
+
+            vertex_subset = vertex_subset.astype(vertex_subset_type)
+
+        cupy_result = [
+            _client.submit(
+                _call_plc_degrees_function,
+                Comms.get_session_id(),
+                self._plc_graph[w],
+                vertex_subset,
+                degree_type,
+                workers=[w],
+                allow_other_workers=False,
+            )
+            for w in Comms.get_workers()
+        ]
+
+        wait(cupy_result)
+
+        def convert_to_cudf(cp_arrays: cp.ndarray, degree_type: bool) -> cudf.DataFrame:
+            """
+            Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
+            """
+            df = cudf.DataFrame()
+            df["vertex"] = cp_arrays[0]
+            if degree_type in ["in_degree", "out_degree"]:
+                df["degree"] = cp_arrays[1]
+            # degree_type must be either 'degree' or 'degrees'
+            else:
+                if degree_type == "degrees":
+                    df["in_degree"] = cp_arrays[1]
+                    df["out_degree"] = cp_arrays[2]
+                else:
+                    df["degree"] = cp_arrays[1] + cp_arrays[2]
+            return df
+
+        cudf_result = [
+            _client.submit(
+                convert_to_cudf,
+                cp_arrays,
+                degree_type,
+                workers=_client.who_has(cp_arrays)[cp_arrays.key],
+            )
+            for cp_arrays in cupy_result
+        ]
+
+        wait(cudf_result)
+        ddf = dask_cudf.from_delayed(cudf_result).persist()
+        wait(ddf)
+
+        # Wait until the inactive futures are released
+        wait([(r.release(), c_r.release()) for r, c_r in zip(cupy_result, cudf_result)])
+
+        if self.properties.renumbered:
+            ddf = self.renumber_map.unrenumber(ddf, "vertex")
+
+        return ddf
+
+    def in_degree(
+        self, vertex_subset: Union[cudf.Series, dask_cudf.Series, Iterable] = None
+    ) -> dask_cudf.DataFrame:
         """
         Compute vertex in-degree. Vertex in-degree is the number of edges
         pointing into the vertex. By default, this method computes vertex
@@ -572,61 +724,11 @@ def in_degree(self, vertex_subset=None):
         >>> df = G.in_degree([0,9,12])
 
         """
-        src_col_name = self.source_columns
-        dst_col_name = self.destination_columns
-
-        # select only the vertex columns
-        if not isinstance(src_col_name, list) and not isinstance(dst_col_name, list):
-            vertex_col_names = [src_col_name] + [dst_col_name]
-
-        df = self.input_df[vertex_col_names]
-        df = df.drop(columns=src_col_name)
-
-        nodes = self.nodes()
-        if isinstance(nodes, dask_cudf.Series):
-            nodes = nodes.to_frame()
-
-        if not isinstance(dst_col_name, list):
-            df = df.rename(columns={dst_col_name: "vertex"})
-            dst_col_name = "vertex"
-
-        vertex_col_names = df.columns
-        nodes.columns = vertex_col_names
-
-        df["degree"] = 1
-
-        # FIXME: leverage the C++ in_degree for optimal performance
-        in_degree = (
-            df.groupby(dst_col_name)
-            .degree.count(split_out=df.npartitions)
-            .reset_index()
-        )
-
-        # Add vertices with zero in_degree
-        in_degree = nodes.merge(in_degree, how="outer").fillna(0)
-
-        # Convert vertex_subset to dataframe.
-        if vertex_subset is not None:
-            if not isinstance(vertex_subset, (dask_cudf.DataFrame, cudf.DataFrame)):
-                if isinstance(vertex_subset, dask_cudf.Series):
-                    vertex_subset = vertex_subset.to_frame()
-                else:
-                    df = cudf.DataFrame()
-                    if isinstance(vertex_subset, (cudf.Series, list)):
-                        df["vertex"] = vertex_subset
-                        vertex_subset = df
-            if isinstance(vertex_subset, (dask_cudf.DataFrame, cudf.DataFrame)):
-                vertex_subset.columns = vertex_col_names
-                in_degree = in_degree.merge(vertex_subset, how="inner")
-            else:
-                raise TypeError(
-                    f"Expected type are: cudf, dask_cudf objects, "
-                    f"iterable container, got "
-                    f"{type(vertex_subset)}"
-                )
-        return in_degree
+        return self.degrees_function(vertex_subset, "in_degree")
 
-    def out_degree(self, vertex_subset=None):
+    def out_degree(
+        self, vertex_subset: Union[cudf.Series, dask_cudf.Series, Iterable] = None
+    ) -> dask_cudf.DataFrame:
         """
         Compute vertex out-degree. Vertex out-degree is the number of edges
         pointing out from the vertex. By default, this method computes vertex
@@ -662,62 +764,11 @@ def out_degree(self, vertex_subset=None):
         >>> df = G.out_degree([0,9,12])
 
         """
-        src_col_name = self.source_columns
-        dst_col_name = self.destination_columns
-
-        # select only the vertex columns
-        if not isinstance(src_col_name, list) and not isinstance(dst_col_name, list):
-            vertex_col_names = [src_col_name] + [dst_col_name]
-
-        df = self.input_df[vertex_col_names]
-        df = df.drop(columns=dst_col_name)
-
-        nodes = self.nodes()
-        if isinstance(nodes, dask_cudf.Series):
-            nodes = nodes.to_frame()
-
-        if not isinstance(src_col_name, list):
-            df = df.rename(columns={src_col_name: "vertex"})
-            src_col_name = "vertex"
-
-        vertex_col_names = df.columns
-
-        nodes.columns = vertex_col_names
-
-        df["degree"] = 1
-        # leverage the C++ out_degree for optimal performance
-        out_degree = (
-            df.groupby(src_col_name)
-            .degree.count(split_out=df.npartitions)
-            .reset_index()
-        )
-
-        # Add vertices with zero out_degree
-        out_degree = nodes.merge(out_degree, how="outer").fillna(0)
-
-        # Convert vertex_subset to dataframe.
-        if vertex_subset is not None:
-            if not isinstance(vertex_subset, (dask_cudf.DataFrame, cudf.DataFrame)):
-                if isinstance(vertex_subset, dask_cudf.Series):
-                    vertex_subset = vertex_subset.to_frame()
-                else:
-                    df = cudf.DataFrame()
-                    if isinstance(vertex_subset, (cudf.Series, list)):
-                        df["vertex"] = vertex_subset
-                        vertex_subset = df
-            if isinstance(vertex_subset, (dask_cudf.DataFrame, cudf.DataFrame)):
-                vertex_subset.columns = vertex_col_names
-                out_degree = out_degree.merge(vertex_subset, how="inner")
-            else:
-                raise TypeError(
-                    f"Expected type are: cudf, dask_cudf objects, "
-                    f"iterable container, got "
-                    f"{type(vertex_subset)}"
-                )
+        return self.degrees_function(vertex_subset, "out_degree")
 
-        return out_degree
-
-    def degree(self, vertex_subset=None):
+    def degree(
+        self, vertex_subset: Union[cudf.Series, dask_cudf.Series, Iterable] = None
+    ) -> dask_cudf.DataFrame:
         """
         Compute vertex degree, which is the total number of edges incident
         to a vertex (both in and out edges). By default, this method computes
@@ -754,18 +805,12 @@ def degree(self, vertex_subset=None):
 
         """
 
-        vertex_in_degree = self.in_degree(vertex_subset)
-        vertex_out_degree = self.out_degree(vertex_subset)
-        # FIXME: leverage the C++ degree for optimal performance
-        vertex_degree = dask_cudf.concat([vertex_in_degree, vertex_out_degree])
-        vertex_degree = vertex_degree.groupby(["vertex"], as_index=False).sum(
-            split_out=self.input_df.npartitions
-        )
-
-        return vertex_degree
+        return self.degrees_function(vertex_subset, "degree")
 
     # FIXME:  vertex_subset could be a DataFrame for multi-column vertices
-    def degrees(self, vertex_subset=None):
+    def degrees(
+        self, vertex_subset: Union[cudf.Series, dask_cudf.Series, Iterable] = None
+    ) -> dask_cudf.DataFrame:
         """
         Compute vertex in-degree and out-degree. By default, this method
         computes vertex degrees for the entire set of vertices. If
@@ -802,21 +847,7 @@ def degrees(self, vertex_subset=None):
         >>> df = G.degrees([0,9,12])
 
         """
-        raise NotImplementedError("Not supported for distributed graph")
-
-    def _degree(self, vertex_subset, direction=Direction.ALL):
-        vertex_col, degree_col = graph_primtypes_wrapper._mg_degree(self, direction)
-        df = cudf.DataFrame()
-        df["vertex"] = vertex_col
-        df["degree"] = degree_col
-
-        if self.renumbered is True:
-            df = self.renumber_map.unrenumber(df, "vertex")
-
-        if vertex_subset is not None:
-            df = df[df["vertex"].isin(vertex_subset)]
-
-        return df
+        return self.degrees_function(vertex_subset, "degrees")
 
     def get_two_hop_neighbors(self, start_vertices=None):
         """
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
index 121a4c6245a..99934e02b10 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
@@ -12,7 +12,6 @@
 # limitations under the License.
 
 from cugraph.structure import graph_primtypes_wrapper
-from cugraph.structure.graph_primtypes_wrapper import Direction
 from cugraph.structure.symmetrize import symmetrize
 from cugraph.structure.number_map import NumberMap
 import cugraph.dask.common.mg_utils as mg_utils
@@ -23,10 +22,13 @@
 import numpy as np
 import warnings
 from cugraph.dask.structure import replication
-from typing import Union, Dict
+from typing import Union, Dict, Iterable
 from pylibcugraph import (
     get_two_hop_neighbors as pylibcugraph_get_two_hop_neighbors,
     select_random_vertices as pylibcugraph_select_random_vertices,
+    degrees as pylibcugraph_degrees,
+    in_degrees as pylibcugraph_in_degrees,
+    out_degrees as pylibcugraph_out_degrees,
 )
 
 from pylibcugraph import (
@@ -854,7 +856,111 @@ def number_of_edges(self, directed_edges=False):
                 raise ValueError("Graph is Empty")
         return self.properties.edge_count
 
-    def in_degree(self, vertex_subset=None):
+    def degrees_function(
+        self,
+        vertex_subset: Union[cudf.Series, Iterable] = None,
+        degree_type: str = "in_degree",
+    ) -> cudf.DataFrame:
+        """
+        Compute vertex in-degree, out-degree, degree and degrees.
+
+        1) Vertex in-degree is the number of edges pointing into the vertex.
+        2) Vertex out-degree is the number of edges pointing out from the vertex.
+        3) Vertex degree, is the total number of edges incident to a vertex
+            (both in and out edges)
+        4) Vertex degrees computes vertex in-degree and out-degree.
+
+        By default, this method computes vertex in-degree, out-degree, degree
+        or degrees for the entire set of vertices. If vertex_subset is provided,
+        this method optionally filters out all but those listed in
+        vertex_subset.
+
+        Parameters
+        ----------
+        vertex_subset : cudf.Series or iterable container, optional
+            A container of vertices for displaying corresponding in-degree.
+            If not set, degrees are computed for the entire set of vertices.
+
+        degree_type : str (default='in_degree')
+
+        Returns
+        -------
+        df : cudf.DataFrame
+            GPU DataFrame of size N (the default) or the size of the given
+            vertices (vertex_subset) containing the in_degree, out_degrees,
+            degree or degrees. The ordering is relative to the adjacency list,
+            or that given by the specified vertex_subset.
+
+        Examples
+        --------
+        >>> M = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
+        ...                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.Graph()
+        >>> G.from_cudf_edgelist(M, '0', '1')
+        >>> df = G.degrees_function([0,9,12], "in_degree")
+
+        """
+        if vertex_subset is not None:
+            if not isinstance(vertex_subset, cudf.Series):
+                vertex_subset = cudf.Series(vertex_subset)
+                if self.properties.renumbered is True:
+                    vertex_subset = self.renumber_map.to_internal_vertex_id(
+                        vertex_subset
+                    )
+                    vertex_subset_type = self.edgelist.edgelist_df.dtypes.iloc[0]
+                else:
+                    vertex_subset_type = self.input_df.dtypes.iloc[0]
+
+                vertex_subset = vertex_subset.astype(vertex_subset_type)
+
+        do_expensive_check = False
+        df = cudf.DataFrame()
+        vertex = None
+
+        if degree_type == "in_degree":
+            vertex, in_degrees = pylibcugraph_in_degrees(
+                resource_handle=ResourceHandle(),
+                graph=self._plc_graph,
+                source_vertices=vertex_subset,
+                do_expensive_check=do_expensive_check,
+            )
+            df["degree"] = in_degrees
+        elif degree_type == "out_degree":
+            vertex, out_degrees = pylibcugraph_out_degrees(
+                resource_handle=ResourceHandle(),
+                graph=self._plc_graph,
+                source_vertices=vertex_subset,
+                do_expensive_check=do_expensive_check,
+            )
+            df["degree"] = out_degrees
+        elif degree_type in ["degree", "degrees"]:
+            vertex, in_degrees, out_degrees = pylibcugraph_degrees(
+                resource_handle=ResourceHandle(),
+                graph=self._plc_graph,
+                source_vertices=vertex_subset,
+                do_expensive_check=do_expensive_check,
+            )
+            if degree_type == "degrees":
+                df["in_degree"] = in_degrees
+                df["out_degree"] = out_degrees
+
+            else:
+                df["degree"] = in_degrees + out_degrees
+        else:
+            raise ValueError(
+                "Incorrect degree type passed, valid values are ",
+                "'in_degree', 'out_degree', 'degree' and 'degrees' ",
+                f"got '{degree_type}'",
+            )
+        df["vertex"] = vertex
+        if self.properties.renumbered is True:
+            df = self.renumber_map.unrenumber(df, "vertex")
+
+        return df
+
+    def in_degree(
+        self, vertex_subset: Union[cudf.Series, Iterable] = None
+    ) -> cudf.DataFrame:
         """
         Compute vertex in-degree. Vertex in-degree is the number of edges
         pointing into the vertex. By default, this method computes vertex
@@ -892,11 +998,11 @@ def in_degree(self, vertex_subset=None):
         >>> df = G.in_degree([0,9,12])
 
         """
-        in_degree = self._degree(vertex_subset, direction=Direction.IN)
-
-        return in_degree
+        return self.degrees_function(vertex_subset, "in_degree")
 
-    def out_degree(self, vertex_subset=None):
+    def out_degree(
+        self, vertex_subset: Union[cudf.Series, Iterable] = None
+    ) -> cudf.DataFrame:
         """
         Compute vertex out-degree. Vertex out-degree is the number of edges
         pointing out from the vertex. By default, this method computes vertex
@@ -934,10 +1040,11 @@ def out_degree(self, vertex_subset=None):
         >>> df = G.out_degree([0,9,12])
 
         """
-        out_degree = self._degree(vertex_subset, direction=Direction.OUT)
-        return out_degree
+        return self.degrees_function(vertex_subset, "out_degree")
 
-    def degree(self, vertex_subset=None):
+    def degree(
+        self, vertex_subset: Union[cudf.Series, Iterable] = None
+    ) -> cudf.DataFrame:
         """
         Compute vertex degree, which is the total number of edges incident
         to a vertex (both in and out edges). By default, this method computes
@@ -976,10 +1083,12 @@ def degree(self, vertex_subset=None):
         >>> subset_df = G.degree([0,9,12])
 
         """
-        return self._degree(vertex_subset)
+        return self.degrees_function(vertex_subset, "degree")
 
     # FIXME:  vertex_subset could be a DataFrame for multi-column vertices
-    def degrees(self, vertex_subset=None):
+    def degrees(
+        self, vertex_subset: Union[cudf.Series, Iterable] = None
+    ) -> cudf.DataFrame:
         """
         Compute vertex in-degree and out-degree. By default, this method
         computes vertex degrees for the entire set of vertices. If
@@ -1019,70 +1128,7 @@ def degrees(self, vertex_subset=None):
         >>> df = G.degrees([0,9,12])
 
         """
-        (
-            vertex_col,
-            in_degree_col,
-            out_degree_col,
-        ) = graph_primtypes_wrapper._degrees(self)
-
-        df = cudf.DataFrame()
-        df["vertex"] = vertex_col
-        df["in_degree"] = in_degree_col
-        df["out_degree"] = out_degree_col
-
-        if self.properties.renumbered:
-            # Get the internal vertex IDs
-            nodes = self.renumber_map.df_internal_to_external["id"]
-        else:
-            nodes = self.nodes()
-        # If the vertex IDs are not contiguous, remove results for the
-        # isolated vertices
-        df = df[df["vertex"].isin(nodes.to_cupy())]
-
-        if vertex_subset is not None:
-            if not isinstance(vertex_subset, cudf.Series):
-                vertex_subset = cudf.Series(vertex_subset)
-                if self.properties.renumbered:
-                    vertex_subset = self.renumber_map.to_internal_vertex_id(
-                        vertex_subset
-                    )
-                vertex_subset = vertex_subset.to_cupy()
-            df = df[df["vertex"].isin(vertex_subset)]
-
-        if self.properties.renumbered:
-            df = self.renumber_map.unrenumber(df, "vertex")
-
-        return df
-
-    def _degree(self, vertex_subset, direction=Direction.ALL):
-        vertex_col, degree_col = graph_primtypes_wrapper._degree(self, direction)
-        df = cudf.DataFrame()
-        df["vertex"] = vertex_col
-        df["degree"] = degree_col
-
-        if self.properties.renumbered:
-            # Get the internal vertex IDs
-            nodes = self.renumber_map.df_internal_to_external["id"]
-        else:
-            nodes = self.nodes()
-        # If the vertex IDs are not contiguous, remove results for the
-        # isolated vertices
-        df = df[df["vertex"].isin(nodes.to_cupy())]
-
-        if vertex_subset is not None:
-            if not isinstance(vertex_subset, cudf.Series):
-                vertex_subset = cudf.Series(vertex_subset)
-                if self.properties.renumbered:
-                    vertex_subset = self.renumber_map.to_internal_vertex_id(
-                        vertex_subset
-                    )
-                vertex_subset = vertex_subset.to_cupy()
-            df = df[df["vertex"].isin(vertex_subset)]
-
-        if self.properties.renumbered:
-            df = self.renumber_map.unrenumber(df, "vertex")
-
-        return df
+        return self.degrees_function(vertex_subset, "degrees")
 
     def _make_plc_graph(
         self,
diff --git a/python/cugraph/cugraph/tests/centrality/test_degree_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_degree_centrality_mg.py
index a46f4b9463b..1bef1e0872b 100644
--- a/python/cugraph/cugraph/tests/centrality/test_degree_centrality_mg.py
+++ b/python/cugraph/cugraph/tests/centrality/test_degree_centrality_mg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,7 +18,6 @@
 import cudf
 import dask_cudf
 import cugraph
-from cugraph.dask.common.mg_utils import is_single_gpu
 from cugraph.testing.utils import RAPIDS_DATASET_ROOT_DIR_PATH
 from cudf.testing import assert_series_equal
 
@@ -41,7 +40,6 @@ def setup_function():
 
 
 @pytest.mark.mg
-@pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
 @pytest.mark.parametrize("directed", IS_DIRECTED)
 @pytest.mark.parametrize("data_file", DATA_PATH)
 def test_dask_mg_degree(dask_client, directed, data_file):
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index 113c316ccbf..a6d3d841298 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -35,7 +35,7 @@ dependencies = [
     "dask-cudf==24.4.*",
     "fsspec[http]>=0.6.0",
     "numba>=0.57",
-    "numpy>=1.23",
+    "numpy>=1.23,<2.0a0",
     "pylibcugraph==24.4.*",
     "raft-dask==24.4.*",
     "rapids-dask-dependency==24.4.*",
@@ -53,7 +53,7 @@ classifiers = [
 [project.optional-dependencies]
 test = [
     "networkx>=2.5.1",
-    "numpy>=1.23",
+    "numpy>=1.23,<2.0a0",
     "pandas",
     "pytest",
     "pytest-benchmark",
diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md
index 8201dc34eb2..1bf310c8c88 100644
--- a/python/nx-cugraph/README.md
+++ b/python/nx-cugraph/README.md
@@ -95,8 +95,6 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
 
 <pre>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/bipartite.html#module-networkx.algorithms.bipartite">bipartite</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/bipartite.html#module-networkx.algorithms.bipartite.basic">basic</a>
- │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.basic.is_bipartite.html#networkx.algorithms.bipartite.basic.is_bipartite">is_bipartite</a>
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/bipartite.html#module-networkx.algorithms.bipartite.generators">generators</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.generators.complete_bipartite_graph.html#networkx.algorithms.bipartite.generators.complete_bipartite_graph">complete_bipartite_graph</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#module-networkx.algorithms.centrality">centrality</a>
@@ -152,9 +150,26 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.reciprocity.overall_reciprocity.html#networkx.algorithms.reciprocity.overall_reciprocity">overall_reciprocity</a>
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.reciprocity.reciprocity.html#networkx.algorithms.reciprocity.reciprocity">reciprocity</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/shortest_paths.html">shortest_paths</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/shortest_paths.html#module-networkx.algorithms.shortest_paths.unweighted">unweighted</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.unweighted.single_source_shortest_path_length.html#networkx.algorithms.shortest_paths.unweighted.single_source_shortest_path_length">single_source_shortest_path_length</a>
-     └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.unweighted.single_target_shortest_path_length.html#networkx.algorithms.shortest_paths.unweighted.single_target_shortest_path_length">single_target_shortest_path_length</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/shortest_paths.html#module-networkx.algorithms.shortest_paths.generic">generic</a>
+ │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.generic.has_path.html#networkx.algorithms.shortest_paths.generic.has_path">has_path</a>
+ │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.generic.shortest_path.html#networkx.algorithms.shortest_paths.generic.shortest_path">shortest_path</a>
+ │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.generic.shortest_path_length.html#networkx.algorithms.shortest_paths.generic.shortest_path_length">shortest_path_length</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/shortest_paths.html#module-networkx.algorithms.shortest_paths.unweighted">unweighted</a>
+ │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.unweighted.all_pairs_shortest_path.html#networkx.algorithms.shortest_paths.unweighted.all_pairs_shortest_path">all_pairs_shortest_path</a>
+ │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.unweighted.all_pairs_shortest_path_length.html#networkx.algorithms.shortest_paths.unweighted.all_pairs_shortest_path_length">all_pairs_shortest_path_length</a>
+ │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.unweighted.bidirectional_shortest_path.html#networkx.algorithms.shortest_paths.unweighted.bidirectional_shortest_path">bidirectional_shortest_path</a>
+ │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.unweighted.single_source_shortest_path.html#networkx.algorithms.shortest_paths.unweighted.single_source_shortest_path">single_source_shortest_path</a>
+ │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.unweighted.single_source_shortest_path_length.html#networkx.algorithms.shortest_paths.unweighted.single_source_shortest_path_length">single_source_shortest_path_length</a>
+ │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.unweighted.single_target_shortest_path.html#networkx.algorithms.shortest_paths.unweighted.single_target_shortest_path">single_target_shortest_path</a>
+ │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.unweighted.single_target_shortest_path_length.html#networkx.algorithms.shortest_paths.unweighted.single_target_shortest_path_length">single_target_shortest_path_length</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/shortest_paths.html#module-networkx.algorithms.shortest_paths.weighted">weighted</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_bellman_ford_path.html#networkx.algorithms.shortest_paths.weighted.all_pairs_bellman_ford_path">all_pairs_bellman_ford_path</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_bellman_ford_path_length.html#networkx.algorithms.shortest_paths.weighted.all_pairs_bellman_ford_path_length">all_pairs_bellman_ford_path_length</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.bellman_ford_path.html#networkx.algorithms.shortest_paths.weighted.bellman_ford_path">bellman_ford_path</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.bellman_ford_path_length.html#networkx.algorithms.shortest_paths.weighted.bellman_ford_path_length">bellman_ford_path_length</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford.html#networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford">single_source_bellman_ford</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path.html#networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path">single_source_bellman_ford_path</a>
+     └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path_length.html#networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path_length">single_source_bellman_ford_path_length</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/traversal.html">traversal</a>
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/traversal.html#module-networkx.algorithms.traversal.breadth_first_search">breadth_first_search</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.traversal.breadth_first_search.bfs_edges.html#networkx.algorithms.traversal.breadth_first_search.bfs_edges">bfs_edges</a>
diff --git a/python/nx-cugraph/_nx_cugraph/__init__.py b/python/nx-cugraph/_nx_cugraph/__init__.py
index b2f13d25ff3..bc7f63fcd49 100644
--- a/python/nx-cugraph/_nx_cugraph/__init__.py
+++ b/python/nx-cugraph/_nx_cugraph/__init__.py
@@ -33,15 +33,22 @@
     # "description": "TODO",
     "functions": {
         # BEGIN: functions
+        "all_pairs_bellman_ford_path",
+        "all_pairs_bellman_ford_path_length",
+        "all_pairs_shortest_path",
+        "all_pairs_shortest_path_length",
         "ancestors",
         "average_clustering",
         "barbell_graph",
+        "bellman_ford_path",
+        "bellman_ford_path_length",
         "betweenness_centrality",
         "bfs_edges",
         "bfs_layers",
         "bfs_predecessors",
         "bfs_successors",
         "bfs_tree",
+        "bidirectional_shortest_path",
         "bull_graph",
         "caveman_graph",
         "chvatal_graph",
@@ -70,6 +77,7 @@
         "from_scipy_sparse_array",
         "frucht_graph",
         "generic_bfs_edges",
+        "has_path",
         "heawood_graph",
         "hits",
         "house_graph",
@@ -77,7 +85,6 @@
         "icosahedral_graph",
         "in_degree_centrality",
         "is_arborescence",
-        "is_bipartite",
         "is_branching",
         "is_connected",
         "is_forest",
@@ -110,7 +117,14 @@
         "reciprocity",
         "reverse",
         "sedgewick_maze_graph",
+        "shortest_path",
+        "shortest_path_length",
+        "single_source_bellman_ford",
+        "single_source_bellman_ford_path",
+        "single_source_bellman_ford_path_length",
+        "single_source_shortest_path",
         "single_source_shortest_path_length",
+        "single_target_shortest_path",
         "single_target_shortest_path_length",
         "star_graph",
         "tadpole_graph",
@@ -128,7 +142,11 @@
     },
     "additional_docs": {
         # BEGIN: additional_docs
+        "all_pairs_bellman_ford_path": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
+        "all_pairs_bellman_ford_path_length": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
         "average_clustering": "Directed graphs and `weight` parameter are not yet supported.",
+        "bellman_ford_path": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
+        "bellman_ford_path_length": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
         "betweenness_centrality": "`weight` parameter is not yet supported, and RNG with seed may be different.",
         "bfs_edges": "`sort_neighbors` parameter is not yet supported.",
         "bfs_predecessors": "`sort_neighbors` parameter is not yet supported.",
@@ -147,11 +165,28 @@
         "katz_centrality": "`nstart` isn't used (but is checked), and `normalized=False` is not supported.",
         "louvain_communities": "`seed` parameter is currently ignored, and self-loops are not yet supported.",
         "pagerank": "`dangling` parameter is not supported, but it is checked for validity.",
+        "shortest_path": "Negative weights are not yet supported, and method is ununsed.",
+        "shortest_path_length": "Negative weights are not yet supported, and method is ununsed.",
+        "single_source_bellman_ford": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
+        "single_source_bellman_ford_path": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
+        "single_source_bellman_ford_path_length": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
         "transitivity": "Directed graphs are not yet supported.",
         # END: additional_docs
     },
     "additional_parameters": {
         # BEGIN: additional_parameters
+        "all_pairs_bellman_ford_path": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "all_pairs_bellman_ford_path_length": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "bellman_ford_path": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "bellman_ford_path_length": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
         "eigenvector_centrality": {
             "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
         },
@@ -169,6 +204,21 @@
         "pagerank": {
             "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
         },
+        "shortest_path": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "shortest_path_length": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "single_source_bellman_ford": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "single_source_bellman_ford_path": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "single_source_bellman_ford_path_length": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
         # END: additional_parameters
     },
 }
diff --git a/python/nx-cugraph/lint.yaml b/python/nx-cugraph/lint.yaml
index fdd24861da7..3239fa151d9 100644
--- a/python/nx-cugraph/lint.yaml
+++ b/python/nx-cugraph/lint.yaml
@@ -50,7 +50,7 @@ repos:
       - id: black
       # - id: black-jupyter
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.2.2
+    rev: v0.3.2
     hooks:
       - id: ruff
         args: [--fix-only, --show-fixes]  # --unsafe-fixes]
@@ -77,7 +77,7 @@ repos:
         additional_dependencies: [tomli]
         files: ^(nx_cugraph|docs)/
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.2.2
+    rev: v0.3.2
     hooks:
       - id: ruff
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
index 7aafa85f5b7..b4a10bcf0a1 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
@@ -22,7 +22,7 @@
     traversal,
     tree,
 )
-from .bipartite import complete_bipartite_graph, is_bipartite
+from .bipartite import complete_bipartite_graph
 from .centrality import *
 from .cluster import *
 from .components import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/__init__.py
index e028299c675..bfc7f1d4d42 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/__init__.py
@@ -10,5 +10,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .basic import *
 from .generators import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py
index 65a8633667a..c32b6fbb708 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py
@@ -36,17 +36,12 @@ def eigenvector_centrality(
     G, max_iter=100, tol=1.0e-6, nstart=None, weight=None, *, dtype=None
 ):
     """`nstart` parameter is not used, but it is checked for validity."""
-    G = _to_graph(G, weight, np.float32)
+    G = _to_graph(G, weight, 1, np.float32)
     if len(G) == 0:
         raise nx.NetworkXPointlessConcept(
             "cannot compute centrality for the null graph"
         )
-    if dtype is not None:
-        dtype = _get_float_dtype(dtype)
-    elif weight in G.edge_values:
-        dtype = _get_float_dtype(G.edge_values[weight].dtype)
-    else:
-        dtype = np.float32
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
     if nstart is not None:
         # Check if given nstart is valid even though we don't use it
         nstart = G._dict_to_nodearray(nstart, dtype=dtype)
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py
index 4a0684f72ee..1c6ed61703d 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py
@@ -49,15 +49,10 @@ def katz_centrality(
         # Redundant with the `_can_run` check below when being dispatched by NetworkX,
         # but we raise here in case this funcion is called directly.
         raise NotImplementedError("normalized=False is not supported.")
-    G = _to_graph(G, weight, np.float32)
+    G = _to_graph(G, weight, 1, np.float32)
     if (N := len(G)) == 0:
         return {}
-    if dtype is not None:
-        dtype = _get_float_dtype(dtype)
-    elif weight in G.edge_values:
-        dtype = _get_float_dtype(G.edge_values[weight].dtype)
-    else:
-        dtype = np.float32
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
     if nstart is not None:
         # Check if given nstart is valid even though we don't use it
         nstart = G._dict_to_nodearray(nstart, 0, dtype)
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
index e61a931c069..e529b83ab1a 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
@@ -46,15 +46,10 @@ def hits(
     weight="weight",
     dtype=None,
 ):
-    G = _to_graph(G, weight, np.float32)
+    G = _to_graph(G, weight, 1, np.float32)
     if (N := len(G)) == 0:
         return {}, {}
-    if dtype is not None:
-        dtype = _get_float_dtype(dtype)
-    elif weight in G.edge_values:
-        dtype = _get_float_dtype(G.edge_values[weight].dtype)
-    else:
-        dtype = np.float32
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
     if nstart is not None:
         nstart = G._dict_to_nodearray(nstart, 0, dtype)
     if max_iter <= 0:
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py
index 40224e91d57..41203a2bc22 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py
@@ -48,12 +48,7 @@ def pagerank(
     G = _to_graph(G, weight, 1, np.float32)
     if (N := len(G)) == 0:
         return {}
-    if dtype is not None:
-        dtype = _get_float_dtype(dtype)
-    elif weight in G.edge_values:
-        dtype = _get_float_dtype(G.edge_values[weight].dtype)
-    else:
-        dtype = np.float32
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
     if nstart is not None:
         nstart = G._dict_to_nodearray(nstart, 0, dtype=dtype)
         if (total := nstart.sum()) == 0:
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/__init__.py
index b7d6b742176..9d87389a98e 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,4 +10,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .generic import *
 from .unweighted import *
+from .weighted import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py
new file mode 100644
index 00000000000..68dbbace93d
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+import numpy as np
+
+import nx_cugraph as nxcg
+from nx_cugraph.convert import _to_graph
+from nx_cugraph.utils import _dtype_param, _get_float_dtype, networkx_algorithm
+
+from .unweighted import _bfs
+from .weighted import _sssp
+
+__all__ = [
+    "shortest_path",
+    "shortest_path_length",
+    "has_path",
+]
+
+
+@networkx_algorithm(version_added="24.04", _plc="bfs")
+def has_path(G, source, target):
+    # TODO PERF: make faster in core
+    try:
+        nxcg.bidirectional_shortest_path(G, source, target)
+    except nx.NetworkXNoPath:
+        return False
+    return True
+
+
+@networkx_algorithm(
+    extra_params=_dtype_param, version_added="24.04", _plc={"bfs", "sssp"}
+)
+def shortest_path(
+    G, source=None, target=None, weight=None, method="dijkstra", *, dtype=None
+):
+    """Negative weights are not yet supported, and method is ununsed."""
+    if method not in {"dijkstra", "bellman-ford"}:
+        raise ValueError(f"method not supported: {method}")
+    if weight is None:
+        method = "unweighted"
+    if source is None:
+        if target is None:
+            # All pairs
+            if method == "unweighted":
+                paths = nxcg.all_pairs_shortest_path(G)
+            else:
+                # method == "dijkstra":
+                # method == 'bellman-ford':
+                paths = nxcg.all_pairs_bellman_ford_path(G, weight=weight, dtype=dtype)
+            if nx.__version__[:3] <= "3.4":
+                paths = dict(paths)
+        # To target
+        elif method == "unweighted":
+            paths = nxcg.single_target_shortest_path(G, target)
+        else:
+            # method == "dijkstra":
+            # method == 'bellman-ford':
+            # XXX: it seems weird that `reverse_path=True` is necessary here
+            G = _to_graph(G, weight, 1, np.float32)
+            dtype = _get_float_dtype(dtype, graph=G, weight=weight)
+            paths = _sssp(
+                G, target, weight, return_type="path", dtype=dtype, reverse_path=True
+            )
+    elif target is None:
+        # From source
+        if method == "unweighted":
+            paths = nxcg.single_source_shortest_path(G, source)
+        else:
+            # method == "dijkstra":
+            # method == 'bellman-ford':
+            paths = nxcg.single_source_bellman_ford_path(
+                G, source, weight=weight, dtype=dtype
+            )
+    # From source to target
+    elif method == "unweighted":
+        paths = nxcg.bidirectional_shortest_path(G, source, target)
+    else:
+        # method == "dijkstra":
+        # method == 'bellman-ford':
+        paths = nxcg.bellman_ford_path(G, source, target, weight, dtype=dtype)
+    return paths
+
+
+@shortest_path._can_run
+def _(G, source=None, target=None, weight=None, method="dijkstra", *, dtype=None):
+    return (
+        weight is None
+        or not callable(weight)
+        and not nx.is_negatively_weighted(G, weight=weight)
+    )
+
+
+@networkx_algorithm(
+    extra_params=_dtype_param, version_added="24.04", _plc={"bfs", "sssp"}
+)
+def shortest_path_length(
+    G, source=None, target=None, weight=None, method="dijkstra", *, dtype=None
+):
+    """Negative weights are not yet supported, and method is ununsed."""
+    if method not in {"dijkstra", "bellman-ford"}:
+        raise ValueError(f"method not supported: {method}")
+    if weight is None:
+        method = "unweighted"
+    if source is None:
+        if target is None:
+            # All pairs
+            if method == "unweighted":
+                lengths = nxcg.all_pairs_shortest_path_length(G)
+            else:
+                # method == "dijkstra":
+                # method == 'bellman-ford':
+                lengths = nxcg.all_pairs_bellman_ford_path_length(
+                    G, weight=weight, dtype=dtype
+                )
+        # To target
+        elif method == "unweighted":
+            lengths = nxcg.single_target_shortest_path_length(G, target)
+            if nx.__version__[:3] <= "3.4":
+                lengths = dict(lengths)
+        else:
+            # method == "dijkstra":
+            # method == 'bellman-ford':
+            lengths = nxcg.single_source_bellman_ford_path_length(
+                G, target, weight=weight, dtype=dtype
+            )
+    elif target is None:
+        # From source
+        if method == "unweighted":
+            lengths = nxcg.single_source_shortest_path_length(G, source)
+        else:
+            # method == "dijkstra":
+            # method == 'bellman-ford':
+            lengths = dict(
+                nxcg.single_source_bellman_ford_path_length(
+                    G, source, weight=weight, dtype=dtype
+                )
+            )
+    # From source to target
+    elif method == "unweighted":
+        G = _to_graph(G)
+        lengths = _bfs(G, source, None, "Source", return_type="length", target=target)
+    else:
+        # method == "dijkstra":
+        # method == 'bellman-ford':
+        lengths = nxcg.bellman_ford_path_length(G, source, target, weight, dtype=dtype)
+    return lengths
+
+
+@shortest_path_length._can_run
+def _(G, source=None, target=None, weight=None, method="dijkstra", *, dtype=None):
+    return (
+        weight is None
+        or not callable(weight)
+        and not nx.is_negatively_weighted(G, weight=weight)
+    )
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
index 2012495953e..714289c5b4b 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
@@ -10,33 +10,127 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import itertools
+
 import cupy as cp
 import networkx as nx
 import numpy as np
 import pylibcugraph as plc
 
 from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import index_dtype, networkx_algorithm
+from nx_cugraph.utils import _groupby, index_dtype, networkx_algorithm
+
+__all__ = [
+    "bidirectional_shortest_path",
+    "single_source_shortest_path",
+    "single_source_shortest_path_length",
+    "single_target_shortest_path",
+    "single_target_shortest_path_length",
+    "all_pairs_shortest_path",
+    "all_pairs_shortest_path_length",
+]
 
-__all__ = ["single_source_shortest_path_length", "single_target_shortest_path_length"]
+concat = itertools.chain.from_iterable
 
 
 @networkx_algorithm(version_added="23.12", _plc="bfs")
 def single_source_shortest_path_length(G, source, cutoff=None):
-    return _single_shortest_path_length(G, source, cutoff, "Source")
+    G = _to_graph(G)
+    return _bfs(G, source, cutoff, "Source", return_type="length")
 
 
 @networkx_algorithm(version_added="23.12", _plc="bfs")
 def single_target_shortest_path_length(G, target, cutoff=None):
-    return _single_shortest_path_length(G, target, cutoff, "Target")
+    G = _to_graph(G)
+    rv = _bfs(G, target, cutoff, "Target", return_type="length")
+    if nx.__version__[:3] <= "3.4":
+        return iter(rv.items())
+    return rv
+
+
+@networkx_algorithm(version_added="24.04", _plc="bfs")
+def all_pairs_shortest_path_length(G, cutoff=None):
+    # TODO PERF: batched bfs to compute many at once
+    G = _to_graph(G)
+    for n in G:
+        yield (n, _bfs(G, n, cutoff, "Source", return_type="length"))
 
 
-def _single_shortest_path_length(G, source, cutoff, kind):
+@networkx_algorithm(version_added="24.04", _plc="bfs")
+def bidirectional_shortest_path(G, source, target):
+    # TODO PERF: do bidirectional traversal in core
     G = _to_graph(G)
+    if source not in G or target not in G:
+        raise nx.NodeNotFound(f"Either source {source} or target {target} is not in G")
+    return _bfs(G, source, None, "Source", return_type="path", target=target)
+
+
+@networkx_algorithm(version_added="24.04", _plc="bfs")
+def single_source_shortest_path(G, source, cutoff=None):
+    G = _to_graph(G)
+    return _bfs(G, source, cutoff, "Source", return_type="path")
+
+
+@networkx_algorithm(version_added="24.04", _plc="bfs")
+def single_target_shortest_path(G, target, cutoff=None):
+    G = _to_graph(G)
+    return _bfs(G, target, cutoff, "Target", return_type="path", reverse_path=True)
+
+
+@networkx_algorithm(version_added="24.04", _plc="bfs")
+def all_pairs_shortest_path(G, cutoff=None):
+    # TODO PERF: batched bfs to compute many at once
+    G = _to_graph(G)
+    for n in G:
+        yield (n, _bfs(G, n, cutoff, "Source", return_type="path"))
+
+
+def _bfs(
+    G, source, cutoff, kind, *, return_type, reverse_path=False, target=None, scale=None
+):
+    """BFS for unweighted shortest path algorithms.
+
+    Parameters
+    ----------
+    source: node label
+
+    cutoff: int, optional
+
+    kind: {"Source", "Target"}
+
+    return_type: {"length", "path", "length-path"}
+
+    reverse_path: bool
+
+    target: node label
+
+    scale: int or float, optional
+        The amount to scale the lengths
+    """
+    # DRY: _sssp in weighted.py has similar code
     if source not in G:
-        raise nx.NodeNotFound(f"{kind} {source} is not in G")
-    if G.src_indices.size == 0:
-        return {source: 0}
+        # Different message to pass networkx tests
+        if return_type == "length":
+            raise nx.NodeNotFound(f"{kind} {source} is not in G")
+        raise nx.NodeNotFound(f"{kind} {source} not in G")
+    if target is not None:
+        if source == target or cutoff is not None and cutoff <= 0:
+            if return_type == "path":
+                return [source]
+            if return_type == "length":
+                return 0
+            # return_type == "length-path"
+            return 0, [source]
+        if target not in G or G.src_indices.size == 0:
+            raise nx.NetworkXNoPath(f"Node {target} not reachable from {source}")
+    elif G.src_indices.size == 0 or cutoff is not None and cutoff <= 0:
+        if return_type == "path":
+            return {source: [source]}
+        if return_type == "length":
+            return {source: 0}
+        # return_type == "length-path"
+        return {source: 0}, {source: [source]}
+
     if cutoff is None:
         cutoff = -1
     src_index = source if G.key_to_id is None else G.key_to_id[source]
@@ -46,8 +140,68 @@ def _single_shortest_path_length(G, source, cutoff, kind):
         sources=cp.array([src_index], index_dtype),
         direction_optimizing=False,  # True for undirected only; what's recommended?
         depth_limit=cutoff,
-        compute_predecessors=False,
+        compute_predecessors=return_type != "length",
         do_expensive_check=False,
     )
     mask = distances != np.iinfo(distances.dtype).max
-    return G._nodearrays_to_dict(node_ids[mask], distances[mask])
+    node_ids = node_ids[mask]
+    if return_type != "path":
+        lengths = distances = distances[mask]
+        if scale is not None:
+            lengths = scale * lengths
+        lengths = G._nodearrays_to_dict(node_ids, lengths)
+        if target is not None:
+            if target not in lengths:
+                raise nx.NetworkXNoPath(f"Node {target} not reachable from {source}")
+            lengths = lengths[target]
+    if return_type != "length":
+        if target is not None:
+            d = dict(zip(node_ids.tolist(), predecessors[mask].tolist()))
+            dst_index = target if G.key_to_id is None else G.key_to_id[target]
+            if dst_index not in d:
+                raise nx.NetworkXNoPath(f"Node {target} not reachable from {source}")
+            cur = dst_index
+            paths = [dst_index]
+            while cur != src_index:
+                cur = d[cur]
+                paths.append(cur)
+            if (id_to_key := G.id_to_key) is not None:
+                if reverse_path:
+                    paths = [id_to_key[cur] for cur in paths]
+                else:
+                    paths = [id_to_key[cur] for cur in reversed(paths)]
+            elif not reverse_path:
+                paths.reverse()
+        else:
+            if return_type == "path":
+                distances = distances[mask]
+            groups = _groupby(distances, [predecessors[mask], node_ids])
+
+            # `pred_node_iter` does the equivalent as these nested for loops:
+            # for length in range(1, len(groups)):
+            #     preds, nodes = groups[length]
+            #     for pred, node in zip(preds.tolist(), nodes.tolist()):
+            if G.key_to_id is None:
+                pred_node_iter = concat(
+                    zip(*(x.tolist() for x in groups[length]))
+                    for length in range(1, len(groups))
+                )
+            else:
+                pred_node_iter = concat(
+                    zip(*(G._nodeiter_to_iter(x.tolist()) for x in groups[length]))
+                    for length in range(1, len(groups))
+                )
+            # Consider making utility functions for creating paths
+            paths = {source: [source]}
+            if reverse_path:
+                for pred, node in pred_node_iter:
+                    paths[node] = [node, *paths[pred]]
+            else:
+                for pred, node in pred_node_iter:
+                    paths[node] = [*paths[pred], node]
+    if return_type == "path":
+        return paths
+    if return_type == "length":
+        return lengths
+    # return_type == "length-path"
+    return lengths, paths
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/weighted.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/weighted.py
new file mode 100644
index 00000000000..32323dd45f3
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/weighted.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+import numpy as np
+import pylibcugraph as plc
+
+from nx_cugraph.convert import _to_graph
+from nx_cugraph.utils import (
+    _dtype_param,
+    _get_float_dtype,
+    _groupby,
+    networkx_algorithm,
+)
+
+from .unweighted import _bfs
+
+__all__ = [
+    "bellman_ford_path",
+    "bellman_ford_path_length",
+    "single_source_bellman_ford",
+    "single_source_bellman_ford_path",
+    "single_source_bellman_ford_path_length",
+    "all_pairs_bellman_ford_path",
+    "all_pairs_bellman_ford_path_length",
+]
+
+
+def _add_doc(func):
+    func.__doc__ = (
+        "Negative cycles are not yet supported. ``NotImplementedError`` will be raised "
+        "if there are negative edge weights. We plan to support negative edge weights "
+        "soon. Also, callable ``weight`` argument is not supported."
+    )
+    return func
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
+@_add_doc
+def bellman_ford_path(G, source, target, weight="weight", *, dtype=None):
+    G = _to_graph(G, weight, 1, np.float32)
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
+    return _sssp(G, source, weight, target, return_type="path", dtype=dtype)
+
+
+@bellman_ford_path._can_run
+def _(G, source, target, weight="weight", *, dtype=None):
+    return (
+        weight is None
+        or not callable(weight)
+        and not nx.is_negatively_weighted(G, weight=weight)
+    )
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
+@_add_doc
+def bellman_ford_path_length(G, source, target, weight="weight", *, dtype=None):
+    G = _to_graph(G, weight, 1, np.float32)
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
+    return _sssp(G, source, weight, target, return_type="length", dtype=dtype)
+
+
+@bellman_ford_path_length._can_run
+def _(G, source, target, weight="weight", *, dtype=None):
+    return (
+        weight is None
+        or not callable(weight)
+        and not nx.is_negatively_weighted(G, weight=weight)
+    )
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
+@_add_doc
+def single_source_bellman_ford_path(G, source, weight="weight", *, dtype=None):
+    G = _to_graph(G, weight, 1, np.float32)
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
+    return _sssp(G, source, weight, return_type="path", dtype=dtype)
+
+
+@single_source_bellman_ford_path._can_run
+def _(G, source, weight="weight", *, dtype=None):
+    return (
+        weight is None
+        or not callable(weight)
+        and not nx.is_negatively_weighted(G, weight=weight)
+    )
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
+@_add_doc
+def single_source_bellman_ford_path_length(G, source, weight="weight", *, dtype=None):
+    G = _to_graph(G, weight, 1, np.float32)
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
+    return _sssp(G, source, weight, return_type="length", dtype=dtype)
+
+
+@single_source_bellman_ford_path_length._can_run
+def _(G, source, weight="weight", *, dtype=None):
+    return (
+        weight is None
+        or not callable(weight)
+        and not nx.is_negatively_weighted(G, weight=weight)
+    )
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
+@_add_doc
+def single_source_bellman_ford(G, source, target=None, weight="weight", *, dtype=None):
+    G = _to_graph(G, weight, 1, np.float32)
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
+    return _sssp(G, source, weight, target, return_type="length-path", dtype=dtype)
+
+
+@single_source_bellman_ford._can_run
+def _(G, source, target=None, weight="weight", *, dtype=None):
+    return (
+        weight is None
+        or not callable(weight)
+        and not nx.is_negatively_weighted(G, weight=weight)
+    )
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
+@_add_doc
+def all_pairs_bellman_ford_path_length(G, weight="weight", *, dtype=None):
+    # TODO PERF: batched bfs to compute many at once
+    G = _to_graph(G, weight, 1, np.float32)
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
+    for n in G:
+        yield (n, _sssp(G, n, weight, return_type="length", dtype=dtype))
+
+
+@all_pairs_bellman_ford_path_length._can_run
+def _(G, weight="weight", *, dtype=None):
+    return (
+        weight is None
+        or not callable(weight)
+        and not nx.is_negatively_weighted(G, weight=weight)
+    )
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
+@_add_doc
+def all_pairs_bellman_ford_path(G, weight="weight", *, dtype=None):
+    # TODO PERF: batched bfs to compute many at once
+    G = _to_graph(G, weight, 1, np.float32)
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
+    for n in G:
+        yield (n, _sssp(G, n, weight, return_type="path", dtype=dtype))
+
+
+@all_pairs_bellman_ford_path._can_run
+def _(G, weight="weight", *, dtype=None):
+    return (
+        weight is None
+        or not callable(weight)
+        and not nx.is_negatively_weighted(G, weight=weight)
+    )
+
+
+def _sssp(G, source, weight, target=None, *, return_type, dtype, reverse_path=False):
+    """SSSP for weighted shortest paths.
+
+    Parameters
+    ----------
+    return_type : {"length", "path", "length-path"}
+
+    """
+    # DRY: _bfs in unweighted.py has similar code
+    if source not in G:
+        raise nx.NodeNotFound(f"Node {source} not found in graph")
+    if target is not None:
+        if source == target:
+            if return_type == "path":
+                return [source]
+            if return_type == "length":
+                return 0
+            # return_type == "length-path"
+            return 0, [source]
+        if target not in G or G.src_indices.size == 0:
+            raise nx.NetworkXNoPath(f"Node {target} not reachable from {source}")
+    elif G.src_indices.size == 0:
+        if return_type == "path":
+            return {source: [source]}
+        if return_type == "length":
+            return {source: 0}
+        # return_type == "length-path"
+        return {source: 0}, {source: [source]}
+
+    if callable(weight):
+        raise NotImplementedError("callable `weight` argument is not supported")
+
+    if weight not in G.edge_values:
+        # No edge values, so use BFS instead
+        return _bfs(G, source, None, "Source", return_type=return_type, target=target)
+
+    # Check for negative values since we don't support negative cycles
+    edge_vals = G.edge_values[weight]
+    if weight in G.edge_masks:
+        edge_vals = edge_vals[G.edge_masks[weight]]
+    if (edge_vals < 0).any():
+        raise NotImplementedError("Negative edge weights not yet supported")
+    edge_val = edge_vals[0]
+    if (edge_vals == edge_val).all() and (
+        edge_vals.size == G.src_indices.size or edge_val == 1
+    ):
+        # Edge values are all the same, so use scaled BFS instead
+        return _bfs(
+            G,
+            source,
+            None,
+            "Source",
+            return_type=return_type,
+            target=target,
+            scale=edge_val,
+            reverse_path=reverse_path,
+        )
+
+    src_index = source if G.key_to_id is None else G.key_to_id[source]
+    node_ids, distances, predecessors = plc.sssp(
+        resource_handle=plc.ResourceHandle(),
+        graph=G._get_plc_graph(weight, 1, dtype),
+        source=src_index,
+        cutoff=np.inf,
+        compute_predecessors=True,  # TODO: False is not yet supported
+        # compute_predecessors=return_type != "length",
+        do_expensive_check=False,
+    )
+    mask = distances != np.finfo(distances.dtype).max
+    node_ids = node_ids[mask]
+    if return_type != "path":
+        lengths = G._nodearrays_to_dict(node_ids, distances[mask])
+        if target is not None:
+            if target not in lengths:
+                raise nx.NetworkXNoPath(f"Node {target} not reachable from {source}")
+            lengths = lengths[target]
+    if return_type != "length":
+        if target is not None:
+            d = dict(zip(node_ids.tolist(), predecessors[mask].tolist()))
+            dst_index = target if G.key_to_id is None else G.key_to_id[target]
+            if dst_index not in d:
+                raise nx.NetworkXNoPath(f"Node {target} not reachable from {source}")
+            cur = dst_index
+            paths = [dst_index]
+            while cur != src_index:
+                cur = d[cur]
+                paths.append(cur)
+            if (id_to_key := G.id_to_key) is not None:
+                if reverse_path:
+                    paths = [id_to_key[cur] for cur in paths]
+                else:
+                    paths = [id_to_key[cur] for cur in reversed(paths)]
+            elif not reverse_path:
+                paths.reverse()
+        else:
+            groups = _groupby(predecessors[mask], node_ids)
+            if (id_to_key := G.id_to_key) is not None:
+                groups = {id_to_key[k]: v for k, v in groups.items() if k >= 0}
+            paths = {source: [source]}
+            preds = [source]
+            while preds:
+                pred = preds.pop()
+                pred_path = paths[pred]
+                nodes = G._nodearray_to_list(groups[pred])
+                if reverse_path:
+                    for node in nodes:
+                        paths[node] = [node, *pred_path]
+                else:
+                    for node in nodes:
+                        paths[node] = [*pred_path, node]
+                preds.extend(nodes & groups.keys())
+    if return_type == "path":
+        return paths
+    if return_type == "length":
+        return lengths
+    # return_type == "length-path"
+    return lengths, paths
diff --git a/python/nx-cugraph/nx_cugraph/interface.py b/python/nx-cugraph/nx_cugraph/interface.py
index d044ba6960d..0d893ac286b 100644
--- a/python/nx-cugraph/nx_cugraph/interface.py
+++ b/python/nx-cugraph/nx_cugraph/interface.py
@@ -67,6 +67,7 @@ def key(testpath):
         no_multigraph = "multigraphs not currently supported"
         louvain_different = "Louvain may be different due to RNG"
         no_string_dtype = "string edge values not currently supported"
+        sssp_path_different = "sssp may choose a different valid path"
 
         xfail = {
             # This is removed while strongly_connected_components() is not
@@ -77,6 +78,19 @@ def key(testpath):
             #     "test_strongly_connected.py:"
             #     "TestStronglyConnected.test_condensation_mapping_and_members"
             # ): "Strongly connected groups in different iteration order",
+            key(
+                "test_cycles.py:TestMinimumCycleBasis.test_unweighted_diamond"
+            ): sssp_path_different,
+            key(
+                "test_cycles.py:TestMinimumCycleBasis.test_weighted_diamond"
+            ): sssp_path_different,
+            key(
+                "test_cycles.py:TestMinimumCycleBasis.test_petersen_graph"
+            ): sssp_path_different,
+            key(
+                "test_cycles.py:TestMinimumCycleBasis."
+                "test_gh6787_and_edge_attribute_names"
+            ): sssp_path_different,
         }
 
         from packaging.version import parse
diff --git a/python/nx-cugraph/nx_cugraph/utils/misc.py b/python/nx-cugraph/nx_cugraph/utils/misc.py
index aa06d7fd29b..eab4b42c2cc 100644
--- a/python/nx-cugraph/nx_cugraph/utils/misc.py
+++ b/python/nx-cugraph/nx_cugraph/utils/misc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -22,7 +22,9 @@
 import numpy as np
 
 if TYPE_CHECKING:
-    from ..typing import Dtype
+    import nx_cugraph as nxcg
+
+    from ..typing import Dtype, EdgeKey
 
 try:
     from itertools import pairwise  # Python >=3.10
@@ -190,10 +192,14 @@ def _get_int_dtype(
         raise ValueError("Value is too large to store as integer: {val}") from exc
 
 
-def _get_float_dtype(dtype: Dtype):
+def _get_float_dtype(
+    dtype: Dtype, *, graph: nxcg.Graph | None = None, weight: EdgeKey | None = None
+):
     """Promote dtype to float32 or float64 as appropriate."""
     if dtype is None:
-        return np.dtype(np.float32)
+        if graph is None or weight not in graph.edge_values:
+            return np.dtype(np.float32)
+        dtype = graph.edge_values[weight].dtype
     rv = np.promote_types(dtype, np.float32)
     if np.float32 != rv != np.float64:
         raise TypeError(
diff --git a/python/nx-cugraph/pyproject.toml b/python/nx-cugraph/pyproject.toml
index 07ec0eab264..dbdc8dd19e1 100644
--- a/python/nx-cugraph/pyproject.toml
+++ b/python/nx-cugraph/pyproject.toml
@@ -33,7 +33,7 @@ classifiers = [
 dependencies = [
     "cupy-cuda11x>=12.0.0",
     "networkx>=3.0",
-    "numpy>=1.23",
+    "numpy>=1.23,<2.0a0",
     "pylibcugraph==24.4.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
diff --git a/python/nx-cugraph/scripts/update_readme.py b/python/nx-cugraph/scripts/update_readme.py
old mode 100644
new mode 100755
diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
index c2e22fc1ff7..7cc90145949 100644
--- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -57,6 +57,7 @@ set(cython_sources
     utils.pyx
     weakly_connected_components.pyx
     replicate_edgelist.pyx
+    degrees.pyx
 )
 set(linked_libraries cugraph::cugraph;cugraph::cugraph_c)
 
diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py
index ab518e24cae..dcdef05e106 100644
--- a/python/pylibcugraph/pylibcugraph/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/__init__.py
@@ -95,6 +95,8 @@
 
 from pylibcugraph.sorensen_coefficients import sorensen_coefficients
 
+from pylibcugraph.degrees import in_degrees, out_degrees, degrees
+
 
 from pylibcugraph import exceptions
 
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd
index 90bc041e5f0..6f1ac1f640b 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd
@@ -182,3 +182,58 @@ cdef extern from "cugraph_c/graph_functions.h":
             cugraph_induced_subgraph_result_t** result,
             cugraph_error_t** error
         )
+
+    ###########################################################################
+    # degrees
+    ctypedef struct cugraph_degrees_result_t:
+        pass
+
+    cdef cugraph_error_code_t \
+        cugraph_in_degrees(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            const cugraph_type_erased_device_array_view_t* source_vertices,
+            bool_t do_expensive_check,
+            cugraph_degrees_result_t** result,
+            cugraph_error_t** error
+        )
+
+    cdef cugraph_error_code_t \
+        cugraph_out_degrees(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            const cugraph_type_erased_device_array_view_t* source_vertices,
+            bool_t do_expensive_check,
+            cugraph_degrees_result_t** result,
+            cugraph_error_t** error
+        )
+
+    cdef cugraph_error_code_t \
+        cugraph_degrees(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            const cugraph_type_erased_device_array_view_t* source_vertices,
+            bool_t do_expensive_check,
+            cugraph_degrees_result_t** result,
+            cugraph_error_t** error
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_degrees_result_get_vertices(
+            cugraph_degrees_result_t* degrees_result
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_degrees_result_get_in_degrees(
+            cugraph_degrees_result_t* degrees_result
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_degrees_result_get_out_degrees(
+            cugraph_degrees_result_t* degrees_result
+        )
+
+    cdef void \
+        cugraph_degrees_result_free(
+            cugraph_degrees_result_t* degrees_result
+        )
diff --git a/python/pylibcugraph/pylibcugraph/degrees.pyx b/python/pylibcugraph/pylibcugraph/degrees.pyx
new file mode 100644
index 00000000000..7818da441bd
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/degrees.pyx
@@ -0,0 +1,307 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    bool_t,
+    data_type_id_t,
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.graph_functions cimport (
+    cugraph_degrees_result_t,
+    cugraph_degrees,
+    cugraph_in_degrees,
+    cugraph_out_degrees,
+    cugraph_degrees_result_get_vertices,
+    cugraph_degrees_result_get_in_degrees,
+    cugraph_degrees_result_get_out_degrees,
+    cugraph_degrees_result_free,
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    copy_to_cupy_array,
+    assert_CAI_type,
+    create_cugraph_type_erased_device_array_view_from_py_obj,
+)
+
+
+def in_degrees(ResourceHandle resource_handle,
+               _GPUGraph graph,
+               source_vertices,
+               bool_t do_expensive_check):
+    """
+    Compute the in degrees for the nodes of the graph.
+
+    Parameters
+    ----------
+    resource_handle : ResourceHandle
+        Handle to the underlying device resources needed for referencing data
+        and running algorithms.
+
+    graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    source_vertices : cupy array
+        The nodes for which we will compute degrees.
+
+    do_expensive_check : bool_t
+        A flag to run expensive checks for input arguments if True.
+
+    Returns
+    -------
+    A tuple of device arrays, where the first item in the tuple is a device
+    array containing the vertices, the second item in the tuple is a device
+    array containing the in degrees for the vertices.
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 2], dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 2, 3], dtype=numpy.int32)
+    >>> weights = cupy.asarray([1.0, 1.0, 1.0], dtype=numpy.float32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=True, renumber=False, do_expensive_check=False)
+    >>> (vertices, in_degrees) = pylibcugraph.in_degrees(
+                                   resource_handle, G, None, False)
+
+    """
+
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+        resource_handle.c_resource_handle_ptr
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+
+    cdef cugraph_degrees_result_t* result_ptr
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    assert_CAI_type(source_vertices, "source_vertices", True)
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        source_vertices_ptr = \
+            create_cugraph_type_erased_device_array_view_from_py_obj(
+                source_vertices)
+
+    error_code = cugraph_in_degrees(c_resource_handle_ptr,
+                                    c_graph_ptr,
+                                    source_vertices_ptr,
+                                    do_expensive_check,
+                                    &result_ptr,
+                                    &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_in_degrees")
+
+    # Extract individual device array pointers from result and copy to cupy
+    # arrays for returning.
+    cdef cugraph_type_erased_device_array_view_t* vertices_ptr = \
+        cugraph_degrees_result_get_vertices(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* in_degrees_ptr = \
+        cugraph_degrees_result_get_in_degrees(result_ptr)
+
+    cupy_vertices = copy_to_cupy_array(c_resource_handle_ptr, vertices_ptr)
+    cupy_in_degrees = copy_to_cupy_array(c_resource_handle_ptr, in_degrees_ptr)
+
+    cugraph_degrees_result_free(result_ptr)
+
+    return (cupy_vertices, cupy_in_degrees)
+
+def out_degrees(ResourceHandle resource_handle,
+                _GPUGraph graph,
+                source_vertices,
+                bool_t do_expensive_check):
+    """
+    Compute the out degrees for the nodes of the graph.
+
+    Parameters
+    ----------
+    resource_handle : ResourceHandle
+        Handle to the underlying device resources needed for referencing data
+        and running algorithms.
+
+    graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    source_vertices : cupy array
+        The nodes for which we will compute degrees.
+
+    do_expensive_check : bool_t
+        A flag to run expensive checks for input arguments if True.
+
+    Returns
+    -------
+    A tuple of device arrays, where the first item in the tuple is a device
+    array containing the vertices, the second item in the tuple is a device
+    array containing the out degrees for the vertices.
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 2], dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 2, 3], dtype=numpy.int32)
+    >>> weights = cupy.asarray([1.0, 1.0, 1.0], dtype=numpy.float32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=True, renumber=False, do_expensive_check=False)
+    >>> (vertices, out_degrees) = pylibcugraph.out_degrees(
+                                    resource_handle, G, None, False)
+
+    """
+
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+        resource_handle.c_resource_handle_ptr
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+
+    cdef cugraph_degrees_result_t* result_ptr
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    assert_CAI_type(source_vertices, "source_vertices", True)
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        source_vertices_ptr = \
+            create_cugraph_type_erased_device_array_view_from_py_obj(
+                source_vertices)
+
+    error_code = cugraph_out_degrees(c_resource_handle_ptr,
+                                     c_graph_ptr,
+                                     source_vertices_ptr,
+                                     do_expensive_check,
+                                     &result_ptr,
+                                     &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_out_degrees")
+
+    # Extract individual device array pointers from result and copy to cupy
+    # arrays for returning.
+    cdef cugraph_type_erased_device_array_view_t* vertices_ptr = \
+        cugraph_degrees_result_get_vertices(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* out_degrees_ptr = \
+        cugraph_degrees_result_get_out_degrees(result_ptr)
+
+    cupy_vertices = copy_to_cupy_array(c_resource_handle_ptr, vertices_ptr)
+    cupy_out_degrees = copy_to_cupy_array(c_resource_handle_ptr, out_degrees_ptr)
+
+    cugraph_degrees_result_free(result_ptr)
+
+    return (cupy_vertices, cupy_out_degrees)
+
+
+def degrees(ResourceHandle resource_handle,
+            _GPUGraph graph,
+            source_vertices,
+            bool_t do_expensive_check):
+    """
+    Compute the degrees for the nodes of the graph.
+
+    Parameters
+    ----------
+    resource_handle : ResourceHandle
+        Handle to the underlying device resources needed for referencing data
+        and running algorithms.
+
+    graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    source_vertices : cupy array
+        The nodes for which we will compute degrees.
+
+    do_expensive_check : bool_t
+        A flag to run expensive checks for input arguments if True.
+
+    Returns
+    -------
+    A tuple of device arrays, where the first item in the tuple is a device
+    array containing the vertices, the second item in the tuple is a device
+    array containing the in degrees for the vertices, the third item in the
+    tuple is a device array containing the out degrees for the vertices.
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 2], dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 2, 3], dtype=numpy.int32)
+    >>> weights = cupy.asarray([1.0, 1.0, 1.0], dtype=numpy.float32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=True, renumber=False, do_expensive_check=False)
+    >>> (vertices, in_degrees, out_degrees) = pylibcugraph.degrees(
+                                                resource_handle, G, None, False)
+
+    """
+
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+        resource_handle.c_resource_handle_ptr
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+
+    cdef cugraph_degrees_result_t* result_ptr
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    assert_CAI_type(source_vertices, "source_vertices", True)
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        source_vertices_ptr = \
+            create_cugraph_type_erased_device_array_view_from_py_obj(
+                source_vertices)
+
+    error_code = cugraph_degrees(c_resource_handle_ptr,
+                                 c_graph_ptr,
+                                 source_vertices_ptr,
+                                 do_expensive_check,
+                                 &result_ptr,
+                                 &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_degrees")
+
+    # Extract individual device array pointers from result and copy to cupy
+    # arrays for returning.
+    cdef cugraph_type_erased_device_array_view_t* vertices_ptr = \
+        cugraph_degrees_result_get_vertices(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* in_degrees_ptr = \
+        cugraph_degrees_result_get_in_degrees(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* out_degrees_ptr = \
+        cugraph_degrees_result_get_out_degrees(result_ptr)
+
+    cupy_vertices = copy_to_cupy_array(c_resource_handle_ptr, vertices_ptr)
+    cupy_in_degrees = copy_to_cupy_array(c_resource_handle_ptr, in_degrees_ptr)
+    cupy_out_degrees = copy_to_cupy_array(c_resource_handle_ptr, out_degrees_ptr)
+
+    cugraph_degrees_result_free(result_ptr)
+
+    return (cupy_vertices, cupy_in_degrees, cupy_out_degrees)
diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml
index eb7323d19e5..d5f568a7a90 100644
--- a/python/pylibcugraph/pyproject.toml
+++ b/python/pylibcugraph/pyproject.toml
@@ -42,7 +42,7 @@ classifiers = [
 [project.optional-dependencies]
 test = [
     "cudf==24.4.*",
-    "numpy>=1.23",
+    "numpy>=1.23,<2.0a0",
     "pandas",
     "pytest",
     "pytest-benchmark",