diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 707c61e4d3e..1976d8ff46f 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -30,7 +30,23 @@ if [[ ${package_name} == "nx-cugraph" ]] || \
    [[ ${package_name} == "cugraph-equivariant" ]]; then
     RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 dist
 else
+    case "${RAPIDS_CUDA_VERSION}" in
+        12.*)
+            EXCLUDE_ARGS=(
+                --exclude "libcublas.so.12"
+                --exclude "libcublasLt.so.12"
+                --exclude "libcurand.so.10"
+                --exclude "libcusolver.so.11"
+                --exclude "libcusparse.so.12"
+                --exclude "libnvJitLink.so.12"
+            )
+        ;;
+        11.*)
+            EXCLUDE_ARGS=()
+        ;;
+    esac
+
     mkdir -p final_dist
-    python -m auditwheel repair -w final_dist dist/*
+    python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/*
     RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
 fi
diff --git a/ci/build_wheel_cugraph.sh b/ci/build_wheel_cugraph.sh
index 6f1b23923ff..20d9bf47e3e 100755
--- a/ci/build_wheel_cugraph.sh
+++ b/ci/build_wheel_cugraph.sh
@@ -19,8 +19,16 @@ export PIP_CONSTRAINT="${PWD}/constraints.txt"
 
 PARALLEL_LEVEL=$(python -c \
   "from math import ceil; from multiprocessing import cpu_count; print(ceil(cpu_count()/4))")
+case "${RAPIDS_CUDA_VERSION}" in
+  12.*)
+    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
+  ;;
+  11.*)
+    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=OFF"
+  ;;
+esac
 
-export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUGRAPH_CPP=OFF;-DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
+export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUGRAPH_CPP=OFF;-DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/${EXTRA_CMAKE_ARGS}"
 export SKBUILD_BUILD_TOOL_ARGS="-j${PARALLEL_LEVEL};-l${PARALLEL_LEVEL}"
 
 ./ci/build_wheel.sh cugraph python/cugraph
diff --git a/ci/build_wheel_pylibcugraph.sh b/ci/build_wheel_pylibcugraph.sh
index ee33ab4a82d..fa967b0be29 100755
--- a/ci/build_wheel_pylibcugraph.sh
+++ b/ci/build_wheel_pylibcugraph.sh
@@ -6,7 +6,16 @@ set -euo pipefail
 PARALLEL_LEVEL=$(python -c \
   "from math import ceil; from multiprocessing import cpu_count; print(ceil(cpu_count()/4))")
 
-export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUGRAPH_CPP=OFF;-DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
+case "${RAPIDS_CUDA_VERSION}" in
+  12.*)
+    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
+  ;;
+  11.*)
+    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=OFF"
+  ;;
+esac
+
+export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUGRAPH_CPP=OFF;-DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/${EXTRA_CMAKE_ARGS}"
 export SKBUILD_BUILD_TOOL_ARGS="-j${PARALLEL_LEVEL};-l${PARALLEL_LEVEL}"
 
 ./ci/build_wheel.sh pylibcugraph python/pylibcugraph
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 441627cabce..26b710247f6 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -332,6 +332,12 @@ set(CUGRAPH_SOURCES
     src/sampling/neighbor_sampling_sg_v32_e64.cpp
     src/sampling/neighbor_sampling_sg_v32_e32.cpp
     src/sampling/neighbor_sampling_sg_v64_e64.cpp
+    src/sampling/negative_sampling_sg_v32_e64.cu
+    src/sampling/negative_sampling_sg_v32_e32.cu
+    src/sampling/negative_sampling_sg_v64_e64.cu
+    src/sampling/negative_sampling_mg_v32_e64.cu
+    src/sampling/negative_sampling_mg_v32_e32.cu
+    src/sampling/negative_sampling_mg_v64_e64.cu
     src/sampling/renumber_sampled_edgelist_sg_v64_e64.cu
     src/sampling/renumber_sampled_edgelist_sg_v32_e32.cu
     src/sampling/sampling_post_processing_sg_v64_e64.cu
@@ -483,6 +489,7 @@ set(CUGRAPH_SOURCES
     src/centrality/betweenness_centrality_mg_v32_e32.cu
     src/centrality/betweenness_centrality_mg_v32_e64.cu
     src/tree/legacy/mst.cu
+    src/from_cugraph_ops/sampling_index.cu
     src/components/weakly_connected_components_sg_v64_e64.cu
     src/components/weakly_connected_components_sg_v32_e32.cu
     src/components/weakly_connected_components_sg_v32_e64.cu
@@ -656,6 +663,7 @@ add_library(cugraph_c
         src/c_api/louvain.cpp
         src/c_api/triangle_count.cpp
         src/c_api/neighbor_sampling.cpp
+        src/c_api/negative_sampling.cpp
         src/c_api/labeling_result.cpp
         src/c_api/weakly_connected_components.cpp
         src/c_api/strongly_connected_components.cpp
diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 8ba39fa2328..faeb7ad8f83 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -1684,6 +1684,8 @@ node2vec_random_walks(raft::handle_t const& handle,
  * list of vertices and sample size per vertex. The output graph consists of the given
  * vertices with each vertex having at most `sample_size` neighbors from the original graph
  *
+ * @deprecated This API will be deprecated.  uniform_neighbor_sample can be used instead.
+ *
  * @tparam graph_t Type of input graph/view (typically, graph_view_t, non-transposed and
  * single-gpu).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -1714,6 +1716,8 @@ sample_neighbors_adjacency_list(raft::handle_t const& handle,
  * list of vertices and sample size per vertex. The output graph consists of the given
  * vertices with each vertex having at most `sample_size` neighbors from the original graph
  *
+ * @deprecated This API will be deprecated.  uniform_neighbor_sample can be used instead.
+ *
  * @tparam graph_t Type of input graph/view (typically, graph_view_t, non-transposed and
  * single-gpu).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index cbb52ef3b1e..a2ff3166fa4 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -636,7 +636,7 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
                                      /* (edge_srcs, edge_dsts) should be pre-shuffled */
                                      raft::device_span<vertex_t const> edge_srcs,
                                      raft::device_span<vertex_t const> edge_dsts,
-                                     bool do_expensive_check = false);
+                                     bool do_expensive_check = false) const;
 
   rmm::device_uvector<edge_t> compute_multiplicity(
     raft::handle_t const& handle,
@@ -945,7 +945,7 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   rmm::device_uvector<bool> has_edge(raft::handle_t const& handle,
                                      raft::device_span<vertex_t const> edge_srcs,
                                      raft::device_span<vertex_t const> edge_dsts,
-                                     bool do_expensive_check = false);
+                                     bool do_expensive_check = false) const;
 
   rmm::device_uvector<edge_t> compute_multiplicity(raft::handle_t const& handle,
                                                    raft::device_span<vertex_t const> edge_srcs,
diff --git a/cpp/include/cugraph/sampling_functions.hpp b/cpp/include/cugraph/sampling_functions.hpp
index fec1a07604e..4e5596d06e0 100644
--- a/cpp/include/cugraph/sampling_functions.hpp
+++ b/cpp/include/cugraph/sampling_functions.hpp
@@ -743,4 +743,61 @@ lookup_endpoints_from_edge_ids_and_types(
   raft::device_span<edge_t const> edge_ids_to_lookup,
   raft::device_span<edge_type_t const> edge_types_to_lookup);
 
+/**
+ * @brief Negative Sampling
+ *
+ * This function generates negative samples for graph.
+ *
+ * Negative sampling is done by generating a random graph according to the specified
+ * parameters and optionally removing samples that represent actual edges in the graph
+ *
+ * Sampling occurs by creating a list of source vertex ids from biased samping
+ * of the source vertex space, and destination vertex ids from biased sampling of the
+ * destination vertex space, and using this as the putative list of edges.  We
+ * then can optionally remove duplicates and remove actual edges in the graph to generate
+ * the final list.  If necessary we will repeat the process to end with a resulting
+ * edge list of the appropriate size.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam store_transposed Flag indicating whether sources (if false) or destinations (if
+ * true) are major indices
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph View object to generate NBR Sampling for
+ * @param rng_state RNG state
+ * @param src_biases Optional bias for randomly selecting source vertices.  If std::nullopt vertices
+ * will be selected uniformly.  In multi-GPU environment the biases should be partitioned based
+ * on the vertex partitions.
+ * @param dst_biases Optional bias for randomly selecting destination vertices.  If std::nullopt
+ * vertices will be selected uniformly.  In multi-GPU environment the biases should be partitioned
+ * based on the vertex partitions.
+ * @param num_samples Number of negative samples to generate
+ * @param remove_duplicates If true, remove duplicate samples
+ * @param remove_existing_edges If true, remove samples that are actually edges in the graph
+ * @param exact_number_of_samples If true, repeat generation until we get the exact number of
+ * negative samples
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ *
+ * @return tuple containing source vertex ids and destination vertex ids for the negative samples
+ */
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<raft::device_span<weight_t const>> src_biases,
+  std::optional<raft::device_span<weight_t const>> dst_biases,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
 }  // namespace cugraph
diff --git a/cpp/include/cugraph_c/coo.h b/cpp/include/cugraph_c/coo.h
new file mode 100644
index 00000000000..ef746c6ed6a
--- /dev/null
+++ b/cpp/include/cugraph_c/coo.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph_c/array.h>
+#include <cugraph_c/graph.h>
+#include <cugraph_c/random.h>
+#include <cugraph_c/resource_handle.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief       Opaque COO definition
+ */
+typedef struct {
+  int32_t align_;
+} cugraph_coo_t;
+
+/**
+ * @brief       Opaque COO list definition
+ */
+typedef struct {
+  int32_t align_;
+} cugraph_coo_list_t;
+
+/**
+ * @brief       Get the source vertex ids
+ *
+ * @param [in]     coo   Opaque pointer to COO
+ * @return type erased array view of source vertex ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_coo_get_sources(cugraph_coo_t* coo);
+
+/**
+ * @brief       Get the destination vertex ids
+ *
+ * @param [in]     coo   Opaque pointer to COO
+ * @return type erased array view of destination vertex ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_coo_get_destinations(cugraph_coo_t* coo);
+
+/**
+ * @brief       Get the edge weights
+ *
+ * @param [in]     coo   Opaque pointer to COO
+ * @return type erased array view of edge weights, NULL if no edge weights in COO
+ */
+cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_weights(cugraph_coo_t* coo);
+
+/**
+ * @brief       Get the edge id
+ *
+ * @param [in]     coo   Opaque pointer to COO
+ * @return type erased array view of edge id, NULL if no edge ids in COO
+ */
+cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_id(cugraph_coo_t* coo);
+
+/**
+ * @brief       Get the edge type
+ *
+ * @param [in]     coo   Opaque pointer to COO
+ * @return type erased array view of edge type, NULL if no edge types in COO
+ */
+cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_type(cugraph_coo_t* coo);
+
+/**
+ * @brief       Get the number of coo object in the list
+ *
+ * @param [in]     coo_list   Opaque pointer to COO list
+ * @return number of elements
+ */
+size_t cugraph_coo_list_size(const cugraph_coo_list_t* coo_list);
+
+/**
+ * @brief       Get a COO from the list
+ *
+ * @param [in]     coo_list   Opaque pointer to COO list
+ * @param [in]     index      Index of desired COO from list
+ * @return a cugraph_coo_t* object from the list
+ */
+cugraph_coo_t* cugraph_coo_list_element(cugraph_coo_list_t* coo_list, size_t index);
+
+/**
+ * @brief     Free coo object
+ *
+ * @param [in]    coo Opaque pointer to COO
+ */
+void cugraph_coo_free(cugraph_coo_t* coo);
+
+/**
+ * @brief     Free coo list
+ *
+ * @param [in]    coo_list Opaque pointer to list of COO objects
+ */
+void cugraph_coo_list_free(cugraph_coo_list_t* coo_list);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/cpp/include/cugraph_c/graph_generators.h b/cpp/include/cugraph_c/graph_generators.h
index 272131d2aab..553be530e95 100644
--- a/cpp/include/cugraph_c/graph_generators.h
+++ b/cpp/include/cugraph_c/graph_generators.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cugraph_c/array.h>
+#include <cugraph_c/coo.h>
 #include <cugraph_c/graph.h>
 #include <cugraph_c/random.h>
 #include <cugraph_c/resource_handle.h>
@@ -27,91 +28,6 @@ extern "C" {
 
 typedef enum { POWER_LAW = 0, UNIFORM } cugraph_generator_distribution_t;
 
-/**
- * @brief       Opaque COO definition
- */
-typedef struct {
-  int32_t align_;
-} cugraph_coo_t;
-
-/**
- * @brief       Opaque COO list definition
- */
-typedef struct {
-  int32_t align_;
-} cugraph_coo_list_t;
-
-/**
- * @brief       Get the source vertex ids
- *
- * @param [in]     coo   Opaque pointer to COO
- * @return type erased array view of source vertex ids
- */
-cugraph_type_erased_device_array_view_t* cugraph_coo_get_sources(cugraph_coo_t* coo);
-
-/**
- * @brief       Get the destination vertex ids
- *
- * @param [in]     coo   Opaque pointer to COO
- * @return type erased array view of destination vertex ids
- */
-cugraph_type_erased_device_array_view_t* cugraph_coo_get_destinations(cugraph_coo_t* coo);
-
-/**
- * @brief       Get the edge weights
- *
- * @param [in]     coo   Opaque pointer to COO
- * @return type erased array view of edge weights, NULL if no edge weights in COO
- */
-cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_weights(cugraph_coo_t* coo);
-
-/**
- * @brief       Get the edge id
- *
- * @param [in]     coo   Opaque pointer to COO
- * @return type erased array view of edge id, NULL if no edge ids in COO
- */
-cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_id(cugraph_coo_t* coo);
-
-/**
- * @brief       Get the edge type
- *
- * @param [in]     coo   Opaque pointer to COO
- * @return type erased array view of edge type, NULL if no edge types in COO
- */
-cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_type(cugraph_coo_t* coo);
-
-/**
- * @brief       Get the number of coo object in the list
- *
- * @param [in]     coo_list   Opaque pointer to COO list
- * @return number of elements
- */
-size_t cugraph_coo_list_size(const cugraph_coo_list_t* coo_list);
-
-/**
- * @brief       Get a COO from the list
- *
- * @param [in]     coo_list   Opaque pointer to COO list
- * @param [in]     index      Index of desired COO from list
- * @return a cugraph_coo_t* object from the list
- */
-cugraph_coo_t* cugraph_coo_list_element(cugraph_coo_list_t* coo_list, size_t index);
-
-/**
- * @brief     Free coo object
- *
- * @param [in]    coo Opaque pointer to COO
- */
-void cugraph_coo_free(cugraph_coo_t* coo);
-
-/**
- * @brief     Free coo list
- *
- * @param [in]    coo_list Opaque pointer to list of COO objects
- */
-void cugraph_coo_list_free(cugraph_coo_list_t* coo_list);
-
 /**
  * @brief      Generate RMAT edge list
  *
diff --git a/cpp/include/cugraph_c/sampling_algorithms.h b/cpp/include/cugraph_c/sampling_algorithms.h
index 1a3d20b9339..bb26e577915 100644
--- a/cpp/include/cugraph_c/sampling_algorithms.h
+++ b/cpp/include/cugraph_c/sampling_algorithms.h
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cugraph_c/coo.h>
 #include <cugraph_c/error.h>
 #include <cugraph_c/graph.h>
 #include <cugraph_c/properties.h>
@@ -674,6 +675,57 @@ cugraph_error_code_t cugraph_select_random_vertices(const cugraph_resource_handl
                                                     cugraph_type_erased_device_array_t** vertices,
                                                     cugraph_error_t** error);
 
+/**
+ * @ingroup samplingC
+ * @brief Perform negative sampling
+ *
+ * Negative sampling generates a COO structure defining edges according to the specified parameters
+ *
+ * @param [in]     handle                  Handle for accessing resources
+ * @param [in,out] rng_state               State of the random number generator, updated with each
+ *                                         call
+ * @param [in]     graph                   Pointer to graph
+ * @param [in]     vertices                Vertex ids for the source biases.  If @p src_bias and
+ *                                         @p dst_bias are not specified this is ignored.  If
+ *                                         @p vertices is specified then vertices[i] is the vertex
+ *                                         id of src_biases[i] and dst_biases[i].  If @p vertices
+ *                                         is not specified then i is the vertex id if src_biases[i]
+ *                                         and dst_biases[i]
+ * @param [in]     src_biases              Bias for selecting source vertices.  If NULL, do uniform
+ *                                         sampling, if provided probability of vertex i will be
+ *                                         src_bias[i] / (sum of all source biases)
+ * @param [in]     dst_biases              Bias for selecting destination vertices.  If NULL, do
+ *                                         uniform sampling, if provided probability of vertex i
+ *                                         will be dst_bias[i] / (sum of all destination biases)
+ * @param [in]     num_samples             Number of negative samples to generate
+ * @param [in]     remove_duplicates       If true, remove duplicates from sampled edges
+ * @param [in]     remove_existing_edges   If true, remove sampled edges that actually exist in
+ *                                         the graph
+ * @param [in]     exact_number_of_samples If true, result should contain exactly @p num_samples. If
+ *                                         false the code will generate @p num_samples and then do
+ *                                         any filtering as specified
+ * @param [in]     do_expensive_check      A flag to run expensive checks for input arguments (if
+ *                                         set to true)
+ * @param [out]    result                  Opaque pointer to generated coo list
+ * @param [out]    error                   Pointer to an error object storing details of any error.
+ *                                         Will be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_negative_sampling(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* vertices,
+  const cugraph_type_erased_device_array_view_t* src_biases,
+  const cugraph_type_erased_device_array_view_t* dst_biases,
+  size_t num_samples,
+  bool_t remove_duplicates,
+  bool_t remove_existing_edges,
+  bool_t exact_number_of_samples,
+  bool_t do_expensive_check,
+  cugraph_coo_t** result,
+  cugraph_error_t** error);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/cpp/src/c_api/coo.hpp b/cpp/src/c_api/coo.hpp
new file mode 100644
index 00000000000..a83a3af375a
--- /dev/null
+++ b/cpp/src/c_api/coo.hpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_api/array.hpp"
+
+#include <vector>
+
+namespace cugraph {
+namespace c_api {
+
+struct cugraph_coo_t {
+  std::unique_ptr<cugraph_type_erased_device_array_t> src_{};
+  std::unique_ptr<cugraph_type_erased_device_array_t> dst_{};
+  std::unique_ptr<cugraph_type_erased_device_array_t> wgt_{};
+  std::unique_ptr<cugraph_type_erased_device_array_t> id_{};
+  std::unique_ptr<cugraph_type_erased_device_array_t> type_{};
+};
+
+struct cugraph_coo_list_t {
+  std::vector<std::unique_ptr<cugraph_coo_t>> list_;
+};
+
+}  // namespace c_api
+}  // namespace cugraph
diff --git a/cpp/src/c_api/graph_generators.cpp b/cpp/src/c_api/graph_generators.cpp
index ef478e57098..7601f1508f9 100644
--- a/cpp/src/c_api/graph_generators.cpp
+++ b/cpp/src/c_api/graph_generators.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include "c_api/array.hpp"
+#include "c_api/coo.hpp"
 #include "c_api/error.hpp"
 #include "c_api/random.hpp"
 #include "c_api/resource_handle.hpp"
@@ -26,24 +27,6 @@
 
 #include <raft/core/handle.hpp>
 
-namespace cugraph {
-namespace c_api {
-
-struct cugraph_coo_t {
-  std::unique_ptr<cugraph_type_erased_device_array_t> src_{};
-  std::unique_ptr<cugraph_type_erased_device_array_t> dst_{};
-  std::unique_ptr<cugraph_type_erased_device_array_t> wgt_{};
-  std::unique_ptr<cugraph_type_erased_device_array_t> id_{};
-  std::unique_ptr<cugraph_type_erased_device_array_t> type_{};
-};
-
-struct cugraph_coo_list_t {
-  std::vector<std::unique_ptr<cugraph_coo_t>> list_;
-};
-
-}  // namespace c_api
-}  // namespace cugraph
-
 namespace {
 
 template <typename vertex_t>
diff --git a/cpp/src/c_api/negative_sampling.cpp b/cpp/src/c_api/negative_sampling.cpp
new file mode 100644
index 00000000000..54f465d67b4
--- /dev/null
+++ b/cpp/src/c_api/negative_sampling.cpp
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_api/abstract_functor.hpp"
+#include "c_api/coo.hpp"
+#include "c_api/graph.hpp"
+#include "c_api/random.hpp"
+#include "c_api/resource_handle.hpp"
+#include "c_api/utils.hpp"
+
+#include <cugraph_c/sampling_algorithms.h>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+#include <raft/core/handle.hpp>
+
+namespace {
+
+struct negative_sampling_functor : public cugraph::c_api::abstract_functor {
+  raft::handle_t const& handle_;
+  cugraph::c_api::cugraph_rng_state_t* rng_state_{nullptr};
+  cugraph::c_api::cugraph_graph_t* graph_{nullptr};
+  cugraph::c_api::cugraph_type_erased_device_array_view_t const* vertices_{nullptr};
+  cugraph::c_api::cugraph_type_erased_device_array_view_t const* src_biases_{nullptr};
+  cugraph::c_api::cugraph_type_erased_device_array_view_t const* dst_biases_{nullptr};
+  size_t num_samples_;
+  bool remove_duplicates_{false};
+  bool remove_existing_edges_{false};
+  bool exact_number_of_samples_{false};
+  bool do_expensive_check_{false};
+  cugraph::c_api::cugraph_coo_t* result_{nullptr};
+
+  negative_sampling_functor(const cugraph_resource_handle_t* handle,
+                            cugraph_rng_state_t* rng_state,
+                            cugraph_graph_t* graph,
+                            const cugraph_type_erased_device_array_view_t* vertices,
+                            const cugraph_type_erased_device_array_view_t* src_biases,
+                            const cugraph_type_erased_device_array_view_t* dst_biases,
+                            size_t num_samples,
+                            bool_t remove_duplicates,
+                            bool_t remove_existing_edges,
+                            bool_t exact_number_of_samples,
+                            bool_t do_expensive_check)
+    : abstract_functor(),
+      handle_(*reinterpret_cast<cugraph::c_api::cugraph_resource_handle_t const*>(handle)->handle_),
+      rng_state_(reinterpret_cast<cugraph::c_api::cugraph_rng_state_t*>(rng_state)),
+      graph_(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)),
+      vertices_(
+        reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(vertices)),
+      src_biases_(reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+        src_biases)),
+      dst_biases_(reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+        dst_biases)),
+      num_samples_(num_samples),
+      remove_duplicates_(remove_duplicates),
+      remove_existing_edges_(remove_existing_edges),
+      exact_number_of_samples_(exact_number_of_samples),
+      do_expensive_check_(do_expensive_check)
+  {
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename edge_type_t,
+            bool store_transposed,
+            bool multi_gpu>
+  void operator()()
+  {
+    // FIXME: Think about how to handle SG vice MG
+    if constexpr (!cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
+      unsupported();
+    } else {
+      // negative_sampling expects store_transposed == false
+      if constexpr (store_transposed) {
+        error_code_ = cugraph::c_api::
+          transpose_storage<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+            handle_, graph_, error_.get());
+        if (error_code_ != CUGRAPH_SUCCESS) return;
+      }
+
+      auto graph =
+        reinterpret_cast<cugraph::graph_t<vertex_t, edge_t, false, multi_gpu>*>(graph_->graph_);
+
+      auto graph_view = graph->view();
+
+      auto number_map = reinterpret_cast<rmm::device_uvector<vertex_t>*>(graph_->number_map_);
+
+      rmm::device_uvector<vertex_t> vertices(0, handle_.get_stream());
+      rmm::device_uvector<weight_t> src_biases(0, handle_.get_stream());
+      rmm::device_uvector<weight_t> dst_biases(0, handle_.get_stream());
+
+      if (src_biases_ != nullptr) {
+        vertices.resize(vertices_->size_, handle_.get_stream());
+        src_biases.resize(src_biases_->size_, handle_.get_stream());
+
+        raft::copy(
+          vertices.data(), vertices_->as_type<vertex_t>(), vertices.size(), handle_.get_stream());
+        raft::copy(src_biases.data(),
+                   src_biases_->as_type<weight_t>(),
+                   src_biases.size(),
+                   handle_.get_stream());
+
+        src_biases = cugraph::detail::
+          collect_local_vertex_values_from_ext_vertex_value_pairs<vertex_t, weight_t, multi_gpu>(
+            handle_,
+            std::move(vertices),
+            std::move(src_biases),
+            *number_map,
+            graph_view.local_vertex_partition_range_first(),
+            graph_view.local_vertex_partition_range_last(),
+            weight_t{0},
+            do_expensive_check_);
+      }
+
+      if (dst_biases_ != nullptr) {
+        vertices.resize(vertices_->size_, handle_.get_stream());
+        dst_biases.resize(dst_biases_->size_, handle_.get_stream());
+
+        raft::copy(
+          vertices.data(), vertices_->as_type<vertex_t>(), vertices.size(), handle_.get_stream());
+        raft::copy(dst_biases.data(),
+                   dst_biases_->as_type<weight_t>(),
+                   dst_biases.size(),
+                   handle_.get_stream());
+
+        dst_biases = cugraph::detail::
+          collect_local_vertex_values_from_ext_vertex_value_pairs<vertex_t, weight_t, multi_gpu>(
+            handle_,
+            std::move(vertices),
+            std::move(dst_biases),
+            *number_map,
+            graph_view.local_vertex_partition_range_first(),
+            graph_view.local_vertex_partition_range_last(),
+            weight_t{0},
+            do_expensive_check_);
+      }
+
+      auto&& [src, dst] = cugraph::negative_sampling(
+        handle_,
+        rng_state_->rng_state_,
+        graph_view,
+        (src_biases_ != nullptr) ? std::make_optional(raft::device_span<weight_t const>{
+                                     src_biases.data(), src_biases.size()})
+                                 : std::nullopt,
+        (dst_biases_ != nullptr) ? std::make_optional(raft::device_span<weight_t const>{
+                                     dst_biases.data(), dst_biases.size()})
+                                 : std::nullopt,
+        num_samples_,
+        remove_duplicates_,
+        remove_existing_edges_,
+        exact_number_of_samples_,
+        do_expensive_check_);
+
+      std::vector<vertex_t> vertex_partition_lasts = graph_view.vertex_partition_range_lasts();
+
+      cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(handle_,
+                                                            src.data(),
+                                                            src.size(),
+                                                            number_map->data(),
+                                                            vertex_partition_lasts,
+                                                            do_expensive_check_);
+
+      cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(handle_,
+                                                            dst.data(),
+                                                            dst.size(),
+                                                            number_map->data(),
+                                                            vertex_partition_lasts,
+                                                            do_expensive_check_);
+
+      result_ = new cugraph::c_api::cugraph_coo_t{
+        std::make_unique<cugraph::c_api::cugraph_type_erased_device_array_t>(src,
+                                                                             graph_->vertex_type_),
+        std::make_unique<cugraph::c_api::cugraph_type_erased_device_array_t>(dst,
+                                                                             graph_->vertex_type_),
+        nullptr,
+        nullptr,
+        nullptr};
+    }
+  }
+};
+
+}  // namespace
+
+cugraph_error_code_t cugraph_negative_sampling(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* vertices,
+  const cugraph_type_erased_device_array_view_t* src_biases,
+  const cugraph_type_erased_device_array_view_t* dst_biases,
+  size_t num_samples,
+  bool_t remove_duplicates,
+  bool_t remove_existing_edges,
+  bool_t exact_number_of_samples,
+  bool_t do_expensive_check,
+  cugraph_coo_t** result,
+  cugraph_error_t** error)
+{
+  negative_sampling_functor functor{handle,
+                                    rng_state,
+                                    graph,
+                                    vertices,
+                                    src_biases,
+                                    dst_biases,
+                                    num_samples,
+                                    remove_duplicates,
+                                    remove_existing_edges,
+                                    exact_number_of_samples,
+                                    do_expensive_check};
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
diff --git a/cpp/src/community/egonet_impl.cuh b/cpp/src/community/egonet_impl.cuh
index 8b942be5b6a..c7945831ba8 100644
--- a/cpp/src/community/egonet_impl.cuh
+++ b/cpp/src/community/egonet_impl.cuh
@@ -17,8 +17,6 @@
 
 // #define TIMING
 
-#include "utilities/graph_utils.cuh"
-
 #include <cugraph/algorithms.hpp>
 #include <cugraph/graph.hpp>
 #include <cugraph/graph_functions.hpp>
diff --git a/cpp/src/components/legacy/connectivity.cu b/cpp/src/components/legacy/connectivity.cu
index ecaaab173db..4d0198fdff6 100644
--- a/cpp/src/components/legacy/connectivity.cu
+++ b/cpp/src/components/legacy/connectivity.cu
@@ -15,7 +15,6 @@
  */
 
 #include "scc_matrix.cuh"
-#include "utilities/graph_utils.cuh"
 #include "weak_cc.cuh"
 
 #include <cugraph/algorithms.hpp>
diff --git a/cpp/src/detail/utility_wrappers_32.cu b/cpp/src/detail/utility_wrappers_32.cu
index 6ab5ae375ca..72dee4a19a5 100644
--- a/cpp/src/detail/utility_wrappers_32.cu
+++ b/cpp/src/detail/utility_wrappers_32.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "detail/utility_wrappers.cuh"
+#include "detail/utility_wrappers_impl.cuh"
 
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/utilities/error.hpp>
diff --git a/cpp/src/detail/utility_wrappers_64.cu b/cpp/src/detail/utility_wrappers_64.cu
index a12bc3e952d..e7254d97c4d 100644
--- a/cpp/src/detail/utility_wrappers_64.cu
+++ b/cpp/src/detail/utility_wrappers_64.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "detail/utility_wrappers.cuh"
+#include "detail/utility_wrappers_impl.cuh"
 
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/utilities/error.hpp>
diff --git a/cpp/src/detail/utility_wrappers.cuh b/cpp/src/detail/utility_wrappers_impl.cuh
similarity index 100%
rename from cpp/src/detail/utility_wrappers.cuh
rename to cpp/src/detail/utility_wrappers_impl.cuh
diff --git a/cpp/src/from_cugraph_ops/algo_R.cuh b/cpp/src/from_cugraph_ops/algo_R.cuh
new file mode 100644
index 00000000000..031a7d2ceb9
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/algo_R.cuh
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+#include "device.cuh"
+
+#include <cugraph/graph.hpp>
+
+#include <raft/random/rng.cuh>
+#include <raft/random/rng_device.cuh>
+#include <raft/random/rng_state.hpp>
+
+#include <algorithm>
+
+namespace cugraph::ops::graph {
+
+// single warp-separated field of type IdxT
+template <typename IdxT>
+using smem_algo_r_t = utils::smem_unit_simple_t<1, IdxT>;
+
+template <typename IdxT, typename GenT, typename RandT>
+__device__ __forceinline__ void warp_algo_r_index(IdxT* smem,
+                                                  IdxT pop_size,
+                                                  IdxT idx_offset,
+                                                  int sample_size,
+                                                  raft::random::DeviceState<GenT>& rng_state)
+{
+  auto lane = utils::lane_id();
+  // first 'sample_size' are just copied
+  CUGRAPH_OPS_UNROLL
+  for (int i = lane; i < sample_size; i += utils::WARP_SIZE) {
+    smem[i] = idx_offset + i;
+  }
+  auto sample_size_idxt = IdxT{sample_size};
+  if (sample_size_idxt >= pop_size) return;
+
+  // we must synchronize here since we have just written to smem
+  utils::warp_sync();
+  // TODO(mjoux): when we support more warps per node enable this
+  //__syncthreads();
+
+  auto idx_end = idx_offset + pop_size;
+  auto n       = idx_offset + sample_size_idxt;
+  auto flat_id = uint64_t{threadIdx.x + blockIdx.x * blockDim.x};
+  GenT gen(rng_state, flat_id);
+  CUGRAPH_OPS_UNROLL
+  for (auto nidx = n + IdxT{lane}; nidx < idx_end; nidx += IdxT{utils::WARP_SIZE}) {
+    // nidx - idx_offset inclusive (necessary for correctness of algo R)
+    auto end = nidx - idx_offset + 1;
+    raft::random::UniformIntDistParams<IdxT, RandT> int_params{};
+    int_params.start = IdxT{0};
+    int_params.end   = IdxT{end};
+    int_params.diff  = static_cast<RandT>(end);
+    IdxT idx;
+    raft::random::custom_next(gen, &idx, int_params, 0, 0 /* idx / stride unused */);
+    if (idx < sample_size_idxt) {
+      // using atomic max instead of exch here because it leads to the same
+      // output as the sequential algorithm (DGL does this, too)
+      // Additionally, we use the index instead of the neighbor ID here
+      // since this allows copying over other node/edge-related data
+      // (useful for heterogeneous graphs for example)
+      utils::atomic_max(smem + idx, nidx);
+    }
+  }
+  // must synchronize to make smem valid
+  utils::warp_sync();
+  // TODO(mjoux): when we support more warps per node enable this
+  //__syncthreads();
+}
+
+template <typename IdxT, typename GenT, typename RandT>
+__device__ __forceinline__ void warp_algo_r(IdxT* smem,
+                                            IdxT row_id,
+                                            const IdxT* nodes,
+                                            const IdxT* fg_offsets,
+                                            int sample_size,
+                                            IdxT& node_id,
+                                            IdxT& node_start,
+                                            IdxT& node_end,
+                                            raft::random::DeviceState<GenT>& rng_state)
+{
+  auto lane = utils::lane_id();
+  if (nodes == nullptr) {
+    node_id = row_id;
+    if (lane == 0)
+      node_start = fg_offsets[node_id];
+    else if (lane == 1)
+      node_end = fg_offsets[node_id + 1];
+    node_start = utils::shfl(node_start, 0);
+    node_end   = utils::shfl(node_end, 1);
+  } else {
+    if (lane == 0) {
+      node_id    = nodes[row_id];
+      node_start = fg_offsets[node_id];
+      node_end   = fg_offsets[node_id + 1];
+    }
+    node_id    = utils::shfl(node_id, 0);
+    node_start = utils::shfl(node_start, 0);
+    node_end   = utils::shfl(node_end, 0);
+  }
+  auto pop_size = node_end - node_start;
+  warp_algo_r_index<IdxT, GenT, RandT>(smem, pop_size, node_start, sample_size, rng_state);
+}
+
+// TODO(mjoux): support configuring n_warps_per_node in template
+template <typename RandT, int N_WARPS, bool SAMPLE_SELF, bool IS_HG, typename IdxT, typename GenT>
+CUGRAPH_OPS_KERNEL void algo_r_kernel(raft::random::DeviceState<GenT> rng_state,
+                                      IdxT* neighbors,
+                                      IdxT* counts,
+                                      // edge_types / node_types should be non-const
+                                      // probably detected if `!IS_HG`
+                                      // NOLINTNEXTLINE(readability-non-const-parameter)
+                                      int32_t* edge_types,
+                                      // NOLINTNEXTLINE(readability-non-const-parameter)
+                                      int32_t* node_types,
+                                      const IdxT* offsets,
+                                      const IdxT* indices,
+                                      const int32_t* g_edge_types,
+                                      const int32_t* g_node_types,
+                                      const IdxT* nodes,
+                                      IdxT n_dst_nodes,
+                                      int sample_size)
+{
+  auto lane   = utils::lane_id();
+  auto warp   = utils::warp_id();  // 1D block with X dim
+  auto row_id = warp + static_cast<IdxT>(blockIdx.x) * IdxT{N_WARPS};
+  if (row_id >= n_dst_nodes) { return; }
+  IdxT* s_idx;
+  smem_algo_r_t<IdxT> smem{};
+  int32_t smem_sizes[] = {sample_size};
+  smem.set_ptrs(warp, N_WARPS, smem_sizes, s_idx);
+  IdxT node_id, node_start, node_end;
+  warp_algo_r<IdxT, GenT, RandT>(
+    s_idx, row_id, nodes, offsets, sample_size, node_id, node_start, node_end, rng_state);
+
+  IdxT count = 0;
+  for (int i = lane; i < sample_size; i += utils::WARP_SIZE) {
+    auto nidx = s_idx[i];
+    // checking for node_end here because sample_size may be larger than
+    // the total number of neighbors of the node
+    auto val = nidx < node_end ? indices[nidx] : cugraph::invalid_idx<IdxT>::value;
+    // TODO(mjoux) it's possible that we break the ELLPACK format here since
+    // if we set val to invalid, we should add it to end of list, rather
+    // than simply at index "i". This is ignored for now since the case
+    // where SAMPLE_SELF := false is rare and unconventional
+    if (!SAMPLE_SELF && val == node_id) val = cugraph::invalid_idx<IdxT>::value;
+    auto local_id       = row_id * IdxT{sample_size} + i;
+    neighbors[local_id] = val;
+    if (val != cugraph::invalid_idx<IdxT>::value) {
+      ++count;
+      if (IS_HG) edge_types[local_id] = g_edge_types[nidx];
+    }
+  }
+  if (IS_HG && lane == 0) node_types[row_id] = g_node_types[node_id];
+  if (counts != nullptr) {
+    count = utils::warp_reduce(count);
+    if (lane == 0) { counts[row_id] = count; }
+  }
+}
+
+template <typename IdxT, bool SAMPLE_SELF, bool IS_HG>
+void algo_r_impl(IdxT* neighbors,
+                 IdxT* counts,
+                 int32_t* edge_types,
+                 int32_t* node_types,
+                 raft::random::RngState& rng,
+                 const IdxT* offsets,
+                 const IdxT* indices,
+                 const int32_t* g_edge_types,
+                 const int32_t* g_node_types,
+                 const IdxT* nodes,
+                 IdxT n_dst_nodes,
+                 IdxT g_n_dst_nodes,
+                 IdxT sample_size,
+                 IdxT max_val,
+                 cudaStream_t stream)
+{
+  if (nodes == nullptr) { n_dst_nodes = g_n_dst_nodes; }
+  ASSERT(n_dst_nodes <= g_n_dst_nodes,
+         "Algo R: expected n_dst_nodes <= graph.n_dst_nodes (%ld > %ld)",
+         long(n_dst_nodes),
+         long(g_n_dst_nodes));
+  ASSERT(
+    static_cast<size_t>(sample_size) + 2 < static_cast<size_t>(std::numeric_limits<int>::max()),
+    "Expected sample size [+2] to be lower than INT_MAX");
+  static constexpr int TPB     = 512;
+  static constexpr int N_WARPS = TPB / utils::WARP_SIZE;
+  auto n_blks                  = utils::ceil_div<IdxT>(n_dst_nodes, N_WARPS);
+  int sample_size_i            = static_cast<int>(sample_size);
+  int32_t smem_sizes[]         = {sample_size_i};
+  size_t smem_size             = smem_algo_r_t<IdxT>::get_size(N_WARPS, smem_sizes);
+  if (static_cast<uint64_t>(max_val) < std::numeric_limits<uint32_t>::max()) {
+    // we'll use the 32-bit based method for generating random integers
+    // as we most likely do not need less bias
+    RAFT_CALL_RNG_FUNC(
+      rng,
+      (algo_r_kernel<uint32_t, N_WARPS, SAMPLE_SELF, IS_HG><<<n_blks, TPB, smem_size, stream>>>),
+      neighbors,
+      counts,
+      edge_types,
+      node_types,
+      offsets,
+      indices,
+      g_edge_types,
+      g_node_types,
+      nodes,
+      n_dst_nodes,
+      sample_size_i);
+  } else {
+    RAFT_CALL_RNG_FUNC(
+      rng,
+      (algo_r_kernel<uint64_t, N_WARPS, SAMPLE_SELF, IS_HG><<<n_blks, TPB, smem_size, stream>>>),
+      neighbors,
+      counts,
+      edge_types,
+      node_types,
+      offsets,
+      indices,
+      g_edge_types,
+      g_node_types,
+      nodes,
+      n_dst_nodes,
+      sample_size_i);
+  }
+  // update the rng state (this is a pessimistic update as it is difficult to
+  // compute the number of RNG calls done per thread!)
+  auto thread_rs = utils::ceil_div<IdxT>(
+    std::max(IdxT{0}, std::min(max_val, g_n_dst_nodes) - sample_size), utils::WARP_SIZE);
+  rng.advance(static_cast<uint64_t>(n_blks * TPB), thread_rs);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+}  // namespace cugraph::ops::graph
diff --git a/cpp/src/from_cugraph_ops/device.cuh b/cpp/src/from_cugraph_ops/device.cuh
new file mode 100644
index 00000000000..f7d37c62f35
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/device.cuh
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+#include "device_atomics.cuh"
+#include "device_core.hpp"
+#include "device_dim.cuh"
+#include "device_smem_helper.cuh"
+#include "device_warp_collectives.cuh"
+#include "macros.hpp"
diff --git a/cpp/src/from_cugraph_ops/device_atomics.cuh b/cpp/src/from_cugraph_ops/device_atomics.cuh
new file mode 100644
index 00000000000..b8be7614284
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/device_atomics.cuh
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include <cstdint>
+
+namespace cugraph::ops::utils {
+
+/**
+ * @defgroup AtomicMax Device atomic max operation
+ *
+ * @{
+ */
+template <typename DataT>
+__device__ inline DataT atomic_max(DataT* address, DataT val)
+{
+  return atomicMax(address, val);
+}
+template <>
+__device__ inline float atomic_max(float* address, float val)
+{
+  using u32_t          = unsigned int;
+  auto* address_as_u32 = reinterpret_cast<u32_t*>(address);
+  u32_t old            = *address_as_u32, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_u32, assumed, __float_as_uint(max(val, __uint_as_float(assumed))));
+  } while (assumed != old);
+  return __uint_as_float(old);
+}
+template <>
+__device__ inline double atomic_max(double* address, double val)
+{
+  using u64_t          = unsigned long long;  // NOLINT(google-runtime-int)
+  auto* address_as_ull = reinterpret_cast<u64_t*>(address);
+  u64_t old            = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old     = atomicCAS(
+      address_as_ull, assumed, __double_as_longlong(max(val, __longlong_as_double(assumed))));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+template <>
+__device__ inline int64_t atomic_max(int64_t* address, int64_t val)
+{
+  using u64_t          = unsigned long long;  // NOLINT(google-runtime-int)
+  auto* val_as_u64     = reinterpret_cast<u64_t*>(&val);
+  auto* address_as_u64 = reinterpret_cast<u64_t*>(address);
+  auto ret             = atomicMax(address_as_u64, *val_as_u64);
+  return *reinterpret_cast<int64_t*>(&ret);
+}
+template <>
+__device__ inline uint64_t atomic_max(uint64_t* address, uint64_t val)
+{
+  using u64_t          = unsigned long long;  // NOLINT(google-runtime-int)
+  auto* val_as_u64     = reinterpret_cast<u64_t*>(&val);
+  auto* address_as_u64 = reinterpret_cast<u64_t*>(address);
+  auto ret             = atomicMax(address_as_u64, *val_as_u64);
+  return *reinterpret_cast<uint64_t*>(&ret);
+}
+/** @} */
+
+}  // namespace cugraph::ops::utils
diff --git a/cpp/src/from_cugraph_ops/device_core.hpp b/cpp/src/from_cugraph_ops/device_core.hpp
new file mode 100644
index 00000000000..b548d2d4d1f
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/device_core.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+#include "macros.hpp"
+
+namespace cugraph::ops::utils {
+
+/** number of threads per warp */
+static constexpr int WARP_SIZE = 32;
+
+/** minimum CUDA version required for warp shfl sync functions */
+static constexpr int CUDA_VER_WARP_SHFL = 9000;
+
+/**
+ * @brief Provide a ceiling division operation ie. ceil(a / b)
+ *
+ * @tparam IntT supposed to be only integers for now!
+ *
+ * @param[in] a dividend
+ * @param[in] b divisor
+ */
+template <typename IntT>
+constexpr CUGRAPH_OPS_HD IntT ceil_div(IntT a, IntT b)
+{
+  return (a + b - 1) / b;
+}
+
+/**
+ * @brief Provide an alignment function ie. ceil(a / b) * b
+ *
+ * @tparam IntT supposed to be only integers for now!
+ *
+ * @param[in] a dividend
+ * @param[in] b divisor
+ */
+template <typename IntT>
+constexpr CUGRAPH_OPS_HD IntT align_to(IntT a, IntT b)
+{
+  return ceil_div(a, b) * b;
+}
+
+}  // namespace cugraph::ops::utils
diff --git a/cpp/src/from_cugraph_ops/device_dim.cuh b/cpp/src/from_cugraph_ops/device_dim.cuh
new file mode 100644
index 00000000000..275d0edd485
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/device_dim.cuh
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+#include "device_core.hpp"
+
+namespace cugraph::ops::utils {
+
+/** get the lane id of the current thread */
+__device__ __forceinline__ int lane_id()
+{
+  int id;
+  asm("mov.s32 %0, %%laneid;" : "=r"(id));
+  return id;
+}
+
+/**
+ * get the flat id of the current thread (within block)
+ * template parameters allow to control which CTA dimensions are used
+ */
+template <bool USE_X = true, bool USE_Y = false, bool USE_Z = false>
+__device__ __forceinline__ int flat_id()
+{
+  if (!USE_X && !USE_Y && !USE_Z)
+    return 0;  // weird case, but if we get here, we should have 1 thread
+  if (!USE_X && !USE_Y && USE_Z) return threadIdx.z;
+  if (!USE_X && USE_Y && !USE_Z) return threadIdx.y;
+  if (!USE_X && USE_Y && USE_Z) return threadIdx.y + threadIdx.z * blockDim.y;
+  if (USE_X && !USE_Y && !USE_Z) return threadIdx.x;
+  if (USE_X && !USE_Y && USE_Z) return threadIdx.x + threadIdx.z * blockDim.x;
+  if (USE_X && USE_Y && !USE_Z) return threadIdx.x + threadIdx.y * blockDim.x;
+  // USE_X && USE_Y && USE_Z
+  return threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
+}
+
+/**
+ * get the number of warps of the current block
+ * template parameters allow to control which CTA dimensions are used
+ */
+template <bool USE_X = true, bool USE_Y = false, bool USE_Z = false>
+__device__ __forceinline__ int num_warps()
+{
+  if (!USE_X && !USE_Y && !USE_Z)
+    return 1;  // weird case, but if we get here, we should have 1 thread
+  if (!USE_X && !USE_Y && USE_Z) return ceil_div<int>(blockDim.z, WARP_SIZE);
+  if (!USE_X && USE_Y && !USE_Z) return ceil_div<int>(blockDim.y, WARP_SIZE);
+  if (!USE_X && USE_Y && USE_Z) return ceil_div<int>(blockDim.y * blockDim.z, WARP_SIZE);
+  if (USE_X && !USE_Y && !USE_Z) return ceil_div<int>(blockDim.x, WARP_SIZE);
+  if (USE_X && !USE_Y && USE_Z) return ceil_div<int>(blockDim.x * blockDim.z, WARP_SIZE);
+  if (USE_X && USE_Y && !USE_Z) return ceil_div<int>(blockDim.x * blockDim.y, WARP_SIZE);
+  // USE_X && USE_Y && USE_Z
+  return ceil_div<int>(blockDim.x * blockDim.y * blockDim.z, WARP_SIZE);
+}
+
+/**
+ * get the warp id of the current thread
+ * template parameters allow to control which CTA dimensions are used
+ * @note: this only makes sense if the first used dimension of the CTA size
+ * is a multiple of WARP_SIZE. If this is not the case, use
+ * `flat_id<...>() / WARP_SIZE` to get the warp id of the current thread
+ */
+template <bool USE_X = true, bool USE_Y = false, bool USE_Z = false>
+__device__ __forceinline__ int warp_id()
+{
+  if (!USE_X && !USE_Y && !USE_Z)
+    return 0;  // weird case, but if we get here, we should have 1 thread
+  if (!USE_X && !USE_Y && USE_Z) return threadIdx.z / WARP_SIZE;
+  if (!USE_X && USE_Y && !USE_Z) return threadIdx.y / WARP_SIZE;
+  if (!USE_X && USE_Y && USE_Z)
+    return threadIdx.y / WARP_SIZE + threadIdx.z * num_warps<false, true, false>();
+  if (USE_X && !USE_Y && !USE_Z) return threadIdx.x / WARP_SIZE;
+  if (USE_X && !USE_Y && USE_Z)
+    return threadIdx.x / WARP_SIZE + threadIdx.z * num_warps<true, false, false>();
+  if (USE_X && USE_Y && !USE_Z)
+    return threadIdx.x / WARP_SIZE + threadIdx.y * num_warps<true, false, false>();
+  // USE_X && USE_Y && USE_Z
+  return threadIdx.x / WARP_SIZE + threadIdx.y * num_warps<true, false, false>() +
+         threadIdx.z * blockDim.y * num_warps<true, false, false>();
+}
+
+/**
+ * get the block dimension of the current executing block
+ * template parameters allow to control which CTA dimensions are used
+ */
+template <bool USE_X = true, bool USE_Y = false, bool USE_Z = false>
+__device__ __forceinline__ int block_dim()
+{
+  if (!USE_X && !USE_Y && !USE_Z)
+    return 1;  // weird case, but if we get here, we should have 1 thread
+  if (!USE_X && !USE_Y && USE_Z) return blockDim.z;
+  if (!USE_X && USE_Y && !USE_Z) return blockDim.y;
+  if (!USE_X && USE_Y && USE_Z) return blockDim.y * blockDim.z;
+  if (USE_X && !USE_Y && !USE_Z) return blockDim.x;
+  if (USE_X && !USE_Y && USE_Z) return blockDim.x * blockDim.z;
+  if (USE_X && USE_Y && !USE_Z) return blockDim.x * blockDim.y;
+  // USE_X && USE_Y && USE_Z
+  return blockDim.x * blockDim.y * blockDim.z;
+}
+
+/**
+ * get the flat id of the current thread (within device/grid)
+ * template parameters allow to control which grid and block/CTA dimensions are used
+ */
+template <bool G_USE_X = true,
+          bool G_USE_Y = false,
+          bool G_USE_Z = false,
+          bool B_USE_X = true,
+          bool B_USE_Y = false,
+          bool B_USE_Z = false>
+__device__ __forceinline__ int flat_grid_id()
+{
+  auto b_id  = flat_id<B_USE_X, B_USE_Y, B_USE_Z>();
+  auto b_dim = block_dim<B_USE_X, B_USE_Y, B_USE_Z>();
+  if (!G_USE_X && !G_USE_Y && !G_USE_Z)
+    return 0;  // weird case, but if we get here, we should have 1 thread
+  if (!G_USE_X && !G_USE_Y && G_USE_Z) return blockIdx.z * b_dim + b_id;
+  if (!G_USE_X && G_USE_Y && !G_USE_Z) return blockIdx.y * b_dim + b_id;
+  if (!G_USE_X && G_USE_Y && G_USE_Z) return blockIdx.y * b_dim + blockIdx.z * blockDim.z + b_id;
+  if (G_USE_X && !G_USE_Y && !G_USE_Z) return blockIdx.x * b_dim + b_id;
+  if (G_USE_X && !G_USE_Y && G_USE_Z) return blockIdx.x * b_dim + blockIdx.z * blockDim.z + b_id;
+  if (G_USE_X && G_USE_Y && !G_USE_Z) return blockIdx.x * b_dim + blockIdx.y * blockDim.y + b_id;
+  // G_USE_X && G_USE_Y && G_USE_Z
+  return blockIdx.x * b_dim + blockIdx.y * blockDim.y * blockDim.z + blockIdx.z * blockDim.z + b_id;
+}
+
+}  // namespace cugraph::ops::utils
diff --git a/cpp/src/from_cugraph_ops/device_smem_helper.cuh b/cpp/src/from_cugraph_ops/device_smem_helper.cuh
new file mode 100644
index 00000000000..f1b5be071d9
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/device_smem_helper.cuh
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+#include "device_core.hpp"
+
+#include <cstdint>
+#include <tuple>
+#include <type_traits>
+
+namespace cugraph::ops::utils {
+
+// The following struct must be used to transmit the size and alignment of
+// a field to the shared memory helpers below.
+// By default, the alignment is just like the alignment of the original data type.
+template <typename DataT, int32_t ALIGN = 0>
+struct field_type {
+  using data_t                       = DataT;
+  static constexpr int32_t BYTES     = static_cast<int32_t>(sizeof(DataT));
+  static constexpr int32_t ALIGNMENT = ALIGN > 0 ? ALIGN : alignof(DataT);
+};
+
+// Imagine we have 2 fields of data in shared memory, one for ints, one for doubles.
+// The intended usage of the following class in simple cases is as follows:
+// 1. specify the type somewhere for both host and kernel code:
+//    using special_smem_name_t = smem_helper< 0, 0, field_type<int>, field_type<double> >;
+//    /* can be simplified to the following: */
+//    using special_smem_name_t = smem_simple_t< int, double >;
+// 2. in host code, get the size of shared memory:
+//    int32_t smem_sizes[] = {n_ints, n_doubles};
+//    /* note: sizes are always in number of elements, not bytes */
+//    /*       sizes always have type `int32_t` */
+//    auto size = special_smem_name_t::get_size(sizes);
+// 3. in device code, call the empty constructor:
+//    special_smem_name_t helper {};
+//    int* s_ints;
+//    double* s_doubles;
+//    int32_t smem_sizes[] = {n_ints, n_doubles};
+//    helper.set_ptrs(sizes, s_ints, s_doubles);
+//
+// For more complicated use cases, it is often useful to create a struct overloading
+// operator[] and passing that to the `get_size` or `set_ptrs` helpers.
+// The struct can also be used to directly pass the size information from
+// host code (launch) to the kernel, avoiding duplication of calculating sizes.
+// Be aware that this overload must have a `__host__ __device__` signature.
+// Here is an example struct for the above use case:
+// struct sizes_t {
+//   int32_t n_ints, n_doubles;
+//   __host__ __device__ sizes_t() = delete;
+//   __host__ __device__ sizes_t(int32_t _n_ints, int32_t _n_doubles) :
+//     n_ints(_n_ints), n_doubles(_n_doubles) {}
+//
+//   /* you may also just return int32_t here instead of const int32_t& */
+//   __host__ __device__ const int32_t& operator[](int idx) const
+//   {
+//     return idx == 0 ? n_ints : n_doubles;
+//   }
+// };
+//
+// The ALIGN_INIT template parameter is important for correctness:
+// By default (ALIGN_INIT=0), we assume that all alignments are powers of 2,
+// and we set ALIGN_INIT to the max alignment of the fields. If you want more
+// control, you can set it yourself, but we always assume that it is a multiple
+// of all alignment values of the fields.
+//
+// The N_UNIT_FIELDS template parameters allows specifying sub-spaces
+// for a given number of "units" (often warps) such that the first
+// `N_UNIT_FIELDS` fields are reserved sub-spaces per unit.
+// In this case, the `get_size` and `set_ptrs` methods are modified such that
+// you have to specify the number of units, and for `set_ptrs` the unit ID
+// as well.
+// This is useful for reserving exclusive shared memory per warp for example.
+// Each unit (warp) will have its sub-space (containing the `N_UNIT_FIELDS`
+// fields) aligned to the initial alignment as described above.
+template <int32_t ALIGN_INIT, int N_UNIT_FIELDS, typename... FieldsT>
+class smem_helper {
+ public:
+  static constexpr size_t N_ARGS = sizeof...(FieldsT);
+
+ protected:
+  static_assert(N_ARGS > 0, "smem_helper: must have at least one field type");
+  static_assert(N_UNIT_FIELDS >= 0, "smem_helper: #unit fields must be non-negative");
+  static_assert(N_UNIT_FIELDS <= N_ARGS,
+                "smem_helper: #unit fields must be smaller than #field types");
+  // following static assertion for FieldsT to not be scalar types is based on
+  // https://stackoverflow.com/a/28253503/4134127
+  template <bool...>
+  struct bool_pack;
+  template <bool... BOOLS>
+  using all_true_t = std::is_same<bool_pack<true, BOOLS...>, bool_pack<BOOLS..., true>>;
+  static_assert(all_true_t<!std::is_scalar<FieldsT>::value...>::value,
+                "smem_helper: the given field template types must be of type `field_type` and "
+                "cannot be scalars");
+
+  template <int IDX>
+  __host__ __device__ static constexpr typename std::enable_if<(IDX < N_ARGS), int32_t>::type
+  max_align()
+  {
+    using f_t = typename std::tuple_element<IDX, std::tuple<FieldsT...>>::type;
+    static_assert(f_t::ALIGNMENT > 0, "field alignments must be greater than 0");
+    return max_align<IDX + 1>() > f_t::ALIGNMENT ? max_align<IDX + 1>() : f_t::ALIGNMENT;
+  }
+  template <int IDX>
+  __host__ __device__ static constexpr typename std::enable_if<(IDX >= N_ARGS), int32_t>::type
+  max_align()
+  {
+    return -1;
+  }
+
+  // this is assumed to be a multiple of all alignments
+  static constexpr int32_t ALIGN_BASE = ALIGN_INIT > 0 ? ALIGN_INIT : max_align<0>();
+
+  // here we exploit that the base pointer must be aligned to 16 bytes.
+  // if 16 is a multiple of ALIGN_BASE, that means we don't have any overhead.
+  // if ALIGN_BASE is a multiple of 16, it means that we need at most
+  // ALIGN_BASE - 16 extra bytes, otherwise it's ALIGN_BASE - 1
+  static constexpr int32_t SIZE_OVERHEAD = 16 % ALIGN_BASE == 0   ? 0
+                                           : ALIGN_BASE % 16 == 0 ? ALIGN_BASE - 16
+                                                                  : ALIGN_BASE - 1;
+
+ public:
+  // cannot easily use "= default" here for host-only code
+  // NOLINTNEXTLINE(modernize-use-equals-default)
+  __host__ __device__ smem_helper()
+  {
+#if defined(__CUDA_ARCH__)
+    // must be aligned to 16 bytes on all supported architectures
+    // (don't have a reference for this at the moment!)
+    extern __shared__ uint8_t smem[];
+    // align manually to `ALIGN_BASE`: this avoids the `__align(X)__` attribute
+    // which can cause issues if this is used in the same compilation unit
+    // with different types / alignments.
+    // In any case, the compiler/hardware cannot do a better job at providing
+    // an aligned pointer than we can do manually.
+    auto smem_aligned = align_to(reinterpret_cast<uintptr_t>(smem), uintptr_t(ALIGN_BASE));
+    base_ptr_         = reinterpret_cast<uint8_t*>(smem_aligned);
+#endif
+  }
+
+  template <typename SizeT, int N = N_UNIT_FIELDS>
+  __host__ __device__ static inline typename std::enable_if<(N <= 0), int32_t>::type get_size(
+    const SizeT& sizes)
+  {
+    auto current_total = 0;  // base pointer must be aligned to ALIGN_BASE
+    size_helper<1>(current_total, sizes);
+    return SIZE_OVERHEAD + current_total;
+  }
+
+  template <typename SizeT, int N = N_UNIT_FIELDS>
+  __host__ __device__ static inline typename std::enable_if<(N > 0), int32_t>::type get_size(
+    const int32_t n_units, const SizeT& sizes)
+  {
+    auto current_total = 0;  // base pointer must be aligned to all alignments
+    unit_size_helper<1>(current_total, sizes);
+    // since the unit size is aligned to ALIGN_BASE, every base pointer for
+    // each unit as well as the base pointer after all units is aligned to
+    // ALIGN_BASE: since that is a multiple of all alignments, we can safely
+    // continue adding the sizes afterwards
+    auto unit_size = align_to(current_total, ALIGN_BASE);
+    current_total  = 0;  // base pointer must be aligned to all alignments
+    size_helper<N + 1>(current_total, sizes);
+    return SIZE_OVERHEAD + unit_size * n_units + current_total;
+  }
+
+  template <typename SizeT, int N = N_UNIT_FIELDS>
+  __device__ inline typename std::enable_if<(N <= 0)>::type set_ptrs(
+    const SizeT& sizes, typename FieldsT::data_t*&... ptrs) const
+  {
+    return ptrs_helper<1>(0, 0, 0, 0, sizes, ptrs...);
+  }
+
+  template <typename SizeT, int N = N_UNIT_FIELDS>
+  __device__ inline typename std::enable_if<(N > 0)>::type set_ptrs(
+    const int32_t& unit_id,
+    const int32_t& n_units,
+    const SizeT& sizes,
+    typename FieldsT::data_t*&... ptrs) const
+  {
+    auto current_total = 0;  // base pointer must be aligned to all alignments
+    unit_size_helper<1>(current_total, sizes);
+    // see explanation in `get_size` for what aligning to ALIGN_BASE means
+    auto unit_size = align_to(current_total, ALIGN_BASE);
+    return ptrs_helper<1>(0, unit_id, unit_size, n_units, sizes, ptrs...);
+  }
+
+ protected:
+  template <int NEXT, typename SizeT>
+  __host__ __device__ static inline void single_size(int32_t& current_total, const SizeT& sizes)
+  {
+    using next_field_t = typename std::tuple_element<(NEXT < N_ARGS ? NEXT : N_ARGS - 1),
+                                                     std::tuple<FieldsT...>>::type;
+    using this_field_t = typename std::tuple_element<(NEXT < N_ARGS ? NEXT - 1 : N_ARGS - 1),
+                                                     std::tuple<FieldsT...>>::type;
+    static constexpr int32_t ALIGN =
+      NEXT == N_UNIT_FIELDS || NEXT >= N_ARGS ? 1 : next_field_t::ALIGNMENT;
+    current_total = align_to(current_total + sizes[NEXT - 1] * this_field_t::BYTES, ALIGN);
+  }
+
+  // parentheses in `enable_if` here are used to help the parser understand "<>"
+  template <int NEXT, typename SizeT>
+  __host__ __device__ static inline typename std::enable_if<(NEXT <= N_ARGS)>::type size_helper(
+    int32_t& current_total, const SizeT& sizes)
+  {
+    single_size<NEXT>(current_total, sizes);
+    size_helper<NEXT + 1>(current_total, sizes);
+  }
+  template <int NEXT, typename SizeT>
+  __host__ __device__ static inline typename std::enable_if<(NEXT > N_ARGS)>::type size_helper(
+    int32_t& /* current_total */, const SizeT& /* sizes */)
+  {
+  }
+
+  template <int NEXT, typename SizeT>
+  __host__ __device__ static inline typename std::enable_if<(NEXT <= N_UNIT_FIELDS)>::type
+  unit_size_helper(int32_t& current_total, const SizeT& sizes)
+  {
+    single_size<NEXT>(current_total, sizes);
+    unit_size_helper<NEXT + 1>(current_total, sizes);
+  }
+  template <int NEXT, typename SizeT>
+  __host__ __device__ static inline typename std::enable_if<(NEXT > N_UNIT_FIELDS)>::type
+  unit_size_helper(int32_t& /* current_total */, const SizeT& /* sizes */)
+  {
+  }
+
+  template <int /* NEXT */, typename SizeT>
+  __device__ inline void ptrs_helper(const int32_t& /* offset */,
+                                     const int32_t& /* unit_id */,
+                                     const int32_t& /* unit_size */,
+                                     const int32_t& /* n_units */,
+                                     const SizeT& /* sizes */) const
+  {
+  }
+  template <int NEXT, typename SizeT, typename PtrT, typename... PtrsT>
+  __device__ inline void ptrs_helper(const int32_t& offset,
+                                     const int32_t& unit_id,
+                                     const int32_t& unit_size,
+                                     const int32_t& n_units,
+                                     const SizeT& sizes,
+                                     PtrT*& ptr,
+                                     PtrsT*&... ptrs) const
+  {
+    // see `get_size`: base_ptr_ + u_off is always aligned to all alignments
+    // (whether for each individual unit or after all units)
+    auto u_off          = NEXT <= N_UNIT_FIELDS ? unit_id * unit_size : n_units * unit_size;
+    ptr                 = reinterpret_cast<PtrT*>(base_ptr_ + (u_off + offset));
+    int32_t next_offset = offset;
+    if (NEXT == N_UNIT_FIELDS)
+      next_offset = 0;  // pointer after all unit fields is aligned to all alignments
+    else
+      single_size<NEXT>(next_offset, sizes);
+    ptrs_helper<NEXT + 1>(next_offset, unit_id, unit_size, n_units, sizes, ptrs...);
+  }
+
+  uint8_t* base_ptr_{nullptr};
+};
+
+template <typename... DataT>
+using smem_simple_t = smem_helper<0, 0, field_type<DataT>...>;
+
+template <int N_UNIT_FIELDS, typename... DataT>
+using smem_unit_simple_t = smem_helper<0, N_UNIT_FIELDS, field_type<DataT>...>;
+
+}  // namespace cugraph::ops::utils
diff --git a/cpp/src/from_cugraph_ops/device_warp_collectives.cuh b/cpp/src/from_cugraph_ops/device_warp_collectives.cuh
new file mode 100644
index 00000000000..198b3be2f12
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/device_warp_collectives.cuh
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+#include "device_core.hpp"
+#include "device_dim.cuh"
+#include "macros.hpp"
+
+#include <cstdint>
+
+namespace cugraph::ops::utils {
+
+/**
+ * @brief get a bit mask for the `n_threads` lowest threads of a warp
+ *
+ * @param[in] n_threads  number of threads in the mask
+ *
+ * @return the bit mask
+ */
+__host__ __device__ constexpr uint32_t low_thread_mask(int n_threads)
+{
+  return n_threads >= WARP_SIZE ? 0xffffffffU : (1U << n_threads) - 1U;
+}
+
+/**
+ * apply a warp-wide sync (useful from Volta+ archs)
+ *
+ * @tparam NP number of participating threads
+ *
+ * @note This works on Pascal and earlier archs as well, but all threads with
+ * lane id <= NP must enter this function together and in convergence.
+ */
+template <int NP = WARP_SIZE>
+__device__ inline void warp_sync()
+{
+  __syncwarp(low_thread_mask(NP));
+}
+
+/**
+ * @brief Shuffle the data inside a warp
+ *
+ * @tparam DataT the data type (currently assumed to be 4B)
+ *
+ * @param[in] val      value to be shuffled
+ * @param[in] src_lane lane from where to shuffle
+ * @param[in] width    lane width
+ * @param[in] mask     mask of participating threads (Volta+)
+ *
+ * @return the shuffled data
+ */
+template <typename DataT>
+__device__ inline DataT shfl(DataT val,
+                             int src_lane,
+                             int width     = WARP_SIZE,
+                             uint32_t mask = 0xffffffffU)
+{
+  static_assert(CUDART_VERSION >= CUDA_VER_WARP_SHFL,
+                "Expected CUDA >= 9 for warp synchronous shuffle");
+  return __shfl_sync(mask, val, src_lane, width);
+}
+
+/**
+ * @brief Warp-level sum reduction
+ *
+ * @tparam DataT data type
+ * @tparam NP number of participating threads.
+ *         must be a power of 2 and at most warp size
+ *
+ * @param[in] val input value
+ *
+ * @return only the lane0 will contain valid reduced result
+ *
+ * @note Why not cub? Because cub doesn't seem to allow working with arbitrary
+ *       number of warps in a block.
+ *
+ * @note All threads with lane id <= NP must enter this function together
+ *
+ * TODO(mjoux) Expand this to support arbitrary reduction ops
+ */
+template <typename DataT, int NP = WARP_SIZE>
+__device__ inline DataT warp_reduce(DataT val)
+{
+  static constexpr uint32_t MASK = low_thread_mask(NP);
+  CUGRAPH_OPS_UNROLL
+  for (int i = NP / 2; i > 0; i >>= 1) {
+    DataT tmp = shfl(val, lane_id() + i, NP, MASK);
+    val += tmp;
+  }
+  return val;
+}
+
+}  // namespace cugraph::ops::utils
diff --git a/cpp/src/from_cugraph_ops/macros.hpp b/cpp/src/from_cugraph_ops/macros.hpp
new file mode 100644
index 00000000000..0ff08af0b1a
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/macros.hpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#define CUGRAPH_OPS_STRINGIFY_DETAIL(x) #x
+#define CUGRAPH_OPS_STRINGIFY(x)        CUGRAPH_OPS_STRINGIFY_DETAIL(x)
+
+#define CUGRAPH_OPS_UNROLL _Pragma("unroll")
+#if defined(__clang__) && defined(__CUDA__)
+// clang wants pragma unroll without parentheses
+#define CUGRAPH_OPS_UNROLL_N(n) _Pragma(CUGRAPH_OPS_STRINGIFY(unroll n))
+#else
+// nvcc / nvrtc want pragma unroll with parentheses
+#define CUGRAPH_OPS_UNROLL_N(n) _Pragma(CUGRAPH_OPS_STRINGIFY(unroll(n)))
+#endif
+
+#if defined(__clang__)
+#define CUGRAPH_OPS_CONSTEXPR_D constexpr
+#else
+#define CUGRAPH_OPS_CONSTEXPR_D constexpr __device__
+#endif
+
+#if defined(__CUDACC__) || defined(__CUDA__)
+#define CUGRAPH_OPS_HD __host__ __device__
+#else
+#define CUGRAPH_OPS_HD
+#endif
+
+// The CUGRAPH_OPS_KERNEL specificies that a kernel has hidden visibility
+//
+// cugraph-ops needs to ensure that the visibility of its CUGRAPH_OPS_KERNEL function
+// templates have hidden visibility ( default is weak visibility).
+//
+// When kernels have weak visibility it means that if two dynamic libraries
+// both contain identical instantiations of a kernel/template, then the linker
+// will discard one of the two instantiations and use only one of them.
+//
+// Do to unique requirements of how the CUDA works this de-deduplication
+// can lead to the wrong kernels being called ( SM version being wrong ),
+// silently no kernel being called at all, or cuda runtime errors being
+// thrown.
+//
+// https://github.com/rapidsai/raft/issues/1722
+#ifndef CUGRAPH_OPS_KERNEL
+#define CUGRAPH_OPS_KERNEL __global__ static
+#endif
diff --git a/cpp/src/from_cugraph_ops/sampling.hpp b/cpp/src/from_cugraph_ops/sampling.hpp
new file mode 100644
index 00000000000..5663b8d9c03
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/sampling.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+// FIXME: This is only here for the prims...
+//   Need to look how Seunghwa fixed this in his PR
+#include <cugraph/graph.hpp>
+
+#include <raft/random/rng_state.hpp>
+
+#include <cuda_runtime.h>
+
+#include <cstdint>
+
+namespace cugraph::legacy::ops::graph {
+
+/**
+ * @brief Generate indexes given population sizes and a sample size,
+ *        with or without replacement
+ *
+ * @param[out]   index          The (dense) index matrix. [on device]
+ *                              [dim = `n_sizes x sample_size`]
+ *                              In case `replace` is `false`, this may contain
+ *                              `ops::graph::INVALID_ID<IdxT>`
+ *                              if no index could be generated.
+ * @param[inout] rng            RAFT RngState state object
+ * @param[in]    sizes          Input array of population sizes [on device]
+ *                              [len = `n_sizes`]
+ * @param[in]    n_sizes        number of sizes to sample from.
+ * @param[in]    sample_size    max number of indexes to be sampled per element
+ *                              in `sizes`. Assumed to be <= 384 at the moment.
+ * @param[in]    replace        If `true`, sample with replacement, otherwise
+ *                              without replacement.
+ * @param[in]    stream         cuda stream
+ *
+ @{
+ */
+void get_sampling_index(int32_t* index,
+                        raft::random::RngState& rng,
+                        const int32_t* sizes,
+                        int32_t n_sizes,
+                        int32_t sample_size,
+                        bool replace,
+                        cudaStream_t stream);
+void get_sampling_index(int64_t* index,
+                        raft::random::RngState& rng,
+                        const int64_t* sizes,
+                        int64_t n_sizes,
+                        int32_t sample_size,
+                        bool replace,
+                        cudaStream_t stream);
+
+}  // namespace cugraph::legacy::ops::graph
diff --git a/cpp/src/from_cugraph_ops/sampling_index.cu b/cpp/src/from_cugraph_ops/sampling_index.cu
new file mode 100644
index 00000000000..fb1f4ac3f1e
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/sampling_index.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#include "sampling.hpp"
+#include "sampling_index.cuh"
+
+namespace cugraph::legacy::ops::graph {
+
+void get_sampling_index(int32_t* index,
+                        raft::random::RngState& rng,
+                        const int32_t* sizes,
+                        int32_t n_sizes,
+                        int32_t sample_size,
+                        bool replace,
+                        cudaStream_t stream)
+{
+  get_sampling_index_impl(index, rng, sizes, n_sizes, sample_size, replace, stream);
+}
+
+void get_sampling_index(int64_t* index,
+                        raft::random::RngState& rng,
+                        const int64_t* sizes,
+                        int64_t n_sizes,
+                        int32_t sample_size,
+                        bool replace,
+                        cudaStream_t stream)
+{
+  get_sampling_index_impl(index, rng, sizes, n_sizes, sample_size, replace, stream);
+}
+
+}  // namespace cugraph::legacy::ops::graph
diff --git a/cpp/src/from_cugraph_ops/sampling_index.cuh b/cpp/src/from_cugraph_ops/sampling_index.cuh
new file mode 100644
index 00000000000..9ac574315bb
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/sampling_index.cuh
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+#include "algo_R.cuh"
+#include "sampling.hpp"
+
+#include <raft/random/rng.cuh>
+#include <raft/random/rng_state.hpp>
+
+#include <type_traits>
+
+namespace cugraph::legacy::ops::graph {
+
+namespace utils = cugraph::ops::utils;
+
+template <typename IdxT>
+using smem_algo_r_t = utils::smem_unit_simple_t<1, IdxT>;
+
+template <typename IdxT, typename GenT>
+CUGRAPH_OPS_KERNEL void index_replace_kernel(raft::random::DeviceState<GenT> rng_state,
+                                             IdxT* index,
+                                             const IdxT* sizes,
+                                             IdxT n_sizes,
+                                             int sample_size)
+{
+  using rand_t = std::make_unsigned_t<IdxT>;
+  // a warp-wide implementation.
+  auto lane    = cugraph::ops::utils::lane_id();
+  auto warp    = utils::warp_id();    // 1D block with X dim
+  auto n_warps = utils::num_warps();  // 1D block with X dim
+  auto row_id  = warp + static_cast<IdxT>(blockIdx.x) * IdxT{n_warps};
+  if (row_id >= n_sizes) return;
+  // 1. load population size (once per warp)
+  IdxT size = IdxT{0};
+  if (lane == 0) size = sizes[row_id];
+
+  // 2. shuffle it to all threads in warp
+  size = utils::shfl(size, 0);
+
+  // 3. check valid size: possible early-out
+  if (size <= 0) {
+    CUGRAPH_OPS_UNROLL
+    for (auto i = lane; i < sample_size; i += utils::WARP_SIZE) {
+      index[row_id * IdxT{sample_size} + IdxT{i}] = cugraph::invalid_idx<IdxT>::value;
+    }
+    return;
+  }
+
+  // 4. every thread generates its indexes
+  auto flat_id = static_cast<uint64_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  GenT gen(rng_state, flat_id);
+  raft::random::UniformIntDistParams<IdxT, rand_t> int_params{};
+  int_params.start = IdxT{0};
+  int_params.end   = size;
+  int_params.diff  = static_cast<rand_t>(size);
+  CUGRAPH_OPS_UNROLL
+  for (auto i = lane; i < sample_size; i += utils::WARP_SIZE) {
+    IdxT idx = IdxT{0};
+    raft::random::custom_next(gen, &idx, int_params, 0, 0 /* idx / stride unused */);
+
+    // 5. output index
+    index[row_id * IdxT{sample_size} + IdxT{i}] = idx;
+  }
+}
+
+template <typename IdxT>
+void get_sampling_index_replace(IdxT* index,
+                                raft::random::RngState& rng,
+                                const IdxT* sizes,
+                                IdxT n_sizes,
+                                int32_t sample_size,
+                                cudaStream_t stream)
+{
+  // keep thread per block fairly low since we can expect sample_size < warp_size
+  // thus we want to have as many blocks as possible to increase parallelism
+  static constexpr int TPB     = 128;
+  static constexpr int N_WARPS = TPB / utils::WARP_SIZE;
+  auto n_blks                  = utils::ceil_div<IdxT>(n_sizes, N_WARPS);
+  RAFT_CALL_RNG_FUNC(
+    rng, (index_replace_kernel<<<n_blks, TPB, 0, stream>>>), index, sizes, n_sizes, sample_size);
+  auto thread_rs = utils::ceil_div<IdxT>(IdxT{sample_size}, utils::WARP_SIZE);
+  rng.advance(static_cast<uint64_t>(n_blks * TPB), thread_rs * sizeof(IdxT) / sizeof(int32_t));
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <int N_WARPS, typename IdxT, typename GenT>
+CUGRAPH_OPS_KERNEL void index_algo_r_kernel(raft::random::DeviceState<GenT> rng_state,
+                                            IdxT* index,
+                                            const IdxT* sizes,
+                                            IdxT n_sizes,
+                                            int sample_size)
+{
+  using rand_t = std::make_unsigned_t<IdxT>;
+  // a warp-wide implementation.
+  auto lane   = utils::lane_id();
+  auto warp   = utils::warp_id();  // 1D block with X dim
+  auto row_id = warp + static_cast<IdxT>(blockIdx.x) * IdxT{N_WARPS};
+  if (row_id >= n_sizes) return;
+  IdxT* s_idx;
+  smem_algo_r_t<IdxT> smem{};
+  int32_t smem_sizes[] = {sample_size};
+  smem.set_ptrs(warp, N_WARPS, smem_sizes, s_idx);
+  // 1. load population size (once per warp)
+  IdxT size = IdxT{0};
+  if (lane == 0) size = sizes[row_id];
+
+  // 2. shuffle it to all threads in warp
+  size = utils::shfl(size, 0);
+
+  // 3. Get algo R indexes per warp
+  cugraph::ops::graph::warp_algo_r_index<IdxT, GenT, rand_t>(
+    s_idx, size, IdxT{0}, sample_size, rng_state);
+
+  CUGRAPH_OPS_UNROLL
+  for (auto i = lane; i < sample_size; i += utils::WARP_SIZE) {
+    // 4. output index
+    // still need to check if the index is actually valid
+    auto idx = s_idx[i];
+    index[row_id * IdxT{sample_size} + IdxT{i}] =
+      idx >= size ? cugraph::invalid_idx<IdxT>::value : idx;
+  }
+}
+
+template <typename IdxT>
+void get_sampling_index_reservoir(IdxT* index,
+                                  raft::random::RngState& rng,
+                                  const IdxT* sizes,
+                                  IdxT n_sizes,
+                                  int32_t sample_size,
+                                  cudaStream_t stream)
+{
+  // same TPB as in algo R: increased SM occupancy is most important here
+  static constexpr int TPB     = 512;
+  static constexpr int N_WARPS = TPB / utils::WARP_SIZE;
+  auto n_blks                  = utils::ceil_div<IdxT>(n_sizes, N_WARPS);
+  int32_t smem_sizes[]         = {sample_size};
+  size_t smem_size             = smem_algo_r_t<IdxT>::get_size(N_WARPS, smem_sizes);
+  RAFT_CALL_RNG_FUNC(rng,
+                     (index_algo_r_kernel<N_WARPS><<<n_blks, TPB, smem_size, stream>>>),
+                     index,
+                     sizes,
+                     n_sizes,
+                     sample_size);
+  auto thread_rs = utils::ceil_div<IdxT>(
+    std::max(IdxT{0}, std::min(std::numeric_limits<IdxT>::max(), n_sizes) - IdxT{sample_size}),
+    utils::WARP_SIZE);
+  rng.advance(static_cast<uint64_t>(n_blks * TPB), thread_rs * sizeof(IdxT) / sizeof(int32_t));
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename IdxT>
+void get_sampling_index_impl(IdxT* index,
+                             raft::random::RngState& rng,
+                             const IdxT* sizes,
+                             IdxT n_sizes,
+                             int32_t sample_size,
+                             bool replace,
+                             cudaStream_t stream)
+{
+  if (replace) {
+    get_sampling_index_replace<IdxT>(index, rng, sizes, n_sizes, sample_size, stream);
+  } else {
+    get_sampling_index_reservoir<IdxT>(index, rng, sizes, n_sizes, sample_size, stream);
+  }
+}
+
+}  // namespace cugraph::legacy::ops::graph
diff --git a/cpp/src/generators/erdos_renyi_generator.cuh b/cpp/src/generators/erdos_renyi_generator.cuh
index cd461ee1aa2..10573ddb0d0 100644
--- a/cpp/src/generators/erdos_renyi_generator.cuh
+++ b/cpp/src/generators/erdos_renyi_generator.cuh
@@ -40,6 +40,11 @@ generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
                                         vertex_t base_vertex_id,
                                         uint64_t seed)
 {
+  // NOTE:
+  // https://networkx.org/documentation/stable/_modules/networkx/generators/random_graphs.html#fast_gnp_random_graph
+  //   identifies a faster algorithm that I think would be very efficient on the GPU.  I believe we
+  //   could just compute lr/lp in that code for a batch of values, use prefix sums to generate edge
+  //   ids and then convert the generated values to a batch of edges.
   CUGRAPH_EXPECTS(num_vertices < std::numeric_limits<int32_t>::max(),
                   "Implementation cannot support specified value");
 
@@ -88,6 +93,11 @@ generate_erdos_renyi_graph_edgelist_gnm(raft::handle_t const& handle,
                                         uint64_t seed)
 {
   CUGRAPH_FAIL("Not implemented");
+
+  // To implement:
+  //    Use sampling function to select `m` unique edge ids from the
+  //    (num_vertices ^ 2) possible edges.  Convert these to vertex
+  //    ids.
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/layout/legacy/barnes_hut.cuh b/cpp/src/layout/legacy/barnes_hut.cuh
index fa6d3816417..fdd57c7772d 100644
--- a/cpp/src/layout/legacy/barnes_hut.cuh
+++ b/cpp/src/layout/legacy/barnes_hut.cuh
@@ -19,7 +19,6 @@
 #include "bh_kernels.cuh"
 #include "converters/legacy/COOtoCSR.cuh"
 #include "fa2_kernels.cuh"
-#include "utilities/graph_utils.cuh"
 #include "utils.hpp"
 
 #include <cugraph/detail/utility_wrappers.hpp>
diff --git a/cpp/src/layout/legacy/fa2_kernels.cuh b/cpp/src/layout/legacy/fa2_kernels.cuh
index 33e7841a380..195889eebfb 100644
--- a/cpp/src/layout/legacy/fa2_kernels.cuh
+++ b/cpp/src/layout/legacy/fa2_kernels.cuh
@@ -17,7 +17,9 @@
 #pragma once
 #define restrict __restrict__
 
-#include "utilities/graph_utils.cuh"
+// From old graph_utils.cuh
+#define CUDA_MAX_BLOCKS         65535
+#define CUDA_MAX_KERNEL_THREADS 256  // kernel will launch at most 256 threads per block
 
 namespace cugraph {
 namespace detail {
diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
index c4c071b8f2c..7d4750c0554 100644
--- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
+++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include "from_cugraph_ops/sampling.hpp"
 #include "prims/detail/partition_v_frontier.cuh"
 #include "prims/detail/transform_v_frontier_e.cuh"
 #include "prims/property_op_utils.cuh"
@@ -33,9 +34,6 @@
 #include <cugraph/vertex_partition_device_view.cuh>
 
 #include <raft/random/rng.cuh>
-#ifndef NO_CUGRAPH_OPS
-#include <cugraph-ops/graph/sampling.hpp>
-#endif
 
 #include <cub/cub.cuh>
 #include <cuda/atomic>
@@ -639,7 +637,7 @@ rmm::device_uvector<edge_t> compute_uniform_sampling_index_without_replacement(
   auto mid_partition_size = frontier_partition_offsets[2] - frontier_partition_offsets[1];
   if (mid_partition_size > 0) {
     // FIXME: tmp_degrees & tmp_nbr_indices can be avoided if we customize
-    // cugraph::ops::get_sampling_index
+    // cugraph::legacy::ops::get_sampling_index
     rmm::device_uvector<edge_t> tmp_degrees(mid_partition_size, handle.get_stream());
     rmm::device_uvector<edge_t> tmp_nbr_indices(mid_partition_size * K, handle.get_stream());
     thrust::gather(handle.get_thrust_policy(),
@@ -647,13 +645,13 @@ rmm::device_uvector<edge_t> compute_uniform_sampling_index_without_replacement(
                    frontier_indices.begin() + frontier_partition_offsets[2],
                    frontier_degrees.begin(),
                    tmp_degrees.begin());
-    cugraph::ops::graph::get_sampling_index(tmp_nbr_indices.data(),
-                                            rng_state,
-                                            tmp_degrees.data(),
-                                            mid_partition_size,
-                                            static_cast<int32_t>(K),
-                                            false,
-                                            handle.get_stream());
+    cugraph::legacy::ops::graph::get_sampling_index(tmp_nbr_indices.data(),
+                                                    rng_state,
+                                                    tmp_degrees.data(),
+                                                    mid_partition_size,
+                                                    static_cast<int32_t>(K),
+                                                    false,
+                                                    handle.get_stream());
     thrust::for_each(
       handle.get_thrust_policy(),
       thrust::make_counting_iterator(size_t{0}),
@@ -736,7 +734,7 @@ rmm::device_uvector<edge_t> compute_uniform_sampling_index_without_replacement(
         }
 
         if (retry_segment_indices) {
-          cugraph::ops::graph::get_sampling_index(
+          cugraph::legacy::ops::graph::get_sampling_index(
             (*retry_nbr_indices).data(),
             rng_state,
             (*retry_degrees).begin(),
@@ -752,7 +750,7 @@ rmm::device_uvector<edge_t> compute_uniform_sampling_index_without_replacement(
                        segment_frontier_degree_first,
                        segment_frontier_degree_first + num_segments,
                        tmp_degrees.begin());
-          cugraph::ops::graph::get_sampling_index(
+          cugraph::legacy::ops::graph::get_sampling_index(
             tmp_nbr_indices.data(),
             rng_state,
             tmp_degrees.data(),
@@ -1626,13 +1624,13 @@ uniform_sample_and_compute_local_nbr_indices(
   if (with_replacement) {
     if (frontier_degrees.size() > 0) {
       nbr_indices.resize(frontier_degrees.size() * K, handle.get_stream());
-      cugraph::ops::graph::get_sampling_index(nbr_indices.data(),
-                                              rng_state,
-                                              frontier_degrees.data(),
-                                              static_cast<edge_t>(frontier_degrees.size()),
-                                              static_cast<int32_t>(K),
-                                              with_replacement,
-                                              handle.get_stream());
+      cugraph::legacy::ops::graph::get_sampling_index(nbr_indices.data(),
+                                                      rng_state,
+                                                      frontier_degrees.data(),
+                                                      static_cast<edge_t>(frontier_degrees.size()),
+                                                      static_cast<int32_t>(K),
+                                                      with_replacement,
+                                                      handle.get_stream());
       frontier_degrees.resize(0, handle.get_stream());
       frontier_degrees.shrink_to_fit(handle.get_stream());
     }
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 015a9c683f1..03514e52e6e 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include "from_cugraph_ops/sampling.hpp"
 #include "prims/detail/sample_and_compute_local_nbr_indices.cuh"
 #include "prims/property_op_utils.cuh"
 
@@ -30,9 +31,6 @@
 #include <cugraph/utilities/shuffle_comm.cuh>
 
 #include <raft/random/rng.cuh>
-#ifndef NO_CUGRAPH_OPS
-#include <cugraph-ops/graph/sampling.hpp>
-#endif
 
 #include <cub/cub.cuh>
 #include <cuda/atomic>
diff --git a/cpp/src/sampling/negative_sampling_impl.cuh b/cpp/src/sampling/negative_sampling_impl.cuh
new file mode 100644
index 00000000000..93bb03077bc
--- /dev/null
+++ b/cpp/src/sampling/negative_sampling_impl.cuh
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "prims/reduce_v.cuh"
+#include "prims/update_edge_src_dst_property.cuh"
+#include "utilities/collect_comm.cuh"
+
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/device_functors.cuh>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+
+#include <rmm/device_scalar.hpp>
+
+#include <thrust/adjacent_difference.h>
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/reduce.h>
+#include <thrust/remove.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+#include <thrust/unique.h>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<weight_t>>>
+normalize_biases(raft::handle_t const& handle,
+                 graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+                 raft::device_span<weight_t const> biases)
+{
+  std::optional<rmm::device_uvector<weight_t>> normalized_biases{std::nullopt};
+  std::optional<rmm::device_uvector<weight_t>> gpu_biases{std::nullopt};
+
+  // Need to normalize the biases
+  normalized_biases =
+    std::make_optional<rmm::device_uvector<weight_t>>(biases.size(), handle.get_stream());
+
+  weight_t sum =
+    thrust::reduce(handle.get_thrust_policy(), biases.begin(), biases.end(), weight_t{0});
+
+  thrust::transform(handle.get_thrust_policy(),
+                    biases.begin(),
+                    biases.end(),
+                    normalized_biases->begin(),
+                    divider_t<weight_t>{sum});
+
+  thrust::inclusive_scan(handle.get_thrust_policy(),
+                         normalized_biases->begin(),
+                         normalized_biases->end(),
+                         normalized_biases->begin());
+
+  if constexpr (multi_gpu) {
+    rmm::device_scalar<weight_t> d_sum(sum, handle.get_stream());
+
+    gpu_biases = cugraph::device_allgatherv(
+      handle, handle.get_comms(), raft::device_span<weight_t const>{d_sum.data(), d_sum.size()});
+
+    weight_t aggregate_sum = thrust::reduce(
+      handle.get_thrust_policy(), gpu_biases->begin(), gpu_biases->end(), weight_t{0});
+
+    // FIXME: https://github.com/rapidsai/raft/issues/2400 results in the possibility
+    // that 1 can appear as a random floating point value.  We're going to use
+    // thrust::upper_bound to assign random values to GPUs, we need the value 1.0 to
+    // be part of the upper-most range.  We'll compute the last non-zero value in the
+    // gpu_biases array here and below we will fill it with a value larger than 1.0
+    size_t trailing_zeros = thrust::distance(
+      thrust::make_reverse_iterator(gpu_biases->end()),
+      thrust::find_if(handle.get_thrust_policy(),
+                      thrust::make_reverse_iterator(gpu_biases->end()),
+                      thrust::make_reverse_iterator(gpu_biases->begin()),
+                      [] __device__(weight_t bias) { return bias > weight_t{0}; }));
+
+    thrust::transform(handle.get_thrust_policy(),
+                      gpu_biases->begin(),
+                      gpu_biases->end(),
+                      gpu_biases->begin(),
+                      divider_t<weight_t>{aggregate_sum});
+
+    thrust::inclusive_scan(
+      handle.get_thrust_policy(), gpu_biases->begin(), gpu_biases->end(), gpu_biases->begin());
+
+    // FIXME: conclusion of above.  Using 1.1 since it is > 1.0 and easy to type
+    thrust::copy_n(handle.get_thrust_policy(),
+                   thrust::make_constant_iterator<weight_t>(1.1),
+                   trailing_zeros + 1,
+                   gpu_biases->begin() + gpu_biases->size() - trailing_zeros - 1);
+  }
+
+  return std::make_tuple(std::move(normalized_biases), std::move(gpu_biases));
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+rmm::device_uvector<vertex_t> create_local_samples(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<rmm::device_uvector<weight_t>> const& normalized_biases,
+  std::optional<rmm::device_uvector<weight_t>> const& gpu_biases,
+  size_t samples_in_this_batch)
+{
+  rmm::device_uvector<vertex_t> samples(0, handle.get_stream());
+
+  if (normalized_biases) {
+    size_t samples_to_generate{samples_in_this_batch};
+    std::vector<size_t> sample_count_from_each_gpu;
+
+    rmm::device_uvector<size_t> position(0, handle.get_stream());
+
+    if constexpr (multi_gpu) {
+      // Determine how many vertices are generated on each GPU
+      auto const comm_size = handle.get_comms().get_size();
+      auto const comm_rank = handle.get_comms().get_rank();
+
+      sample_count_from_each_gpu.resize(comm_size);
+
+      rmm::device_uvector<size_t> gpu_counts(comm_size, handle.get_stream());
+      position.resize(samples_in_this_batch, handle.get_stream());
+
+      thrust::fill(handle.get_thrust_policy(), gpu_counts.begin(), gpu_counts.end(), size_t{0});
+      thrust::sequence(handle.get_thrust_policy(), position.begin(), position.end());
+
+      rmm::device_uvector<weight_t> random_values(samples_in_this_batch, handle.get_stream());
+      detail::uniform_random_fill(handle.get_stream(),
+                                  random_values.data(),
+                                  random_values.size(),
+                                  weight_t{0},
+                                  weight_t{1},
+                                  rng_state);
+
+      thrust::sort(handle.get_thrust_policy(),
+                   thrust::make_zip_iterator(random_values.begin(), position.begin()),
+                   thrust::make_zip_iterator(random_values.end(), position.end()));
+
+      thrust::upper_bound(handle.get_thrust_policy(),
+                          random_values.begin(),
+                          random_values.end(),
+                          gpu_biases->begin(),
+                          gpu_biases->end(),
+                          gpu_counts.begin());
+
+      thrust::adjacent_difference(
+        handle.get_thrust_policy(), gpu_counts.begin(), gpu_counts.end(), gpu_counts.begin());
+
+      std::vector<size_t> tx_counts(gpu_counts.size());
+      std::fill(tx_counts.begin(), tx_counts.end(), size_t{1});
+
+      rmm::device_uvector<size_t> d_sample_count_from_each_gpu(0, handle.get_stream());
+
+      std::tie(d_sample_count_from_each_gpu, std::ignore) =
+        shuffle_values(handle.get_comms(), gpu_counts.begin(), tx_counts, handle.get_stream());
+
+      samples_to_generate = thrust::reduce(handle.get_thrust_policy(),
+                                           d_sample_count_from_each_gpu.begin(),
+                                           d_sample_count_from_each_gpu.end(),
+                                           size_t{0});
+
+      raft::update_host(sample_count_from_each_gpu.data(),
+                        d_sample_count_from_each_gpu.data(),
+                        d_sample_count_from_each_gpu.size(),
+                        handle.get_stream());
+    }
+
+    // Generate samples
+    //  FIXME: We could save this memory if we had an iterator that
+    //         generated random values.
+    rmm::device_uvector<weight_t> random_values(samples_to_generate, handle.get_stream());
+    samples.resize(samples_to_generate, handle.get_stream());
+    detail::uniform_random_fill(handle.get_stream(),
+                                random_values.data(),
+                                random_values.size(),
+                                weight_t{0},
+                                weight_t{1},
+                                rng_state);
+
+    thrust::transform(
+      handle.get_thrust_policy(),
+      random_values.begin(),
+      random_values.end(),
+      samples.begin(),
+      [biases =
+         raft::device_span<weight_t const>{normalized_biases->data(), normalized_biases->size()},
+       offset = graph_view.local_vertex_partition_range_first()] __device__(weight_t r) {
+        size_t result =
+          offset +
+          static_cast<vertex_t>(thrust::distance(
+            biases.begin(), thrust::lower_bound(thrust::seq, biases.begin(), biases.end(), r)));
+
+        // FIXME: https://github.com/rapidsai/raft/issues/2400
+        // results in the possibility that 1 can appear as a
+        // random floating point value, which results in the sampling
+        // algorithm below generating a value that's OOB.
+        if (result == (offset + biases.size())) --result;
+
+        return result;
+      });
+
+    // Shuffle them back
+    if constexpr (multi_gpu) {
+      std::tie(samples, std::ignore) = shuffle_values(
+        handle.get_comms(), samples.begin(), sample_count_from_each_gpu, handle.get_stream());
+
+      thrust::sort(handle.get_thrust_policy(),
+                   thrust::make_zip_iterator(position.begin(), samples.begin()),
+                   thrust::make_zip_iterator(position.end(), samples.begin()));
+    }
+  } else {
+    samples.resize(samples_in_this_batch, handle.get_stream());
+
+    // Uniformly select a vertex from any GPU
+    detail::uniform_random_fill(handle.get_stream(),
+                                samples.data(),
+                                samples.size(),
+                                vertex_t{0},
+                                graph_view.number_of_vertices(),
+                                rng_state);
+  }
+
+  return samples;
+}
+
+}  // namespace detail
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<raft::device_span<weight_t const>> src_biases,
+  std::optional<raft::device_span<weight_t const>> dst_biases,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check)
+{
+  rmm::device_uvector<vertex_t> src(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst(0, handle.get_stream());
+
+  // Optimistically assume we can do this in one pass
+  size_t samples_in_this_batch = num_samples;
+
+  // Normalize the biases and (for MG) determine how the biases are
+  // distributed across the GPUs.
+  std::optional<rmm::device_uvector<weight_t>> normalized_src_biases{std::nullopt};
+  std::optional<rmm::device_uvector<weight_t>> gpu_src_biases{std::nullopt};
+  std::optional<rmm::device_uvector<weight_t>> normalized_dst_biases{std::nullopt};
+  std::optional<rmm::device_uvector<weight_t>> gpu_dst_biases{std::nullopt};
+
+  if (src_biases)
+    std::tie(normalized_src_biases, gpu_src_biases) =
+      detail::normalize_biases(handle, graph_view, *src_biases);
+
+  if (dst_biases)
+    std::tie(normalized_dst_biases, gpu_dst_biases) =
+      detail::normalize_biases(handle, graph_view, *dst_biases);
+
+  while (samples_in_this_batch > 0) {
+    if constexpr (multi_gpu) {
+      auto const comm_size = handle.get_comms().get_size();
+      auto const comm_rank = handle.get_comms().get_rank();
+
+      samples_in_this_batch =
+        (samples_in_this_batch / static_cast<size_t>(comm_size)) +
+        (static_cast<size_t>(comm_rank) < (samples_in_this_batch % static_cast<size_t>(comm_size))
+           ? 1
+           : 0);
+    }
+
+    auto batch_src = create_local_samples(
+      handle, rng_state, graph_view, normalized_src_biases, gpu_src_biases, samples_in_this_batch);
+    auto batch_dst = create_local_samples(
+      handle, rng_state, graph_view, normalized_dst_biases, gpu_dst_biases, samples_in_this_batch);
+
+    if constexpr (multi_gpu) {
+      auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
+
+      std::tie(batch_src, batch_dst, std::ignore, std::ignore, std::ignore, std::ignore) =
+        detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
+                                                                                       edge_t,
+                                                                                       weight_t,
+                                                                                       int32_t>(
+          handle,
+          std::move(batch_src),
+          std::move(batch_dst),
+          std::nullopt,
+          std::nullopt,
+          std::nullopt,
+          vertex_partition_range_lasts);
+    }
+
+    if (remove_existing_edges) {
+      auto has_edge_flags =
+        graph_view.has_edge(handle,
+                            raft::device_span<vertex_t const>{batch_src.data(), batch_src.size()},
+                            raft::device_span<vertex_t const>{batch_dst.data(), batch_dst.size()},
+                            do_expensive_check);
+
+      auto begin_iter = thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin());
+      auto new_end    = thrust::remove_if(handle.get_thrust_policy(),
+                                       begin_iter,
+                                       begin_iter + batch_src.size(),
+                                       has_edge_flags.begin(),
+                                       thrust::identity<bool>());
+
+      batch_src.resize(thrust::distance(begin_iter, new_end), handle.get_stream());
+      batch_dst.resize(thrust::distance(begin_iter, new_end), handle.get_stream());
+    }
+
+    if (remove_duplicates) {
+      thrust::sort(handle.get_thrust_policy(),
+                   thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()),
+                   thrust::make_zip_iterator(batch_src.end(), batch_dst.end()));
+
+      auto new_end = thrust::unique(handle.get_thrust_policy(),
+                                    thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()),
+                                    thrust::make_zip_iterator(batch_src.end(), batch_dst.end()));
+
+      size_t new_size =
+        thrust::distance(thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()), new_end);
+
+      if (src.size() > 0) {
+        rmm::device_uvector<vertex_t> new_src(src.size() + new_size, handle.get_stream());
+        rmm::device_uvector<vertex_t> new_dst(dst.size() + new_size, handle.get_stream());
+
+        thrust::merge(handle.get_thrust_policy(),
+                      thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()),
+                      new_end,
+                      thrust::make_zip_iterator(src.begin(), dst.begin()),
+                      thrust::make_zip_iterator(src.end(), dst.end()),
+                      thrust::make_zip_iterator(new_src.begin(), new_dst.begin()));
+
+        new_end = thrust::unique(handle.get_thrust_policy(),
+                                 thrust::make_zip_iterator(new_src.begin(), new_dst.begin()),
+                                 thrust::make_zip_iterator(new_src.end(), new_dst.end()));
+
+        new_size =
+          thrust::distance(thrust::make_zip_iterator(new_src.begin(), new_dst.begin()), new_end);
+
+        src = std::move(new_src);
+        dst = std::move(new_dst);
+      } else {
+        src = std::move(batch_src);
+        dst = std::move(batch_dst);
+      }
+
+      src.resize(new_size, handle.get_stream());
+      dst.resize(new_size, handle.get_stream());
+    } else if (src.size() > 0) {
+      size_t current_end = src.size();
+
+      src.resize(src.size() + batch_src.size(), handle.get_stream());
+      dst.resize(dst.size() + batch_dst.size(), handle.get_stream());
+
+      thrust::copy(handle.get_thrust_policy(),
+                   thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()),
+                   thrust::make_zip_iterator(batch_src.end(), batch_dst.end()),
+                   thrust::make_zip_iterator(src.begin(), dst.begin()) + current_end);
+    } else {
+      src = std::move(batch_src);
+      dst = std::move(batch_dst);
+    }
+
+    if (exact_number_of_samples) {
+      size_t current_sample_size = src.size();
+      if constexpr (multi_gpu) {
+        current_sample_size = cugraph::host_scalar_allreduce(
+          handle.get_comms(), current_sample_size, raft::comms::op_t::SUM, handle.get_stream());
+      }
+
+      // FIXME: We could oversample and discard the unnecessary samples
+      // to reduce the number of iterations in the outer loop, but it seems like
+      // exact_number_of_samples is an edge case not worth optimizing for at this time.
+      samples_in_this_batch = num_samples - current_sample_size;
+    } else {
+      samples_in_this_batch = 0;
+    }
+  }
+
+  src.shrink_to_fit(handle.get_stream());
+  dst.shrink_to_fit(handle.get_stream());
+
+  return std::make_tuple(std::move(src), std::move(dst));
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/negative_sampling_mg_v32_e32.cu b/cpp/src/sampling/negative_sampling_mg_v32_e32.cu
new file mode 100644
index 00000000000..ce54d54d319
--- /dev/null
+++ b/cpp/src/sampling/negative_sampling_mg_v32_e32.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "negative_sampling_impl.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<raft::device_span<float const>> src_bias,
+  std::optional<raft::device_span<float const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<raft::device_span<double const>> src_bias,
+  std::optional<raft::device_span<double const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/negative_sampling_mg_v32_e64.cu b/cpp/src/sampling/negative_sampling_mg_v32_e64.cu
new file mode 100644
index 00000000000..af4c28c0f1a
--- /dev/null
+++ b/cpp/src/sampling/negative_sampling_mg_v32_e64.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "negative_sampling_impl.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  std::optional<raft::device_span<float const>> src_bias,
+  std::optional<raft::device_span<float const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  std::optional<raft::device_span<double const>> src_bias,
+  std::optional<raft::device_span<double const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/negative_sampling_mg_v64_e64.cu b/cpp/src/sampling/negative_sampling_mg_v64_e64.cu
new file mode 100644
index 00000000000..c5691fb4644
--- /dev/null
+++ b/cpp/src/sampling/negative_sampling_mg_v64_e64.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "negative_sampling_impl.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<raft::device_span<float const>> src_bias,
+  std::optional<raft::device_span<float const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<raft::device_span<double const>> src_bias,
+  std::optional<raft::device_span<double const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/negative_sampling_sg_v32_e32.cu b/cpp/src/sampling/negative_sampling_sg_v32_e32.cu
new file mode 100644
index 00000000000..3712414e4ec
--- /dev/null
+++ b/cpp/src/sampling/negative_sampling_sg_v32_e32.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "negative_sampling_impl.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<raft::device_span<float const>> src_bias,
+  std::optional<raft::device_span<float const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<raft::device_span<double const>> src_bias,
+  std::optional<raft::device_span<double const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/negative_sampling_sg_v32_e64.cu b/cpp/src/sampling/negative_sampling_sg_v32_e64.cu
new file mode 100644
index 00000000000..c66c31a4258
--- /dev/null
+++ b/cpp/src/sampling/negative_sampling_sg_v32_e64.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "negative_sampling_impl.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  std::optional<raft::device_span<float const>> src_bias,
+  std::optional<raft::device_span<float const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  std::optional<raft::device_span<double const>> src_bias,
+  std::optional<raft::device_span<double const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/negative_sampling_sg_v64_e64.cu b/cpp/src/sampling/negative_sampling_sg_v64_e64.cu
new file mode 100644
index 00000000000..e4fc50890e4
--- /dev/null
+++ b/cpp/src/sampling/negative_sampling_sg_v64_e64.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "negative_sampling_impl.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<raft::device_span<float const>> src_bias,
+  std::optional<raft::device_span<float const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<raft::device_span<double const>> src_bias,
+  std::optional<raft::device_span<double const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 3b0bc15df93..0b1d9dcdb56 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -18,8 +18,6 @@
 //
 #pragma once
 
-#include "utilities/graph_utils.cuh"
-
 #include <cugraph/algorithms.hpp>
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/graph.hpp>
diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index 45cc1e54cb4..2c5658b32a5 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -18,8 +18,6 @@
 //
 #pragma once
 
-#include "utilities/graph_utils.cuh"
-
 #include <cugraph/api_helpers.hpp>
 #include <cugraph/graph.hpp>
 
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 5371d53bcf0..f925a142737 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -803,7 +803,7 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mul
   raft::handle_t const& handle,
   raft::device_span<vertex_t const> edge_srcs,
   raft::device_span<vertex_t const> edge_dsts,
-  bool do_expensive_check)
+  bool do_expensive_check) const
 {
   CUGRAPH_EXPECTS(
     edge_srcs.size() == edge_dsts.size(),
@@ -883,7 +883,7 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!mu
   raft::handle_t const& handle,
   raft::device_span<vertex_t const> edge_srcs,
   raft::device_span<vertex_t const> edge_dsts,
-  bool do_expensive_check)
+  bool do_expensive_check) const
 {
   CUGRAPH_EXPECTS(
     edge_srcs.size() == edge_dsts.size(),
diff --git a/cpp/src/structure/legacy/graph.cu b/cpp/src/structure/legacy/graph.cu
index 7e1238e1558..a504125080b 100644
--- a/cpp/src/structure/legacy/graph.cu
+++ b/cpp/src/structure/legacy/graph.cu
@@ -14,17 +14,18 @@
  * limitations under the License.
  */
 
-#include "utilities/graph_utils.cuh"
-
 #include <cugraph/legacy/graph.hpp>
 #include <cugraph/utilities/error.hpp>
 
+#include <raft/core/device_span.hpp>
 #include <raft/util/device_atomics.cuh>
 
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/sequence.h>
 
 namespace {
 
@@ -69,15 +70,40 @@ namespace legacy {
 template <typename VT, typename ET, typename WT>
 void GraphViewBase<VT, ET, WT>::get_vertex_identifiers(VT* identifiers) const
 {
-  cugraph::detail::sequence<VT>(number_of_vertices, identifiers);
+  thrust::sequence(thrust::device,
+                   thrust::device_pointer_cast(identifiers),
+                   thrust::device_pointer_cast(identifiers + number_of_vertices),
+                   VT{0});
+  RAFT_CHECK_CUDA(nullptr);
 }
 
+// FIXME: Need to get rid of this function... still used in python
 template <typename VT, typename ET, typename WT>
 void GraphCompressedSparseBaseView<VT, ET, WT>::get_source_indices(VT* src_indices) const
 {
   CUGRAPH_EXPECTS(offsets != nullptr, "No graph specified");
-  cugraph::detail::offsets_to_indices<ET, VT>(
-    offsets, GraphViewBase<VT, ET, WT>::number_of_vertices, src_indices);
+  rmm::cuda_stream_view stream_view;
+
+  raft::device_span<VT> indices_span(src_indices, GraphViewBase<VT, ET, WT>::number_of_edges);
+
+  if (indices_span.size() > 0) {
+    thrust::fill(rmm::exec_policy(stream_view), indices_span.begin(), indices_span.end(), VT{0});
+
+    thrust::for_each(rmm::exec_policy(stream_view),
+                     offsets + 1,
+                     offsets + GraphViewBase<VT, ET, WT>::number_of_vertices,
+                     [indices_span] __device__(ET offset) {
+                       if (offset < static_cast<ET>(indices_span.size())) {
+                         cuda::atomic_ref<VT, cuda::thread_scope_device> atomic_counter(
+                           indices_span.data()[offset]);
+                         atomic_counter.fetch_add(VT{1}, cuda::std::memory_order_relaxed);
+                       }
+                     });
+    thrust::inclusive_scan(rmm::exec_policy(stream_view),
+                           indices_span.begin(),
+                           indices_span.end(),
+                           indices_span.begin());
+  }
 }
 
 template <typename VT, typename ET, typename WT>
@@ -152,6 +178,4 @@ void GraphCompressedSparseBaseView<VT, ET, WT>::degree(ET* degree, DegreeDirecti
 }  // namespace legacy
 }  // namespace cugraph
 
-#include "utilities/eidir_graph_utils.hpp"
-
 #include <cugraph/legacy/eidir_graph.hpp>
diff --git a/cpp/src/structure/select_random_vertices_impl.hpp b/cpp/src/structure/select_random_vertices_impl.hpp
index e6857a5beda..d7502b3f6da 100644
--- a/cpp/src/structure/select_random_vertices_impl.hpp
+++ b/cpp/src/structure/select_random_vertices_impl.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "detail/graph_partition_utils.cuh"
+#include "from_cugraph_ops/sampling.hpp"
 
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/detail/utility_wrappers.hpp>
@@ -30,10 +31,6 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
-#ifndef NO_CUGRAPH_OPS
-#include <cugraph-ops/graph/sampling.hpp>
-#endif
-
 #include <thrust/functional.h>
 #include <thrust/gather.h>
 #include <thrust/logical.h>
diff --git a/cpp/src/traversal/extract_bfs_paths_impl.cuh b/cpp/src/traversal/extract_bfs_paths_impl.cuh
index 3790c0057cb..40030e2e39c 100644
--- a/cpp/src/traversal/extract_bfs_paths_impl.cuh
+++ b/cpp/src/traversal/extract_bfs_paths_impl.cuh
@@ -17,7 +17,6 @@
 
 #include "detail/graph_partition_utils.cuh"
 #include "utilities/collect_comm.cuh"
-#include "utilities/graph_utils.cuh"
 
 #include <cugraph/algorithms.hpp>
 #include <cugraph/detail/utility_wrappers.hpp>
diff --git a/cpp/src/utilities/eidecl_graph_utils.hpp b/cpp/src/utilities/eidecl_graph_utils.hpp
deleted file mode 100644
index abf026cbbfe..00000000000
--- a/cpp/src/utilities/eidecl_graph_utils.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-namespace cugraph {
-namespace detail {
-
-extern template __device__ float parallel_prefix_sum(int32_t, int32_t const*, float const*);
-extern template __device__ double parallel_prefix_sum(int32_t, int32_t const*, double const*);
-extern template __device__ float parallel_prefix_sum(int64_t, int32_t const*, float const*);
-extern template __device__ double parallel_prefix_sum(int64_t, int32_t const*, double const*);
-extern template __device__ float parallel_prefix_sum(int64_t, int64_t const*, float const*);
-extern template __device__ double parallel_prefix_sum(int64_t, int64_t const*, double const*);
-
-extern template void offsets_to_indices<int, int>(int const*, int, int*);
-extern template void offsets_to_indices<long, int>(long const*, int, int*);
-extern template void offsets_to_indices<long, long>(long const*, long, long*);
-
-extern template __attribute__((visibility("hidden"))) __global__ void
-offsets_to_indices_kernel<int, int>(int const*, int, int*);
-extern template __attribute__((visibility("hidden"))) __global__ void
-offsets_to_indices_kernel<long, int>(long const*, int, int*);
-extern template __attribute__((visibility("hidden"))) __global__ void
-offsets_to_indices_kernel<long, long>(long const*, long, long*);
-
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/utilities/eidir_graph_utils.hpp b/cpp/src/utilities/eidir_graph_utils.hpp
deleted file mode 100644
index ba06c6f56ea..00000000000
--- a/cpp/src/utilities/eidir_graph_utils.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-namespace cugraph {
-namespace detail {
-
-template __device__ float parallel_prefix_sum(int32_t, int32_t const*, float const*);
-template __device__ double parallel_prefix_sum(int32_t, int32_t const*, double const*);
-template __device__ float parallel_prefix_sum(int64_t, int32_t const*, float const*);
-template __device__ double parallel_prefix_sum(int64_t, int32_t const*, double const*);
-template __device__ float parallel_prefix_sum(int64_t, int64_t const*, float const*);
-template __device__ double parallel_prefix_sum(int64_t, int64_t const*, double const*);
-
-template void offsets_to_indices<int32_t, int32_t>(int32_t const*, int32_t, int32_t*);
-template void offsets_to_indices<int64_t, int32_t>(int64_t const*, int32_t, int32_t*);
-template void offsets_to_indices<int64_t, int64_t>(int64_t const*, int64_t, int64_t*);
-
-template __global__ __attribute__((visibility("hidden"))) void
-offsets_to_indices_kernel<int32_t, int32_t>(int32_t const*, int32_t, int32_t*);
-template __global__ __attribute__((visibility("hidden"))) void
-offsets_to_indices_kernel<int64_t, int32_t>(int64_t const*, int32_t, int32_t*);
-template __global__ __attribute__((visibility("hidden"))) void
-offsets_to_indices_kernel<int64_t, int64_t>(int64_t const*, int64_t, int64_t*);
-
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/utilities/graph_utils.cuh b/cpp/src/utilities/graph_utils.cuh
deleted file mode 100644
index 0b257e7abde..00000000000
--- a/cpp/src/utilities/graph_utils.cuh
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-
-// Interanl helper functions
-// Author: Alex Fender afender@nvidia.com
-#pragma once
-
-#include <cugraph/utilities/error.hpp>
-
-#include <raft/util/cudart_utils.hpp>
-#include <raft/util/device_atomics.cuh>
-
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/extrema.h>
-#include <thrust/fill.h>
-#include <thrust/functional.h>
-#include <thrust/inner_product.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/scatter.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-#include <thrust/transform_reduce.h>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-namespace cugraph {
-namespace detail {
-
-// #define DEBUG 1
-#define CUDA_MAX_BLOCKS         65535
-#define CUDA_MAX_KERNEL_THREADS 256  // kernel will launch at most 256 threads per block
-#define US
-
-template <typename count_t, typename index_t, typename value_t>
-__inline__ __device__ value_t parallel_prefix_sum(count_t n, index_t const* ind, value_t const* w)
-{
-  count_t i, j, mn;
-  value_t v, last;
-  value_t sum = 0.0;
-  bool valid;
-
-  // Parallel prefix sum (using __shfl)
-  mn = (((n + blockDim.x - 1) / blockDim.x) * blockDim.x);  // n in multiple of blockDim.x
-  for (i = threadIdx.x; i < mn; i += blockDim.x) {
-    // All threads (especially the last one) must always participate
-    // in the shfl instruction, otherwise their sum will be undefined.
-    // So, the loop stopping condition is based on multiple of n in loop increments,
-    // so that all threads enter into the loop and inside we make sure we do not
-    // read out of bounds memory checking for the actual size n.
-
-    // check if the thread is valid
-    valid = i < n;
-
-    // Notice that the last thread is used to propagate the prefix sum.
-    // For all the threads, in the first iteration the last is 0, in the following
-    // iterations it is the value at the last thread of the previous iterations.
-
-    // get the value of the last thread
-    last = __shfl_sync(raft::warp_full_mask(), sum, blockDim.x - 1, blockDim.x);
-
-    // if you are valid read the value from memory, otherwise set your value to 0
-    sum = (valid) ? w[ind[i]] : 0.0;
-
-    // do prefix sum (of size warpSize=blockDim.x =< 32)
-    for (j = 1; j < blockDim.x; j *= 2) {
-      v = __shfl_up_sync(raft::warp_full_mask(), sum, j, blockDim.x);
-      if (threadIdx.x >= j) sum += v;
-    }
-    // shift by last
-    sum += last;
-    // notice that no __threadfence or __syncthreads are needed in this implementation
-  }
-  // get the value of the last thread (to all threads)
-  last = __shfl_sync(raft::warp_full_mask(), sum, blockDim.x - 1, blockDim.x);
-
-  return last;
-}
-
-// axpy
-template <typename T>
-struct axpy_functor : public thrust::binary_function<T, T, T> {
-  const T a;
-  axpy_functor(T _a) : a(_a) {}
-  __host__ __device__ T operator()(const T& x, const T& y) const { return a * x + y; }
-};
-
-template <typename T>
-void axpy(size_t n, T a, T* x, T* y)
-{
-  rmm::cuda_stream_view stream_view;
-  thrust::transform(rmm::exec_policy(stream_view),
-                    thrust::device_pointer_cast(x),
-                    thrust::device_pointer_cast(x + n),
-                    thrust::device_pointer_cast(y),
-                    thrust::device_pointer_cast(y),
-                    axpy_functor<T>(a));
-  RAFT_CHECK_CUDA(stream_view.value());
-}
-
-// norm
-template <typename T>
-struct square {
-  __host__ __device__ T operator()(const T& x) const { return x * x; }
-};
-
-template <typename T>
-T nrm2(size_t n, T* x)
-{
-  rmm::cuda_stream_view stream_view;
-  T init   = 0;
-  T result = std::sqrt(thrust::transform_reduce(rmm::exec_policy(stream_view),
-                                                thrust::device_pointer_cast(x),
-                                                thrust::device_pointer_cast(x + n),
-                                                square<T>(),
-                                                init,
-                                                thrust::plus<T>()));
-  RAFT_CHECK_CUDA(stream_view.value());
-  return result;
-}
-
-template <typename T>
-T nrm1(size_t n, T* x)
-{
-  rmm::cuda_stream_view stream_view;
-  T result = thrust::reduce(rmm::exec_policy(stream_view),
-                            thrust::device_pointer_cast(x),
-                            thrust::device_pointer_cast(x + n));
-  RAFT_CHECK_CUDA(stream_view.value());
-  return result;
-}
-
-template <typename T>
-void scal(size_t n, T val, T* x)
-{
-  rmm::cuda_stream_view stream_view;
-  thrust::transform(rmm::exec_policy(stream_view),
-                    thrust::device_pointer_cast(x),
-                    thrust::device_pointer_cast(x + n),
-                    thrust::make_constant_iterator(val),
-                    thrust::device_pointer_cast(x),
-                    thrust::multiplies<T>());
-  RAFT_CHECK_CUDA(stream_view.value());
-}
-
-template <typename T>
-void addv(size_t n, T val, T* x)
-{
-  rmm::cuda_stream_view stream_view;
-  thrust::transform(rmm::exec_policy(stream_view),
-                    thrust::device_pointer_cast(x),
-                    thrust::device_pointer_cast(x + n),
-                    thrust::make_constant_iterator(val),
-                    thrust::device_pointer_cast(x),
-                    thrust::plus<T>());
-  RAFT_CHECK_CUDA(stream_view.value());
-}
-
-template <typename T>
-void fill(size_t n, T* x, T value)
-{
-  rmm::cuda_stream_view stream_view;
-  thrust::fill(rmm::exec_policy(stream_view),
-               thrust::device_pointer_cast(x),
-               thrust::device_pointer_cast(x + n),
-               value);
-  RAFT_CHECK_CUDA(stream_view.value());
-}
-
-template <typename T, typename M>
-void scatter(size_t n, T* src, T* dst, M* map)
-{
-  rmm::cuda_stream_view stream_view;
-  thrust::scatter(rmm::exec_policy(stream_view),
-                  thrust::device_pointer_cast(src),
-                  thrust::device_pointer_cast(src + n),
-                  thrust::device_pointer_cast(map),
-                  thrust::device_pointer_cast(dst));
-  RAFT_CHECK_CUDA(stream_view.value());
-}
-
-template <typename T>
-void printv(size_t n, T* vec, int offset)
-{
-  thrust::device_ptr<T> dev_ptr(vec);
-  std::cout.precision(15);
-  std::cout << "sample size = " << n << ", offset = " << offset << std::endl;
-  thrust::copy(
-    dev_ptr + offset,
-    dev_ptr + offset + n,
-    std::ostream_iterator<T>(
-      std::cout, " "));  // Assume no RMM dependency; TODO: check / test (potential BUG !!!!!)
-  RAFT_CHECK_CUDA(nullptr);
-  std::cout << std::endl;
-}
-
-template <typename T>
-void copy(size_t n, T* x, T* res)
-{
-  thrust::device_ptr<T> dev_ptr(x);
-  thrust::device_ptr<T> res_ptr(res);
-  rmm::cuda_stream_view stream_view;
-  thrust::copy_n(rmm::exec_policy(stream_view), dev_ptr, n, res_ptr);
-  RAFT_CHECK_CUDA(stream_view.value());
-}
-
-template <typename T>
-struct is_zero {
-  __host__ __device__ bool operator()(const T x) { return x == 0; }
-};
-
-template <typename T>
-struct dangling_functor : public thrust::unary_function<T, T> {
-  const T val;
-  dangling_functor(T _val) : val(_val) {}
-  __host__ __device__ T operator()(const T& x) const { return val + x; }
-};
-
-template <typename T>
-void update_dangling_nodes(size_t n, T* dangling_nodes, T damping_factor)
-{
-  rmm::cuda_stream_view stream_view;
-  thrust::transform_if(rmm::exec_policy(stream_view),
-                       thrust::device_pointer_cast(dangling_nodes),
-                       thrust::device_pointer_cast(dangling_nodes + n),
-                       thrust::device_pointer_cast(dangling_nodes),
-                       dangling_functor<T>(1.0 - damping_factor),
-                       is_zero<T>());
-  RAFT_CHECK_CUDA(stream_view.value());
-}
-
-// google matrix kernels
-template <typename IndexType, typename ValueType>
-__global__ static void degree_coo(const IndexType n,
-                                  const IndexType e,
-                                  const IndexType* ind,
-                                  ValueType* degree)
-{
-  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x)
-    atomicAdd(&degree[ind[i]], (ValueType)1.0);
-}
-
-template <typename IndexType, typename ValueType>
-__global__ static void flag_leafs_kernel(const size_t n,
-                                         const IndexType* degree,
-                                         ValueType* bookmark)
-{
-  for (auto i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
-    if (degree[i] == 0) bookmark[i] = 1.0;
-}
-
-template <typename IndexType, typename ValueType>
-__global__ static void degree_offsets(const IndexType n,
-                                      const IndexType e,
-                                      const IndexType* ind,
-                                      ValueType* degree)
-{
-  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
-    degree[i] += ind[i + 1] - ind[i];
-}
-
-template <typename FromType, typename ToType>
-__global__ static void type_convert(FromType* array, int n)
-{
-  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
-    ToType val   = array[i];
-    ToType* vals = (ToType*)array;
-    vals[i]      = val;
-  }
-}
-
-template <typename IndexType, typename ValueType>
-__global__ static void equi_prob3(const IndexType n,
-                                  const IndexType e,
-                                  const IndexType* csrPtr,
-                                  const IndexType* csrInd,
-                                  ValueType* val,
-                                  IndexType* degree)
-{
-  int j, row, col;
-  for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) {
-    for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; j < csrPtr[row + 1];
-         j += gridDim.y * blockDim.y) {
-      col    = csrInd[j];
-      val[j] = 1.0 / degree[col];
-      // val[j] = 999;
-    }
-  }
-}
-
-template <typename IndexType, typename ValueType>
-__global__ static void equi_prob2(const IndexType n,
-                                  const IndexType e,
-                                  const IndexType* csrPtr,
-                                  const IndexType* csrInd,
-                                  ValueType* val,
-                                  IndexType* degree)
-{
-  int row = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row < n) {
-    int row_begin = csrPtr[row];
-    int row_end   = csrPtr[row + 1];
-    int col;
-    for (int i = row_begin; i < row_end; i++) {
-      col    = csrInd[i];
-      val[i] = 1.0 / degree[col];
-    }
-  }
-}
-
-// compute the H^T values for an already transposed adjacency matrix, leveraging coo info
-template <typename IndexType, typename ValueType>
-void HT_matrix_csc_coo(const IndexType n,
-                       const IndexType e,
-                       const IndexType* csrPtr,
-                       const IndexType* csrInd,
-                       ValueType* val,
-                       ValueType* bookmark)
-{
-  rmm::cuda_stream_view stream_view;
-  rmm::device_uvector<IndexType> degree(n, stream_view);
-
-  dim3 nthreads, nblocks;
-  nthreads.x = min(e, CUDA_MAX_KERNEL_THREADS);
-  nthreads.y = 1;
-  nthreads.z = 1;
-  nblocks.x  = min((e + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS);
-  nblocks.y  = 1;
-  nblocks.z  = 1;
-  degree_coo<IndexType, IndexType>
-    <<<nblocks, nthreads, 0, stream_view.value()>>>(n, e, csrInd, degree.data());
-  RAFT_CHECK_CUDA(stream_view.value());
-
-  int y      = 4;
-  nthreads.x = 32 / y;
-  nthreads.y = y;
-  nthreads.z = 8;
-  nblocks.x  = 1;
-  nblocks.y  = 1;
-  nblocks.z  = min((n + nthreads.z - 1) / nthreads.z, CUDA_MAX_BLOCKS);  // 1;
-  equi_prob3<IndexType, ValueType>
-    <<<nblocks, nthreads, 0, stream_view.value()>>>(n, e, csrPtr, csrInd, val, degree.data());
-  RAFT_CHECK_CUDA(stream_view.value());
-
-  ValueType a = 0.0;
-  fill(n, bookmark, a);
-  RAFT_CHECK_CUDA(stream_view.value());
-
-  nthreads.x = min(n, CUDA_MAX_KERNEL_THREADS);
-  nthreads.y = 1;
-  nthreads.z = 1;
-  nblocks.x  = min((n + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS);
-  nblocks.y  = 1;
-  nblocks.z  = 1;
-  flag_leafs_kernel<IndexType, ValueType>
-    <<<nblocks, nthreads, 0, stream_view.value()>>>(n, degree.data(), bookmark);
-  RAFT_CHECK_CUDA(stream_view.value());
-}
-
-template <typename offsets_t, typename index_t>
-__attribute__((visibility("hidden"))) __global__ void offsets_to_indices_kernel(
-  const offsets_t* offsets, index_t v, index_t* indices)
-{
-  auto tid{threadIdx.x};
-  auto ctaStart{blockIdx.x};
-
-  for (index_t j = ctaStart; j < v; j += gridDim.x) {
-    offsets_t colStart = offsets[j];
-    offsets_t colEnd   = offsets[j + 1];
-    offsets_t rowNnz   = colEnd - colStart;
-
-    for (offsets_t i = 0; i < rowNnz; i += blockDim.x) {
-      if ((colStart + tid + i) < colEnd) { indices[colStart + tid + i] = j; }
-    }
-  }
-}
-
-template <typename offsets_t, typename index_t>
-void offsets_to_indices(const offsets_t* offsets, index_t v, index_t* indices)
-{
-  cudaStream_t stream{nullptr};
-  index_t nthreads = min(v, (index_t)CUDA_MAX_KERNEL_THREADS);
-  index_t nblocks  = min((v + nthreads - 1) / nthreads, (index_t)CUDA_MAX_BLOCKS);
-  offsets_to_indices_kernel<<<nblocks, nthreads, 0, stream>>>(offsets, v, indices);
-  RAFT_CHECK_CUDA(stream);
-}
-
-template <typename IndexType>
-void sequence(IndexType n, IndexType* vec, IndexType init = 0)
-{
-  thrust::sequence(
-    thrust::device, thrust::device_pointer_cast(vec), thrust::device_pointer_cast(vec + n), init);
-  RAFT_CHECK_CUDA(nullptr);
-}
-
-template <typename DistType>
-bool has_negative_val(DistType* arr, size_t n)
-{
-  // custom kernel with boolean bitwise reduce may be
-  // faster.
-  rmm::cuda_stream_view stream_view;
-  DistType result = *thrust::min_element(rmm::exec_policy(stream_view),
-                                         thrust::device_pointer_cast(arr),
-                                         thrust::device_pointer_cast(arr + n));
-
-  RAFT_CHECK_CUDA(stream_view.value());
-
-  return (result < 0);
-}
-
-}  // namespace detail
-}  // namespace cugraph
-
-#include "eidecl_graph_utils.hpp"
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 52d257b9bea..da31f498de1 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -39,6 +39,7 @@ add_library(cugraphtestutil STATIC
             utilities/misc_utilities.cpp
             utilities/conversion_utilities_sg.cu
             utilities/debug_utilities_sg.cpp
+            utilities/validation_utilities.cu
             link_prediction/similarity_compare.cpp
             centrality/betweenness_centrality_validate.cu
             community/egonet_validate.cu
@@ -487,6 +488,10 @@ ConfigureTest(BIASED_NEIGHBOR_SAMPLING_TEST sampling/biased_neighbor_sampling.cp
 # - SAMPLING_POST_PROCESSING tests ----------------------------------------------------------------
 ConfigureTest(SAMPLING_POST_PROCESSING_TEST sampling/sampling_post_processing_test.cu)
 
+###################################################################################################
+# - NEGATIVE SAMPLING tests --------------------------------------------------------------------
+ConfigureTest(NEGATIVE_SAMPLING_TEST sampling/negative_sampling.cpp)
+
 ###################################################################################################
 # - Renumber tests --------------------------------------------------------------------------------
 ConfigureTest(RENUMBERING_TEST structure/renumbering_test.cpp)
@@ -741,6 +746,11 @@ if(BUILD_CUGRAPH_MG_TESTS)
     # - MG BIASED NBR SAMPLING tests --------------------------------------------------------------
     ConfigureTestMG(MG_BIASED_NEIGHBOR_SAMPLING_TEST sampling/mg_biased_neighbor_sampling.cpp)
 
+    ###################################################################################################
+    # - NEGATIVE SAMPLING tests --------------------------------------------------------------------
+    ConfigureTestMG(MG_NEGATIVE_SAMPLING_TEST sampling/mg_negative_sampling.cpp)
+
+
     ###############################################################################################
     # - MG RANDOM_WALKS tests ---------------------------------------------------------------------
     ConfigureTestMG(MG_RANDOM_WALKS_TEST sampling/mg_random_walks_test.cpp)
@@ -773,6 +783,7 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureCTestMG(MG_CAPI_HITS_TEST c_api/mg_hits_test.c)
     ConfigureCTestMG(MG_CAPI_UNIFORM_NEIGHBOR_SAMPLE_TEST c_api/mg_uniform_neighbor_sample_test.c)
     ConfigureCTestMG(MG_CAPI_BIASED_NEIGHBOR_SAMPLE_TEST c_api/mg_biased_neighbor_sample_test.c)
+    ConfigureCTestMG(MG_CAPI_NEGATIVE_SAMPLING_TEST c_api/mg_negative_sampling_test.c)
     ConfigureCTestMG(MG_CAPI_LOOKUP_SRC_DST_TEST c_api/mg_lookup_src_dst_test.c)
     ConfigureCTestMG(MG_CAPI_RANDOM_WALKS_TEST c_api/mg_random_walks_test.c)
     ConfigureCTestMG(MG_CAPI_TRIANGLE_COUNT_TEST c_api/mg_triangle_count_test.c)
@@ -812,6 +823,7 @@ ConfigureCTest(CAPI_WEAKLY_CONNECTED_COMPONENTS_TEST c_api/weakly_connected_comp
 ConfigureCTest(CAPI_STRONGLY_CONNECTED_COMPONENTS_TEST c_api/strongly_connected_components_test.c)
 ConfigureCTest(CAPI_UNIFORM_NEIGHBOR_SAMPLE_TEST c_api/uniform_neighbor_sample_test.c)
 ConfigureCTest(CAPI_BIASED_NEIGHBOR_SAMPLE_TEST c_api/biased_neighbor_sample_test.c)
+ConfigureCTest(CAPI_NEGATIVE_SAMPLING_TEST c_api/negative_sampling_test.c)
 ConfigureCTest(CAPI_RANDOM_WALKS_TEST c_api/sg_random_walks_test.c)
 ConfigureCTest(CAPI_TRIANGLE_COUNT_TEST c_api/triangle_count_test.c)
 ConfigureCTest(CAPI_LOUVAIN_TEST c_api/louvain_test.c)
diff --git a/cpp/tests/c_api/mg_negative_sampling_test.c b/cpp/tests/c_api/mg_negative_sampling_test.c
new file mode 100644
index 00000000000..3289206d8db
--- /dev/null
+++ b/cpp/tests/c_api/mg_negative_sampling_test.c
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mg_test_utils.h" /* RUN_MG_TEST */
+
+#include <cugraph_c/algorithms.h>
+#include <cugraph_c/graph.h>
+
+#include <math.h>
+#include <stdbool.h>
+#include <unistd.h>
+
+typedef int32_t vertex_t;
+typedef int32_t edge_t;
+typedef float weight_t;
+
+data_type_id_t vertex_tid    = INT32;
+data_type_id_t edge_tid      = INT32;
+data_type_id_t weight_tid    = FLOAT32;
+data_type_id_t edge_id_tid   = INT32;
+data_type_id_t edge_type_tid = INT32;
+
+int generic_negative_sampling_test(const cugraph_resource_handle_t* handle,
+                                   vertex_t* h_src,
+                                   vertex_t* h_dst,
+                                   size_t num_vertices,
+                                   size_t num_edges,
+                                   size_t num_samples,
+                                   vertex_t* h_vertices,
+                                   weight_t* h_src_bias,
+                                   weight_t* h_dst_bias,
+                                   size_t num_biases,
+                                   bool_t remove_duplicates,
+                                   bool_t remove_false_negatives,
+                                   bool_t exact_number_of_samples)
+{
+  // Create graph
+  int test_ret_value            = 0;
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error    = NULL;
+  cugraph_graph_t* graph        = NULL;
+  cugraph_coo_t* result         = NULL;
+
+  ret_code = create_mg_test_graph_new(handle,
+                                      vertex_tid,
+                                      edge_tid,
+                                      h_src,
+                                      h_dst,
+                                      weight_tid,
+                                      NULL,
+                                      edge_type_tid,
+                                      NULL,
+                                      edge_id_tid,
+                                      NULL,
+                                      num_edges,
+                                      FALSE,
+                                      TRUE,
+                                      FALSE,
+                                      FALSE,
+                                      &graph,
+                                      &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed.");
+
+  cugraph_type_erased_device_array_t* d_vertices           = NULL;
+  cugraph_type_erased_device_array_view_t* d_vertices_view = NULL;
+  cugraph_type_erased_device_array_t* d_src_bias           = NULL;
+  cugraph_type_erased_device_array_view_t* d_src_bias_view = NULL;
+  cugraph_type_erased_device_array_t* d_dst_bias           = NULL;
+  cugraph_type_erased_device_array_view_t* d_dst_bias_view = NULL;
+
+  int rank = cugraph_resource_handle_get_rank(handle);
+
+  if (num_biases > 0) {
+    if (rank == 0) {
+      ret_code = cugraph_type_erased_device_array_create(
+        handle, num_biases, vertex_tid, &d_vertices, &ret_error);
+      TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_vertices create failed.");
+
+      d_vertices_view = cugraph_type_erased_device_array_view(d_vertices);
+
+      ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+        handle, d_vertices_view, (byte_t*)h_vertices, &ret_error);
+
+      ret_code = cugraph_type_erased_device_array_create(
+        handle, num_biases, weight_tid, &d_src_bias, &ret_error);
+      TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_src_bias create failed.");
+
+      d_src_bias_view = cugraph_type_erased_device_array_view(d_src_bias);
+
+      ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+        handle, d_src_bias_view, (byte_t*)h_src_bias, &ret_error);
+
+      ret_code = cugraph_type_erased_device_array_create(
+        handle, num_biases, weight_tid, &d_dst_bias, &ret_error);
+      TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_dst_bias create failed.");
+
+      d_dst_bias_view = cugraph_type_erased_device_array_view(d_dst_bias);
+
+      ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+        handle, d_dst_bias_view, (byte_t*)h_dst_bias, &ret_error);
+    } else {
+      d_vertices_view = cugraph_type_erased_device_array_view_create(NULL, 0, vertex_tid);
+      d_src_bias_view = cugraph_type_erased_device_array_view_create(NULL, 0, weight_tid);
+      d_dst_bias_view = cugraph_type_erased_device_array_view_create(NULL, 0, weight_tid);
+    }
+  }
+
+  cugraph_rng_state_t* rng_state;
+  ret_code = cugraph_rng_state_create(handle, rank, &rng_state, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed.");
+
+  ret_code = cugraph_negative_sampling(handle,
+                                       rng_state,
+                                       graph,
+                                       d_vertices_view,
+                                       d_src_bias_view,
+                                       d_dst_bias_view,
+                                       num_samples,
+                                       remove_duplicates,
+                                       remove_false_negatives,
+                                       exact_number_of_samples,
+                                       FALSE,
+                                       &result,
+                                       &ret_error);
+
+  cugraph_type_erased_device_array_view_t* result_srcs = NULL;
+  cugraph_type_erased_device_array_view_t* result_dsts = NULL;
+
+  result_srcs = cugraph_coo_get_sources(result);
+  result_dsts = cugraph_coo_get_destinations(result);
+
+  size_t result_size = cugraph_type_erased_device_array_view_size(result_srcs);
+
+  vertex_t h_result_srcs[result_size];
+  vertex_t h_result_dsts[result_size];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_result_srcs, result_srcs, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_result_dsts, result_dsts, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  //  First, check that all edges are actually part of the graph
+  int32_t M_exists[num_vertices][num_vertices];
+  int32_t M_duplicates[num_vertices][num_vertices];
+
+  for (int i = 0; i < num_vertices; ++i)
+    for (int j = 0; j < num_vertices; ++j) {
+      M_exists[i][j]     = 0;
+      M_duplicates[i][j] = 0;
+    }
+
+  for (int i = 0; i < num_edges; ++i) {
+    M_exists[h_src[i]][h_dst[i]] = 1;
+  }
+
+  for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) {
+    TEST_ASSERT(test_ret_value,
+                (h_result_srcs[i] >= 0) && (h_result_srcs[i] < num_vertices),
+                "negative_sampling generated an edge that with an invalid vertex");
+    TEST_ASSERT(test_ret_value,
+                (h_result_dsts[i] >= 0) && (h_result_dsts[i] < num_vertices),
+                "negative_sampling generated an edge that with an invalid vertex");
+    if (remove_false_negatives == TRUE) {
+      TEST_ASSERT(test_ret_value,
+                  M_exists[h_result_srcs[i]][h_result_dsts[i]] == 0,
+                  "negative_sampling generated a false negative edge that should be suppressed");
+    }
+
+    if (remove_duplicates == TRUE) {
+      TEST_ASSERT(test_ret_value,
+                  M_duplicates[h_result_srcs[i]][h_result_dsts[i]] == 0,
+                  "negative_sampling generated a duplicate edge that should be suppressed");
+      M_duplicates[h_result_srcs[i]][h_result_dsts[i]] = 1;
+    }
+  }
+
+  if (exact_number_of_samples == TRUE)
+    TEST_ASSERT(test_ret_value,
+                result_size == num_samples,
+                "negative_sampling generated a result with an incorrect number of samples");
+
+  cugraph_type_erased_device_array_view_free(d_vertices_view);
+  cugraph_type_erased_device_array_view_free(d_src_bias_view);
+  cugraph_type_erased_device_array_view_free(d_dst_bias_view);
+  cugraph_type_erased_device_array_free(d_vertices);
+  cugraph_type_erased_device_array_free(d_src_bias);
+  cugraph_type_erased_device_array_free(d_dst_bias);
+  cugraph_coo_free(result);
+  cugraph_mg_graph_free(graph);
+  cugraph_error_free(ret_error);
+  return test_ret_value;
+}
+
+int test_negative_sampling_uniform(const cugraph_resource_handle_t* handle)
+{
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t edge_tid   = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+
+  size_t num_edges    = 9;
+  size_t num_vertices = 6;
+  size_t num_biases   = 0;
+  size_t num_samples  = 10;
+
+  vertex_t src[] = {0, 0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t dst[] = {1, 2, 3, 4, 0, 1, 3, 5, 5};
+
+  bool_t remove_duplicates       = FALSE;
+  bool_t remove_false_negatives  = TRUE;
+  bool_t exact_number_of_samples = FALSE;
+
+  return generic_negative_sampling_test(handle,
+                                        src,
+                                        dst,
+                                        num_vertices,
+                                        num_edges,
+                                        num_samples,
+                                        NULL,
+                                        NULL,
+                                        NULL,
+                                        num_biases,
+                                        remove_duplicates,
+                                        remove_false_negatives,
+                                        exact_number_of_samples);
+}
+
+int test_negative_sampling_biased(const cugraph_resource_handle_t* handle)
+{
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t edge_tid   = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+
+  size_t num_edges    = 9;
+  size_t num_vertices = 6;
+  size_t num_biases   = 6;
+  size_t num_samples  = 10;
+
+  vertex_t src[]      = {0, 0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t dst[]      = {1, 2, 3, 4, 0, 1, 3, 5, 5};
+  weight_t src_bias[] = {1, 1, 2, 2, 1, 1};
+  weight_t dst_bias[] = {2, 2, 1, 1, 1, 1};
+  vertex_t vertices[] = {0, 1, 2, 3, 4, 5};
+
+  bool_t remove_duplicates       = FALSE;
+  bool_t remove_false_negatives  = TRUE;
+  bool_t exact_number_of_samples = FALSE;
+
+  return generic_negative_sampling_test(handle,
+                                        src,
+                                        dst,
+                                        num_vertices,
+                                        num_edges,
+                                        num_samples,
+                                        vertices,
+                                        src_bias,
+                                        dst_bias,
+                                        num_biases,
+                                        remove_duplicates,
+                                        remove_false_negatives,
+                                        exact_number_of_samples);
+}
+
+/******************************************************************************/
+
+int main(int argc, char** argv)
+{
+  void* raft_handle                 = create_mg_raft_handle(argc, argv);
+  cugraph_resource_handle_t* handle = cugraph_create_resource_handle(raft_handle);
+
+  int result = 0;
+  result |= RUN_MG_TEST(test_negative_sampling_uniform, handle);
+  result |= RUN_MG_TEST(test_negative_sampling_biased, handle);
+
+  cugraph_free_resource_handle(handle);
+  free_mg_raft_handle(raft_handle);
+
+  return result;
+}
diff --git a/cpp/tests/c_api/negative_sampling_test.c b/cpp/tests/c_api/negative_sampling_test.c
new file mode 100644
index 00000000000..5e8d3f7e765
--- /dev/null
+++ b/cpp/tests/c_api/negative_sampling_test.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_test_utils.h" /* RUN_TEST */
+
+#include <cugraph_c/algorithms.h>
+#include <cugraph_c/graph.h>
+
+#include <math.h>
+#include <stdbool.h>
+#include <stdlib.h>
+
+typedef int32_t vertex_t;
+typedef int32_t edge_t;
+typedef float weight_t;
+
+data_type_id_t vertex_tid    = INT32;
+data_type_id_t edge_tid      = INT32;
+data_type_id_t weight_tid    = FLOAT32;
+data_type_id_t edge_id_tid   = INT32;
+data_type_id_t edge_type_tid = INT32;
+
+int generic_negative_sampling_test(const cugraph_resource_handle_t* handle,
+                                   vertex_t* h_src,
+                                   vertex_t* h_dst,
+                                   size_t num_vertices,
+                                   size_t num_edges,
+                                   size_t num_samples,
+                                   vertex_t* h_vertices,
+                                   weight_t* h_src_bias,
+                                   weight_t* h_dst_bias,
+                                   size_t num_biases,
+                                   bool_t remove_duplicates,
+                                   bool_t remove_false_negatives,
+                                   bool_t exact_number_of_samples)
+{
+  // Create graph
+  int test_ret_value            = 0;
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error    = NULL;
+  cugraph_graph_t* graph        = NULL;
+  cugraph_coo_t* result         = NULL;
+
+  ret_code = create_sg_test_graph(handle,
+                                  vertex_tid,
+                                  edge_tid,
+                                  h_src,
+                                  h_dst,
+                                  weight_tid,
+                                  NULL,
+                                  edge_type_tid,
+                                  NULL,
+                                  edge_id_tid,
+                                  NULL,
+                                  num_edges,
+                                  FALSE,
+                                  TRUE,
+                                  FALSE,
+                                  FALSE,
+                                  &graph,
+                                  &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed.");
+
+  cugraph_type_erased_device_array_t* d_vertices           = NULL;
+  cugraph_type_erased_device_array_view_t* d_vertices_view = NULL;
+  cugraph_type_erased_device_array_t* d_src_bias           = NULL;
+  cugraph_type_erased_device_array_view_t* d_src_bias_view = NULL;
+  cugraph_type_erased_device_array_t* d_dst_bias           = NULL;
+  cugraph_type_erased_device_array_view_t* d_dst_bias_view = NULL;
+
+  if (num_biases > 0) {
+    ret_code = cugraph_type_erased_device_array_create(
+      handle, num_biases, vertex_tid, &d_vertices, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_vertices create failed.");
+
+    d_vertices_view = cugraph_type_erased_device_array_view(d_vertices);
+
+    ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, d_vertices_view, (byte_t*)h_vertices, &ret_error);
+
+    ret_code = cugraph_type_erased_device_array_create(
+      handle, num_biases, weight_tid, &d_src_bias, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_src_bias create failed.");
+
+    d_src_bias_view = cugraph_type_erased_device_array_view(d_src_bias);
+
+    ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, d_src_bias_view, (byte_t*)h_src_bias, &ret_error);
+
+    ret_code = cugraph_type_erased_device_array_create(
+      handle, num_biases, weight_tid, &d_dst_bias, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_dst_bias create failed.");
+
+    d_dst_bias_view = cugraph_type_erased_device_array_view(d_dst_bias);
+
+    ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, d_dst_bias_view, (byte_t*)h_dst_bias, &ret_error);
+  }
+
+  cugraph_rng_state_t* rng_state;
+  ret_code = cugraph_rng_state_create(handle, 0, &rng_state, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed.");
+
+  ret_code = cugraph_negative_sampling(handle,
+                                       rng_state,
+                                       graph,
+                                       d_vertices_view,
+                                       d_src_bias_view,
+                                       d_dst_bias_view,
+                                       num_samples,
+                                       remove_duplicates,
+                                       remove_false_negatives,
+                                       exact_number_of_samples,
+                                       FALSE,
+                                       &result,
+                                       &ret_error);
+
+  cugraph_type_erased_device_array_view_t* result_srcs = NULL;
+  cugraph_type_erased_device_array_view_t* result_dsts = NULL;
+
+  result_srcs = cugraph_coo_get_sources(result);
+  result_dsts = cugraph_coo_get_destinations(result);
+
+  size_t result_size = cugraph_type_erased_device_array_view_size(result_srcs);
+
+  vertex_t h_result_srcs[result_size];
+  vertex_t h_result_dsts[result_size];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_result_srcs, result_srcs, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_result_dsts, result_dsts, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  //  First, check that all edges are actually part of the graph
+  int32_t M_exists[num_vertices][num_vertices];
+  int32_t M_duplicates[num_vertices][num_vertices];
+
+  for (int i = 0; i < num_vertices; ++i)
+    for (int j = 0; j < num_vertices; ++j) {
+      M_exists[i][j]     = 0;
+      M_duplicates[i][j] = 0;
+    }
+
+  for (int i = 0; i < num_edges; ++i) {
+    M_exists[h_src[i]][h_dst[i]] = 1;
+  }
+
+  for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) {
+    TEST_ASSERT(test_ret_value,
+                (h_result_srcs[i] >= 0) && (h_result_srcs[i] < num_vertices),
+                "negative_sampling generated an edge that with an invalid vertex");
+    TEST_ASSERT(test_ret_value,
+                (h_result_dsts[i] >= 0) && (h_result_dsts[i] < num_vertices),
+                "negative_sampling generated an edge that with an invalid vertex");
+    if (remove_false_negatives == TRUE) {
+      TEST_ASSERT(test_ret_value,
+                  M_exists[h_result_srcs[i]][h_result_dsts[i]] == 0,
+                  "negative_sampling generated a false negative edge that should be suppressed");
+    }
+
+    if (remove_duplicates == TRUE) {
+      TEST_ASSERT(test_ret_value,
+                  M_duplicates[h_result_srcs[i]][h_result_dsts[i]] == 0,
+                  "negative_sampling generated a duplicate edge that should be suppressed");
+      M_duplicates[h_result_srcs[i]][h_result_dsts[i]] = 1;
+    }
+  }
+
+  if (exact_number_of_samples == TRUE)
+    TEST_ASSERT(test_ret_value,
+                result_size == num_samples,
+                "negative_sampling generated a result with an incorrect number of samples");
+
+  cugraph_type_erased_device_array_view_free(d_vertices_view);
+  cugraph_type_erased_device_array_view_free(d_src_bias_view);
+  cugraph_type_erased_device_array_view_free(d_dst_bias_view);
+  cugraph_type_erased_device_array_free(d_vertices);
+  cugraph_type_erased_device_array_free(d_src_bias);
+  cugraph_coo_free(result);
+  cugraph_sg_graph_free(graph);
+  cugraph_error_free(ret_error);
+  return test_ret_value;
+}
+
+int test_negative_sampling_uniform(const cugraph_resource_handle_t* handle)
+{
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t edge_tid   = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+
+  size_t num_edges    = 9;
+  size_t num_vertices = 6;
+  size_t num_biases   = 0;
+  size_t num_samples  = 10;
+
+  vertex_t src[] = {0, 0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t dst[] = {1, 2, 3, 4, 0, 1, 3, 5, 5};
+
+  bool_t remove_duplicates       = FALSE;
+  bool_t remove_false_negatives  = TRUE;
+  bool_t exact_number_of_samples = FALSE;
+
+  return generic_negative_sampling_test(handle,
+                                        src,
+                                        dst,
+                                        num_vertices,
+                                        num_edges,
+                                        num_samples,
+                                        NULL,
+                                        NULL,
+                                        NULL,
+                                        num_biases,
+                                        remove_duplicates,
+                                        remove_false_negatives,
+                                        exact_number_of_samples);
+}
+
+int test_negative_sampling_biased(const cugraph_resource_handle_t* handle)
+{
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t edge_tid   = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+
+  size_t num_edges    = 9;
+  size_t num_vertices = 6;
+  size_t num_biases   = 6;
+  size_t num_samples  = 10;
+
+  vertex_t src[]      = {0, 0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t dst[]      = {1, 2, 3, 4, 0, 1, 3, 5, 5};
+  weight_t src_bias[] = {1, 1, 2, 2, 1, 1};
+  weight_t dst_bias[] = {2, 2, 1, 1, 1, 1};
+  vertex_t vertices[] = {0, 1, 2, 3, 4, 5};
+
+  bool_t remove_duplicates       = FALSE;
+  bool_t remove_false_negatives  = TRUE;
+  bool_t exact_number_of_samples = FALSE;
+
+  return generic_negative_sampling_test(handle,
+                                        src,
+                                        dst,
+                                        num_vertices,
+                                        num_edges,
+                                        num_samples,
+                                        vertices,
+                                        src_bias,
+                                        dst_bias,
+                                        num_biases,
+                                        remove_duplicates,
+                                        remove_false_negatives,
+                                        exact_number_of_samples);
+}
+
+int main(int argc, char** argv)
+{
+  cugraph_resource_handle_t* handle = NULL;
+
+  handle = cugraph_create_resource_handle(NULL);
+
+  int result = 0;
+  result |= RUN_TEST_NEW(test_negative_sampling_uniform, handle);
+  result |= RUN_TEST_NEW(test_negative_sampling_biased, handle);
+
+  cugraph_free_resource_handle(handle);
+
+  return result;
+}
diff --git a/cpp/tests/sampling/detail/nbr_sampling_validate.cu b/cpp/tests/sampling/detail/nbr_sampling_validate.cu
index 61731e2e15c..70828e559f1 100644
--- a/cpp/tests/sampling/detail/nbr_sampling_validate.cu
+++ b/cpp/tests/sampling/detail/nbr_sampling_validate.cu
@@ -75,6 +75,8 @@ struct ArithmeticZipLess {
       } else {
         return thrust::get<1>(left) < thrust::get<1>(right);
       }
+    } else {
+      return false;
     }
   }
 };
diff --git a/cpp/tests/sampling/mg_negative_sampling.cpp b/cpp/tests/sampling/mg_negative_sampling.cpp
new file mode 100644
index 00000000000..7c64bb7fbbb
--- /dev/null
+++ b/cpp/tests/sampling/mg_negative_sampling.cpp
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utilities/base_fixture.hpp"
+#include "utilities/conversion_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/validation_utilities.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Negative_Sampling_Usecase {
+  float sample_multiplier{2};
+  bool use_src_bias{false};
+  bool use_dst_bias{false};
+  bool remove_duplicates{false};
+  bool remove_existing_edges{false};
+  bool exact_number_of_samples{false};
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t, typename vertex_t, typename edge_t, typename weight_t>
+class Tests_MGNegative_Sampling : public ::testing::TestWithParam<input_usecase_t> {
+ public:
+  using graph_t      = cugraph::graph_t<vertex_t, edge_t, false, true>;
+  using graph_view_t = cugraph::graph_view_t<vertex_t, edge_t, false, true>;
+
+  Tests_MGNegative_Sampling() : graph_(*handle_) {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  template <typename input_t>
+  void load_graph(input_t const& param)
+  {
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    std::tie(graph_, edge_weights_, renumber_map_labels_) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_, param, true, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    edge_mask_ =
+      cugraph::test::generate<graph_view_t, bool>::edge_property(*handle_, graph_.view(), 2);
+  }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  void run_current_test(raft::random::RngState& rng_state,
+                        Negative_Sampling_Usecase const& negative_sampling_usecase)
+  {
+    constexpr bool do_expensive_check{false};
+
+    HighResTimer hr_timer{};
+
+    auto graph_view = graph_.view();
+
+    if (negative_sampling_usecase.edge_masking) { graph_view.attach_edge_mask(edge_mask_->view()); }
+
+    size_t num_samples =
+      graph_view.compute_number_of_edges(*handle_) * negative_sampling_usecase.sample_multiplier;
+
+    rmm::device_uvector<weight_t> src_bias_v(0, handle_->get_stream());
+    rmm::device_uvector<weight_t> dst_bias_v(0, handle_->get_stream());
+
+    std::optional<raft::device_span<weight_t const>> src_bias{std::nullopt};
+    std::optional<raft::device_span<weight_t const>> dst_bias{std::nullopt};
+
+    if (negative_sampling_usecase.use_src_bias) {
+      src_bias_v.resize(graph_view.local_vertex_partition_range_size(), handle_->get_stream());
+
+      cugraph::detail::uniform_random_fill(handle_->get_stream(),
+                                           src_bias_v.data(),
+                                           src_bias_v.size(),
+                                           weight_t{1},
+                                           weight_t{10},
+                                           rng_state);
+
+      src_bias = raft::device_span<weight_t const>{src_bias_v.data(), src_bias_v.size()};
+    }
+
+    if (negative_sampling_usecase.use_dst_bias) {
+      dst_bias_v.resize(graph_view.local_vertex_partition_range_size(), handle_->get_stream());
+
+      cugraph::detail::uniform_random_fill(handle_->get_stream(),
+                                           dst_bias_v.data(),
+                                           dst_bias_v.size(),
+                                           weight_t{1},
+                                           weight_t{10},
+                                           rng_state);
+
+      dst_bias = raft::device_span<weight_t const>{dst_bias_v.data(), dst_bias_v.size()};
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Negative sampling");
+    }
+
+    auto&& [src_out, dst_out] =
+      cugraph::negative_sampling(*handle_,
+                                 rng_state,
+                                 graph_view,
+                                 src_bias,
+                                 dst_bias,
+                                 num_samples,
+                                 negative_sampling_usecase.remove_duplicates,
+                                 negative_sampling_usecase.remove_existing_edges,
+                                 negative_sampling_usecase.exact_number_of_samples,
+                                 do_expensive_check);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (negative_sampling_usecase.check_correctness) {
+      ASSERT_EQ(src_out.size(), dst_out.size()) << "Result size (src, dst) mismatch";
+
+      cugraph::test::sort(*handle_,
+                          raft::device_span<vertex_t>{src_out.data(), src_out.size()},
+                          raft::device_span<vertex_t>{dst_out.data(), dst_out.size()});
+
+      // TODO:  Move this to validation_utilities...
+      auto h_vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
+      rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(
+        h_vertex_partition_range_lasts.size(), handle_->get_stream());
+      raft::update_device(d_vertex_partition_range_lasts.data(),
+                          h_vertex_partition_range_lasts.data(),
+                          h_vertex_partition_range_lasts.size(),
+                          handle_->get_stream());
+
+      size_t error_count = cugraph::test::count_edges_on_wrong_int_gpu(
+        *handle_,
+        raft::device_span<vertex_t const>{src_out.data(), src_out.size()},
+        raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()},
+        raft::device_span<vertex_t const>{d_vertex_partition_range_lasts.data(),
+                                          d_vertex_partition_range_lasts.size()});
+
+      ASSERT_EQ(error_count, 0) << "generate edges out of range > 0";
+
+      if ((negative_sampling_usecase.remove_duplicates) && (src_out.size() > 0)) {
+        error_count = cugraph::test::count_duplicate_vertex_pairs_sorted(
+          *handle_,
+          raft::device_span<vertex_t const>{src_out.data(), src_out.size()},
+          raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()});
+        ASSERT_EQ(error_count, 0) << "Remove duplicates specified, found duplicate entries";
+      }
+
+      if (negative_sampling_usecase.remove_existing_edges) {
+        rmm::device_uvector<vertex_t> graph_src(0, handle_->get_stream());
+        rmm::device_uvector<vertex_t> graph_dst(0, handle_->get_stream());
+
+        std::tie(graph_src, graph_dst, std::ignore, std::ignore, std::ignore) =
+          cugraph::decompress_to_edgelist<vertex_t, edge_t, float, int, false, true>(
+            *handle_, graph_view, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
+
+        error_count = cugraph::test::count_intersection<vertex_t, edge_t, weight_t, int32_t>(
+          *handle_,
+          raft::device_span<vertex_t const>{graph_src.data(), graph_src.size()},
+          raft::device_span<vertex_t const>{graph_dst.data(), graph_dst.size()},
+          std::nullopt,
+          std::nullopt,
+          std::nullopt,
+          raft::device_span<vertex_t const>{src_out.data(), src_out.size()},
+          raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()},
+          std::nullopt,
+          std::nullopt,
+          std::nullopt);
+        ASSERT_EQ(error_count, 0) << "Remove existing edges specified, found existing edges";
+      }
+
+      if (negative_sampling_usecase.exact_number_of_samples) {
+        size_t sz = cugraph::host_scalar_allreduce(
+          handle_->get_comms(), src_out.size(), raft::comms::op_t::SUM, handle_->get_stream());
+        ASSERT_EQ(sz, num_samples) << "Expected exact number of samples";
+      }
+
+      //  TBD: How do we determine if we have properly reflected the biases?
+    }
+  }
+
+ public:
+  static std::unique_ptr<raft::handle_t> handle_;
+
+ private:
+  graph_t graph_;
+  std::optional<cugraph::edge_property_t<graph_view_t, weight_t>> edge_weights_{std::nullopt};
+  std::optional<cugraph::edge_property_t<graph_view_t, bool>> edge_mask_{std::nullopt};
+  std::optional<rmm::device_uvector<vertex_t>> renumber_map_labels_{std::nullopt};
+};
+
+template <typename input_usecase_t, typename vertex_t, typename edge_t, typename weight_t>
+std::unique_ptr<raft::handle_t>
+  Tests_MGNegative_Sampling<input_usecase_t, vertex_t, edge_t, weight_t>::handle_ = nullptr;
+
+using Tests_MGNegative_Sampling_File_i64_i64_float =
+  Tests_MGNegative_Sampling<cugraph::test::File_Usecase, int64_t, int64_t, float>;
+
+using Tests_MGNegative_Sampling_Rmat_i64_i64_float =
+  Tests_MGNegative_Sampling<cugraph::test::Rmat_Usecase, int64_t, int64_t, float>;
+
+template <typename CurrentTest>
+void run_all_tests(CurrentTest* current_test)
+{
+  raft::random::RngState rng_state{
+    static_cast<uint64_t>(current_test->handle_->get_comms().get_rank())};
+
+  for (bool use_src_bias : {false, true})
+    for (bool use_dst_bias : {false, true})
+      for (bool remove_duplicates : {false, true})
+        for (bool remove_existing_edges : {false, true})
+          for (bool exact_number_of_samples : {false, true})
+            for (bool edge_masking : {false, true})
+              current_test->run_current_test(rng_state,
+                                             Negative_Sampling_Usecase{2,
+                                                                       use_src_bias,
+                                                                       use_dst_bias,
+                                                                       remove_duplicates,
+                                                                       remove_existing_edges,
+                                                                       exact_number_of_samples,
+                                                                       edge_masking});
+}
+
+TEST_P(Tests_MGNegative_Sampling_File_i64_i64_float, CheckInt64Int64Float)
+{
+  load_graph(override_File_Usecase_with_cmd_line_arguments(GetParam()));
+  run_all_tests(this);
+}
+
+TEST_P(Tests_MGNegative_Sampling_Rmat_i64_i64_float, CheckInt64Int64Float)
+{
+  load_graph(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+  run_all_tests(this);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGNegative_Sampling_File_i64_i64_float,
+  ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx")));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_MGNegative_Sampling_File_i64_i64_float,
+  ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                    cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                    cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx")));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGNegative_Sampling_Rmat_i64_i64_float,
+  ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGNegative_Sampling_Rmat_i64_i64_float,
+  ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false, 0)));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/negative_sampling.cpp b/cpp/tests/sampling/negative_sampling.cpp
new file mode 100644
index 00000000000..ba929c63e9b
--- /dev/null
+++ b/cpp/tests/sampling/negative_sampling.cpp
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utilities/base_fixture.hpp"
+#include "utilities/conversion_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/validation_utilities.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Negative_Sampling_Usecase {
+  float sample_multiplier{2};
+  bool use_src_bias{false};
+  bool use_dst_bias{false};
+  bool remove_duplicates{false};
+  bool remove_existing_edges{false};
+  bool exact_number_of_samples{false};
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t, typename vertex_t, typename edge_t, typename weight_t>
+class Tests_Negative_Sampling : public ::testing::TestWithParam<input_usecase_t> {
+ public:
+  using graph_t      = cugraph::graph_t<vertex_t, edge_t, false, false>;
+  using graph_view_t = cugraph::graph_view_t<vertex_t, edge_t, false, false>;
+
+  Tests_Negative_Sampling() : graph_(raft::handle_t{}) {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  template <typename input_t>
+  void load_graph(input_t const& param)
+  {
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    std::tie(graph_, edge_weights_, renumber_map_labels_) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, param, true, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    edge_mask_ =
+      cugraph::test::generate<graph_view_t, bool>::edge_property(handle, graph_.view(), 2);
+  }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  void run_current_test(raft::random::RngState& rng_state,
+                        Negative_Sampling_Usecase const& negative_sampling_usecase)
+  {
+    constexpr bool do_expensive_check{false};
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    auto graph_view = graph_.view();
+
+    if (negative_sampling_usecase.edge_masking) { graph_view.attach_edge_mask(edge_mask_->view()); }
+
+    size_t num_samples =
+      graph_view.compute_number_of_edges(handle) * negative_sampling_usecase.sample_multiplier;
+
+    rmm::device_uvector<weight_t> src_bias_v(0, handle.get_stream());
+    rmm::device_uvector<weight_t> dst_bias_v(0, handle.get_stream());
+
+    std::optional<raft::device_span<weight_t const>> src_bias{std::nullopt};
+    std::optional<raft::device_span<weight_t const>> dst_bias{std::nullopt};
+
+    if (negative_sampling_usecase.use_src_bias) {
+      src_bias_v.resize(graph_view.number_of_vertices(), handle.get_stream());
+
+      cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                           src_bias_v.data(),
+                                           src_bias_v.size(),
+                                           weight_t{1},
+                                           weight_t{10},
+                                           rng_state);
+
+      src_bias = raft::device_span<weight_t const>{src_bias_v.data(), src_bias_v.size()};
+    }
+
+    if (negative_sampling_usecase.use_dst_bias) {
+      dst_bias_v.resize(graph_view.number_of_vertices(), handle.get_stream());
+
+      cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                           dst_bias_v.data(),
+                                           dst_bias_v.size(),
+                                           weight_t{1},
+                                           weight_t{10},
+                                           rng_state);
+
+      dst_bias = raft::device_span<weight_t const>{dst_bias_v.data(), dst_bias_v.size()};
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Negative sampling");
+    }
+
+    auto&& [src_out, dst_out] =
+      cugraph::negative_sampling(handle,
+                                 rng_state,
+                                 graph_view,
+                                 src_bias,
+                                 dst_bias,
+                                 num_samples,
+                                 negative_sampling_usecase.remove_duplicates,
+                                 negative_sampling_usecase.remove_existing_edges,
+                                 negative_sampling_usecase.exact_number_of_samples,
+                                 do_expensive_check);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (negative_sampling_usecase.check_correctness) {
+      ASSERT_EQ(src_out.size(), dst_out.size()) << "Result size (src, dst) mismatch";
+
+      cugraph::test::sort(handle,
+                          raft::device_span<vertex_t>{src_out.data(), src_out.size()},
+                          raft::device_span<vertex_t>{dst_out.data(), dst_out.size()});
+
+      size_t error_count = cugraph::test::count_invalid_vertices(
+        handle,
+        raft::device_span<vertex_t const>{src_out.data(), src_out.size()},
+        graph_view.local_vertex_partition_view());
+      ASSERT_EQ(error_count, 0) << "Source vertices out of range > 0";
+
+      error_count = cugraph::test::count_invalid_vertices(
+        handle,
+        raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()},
+        graph_view.local_vertex_partition_view());
+      ASSERT_EQ(error_count, 0) << "Dest vertices out of range > 0";
+
+      if (negative_sampling_usecase.remove_duplicates) {
+        error_count = cugraph::test::count_duplicate_vertex_pairs_sorted(
+          handle,
+          raft::device_span<vertex_t const>{src_out.data(), src_out.size()},
+          raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()});
+        ASSERT_EQ(error_count, 0) << "Remove duplicates specified, found duplicate entries";
+      }
+
+      if (negative_sampling_usecase.remove_existing_edges) {
+        rmm::device_uvector<vertex_t> graph_src(0, handle.get_stream());
+        rmm::device_uvector<vertex_t> graph_dst(0, handle.get_stream());
+
+        std::tie(graph_src, graph_dst, std::ignore, std::ignore, std::ignore) =
+          cugraph::decompress_to_edgelist<vertex_t, edge_t, float, int, false, false>(
+            handle, graph_view, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
+
+        error_count = cugraph::test::count_intersection<vertex_t, edge_t, weight_t, int32_t>(
+          handle,
+          raft::device_span<vertex_t const>{graph_src.data(), graph_src.size()},
+          raft::device_span<vertex_t const>{graph_dst.data(), graph_dst.size()},
+          std::nullopt,
+          std::nullopt,
+          std::nullopt,
+          raft::device_span<vertex_t const>{src_out.data(), src_out.size()},
+          raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()},
+          std::nullopt,
+          std::nullopt,
+          std::nullopt);
+
+        ASSERT_EQ(error_count, 0) << "Remove existing edges specified, found existing edges";
+      }
+
+      if (negative_sampling_usecase.exact_number_of_samples) {
+        ASSERT_EQ(src_out.size(), num_samples) << "Expected exact number of samples";
+      }
+
+      //  TBD: How do we determine if we have properly reflected the biases?
+    }
+  }
+
+ private:
+  graph_t graph_;
+  std::optional<cugraph::edge_property_t<graph_view_t, weight_t>> edge_weights_{std::nullopt};
+  std::optional<cugraph::edge_property_t<graph_view_t, bool>> edge_mask_{std::nullopt};
+  std::optional<rmm::device_uvector<vertex_t>> renumber_map_labels_{std::nullopt};
+};
+
+using Tests_Negative_Sampling_File_i32_i32_float =
+  Tests_Negative_Sampling<cugraph::test::File_Usecase, int32_t, int32_t, float>;
+
+using Tests_Negative_Sampling_File_i32_i64_float =
+  Tests_Negative_Sampling<cugraph::test::File_Usecase, int32_t, int64_t, float>;
+
+using Tests_Negative_Sampling_File_i64_i64_float =
+  Tests_Negative_Sampling<cugraph::test::File_Usecase, int64_t, int64_t, float>;
+
+using Tests_Negative_Sampling_Rmat_i32_i32_float =
+  Tests_Negative_Sampling<cugraph::test::Rmat_Usecase, int32_t, int32_t, float>;
+
+using Tests_Negative_Sampling_Rmat_i32_i64_float =
+  Tests_Negative_Sampling<cugraph::test::Rmat_Usecase, int32_t, int64_t, float>;
+
+using Tests_Negative_Sampling_Rmat_i64_i64_float =
+  Tests_Negative_Sampling<cugraph::test::Rmat_Usecase, int64_t, int64_t, float>;
+
+template <typename CurrentTest>
+void run_all_tests(CurrentTest* current_test)
+{
+  raft::random::RngState rng_state{0};
+
+  for (bool use_src_bias : {false, true})
+    for (bool use_dst_bias : {false, true})
+      for (bool remove_duplicates : {false, true})
+        for (bool remove_existing_edges : {false, true})
+          for (bool exact_number_of_samples : {false, true})
+            for (bool edge_masking : {false, true})
+              current_test->run_current_test(rng_state,
+                                             Negative_Sampling_Usecase{2,
+                                                                       use_src_bias,
+                                                                       use_dst_bias,
+                                                                       remove_duplicates,
+                                                                       remove_existing_edges,
+                                                                       exact_number_of_samples,
+                                                                       edge_masking});
+}
+
+TEST_P(Tests_Negative_Sampling_File_i32_i32_float, CheckInt32Int32Float)
+{
+  load_graph(override_File_Usecase_with_cmd_line_arguments(GetParam()));
+  run_all_tests(this);
+}
+
+TEST_P(Tests_Negative_Sampling_File_i32_i64_float, CheckInt32Int64Float)
+{
+  load_graph(override_File_Usecase_with_cmd_line_arguments(GetParam()));
+  run_all_tests(this);
+}
+
+TEST_P(Tests_Negative_Sampling_File_i64_i64_float, CheckInt64Int64Float)
+{
+  load_graph(override_File_Usecase_with_cmd_line_arguments(GetParam()));
+  run_all_tests(this);
+}
+
+TEST_P(Tests_Negative_Sampling_Rmat_i32_i32_float, CheckInt32Int32Float)
+{
+  load_graph(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+  run_all_tests(this);
+}
+
+TEST_P(Tests_Negative_Sampling_Rmat_i32_i64_float, CheckInt32Int64Float)
+{
+  load_graph(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+  run_all_tests(this);
+}
+
+TEST_P(Tests_Negative_Sampling_Rmat_i64_i64_float, CheckInt64Int64Float)
+{
+  load_graph(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+  run_all_tests(this);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_Negative_Sampling_File_i32_i32_float,
+  ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx")));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_Negative_Sampling_File_i32_i32_float,
+  ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                    cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                    cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx")));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_Negative_Sampling_File_i32_i64_float,
+  ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx")));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_Negative_Sampling_File_i32_i64_float,
+  ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                    cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                    cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx")));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_Negative_Sampling_File_i64_i64_float,
+  ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx")));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_Negative_Sampling_File_i64_i64_float,
+  ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                    cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                    cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx")));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Negative_Sampling_Rmat_i32_i32_float,
+  ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Negative_Sampling_Rmat_i32_i64_float,
+  ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Negative_Sampling_Rmat_i64_i64_float,
+  ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Negative_Sampling_Rmat_i64_i64_float,
+  ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false, 0)));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/utilities/validation_utilities.cu b/cpp/tests/utilities/validation_utilities.cu
new file mode 100644
index 00000000000..3da998ad626
--- /dev/null
+++ b/cpp/tests/utilities/validation_utilities.cu
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/graph_partition_utils.cuh"
+#include "utilities/validation_utilities.hpp"
+
+#include <cugraph/vertex_partition_device_view.cuh>
+
+#include <thrust/count.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/set_operations.h>
+#include <thrust/sort.h>
+
+namespace cugraph::test {
+
+template <typename vertex_t, bool multi_gpu>
+size_t count_invalid_vertices(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> vertices,
+  cugraph::vertex_partition_view_t<vertex_t, multi_gpu> const& vertex_partition_view)
+{
+  return thrust::count_if(
+    handle.get_thrust_policy(),
+    vertices.begin(),
+    vertices.end(),
+    [vertex_partition = cugraph::vertex_partition_device_view_t<vertex_t, multi_gpu>{
+       vertex_partition_view}] __device__(auto val) {
+      return !(vertex_partition.is_valid_vertex(val) &&
+               vertex_partition.in_local_vertex_partition_range_nocheck(val));
+    });
+}
+
+template <typename vertex_t>
+size_t count_duplicate_vertex_pairs_sorted(raft::handle_t const& handle,
+                                           raft::device_span<vertex_t const> src,
+                                           raft::device_span<vertex_t const> dst)
+{
+  return thrust::count_if(handle.get_thrust_policy(),
+                          thrust::make_counting_iterator<size_t>(1),
+                          thrust::make_counting_iterator<size_t>(src.size()),
+                          [src, dst] __device__(size_t index) {
+                            return (src[index - 1] == src[index]) && (dst[index - 1] == dst[index]);
+                          });
+}
+
+// FIXME: Resolve this with dataframe_buffer variations in thrust_wrappers.cu
+template <typename vertex_t>
+void sort(raft::handle_t const& handle,
+          raft::device_span<vertex_t> srcs,
+          raft::device_span<vertex_t> dsts)
+{
+  thrust::sort(handle.get_thrust_policy(),
+               thrust::make_zip_iterator(srcs.begin(), dsts.begin()),
+               thrust::make_zip_iterator(srcs.end(), dsts.end()));
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename edge_type_t>
+size_t count_intersection(raft::handle_t const& handle,
+                          raft::device_span<vertex_t const> srcs1,
+                          raft::device_span<vertex_t const> dsts1,
+                          std::optional<raft::device_span<weight_t const>> wgts1,
+                          std::optional<raft::device_span<edge_t const>> edge_ids1,
+                          std::optional<raft::device_span<edge_type_t const>> edge_types1,
+                          raft::device_span<vertex_t const> srcs2,
+                          raft::device_span<vertex_t const> dsts2,
+                          std::optional<raft::device_span<weight_t const>> wgts2,
+                          std::optional<raft::device_span<edge_t const>> edge_ids2,
+                          std::optional<raft::device_span<edge_type_t const>> edge_types2)
+{
+  // FIXME: Add support for wgts, edgeids and edge_types...
+  //    Added to the API for future support.
+
+  auto iter1       = thrust::make_zip_iterator(srcs1.begin(), dsts1.begin());
+  auto iter2       = thrust::make_zip_iterator(srcs2.begin(), dsts2.begin());
+  auto output_iter = thrust::make_discard_iterator();
+
+  return thrust::distance(output_iter,
+                          thrust::set_intersection(handle.get_thrust_policy(),
+                                                   iter1,
+                                                   iter1 + srcs1.size(),
+                                                   iter2,
+                                                   iter2 + srcs2.size(),
+                                                   output_iter));
+#if 0
+  // OLD Approach
+  return thrust::count_if(
+    handle.get_thrust_policy(),
+    thrust::make_zip_iterator(src_out.begin(), dst_out.begin()),
+    thrust::make_zip_iterator(src_out.end(), dst_out.end()),
+    cuda::proclaim_return_type<size_t>(
+      [src = raft::device_span<vertex_t const>{graph_src.data(), graph_src.size()},
+       dst = raft::device_span<vertex_t const>{graph_dst.data(),
+                                               graph_dst.size()}] __device__(auto tuple) {
+#if 0
+        // FIXME: This fails on rocky linux CUDA 11.8, works on CUDA 12
+        return thrust::binary_search(thrust::seq,
+                                     thrust::make_zip_iterator(src.begin(), dst.begin()),
+                                     thrust::make_zip_iterator(src.end(), dst.end()),
+                                     tuple) ? size_t{1} : size_t{0};
+#else
+        auto lb = thrust::distance(
+          src.begin(),
+          thrust::lower_bound(thrust::seq, src.begin(), src.end(), thrust::get<0>(tuple)));
+        auto ub = thrust::distance(
+          src.begin(),
+          thrust::upper_bound(thrust::seq, src.begin(), src.end(), thrust::get<0>(tuple)));
+
+        if (src.data()[lb] == thrust::get<0>(tuple)) {
+          return thrust::binary_search(
+            thrust::seq, dst.begin() + lb, dst.begin() + ub, thrust::get<1>(tuple))
+              ? size_t{1}
+              : size_t{0};
+        } else {
+          return size_t{0};
+        }
+#endif
+      }));
+#endif
+}
+
+template <typename vertex_t>
+size_t count_edges_on_wrong_int_gpu(raft::handle_t const& handle,
+                                    raft::device_span<vertex_t const> srcs,
+                                    raft::device_span<vertex_t const> dsts,
+                                    raft::device_span<vertex_t const> vertex_partition_range_lasts)
+{
+  return thrust::count_if(
+    handle.get_thrust_policy(),
+    thrust::make_zip_iterator(srcs.begin(), dsts.begin()),
+    thrust::make_zip_iterator(srcs.end(), dsts.end()),
+    [comm_rank       = handle.get_comms().get_rank(),
+     gpu_id_key_func = cugraph::detail::compute_gpu_id_from_int_edge_endpoints_t<vertex_t>{
+       vertex_partition_range_lasts,
+       handle.get_comms().get_size(),
+       handle.get_subcomm(cugraph::partition_manager::major_comm_name()).get_size(),
+       handle.get_subcomm(cugraph::partition_manager::minor_comm_name())
+         .get_size()}] __device__(auto e) {
+      return (gpu_id_key_func(thrust::get<0>(e), thrust::get<1>(e)) != comm_rank);
+    });
+}
+
+// TODO: Split SG from MG?
+template size_t count_invalid_vertices(
+  raft::handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  cugraph::vertex_partition_view_t<int32_t, false> const& vertex_partition_view);
+
+template size_t count_invalid_vertices(
+  raft::handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  cugraph::vertex_partition_view_t<int64_t, false> const& vertex_partition_view);
+
+template size_t count_duplicate_vertex_pairs_sorted(raft::handle_t const& handle,
+                                                    raft::device_span<int32_t const> src,
+                                                    raft::device_span<int32_t const> dst);
+
+template size_t count_duplicate_vertex_pairs_sorted(raft::handle_t const& handle,
+                                                    raft::device_span<int64_t const> src,
+                                                    raft::device_span<int64_t const> dst);
+
+template void sort(raft::handle_t const& handle,
+                   raft::device_span<int32_t> srcs,
+                   raft::device_span<int32_t> dsts);
+template void sort(raft::handle_t const& handle,
+                   raft::device_span<int64_t> srcs,
+                   raft::device_span<int64_t> dsts);
+
+template size_t count_intersection(raft::handle_t const& handle,
+                                   raft::device_span<int32_t const> srcs1,
+                                   raft::device_span<int32_t const> dsts1,
+                                   std::optional<raft::device_span<float const>> wgts1,
+                                   std::optional<raft::device_span<int32_t const>> edge_ids1,
+                                   std::optional<raft::device_span<int32_t const>> edge_types1,
+                                   raft::device_span<int32_t const> srcs2,
+                                   raft::device_span<int32_t const> dsts2,
+                                   std::optional<raft::device_span<float const>> wgts2,
+                                   std::optional<raft::device_span<int32_t const>> edge_ids2,
+                                   std::optional<raft::device_span<int32_t const>> edge_types2);
+
+template size_t count_intersection(raft::handle_t const& handle,
+                                   raft::device_span<int32_t const> srcs1,
+                                   raft::device_span<int32_t const> dsts1,
+                                   std::optional<raft::device_span<float const>> wgts1,
+                                   std::optional<raft::device_span<int64_t const>> edge_ids1,
+                                   std::optional<raft::device_span<int32_t const>> edge_types1,
+                                   raft::device_span<int32_t const> srcs2,
+                                   raft::device_span<int32_t const> dsts2,
+                                   std::optional<raft::device_span<float const>> wgts2,
+                                   std::optional<raft::device_span<int64_t const>> edge_ids2,
+                                   std::optional<raft::device_span<int32_t const>> edge_types2);
+
+template size_t count_intersection(raft::handle_t const& handle,
+                                   raft::device_span<int64_t const> srcs1,
+                                   raft::device_span<int64_t const> dsts1,
+                                   std::optional<raft::device_span<float const>> wgts1,
+                                   std::optional<raft::device_span<int64_t const>> edge_ids1,
+                                   std::optional<raft::device_span<int32_t const>> edge_types1,
+                                   raft::device_span<int64_t const> srcs2,
+                                   raft::device_span<int64_t const> dsts2,
+                                   std::optional<raft::device_span<float const>> wgts2,
+                                   std::optional<raft::device_span<int64_t const>> edge_ids2,
+                                   std::optional<raft::device_span<int32_t const>> edge_types2);
+
+template size_t count_intersection(raft::handle_t const& handle,
+                                   raft::device_span<int32_t const> srcs1,
+                                   raft::device_span<int32_t const> dsts1,
+                                   std::optional<raft::device_span<double const>> wgts1,
+                                   std::optional<raft::device_span<int32_t const>> edge_ids1,
+                                   std::optional<raft::device_span<int32_t const>> edge_types1,
+                                   raft::device_span<int32_t const> srcs2,
+                                   raft::device_span<int32_t const> dsts2,
+                                   std::optional<raft::device_span<double const>> wgts2,
+                                   std::optional<raft::device_span<int32_t const>> edge_ids2,
+                                   std::optional<raft::device_span<int32_t const>> edge_types2);
+
+template size_t count_intersection(raft::handle_t const& handle,
+                                   raft::device_span<int32_t const> srcs1,
+                                   raft::device_span<int32_t const> dsts1,
+                                   std::optional<raft::device_span<double const>> wgts1,
+                                   std::optional<raft::device_span<int64_t const>> edge_ids1,
+                                   std::optional<raft::device_span<int32_t const>> edge_types1,
+                                   raft::device_span<int32_t const> srcs2,
+                                   raft::device_span<int32_t const> dsts2,
+                                   std::optional<raft::device_span<double const>> wgts2,
+                                   std::optional<raft::device_span<int64_t const>> edge_ids2,
+                                   std::optional<raft::device_span<int32_t const>> edge_types2);
+
+template size_t count_intersection(raft::handle_t const& handle,
+                                   raft::device_span<int64_t const> srcs1,
+                                   raft::device_span<int64_t const> dsts1,
+                                   std::optional<raft::device_span<double const>> wgts1,
+                                   std::optional<raft::device_span<int64_t const>> edge_ids1,
+                                   std::optional<raft::device_span<int32_t const>> edge_types1,
+                                   raft::device_span<int64_t const> srcs2,
+                                   raft::device_span<int64_t const> dsts2,
+                                   std::optional<raft::device_span<double const>> wgts2,
+                                   std::optional<raft::device_span<int64_t const>> edge_ids2,
+                                   std::optional<raft::device_span<int32_t const>> edge_types2);
+
+template size_t count_edges_on_wrong_int_gpu(
+  raft::handle_t const& handle,
+  raft::device_span<int32_t const> srcs,
+  raft::device_span<int32_t const> dsts,
+  raft::device_span<int32_t const> vertex_partition_range_lasts);
+
+template size_t count_edges_on_wrong_int_gpu(
+  raft::handle_t const& handle,
+  raft::device_span<int64_t const> srcs,
+  raft::device_span<int64_t const> dsts,
+  raft::device_span<int64_t const> vertex_partition_range_lasts);
+
+}  // namespace cugraph::test
diff --git a/cpp/tests/utilities/validation_utilities.hpp b/cpp/tests/utilities/validation_utilities.hpp
new file mode 100644
index 00000000000..b94ceaf68be
--- /dev/null
+++ b/cpp/tests/utilities/validation_utilities.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/vertex_partition_view.hpp>
+
+#include <raft/core/device_span.hpp>
+#include <raft/core/handle.hpp>
+
+namespace cugraph::test {
+template <typename vertex_t, bool multi_gpu>
+size_t count_invalid_vertices(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> vertices,
+  cugraph::vertex_partition_view_t<vertex_t, multi_gpu> const& vertex_partition);
+
+template <typename vertex_t>
+size_t count_duplicate_vertex_pairs_sorted(raft::handle_t const& handle,
+                                           raft::device_span<vertex_t const> src,
+                                           raft::device_span<vertex_t const> dst);
+
+template <typename vertex_t>
+void sort(raft::handle_t const& handle,
+          raft::device_span<vertex_t> srcs,
+          raft::device_span<vertex_t> dsts);
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename edge_type_t>
+size_t count_intersection(raft::handle_t const& handle,
+                          raft::device_span<vertex_t const> srcs1,
+                          raft::device_span<vertex_t const> dsts1,
+                          std::optional<raft::device_span<weight_t const>> wgts1,
+                          std::optional<raft::device_span<edge_t const>> edge_ids1,
+                          std::optional<raft::device_span<edge_type_t const>> edge_types1,
+                          raft::device_span<vertex_t const> srcs2,
+                          raft::device_span<vertex_t const> dsts2,
+                          std::optional<raft::device_span<weight_t const>> wgts2,
+                          std::optional<raft::device_span<edge_t const>> edge_ids2,
+                          std::optional<raft::device_span<edge_type_t const>> edge_types2);
+
+template <typename vertex_t>
+size_t count_edges_on_wrong_int_gpu(raft::handle_t const& handle,
+                                    raft::device_span<vertex_t const> srcs,
+                                    raft::device_span<vertex_t const> dsts,
+                                    raft::device_span<vertex_t const> vertex_partition_range_lasts);
+
+}  // namespace cugraph::test
diff --git a/dependencies.yaml b/dependencies.yaml
index 0d2d8c5325d..1fb47d9e368 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -20,6 +20,7 @@ files:
       - depends_on_dask_cudf
       - depends_on_pylibraft
       - depends_on_raft_dask
+        # Deprecate pylibcugraphops
       - depends_on_pylibcugraphops
       - depends_on_pylibwholegraph
       - depends_on_cupy
@@ -44,6 +45,7 @@ files:
       - cuda_version
       - docs
       - py_version
+        # Deprecate pylibcugraphops
       - depends_on_pylibcugraphops
   test_cpp:
     output: none
@@ -135,6 +137,7 @@ files:
     extras:
       table: project
     includes:
+      - cuda_wheels
       - depends_on_rmm
       - depends_on_pylibraft
   py_test_pylibcugraph:
@@ -188,6 +191,7 @@ files:
       table: project
     includes:
       - python_run_cugraph_dgl
+        # Deprecate pylibcugraphops
       - depends_on_pylibcugraphops
   py_test_cugraph_dgl:
     output: pyproject
@@ -214,6 +218,7 @@ files:
       table: project
     includes:
       - python_run_cugraph_pyg
+        # Deprecate pylibcugraphops
       - depends_on_pylibcugraphops
   py_test_cugraph_pyg:
     output: pyproject
@@ -239,6 +244,7 @@ files:
     extras:
       table: project
     includes:
+        # Deprecate pylibcugraphops
       - depends_on_pylibcugraphops
   py_test_cugraph_equivariant:
     output: pyproject
@@ -298,6 +304,7 @@ files:
     conda_dir: python/cugraph-dgl/conda
     includes:
       - checks
+        # Deprecate pylibcugraphops
       - depends_on_pylibcugraphops
       - cugraph_dgl_dev
       - test_python_common
@@ -308,6 +315,7 @@ files:
     conda_dir: python/cugraph-pyg/conda
     includes:
       - checks
+        # Deprecate pylibcugraphops
       - depends_on_pylibcugraphops
       - cugraph_pyg_dev
       - test_python_common
@@ -376,6 +384,36 @@ dependencies:
             packages:
               - cudatoolkit
               - cuda-nvtx
+  cuda_wheels:
+    specific:
+      - output_types: pyproject
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              use_cuda_wheels: "true"
+            packages:
+              - nvidia-cublas-cu12
+              - nvidia-curand-cu12
+              - nvidia-cusolver-cu12
+              - nvidia-cusparse-cu12
+          # CUDA 11 does not provide wheels, so use the system libraries instead
+          - matrix:
+              cuda: "11.*"
+              use_cuda_wheels: "true"
+            packages:
+          # if use_cuda_wheels=false is provided, do not add dependencies on any CUDA wheels
+          # (e.g. for DLFW and pip devcontainers)
+          - matrix:
+              use_cuda_wheels: "false"
+            packages:
+          # if no matching matrix selectors passed, list the unsuffixed packages
+          # (just as a source of documentation, as this populates pyproject.toml in source control)
+          - matrix:
+            packages:
+              - nvidia-cublas
+              - nvidia-curand
+              - nvidia-cusolver
+              - nvidia-cusparse
   common_build:
     common:
       - output_types: [conda, pyproject]
@@ -389,6 +427,7 @@ dependencies:
           - c-compiler
           - cxx-compiler
           - libcudf==24.10.*,>=0.0.0a0
+          # Deprecate libcugraphops
           - libcugraphops==24.10.*,>=0.0.0a0
           - libraft-headers==24.10.*,>=0.0.0a0
           - libraft==24.10.*,>=0.0.0a0
@@ -875,6 +914,7 @@ dependencies:
               - pylibcugraph-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibcugraph_unsuffixed]}
 
+  # deprecate pylibcugraphops
   depends_on_pylibcugraphops:
     common:
       - output_types: conda
diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt
index d96d28b6b59..ca38b5551c9 100644
--- a/python/cugraph/CMakeLists.txt
+++ b/python/cugraph/CMakeLists.txt
@@ -33,6 +33,7 @@ option(FIND_CUGRAPH_CPP "Search for existing CUGRAPH C++ installations before de
        OFF
 )
 option(USE_CUGRAPH_OPS "Enable all functions that call cugraph-ops" ON)
+option(USE_CUDA_MATH_WHEELS "Use the CUDA math wheels instead of the system libraries" OFF)
 
 if(NOT USE_CUGRAPH_OPS)
     message(STATUS "Disabling libcugraph functions that reference cugraph-ops")
@@ -49,6 +50,8 @@ endif()
 include(rapids-cython-core)
 
 if(NOT cugraph_FOUND)
+  find_package(CUDAToolkit REQUIRED)
+
   set(BUILD_TESTS OFF)
   set(BUILD_CUGRAPH_MG_TESTS OFF)
   set(BUILD_CUGRAPH_OPS_CPP_TESTS OFF)
@@ -60,8 +63,26 @@ if(NOT cugraph_FOUND)
   set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
   set(ALLOW_CLONE_CUGRAPH_OPS ON)
 
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0)
+    set(CUDA_STATIC_MATH_LIBRARIES OFF)
+  elseif(USE_CUDA_MATH_WHEELS)
+    message(FATAL_ERROR "Cannot use CUDA math wheels with CUDA < 12.0")
+  endif()
+
   add_subdirectory(../../cpp cugraph-cpp EXCLUDE_FROM_ALL)
 
+  if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS)
+    set(rpaths
+      "$ORIGIN/../nvidia/cublas/lib"
+      "$ORIGIN/../nvidia/cufft/lib"
+      "$ORIGIN/../nvidia/curand/lib"
+      "$ORIGIN/../nvidia/cusolver/lib"
+      "$ORIGIN/../nvidia/cusparse/lib"
+      "$ORIGIN/../nvidia/nvjitlink/lib"
+    )
+    set_property(TARGET cugraph PROPERTY INSTALL_RPATH ${rpaths} APPEND)
+  endif()
+
   set(cython_lib_dir cugraph)
   install(TARGETS cugraph DESTINATION ${cython_lib_dir})
 endif()
diff --git a/python/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/CMakeLists.txt
index d30b03cb68d..045628e9c0d 100644
--- a/python/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/CMakeLists.txt
@@ -33,6 +33,7 @@ option(FIND_CUGRAPH_CPP "Search for existing CUGRAPH C++ installations before de
        OFF
 )
 option(USE_CUGRAPH_OPS "Enable all functions that call cugraph-ops" ON)
+option(USE_CUDA_MATH_WHEELS "Use the CUDA math wheels instead of the system libraries" OFF)
 
 if(NOT USE_CUGRAPH_OPS)
     message(STATUS "Disabling libcugraph functions that reference cugraph-ops")
@@ -49,6 +50,8 @@ endif()
 include(rapids-cython-core)
 
 if (NOT cugraph_FOUND)
+  find_package(CUDAToolkit REQUIRED)
+
   set(BUILD_TESTS OFF)
   set(BUILD_CUGRAPH_MG_TESTS OFF)
   set(BUILD_CUGRAPH_OPS_CPP_TESTS OFF)
@@ -60,8 +63,25 @@ if (NOT cugraph_FOUND)
   set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
   set(ALLOW_CLONE_CUGRAPH_OPS ON)
 
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0)
+    set(CUDA_STATIC_MATH_LIBRARIES OFF)
+  elseif(USE_CUDA_MATH_WHEELS)
+    message(FATAL_ERROR "Cannot use CUDA math wheels with CUDA < 12.0")
+  endif()
+
   add_subdirectory(../../cpp cugraph-cpp EXCLUDE_FROM_ALL)
 
+  if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS)
+    set(rpaths
+      "$ORIGIN/../nvidia/cublas/lib"
+      "$ORIGIN/../nvidia/curand/lib"
+      "$ORIGIN/../nvidia/cusolver/lib"
+      "$ORIGIN/../nvidia/cusparse/lib"
+      "$ORIGIN/../nvidia/nvjitlink/lib"
+    )
+    set_property(TARGET cugraph PROPERTY INSTALL_RPATH ${rpaths} APPEND)
+  endif()
+
   set(cython_lib_dir pylibcugraph)
   install(TARGETS cugraph DESTINATION ${cython_lib_dir})
   install(TARGETS cugraph_c DESTINATION ${cython_lib_dir})
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/coo.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/coo.pxd
new file mode 100644
index 00000000000..e466e6ee5a0
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/coo.pxd
@@ -0,0 +1,71 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+)
+
+cdef extern from "cugraph_c/coo.h":
+    ctypedef struct cugraph_coo_t:
+        pass
+
+    ctypedef struct cugraph_coo_list_t:
+        pass
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_coo_get_sources(
+            cugraph_coo_t* coo
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_coo_get_destinations(
+            cugraph_coo_t* coo
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_coo_get_edge_weights(
+            cugraph_coo_t* coo
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_coo_get_edge_id(
+            cugraph_coo_t* coo
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_coo_get_edge_type(
+            cugraph_coo_t* coo
+        )
+
+    cdef size_t \
+        cugraph_coo_list_size(
+            const cugraph_coo_list_t* coo_list
+        )
+
+    cdef cugraph_coo_t* \
+        cugraph_coo_list_element(
+            cugraph_coo_list_t* coo_list,
+            size_t index)
+
+    cdef void \
+        cugraph_coo_free(
+            cugraph_coo_t* coo
+        )
+
+    cdef void \
+        cugraph_coo_list_free(
+            cugraph_coo_list_t* coo_list
+        )
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_generators.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_generators.pxd
index f6d62377443..cda47e55f77 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_generators.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_generators.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -31,62 +31,16 @@ from pylibcugraph._cugraph_c.random cimport (
     cugraph_rng_state_t,
 )
 
+from pylibcugraph._cugraph_c.coo cimport (
+    cugraph_coo_t,
+    cugraph_coo_list_t,
+)
+
 cdef extern from "cugraph_c/graph_generators.h":
     ctypedef enum cugraph_generator_distribution_t:
         POWER_LAW
         UNIFORM
 
-    ctypedef struct cugraph_coo_t:
-        pass
-
-    ctypedef struct cugraph_coo_list_t:
-        pass
-
-    cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_coo_get_sources(
-            cugraph_coo_t* coo
-        )
-
-    cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_coo_get_destinations(
-            cugraph_coo_t* coo
-        )
-
-    cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_coo_get_edge_weights(
-            cugraph_coo_t* coo
-        )
-
-    cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_coo_get_edge_id(
-            cugraph_coo_t* coo
-        )
-
-    cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_coo_get_edge_type(
-            cugraph_coo_t* coo
-        )
-
-    cdef size_t \
-        cugraph_coo_list_size(
-            const cugraph_coo_list_t* coo_list
-        )
-
-    cdef cugraph_coo_t* \
-        cugraph_coo_list_element(
-            cugraph_coo_list_t* coo_list,
-            size_t index)
-
-    cdef void \
-        cugraph_coo_free(
-            cugraph_coo_t* coo
-        )
-
-    cdef void \
-        cugraph_coo_list_free(
-            cugraph_coo_list_t* coo_list
-        )
-
     cdef cugraph_error_code_t \
         cugraph_generate_rmat_edgelist(
             const cugraph_resource_handle_t* handle,
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
index 0f852d9cecd..4a707db03c5 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
@@ -39,7 +39,9 @@ from pylibcugraph._cugraph_c.random cimport (
 from pylibcugraph._cugraph_c.array cimport (
     cugraph_type_erased_device_array_t,
 )
-
+from pylibcugraph._cugraph_c.coo cimport (
+    cugraph_coo_t,
+)
 from pylibcugraph._cugraph_c.properties cimport (
     cugraph_edge_property_view_t,
 )
@@ -103,3 +105,21 @@ cdef extern from "cugraph_c/sampling_algorithms.h":
             cugraph_type_erased_device_array_t** vertices,
             cugraph_error_t** error
         )
+
+    # negative sampling
+    cdef cugraph_error_code_t \
+        cugraph_negative_sampling(
+            const cugraph_resource_handle_t* handle,
+            cugraph_rng_state_t* rng_state,
+            cugraph_graph_t* graph,
+            size_t num_samples,
+            const cugraph_type_erased_device_array_view_t* vertices,
+            const cugraph_type_erased_device_array_view_t* src_bias,
+            const cugraph_type_erased_device_array_view_t* dst_bias,
+            bool_t remove_duplicates,
+            bool_t remove_false_negatives,
+            bool_t exact_number_of_samples,
+            bool_t do_expensive_check,
+            cugraph_coo_t **result,
+            cugraph_error_t **error
+        )
diff --git a/python/pylibcugraph/pylibcugraph/generate_rmat_edgelist.pyx b/python/pylibcugraph/pylibcugraph/generate_rmat_edgelist.pyx
index f38ad21d3b0..4ea96920e61 100644
--- a/python/pylibcugraph/pylibcugraph/generate_rmat_edgelist.pyx
+++ b/python/pylibcugraph/pylibcugraph/generate_rmat_edgelist.pyx
@@ -26,11 +26,7 @@ from pylibcugraph._cugraph_c.error cimport (
 from pylibcugraph._cugraph_c.array cimport (
     cugraph_type_erased_device_array_view_t,
 )
-from pylibcugraph._cugraph_c.graph_generators cimport (
-    cugraph_generate_rmat_edgelist,
-    cugraph_generate_edge_weights,
-    cugraph_generate_edge_ids,
-    cugraph_generate_edge_types,
+from pylibcugraph._cugraph_c.coo cimport (
     cugraph_coo_t,
     cugraph_coo_get_sources,
     cugraph_coo_get_destinations,
@@ -39,6 +35,12 @@ from pylibcugraph._cugraph_c.graph_generators cimport (
     cugraph_coo_get_edge_type,
     cugraph_coo_free,
 )
+from pylibcugraph._cugraph_c.graph_generators cimport (
+    cugraph_generate_rmat_edgelist,
+    cugraph_generate_edge_weights,
+    cugraph_generate_edge_ids,
+    cugraph_generate_edge_types,
+)
 from pylibcugraph.resource_handle cimport (
     ResourceHandle,
 )
diff --git a/python/pylibcugraph/pylibcugraph/generate_rmat_edgelists.pyx b/python/pylibcugraph/pylibcugraph/generate_rmat_edgelists.pyx
index 32af0c13fc0..7de48708f80 100644
--- a/python/pylibcugraph/pylibcugraph/generate_rmat_edgelists.pyx
+++ b/python/pylibcugraph/pylibcugraph/generate_rmat_edgelists.pyx
@@ -26,14 +26,9 @@ from pylibcugraph._cugraph_c.error cimport (
 from pylibcugraph._cugraph_c.array cimport (
     cugraph_type_erased_device_array_view_t,
 )
-from pylibcugraph._cugraph_c.graph_generators cimport (
-    cugraph_generate_rmat_edgelists,
-    cugraph_generate_edge_weights,
-    cugraph_generate_edge_ids,
-    cugraph_generate_edge_types,
+from pylibcugraph._cugraph_c.coo cimport (
     cugraph_coo_t,
     cugraph_coo_list_t,
-    cugraph_generator_distribution_t,
     cugraph_coo_get_sources,
     cugraph_coo_get_destinations,
     cugraph_coo_get_edge_weights,
@@ -44,6 +39,13 @@ from pylibcugraph._cugraph_c.graph_generators cimport (
     cugraph_coo_free,
     cugraph_coo_list_free,
 )
+from pylibcugraph._cugraph_c.graph_generators cimport (
+    cugraph_generate_rmat_edgelists,
+    cugraph_generate_edge_weights,
+    cugraph_generate_edge_ids,
+    cugraph_generate_edge_types,
+    cugraph_generator_distribution_t,
+)
 from pylibcugraph.resource_handle cimport (
     ResourceHandle,
 )
diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml
index 73699b6e445..727d05e8e52 100644
--- a/python/pylibcugraph/pyproject.toml
+++ b/python/pylibcugraph/pyproject.toml
@@ -23,6 +23,10 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
+    "nvidia-cublas",
+    "nvidia-curand",
+    "nvidia-cusolver",
+    "nvidia-cusparse",
     "pylibraft==24.10.*,>=0.0.0a0",
     "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -73,4 +77,4 @@ requires = [
     "pylibraft==24.10.*,>=0.0.0a0",
     "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-matrix-entry = "cuda_suffixed=true"
+matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"