diff --git a/cpp/src/sampling/detail/graph_functions.hpp b/cpp/src/sampling/detail/graph_functions.hpp
index 8eef9c83d61..71c1b27655a 100644
--- a/cpp/src/sampling/detail/graph_functions.hpp
+++ b/cpp/src/sampling/detail/graph_functions.hpp
@@ -150,7 +150,8 @@ gather_local_edges(
   const rmm::device_uvector<typename GraphViewType::vertex_type>& active_majors,
   rmm::device_uvector<typename GraphViewType::edge_type>&& minor_map,
   typename GraphViewType::edge_type indices_per_major,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets);
+  const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets,
+  bool remove_invalid_vertices = true);
 
 /**
  * @brief Gather edge list for specified vertices
diff --git a/cpp/src/sampling/detail/sampling_utils_impl.cuh b/cpp/src/sampling/detail/sampling_utils_impl.cuh
index a3434530505..02513de6889 100644
--- a/cpp/src/sampling/detail/sampling_utils_impl.cuh
+++ b/cpp/src/sampling/detail/sampling_utils_impl.cuh
@@ -333,7 +333,8 @@ gather_local_edges(
   const rmm::device_uvector<typename GraphViewType::vertex_type>& active_majors,
   rmm::device_uvector<typename GraphViewType::edge_type>&& minor_map,
   typename GraphViewType::edge_type indices_per_major,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets)
+  const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets,
+  bool remove_invalid_vertices)
 {
   using vertex_t  = typename GraphViewType::vertex_type;
   using edge_t    = typename GraphViewType::edge_type;
@@ -417,6 +418,7 @@ gather_local_edges(
           }
         } else {
           minors[index] = invalid_vertex_id;
+          if (weights != nullptr) { weights[index] = weight_t{0}; }
         }
       });
   } else {
@@ -485,52 +487,56 @@ gather_local_edges(
           edge_index_first[index] = g_dst_index;
         } else {
           minors[index] = invalid_vertex_id;
+          if (weights != nullptr) { weights[index] = weight_t{0}; }
         }
       });
   }
 
-  if (weights) {
-    auto input_iter = thrust::make_zip_iterator(
-      thrust::make_tuple(majors.begin(), minors.begin(), weights->begin()));
+  if (remove_invalid_vertices) {
+    if (weights) {
+      auto input_iter = thrust::make_zip_iterator(
+        thrust::make_tuple(majors.begin(), minors.begin(), weights->begin()));
 
-    CUGRAPH_EXPECTS(minors.size() < static_cast<size_t>(std::numeric_limits<int32_t>::max()),
-                    "remove_if will fail, minors.size() is too large");
+      CUGRAPH_EXPECTS(minors.size() < std::numeric_limits<int32_t>::max(),
+                      "remove_if will fail, minors.size() is too large");
 
-    // FIXME: remove_if has a 32-bit overflow issue (https://github.com/NVIDIA/thrust/issues/1302)
-    // Seems unlikely here (the goal of sampling is to extract small graphs)
-    // so not going to work around this for now.
-    auto compacted_length = thrust::distance(
-      input_iter,
-      thrust::remove_if(
-        handle.get_thrust_policy(),
+      // FIXME: remove_if has a 32-bit overflow issue
+      // (https://github.com/NVIDIA/thrust/issues/1302) Seems unlikely here (the goal of sampling
+      // is to extract small graphs) so not going to work around this for now.
+      auto compacted_length = thrust::distance(
         input_iter,
-        input_iter + minors.size(),
-        minors.begin(),
-        [invalid_vertex_id] __device__(auto dst) { return (dst == invalid_vertex_id); }));
+        thrust::remove_if(
+          handle.get_thrust_policy(),
+          input_iter,
+          input_iter + minors.size(),
+          minors.begin(),
+          [invalid_vertex_id] __device__(auto dst) { return (dst == invalid_vertex_id); }));
+
+      majors.resize(compacted_length, handle.get_stream());
+      minors.resize(compacted_length, handle.get_stream());
+      weights->resize(compacted_length, handle.get_stream());
+    } else {
+      auto input_iter =
+        thrust::make_zip_iterator(thrust::make_tuple(majors.begin(), minors.begin()));
 
-    majors.resize(compacted_length, handle.get_stream());
-    minors.resize(compacted_length, handle.get_stream());
-    weights->resize(compacted_length, handle.get_stream());
-  } else {
-    auto input_iter = thrust::make_zip_iterator(thrust::make_tuple(majors.begin(), minors.begin()));
-
-    CUGRAPH_EXPECTS(minors.size() < static_cast<size_t>(std::numeric_limits<int32_t>::max()),
-                    "remove_if will fail, minors.size() is too large");
-
-    auto compacted_length = thrust::distance(
-      input_iter,
-      // FIXME: remove_if has a 32-bit overflow issue (https://github.com/NVIDIA/thrust/issues/1302)
-      // Seems unlikely here (the goal of sampling is to extract small graphs)
-      // so not going to work around this for now.
-      thrust::remove_if(
-        handle.get_thrust_policy(),
-        input_iter,
-        input_iter + minors.size(),
-        minors.begin(),
-        [invalid_vertex_id] __device__(auto dst) { return (dst == invalid_vertex_id); }));
+      CUGRAPH_EXPECTS(minors.size() < std::numeric_limits<int32_t>::max(),
+                      "remove_if will fail, minors.size() is too large");
 
-    majors.resize(compacted_length, handle.get_stream());
-    minors.resize(compacted_length, handle.get_stream());
+      auto compacted_length = thrust::distance(
+        input_iter,
+        // FIXME: remove_if has a 32-bit overflow issue
+        // (https://github.com/NVIDIA/thrust/issues/1302) Seems unlikely here (the goal of
+        // sampling is to extract small graphs) so not going to work around this for now.
+        thrust::remove_if(
+          handle.get_thrust_policy(),
+          input_iter,
+          input_iter + minors.size(),
+          minors.begin(),
+          [invalid_vertex_id] __device__(auto dst) { return (dst == invalid_vertex_id); }));
+
+      majors.resize(compacted_length, handle.get_stream());
+      minors.resize(compacted_length, handle.get_stream());
+    }
   }
 
   return std::make_tuple(std::move(majors), std::move(minors), std::move(weights));
diff --git a/cpp/src/sampling/detail/sampling_utils_mg.cu b/cpp/src/sampling/detail/sampling_utils_mg.cu
index 726309e5370..2ca52c9ffe4 100644
--- a/cpp/src/sampling/detail/sampling_utils_mg.cu
+++ b/cpp/src/sampling/detail/sampling_utils_mg.cu
@@ -150,7 +150,8 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int32_t>& active_majors,
                    rmm::device_uvector<int32_t>&& minor_map,
                    int32_t indices_per_major,
-                   const rmm::device_uvector<int32_t>& global_degree_offsets);
+                   const rmm::device_uvector<int32_t>& global_degree_offsets,
+                   bool remove_invalid_vertices);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -160,7 +161,8 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int32_t>& active_majors,
                    rmm::device_uvector<int64_t>&& minor_map,
                    int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets);
+                   const rmm::device_uvector<int64_t>& global_degree_offsets,
+                   bool remove_invalid_vertices);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
@@ -170,7 +172,8 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int64_t>& active_majors,
                    rmm::device_uvector<int64_t>&& minor_map,
                    int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets);
+                   const rmm::device_uvector<int64_t>& global_degree_offsets,
+                   bool remove_invalid_vertices);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -180,7 +183,8 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int32_t>& active_majors,
                    rmm::device_uvector<int32_t>&& minor_map,
                    int32_t indices_per_major,
-                   const rmm::device_uvector<int32_t>& global_degree_offsets);
+                   const rmm::device_uvector<int32_t>& global_degree_offsets,
+                   bool remove_invalid_vertices);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -190,7 +194,8 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int32_t>& active_majors,
                    rmm::device_uvector<int64_t>&& minor_map,
                    int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets);
+                   const rmm::device_uvector<int64_t>& global_degree_offsets,
+                   bool remove_invalid_vertices);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
@@ -200,7 +205,8 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int64_t>& active_majors,
                    rmm::device_uvector<int64_t>&& minor_map,
                    int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets);
+                   const rmm::device_uvector<int64_t>& global_degree_offsets,
+                   bool remove_invalid_vertices);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
diff --git a/cpp/src/sampling/detail/sampling_utils_sg.cu b/cpp/src/sampling/detail/sampling_utils_sg.cu
index ae2980e5f10..63c10c2377c 100644
--- a/cpp/src/sampling/detail/sampling_utils_sg.cu
+++ b/cpp/src/sampling/detail/sampling_utils_sg.cu
@@ -93,7 +93,8 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int32_t>& active_majors,
                    rmm::device_uvector<int32_t>&& minor_map,
                    int32_t indices_per_major,
-                   const rmm::device_uvector<int32_t>& global_degree_offsets);
+                   const rmm::device_uvector<int32_t>& global_degree_offsets,
+                   bool remove_invalid_vertices);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -103,7 +104,8 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int32_t>& active_majors,
                    rmm::device_uvector<int64_t>&& minor_map,
                    int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets);
+                   const rmm::device_uvector<int64_t>& global_degree_offsets,
+                   bool remove_invalid_vertices);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
@@ -113,7 +115,8 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int64_t>& active_majors,
                    rmm::device_uvector<int64_t>&& minor_map,
                    int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets);
+                   const rmm::device_uvector<int64_t>& global_degree_offsets,
+                   bool remove_invalid_vertices);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -123,7 +126,8 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int32_t>& active_majors,
                    rmm::device_uvector<int32_t>&& minor_map,
                    int32_t indices_per_major,
-                   const rmm::device_uvector<int32_t>& global_degree_offsets);
+                   const rmm::device_uvector<int32_t>& global_degree_offsets,
+                   bool remove_invalid_vertices);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -133,7 +137,8 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int32_t>& active_majors,
                    rmm::device_uvector<int64_t>&& minor_map,
                    int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets);
+                   const rmm::device_uvector<int64_t>& global_degree_offsets,
+                   bool remove_invalid_vertices);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
@@ -143,7 +148,8 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int64_t>& active_majors,
                    rmm::device_uvector<int64_t>&& minor_map,
                    int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets);
+                   const rmm::device_uvector<int64_t>& global_degree_offsets,
+                   bool remove_invalid_vertices);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
diff --git a/cpp/src/sampling/random_walks_impl.cuh b/cpp/src/sampling/random_walks_impl.cuh
new file mode 100644
index 00000000000..0f0075020af
--- /dev/null
+++ b/cpp/src/sampling/random_walks_impl.cuh
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <detail/graph_utils.cuh>
+#include <sampling/detail/graph_functions.hpp>
+#include <sampling/detail/sampling_utils_impl.cuh>
+
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/partition_manager.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+#include <cugraph/utilities/shuffle_comm.cuh>
+
+#include <raft/handle.hpp>
+#include <raft/random/rng.cuh>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/optional.h>
+
+#include <algorithm>
+#include <limits>
+#include <numeric>
+#include <type_traits>
+#include <vector>
+
+namespace cugraph {
+namespace detail {
+
+inline uint64_t get_current_time_nanoseconds()
+{
+  timespec current_time;
+  clock_gettime(CLOCK_REALTIME, &current_time);
+  return current_time.tv_sec * 1000000000 + current_time.tv_nsec;
+}
+
+// FIXME: With impending new selection primitive, this might not be the appropriate
+// structure for the code.
+struct uniform_selector {
+  raft::random::RngState rng_state_;
+
+  uniform_selector(uint64_t seed) : rng_state_(seed) {}
+
+  template <typename vertex_t, typename edge_t>
+  rmm::device_uvector<edge_t> get_random_indices(raft::handle_t const& handle,
+                                                 rmm::device_uvector<vertex_t>& current_vertices,
+                                                 rmm::device_uvector<edge_t>& out_degrees)
+  {
+    rmm::device_uvector<edge_t> reply(out_degrees.size(), handle.get_stream());
+    rmm::device_uvector<double> random(out_degrees.size(), handle.get_stream());
+
+    // pick a uniform random integer between 0 and out_degrees[i] - 1
+    raft::random::uniform<double, size_t>(
+      rng_state_, random.data(), random.size(), double{0}, double{1}, handle.get_stream());
+
+    thrust::transform(handle.get_thrust_policy(),
+                      thrust::make_zip_iterator(random.begin(), out_degrees.begin()),
+                      thrust::make_zip_iterator(random.end(), out_degrees.end()),
+                      reply.begin(),
+                      [] __device__(auto t) {
+                        double rnd        = thrust::get<0>(t);
+                        edge_t out_degree = thrust::get<1>(t);
+
+                        return (out_degree > 0) ? static_cast<edge_t>(rnd * out_degree)
+                                                : edge_t{-1};
+                      });
+
+    return reply;
+  }
+};
+
+struct biased_selector {
+  uint64_t seed_{0};
+
+  template <typename vertex_t, typename edge_t>
+  rmm::device_uvector<edge_t> get_random_indices(raft::handle_t const& handle,
+                                                 rmm::device_uvector<vertex_t>& current_vertices,
+                                                 rmm::device_uvector<edge_t>& out_degrees)
+  {
+    //  To do biased sampling, I need out_weights instead of out_degrees.
+    //  Then I generate a random float between [0, out_weights[v]).  Then
+    //  instead of making a decision based on the index I need to find
+    //  upper_bound (or is it lower_bound) of the random number and
+    //  the cumulative weight.
+    CUGRAPH_FAIL("biased sampling not implemented");
+  }
+};
+
+template <typename weight_t>
+struct node2vec_selector {
+  weight_t p_;
+  weight_t q_;
+  uint64_t seed_{0};
+
+  template <typename vertex_t, typename edge_t>
+  rmm::device_uvector<edge_t> get_random_indices(raft::handle_t const& handle,
+                                                 rmm::device_uvector<vertex_t>& current_vertices,
+                                                 rmm::device_uvector<edge_t>& out_degrees)
+  {
+    //  To do node2vec, I need the following:
+    //    1) transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v to compute the sum of the
+    //       node2vec style weights
+    //    2) Generate a random number between [0, output_from_trdnioeebv[v])
+    //    3) a sampling value that lets me pick the correct edge based on the same computation
+    //       (essentially weighted sampling, but with a function that computes the weight rather
+    //       than just using the edge weights)
+    CUGRAPH_FAIL("node2vec not implemented");
+  }
+};
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool multi_gpu,
+          typename random_selector_t>
+std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
+random_walk_impl(raft::handle_t const& handle,
+                 graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
+                 raft::device_span<vertex_t const> start_vertices,
+                 size_t max_length,
+                 random_selector_t random_selector)
+{
+  // FIXME: This should be the global constant
+  vertex_t invalid_vertex_id = graph_view.number_of_vertices();
+
+  // preamble step for out-degree info:
+  //
+  auto&& [global_degree_offsets, global_out_degrees] =
+    detail::get_global_degree_information(handle, graph_view);
+
+  rmm::device_uvector<vertex_t> result_vertices(start_vertices.size() * (max_length + 1),
+                                                handle.get_stream());
+  auto result_weights = graph_view.is_weighted()
+                          ? std::make_optional<rmm::device_uvector<weight_t>>(
+                              start_vertices.size() * max_length, handle.get_stream())
+                          : std::nullopt;
+
+  thrust::fill(
+    handle.get_thrust_policy(), result_vertices.begin(), result_vertices.end(), invalid_vertex_id);
+  if (result_weights)
+    thrust::fill(
+      handle.get_thrust_policy(), result_weights->begin(), result_weights->end(), weight_t{0});
+
+  rmm::device_uvector<vertex_t> current_vertices(start_vertices.size(), handle.get_stream());
+  rmm::device_uvector<size_t> current_position(0, handle.get_stream());
+  rmm::device_uvector<int> current_gpu(0, handle.get_stream());
+  auto new_weights = graph_view.is_weighted()
+                       ? std::make_optional<rmm::device_uvector<weight_t>>(0, handle.get_stream())
+                       : std::nullopt;
+
+  if constexpr (multi_gpu) {
+    current_position.resize(start_vertices.size(), handle.get_stream());
+    current_gpu.resize(start_vertices.size(), handle.get_stream());
+    auto current_iter = thrust::make_zip_iterator(
+      current_vertices.begin(), current_gpu.begin(), current_position.begin());
+
+    thrust::tabulate(handle.get_thrust_policy(),
+                     current_iter,
+                     current_iter + current_vertices.size(),
+                     [my_gpu_id      = handle.get_comms().get_rank(),
+                      start_vertices = start_vertices.begin()] __device__(auto i) {
+                       return thrust::make_tuple(
+                         start_vertices[i], my_gpu_id, static_cast<size_t>(i));
+                     });
+  } else {
+    raft::copy(
+      current_vertices.begin(), start_vertices.begin(), start_vertices.size(), handle.get_stream());
+  }
+
+  thrust::for_each(
+    handle.get_thrust_policy(),
+    thrust::make_counting_iterator<size_t>(0),
+    thrust::make_counting_iterator<size_t>(current_vertices.size()),
+    [current_verts = current_vertices.data(),
+     result_verts  = result_vertices.data(),
+     max_length] __device__(size_t i) { result_verts[i * (max_length + 1)] = current_verts[i]; });
+
+  rmm::device_uvector<vertex_t> vertex_partition_range_lasts(
+    graph_view.vertex_partition_range_lasts().size(), handle.get_stream());
+  raft::update_device(vertex_partition_range_lasts.data(),
+                      graph_view.vertex_partition_range_lasts().data(),
+                      graph_view.vertex_partition_range_lasts().size(),
+                      handle.get_stream());
+
+  for (size_t level = 0; level < max_length; ++level) {
+    rmm::device_uvector<edge_t> random_indices(0, handle.get_stream());
+
+    if constexpr (multi_gpu) {
+      // Shuffle vertices to correct GPU to compute random indices
+      std::forward_as_tuple(std::tie(current_vertices, current_gpu, current_position),
+                            std::ignore) =
+        cugraph::groupby_gpu_id_and_shuffle_values(
+          handle.get_comms(),
+          thrust::make_zip_iterator(
+            current_vertices.begin(), current_gpu.begin(), current_position.begin()),
+          thrust::make_zip_iterator(
+            current_vertices.end(), current_gpu.end(), current_position.end()),
+          [key_func =
+             cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t>{
+               {vertex_partition_range_lasts.begin(),
+                vertex_partition_range_lasts.size()}}] __device__(auto val) {
+            return key_func(thrust::get<0>(val));
+          },
+          handle.get_stream());
+
+      auto&& out_degrees =
+        get_active_major_global_degrees(handle, graph_view, current_vertices, global_out_degrees);
+
+      random_indices = random_selector.get_random_indices(handle, current_vertices, out_degrees);
+
+      //  *** NOTE: to support node2vec, the current tuples also should specify a previous src
+      //    (invalid_vertex for initialization).  Note, computing node2vec will be more
+      //    complicated/expensive
+      // Shuffle current_vertices tuples
+      std::forward_as_tuple(
+        std::tie(current_vertices, current_gpu, current_position, random_indices), std::ignore) =
+        cugraph::groupby_gpu_id_and_shuffle_values(
+          handle.get_comms(),
+          thrust::make_zip_iterator(current_vertices.begin(),
+                                    current_gpu.begin(),
+                                    current_position.begin(),
+                                    random_indices.begin()),
+          thrust::make_zip_iterator(current_vertices.end(),
+                                    current_gpu.end(),
+                                    current_position.end(),
+                                    random_indices.end()),
+          [key_func =
+             cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t>{
+               {vertex_partition_range_lasts.begin(),
+                vertex_partition_range_lasts.size()}}] __device__(auto val) {
+            return key_func(thrust::get<0>(val));
+          },
+          handle.get_stream());
+
+      // Need to allgather across the col communicator
+      auto const& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      size_t source_count  = current_vertices.size();
+
+      auto external_source_counts =
+        cugraph::host_scalar_allgather(col_comm, source_count, handle.get_stream());
+
+      auto total_external_source_count =
+        std::accumulate(external_source_counts.begin(), external_source_counts.end(), size_t{0});
+
+      std::vector<size_t> displacements(external_source_counts.size(), size_t{0});
+      std::exclusive_scan(external_source_counts.begin(),
+                          external_source_counts.end(),
+                          displacements.begin(),
+                          size_t{0});
+
+      rmm::device_uvector<vertex_t> active_vertices(total_external_source_count,
+                                                    handle.get_stream());
+      rmm::device_uvector<int> active_gpu(total_external_source_count, handle.get_stream());
+      rmm::device_uvector<size_t> active_position(total_external_source_count, handle.get_stream());
+      rmm::device_uvector<edge_t> active_random_indices(total_external_source_count,
+                                                        handle.get_stream());
+
+      // Get the sources other gpus on the same row are working on
+      // FIXME : replace with device_bcast for better scaling
+      device_allgatherv(col_comm,
+                        current_vertices.data(),
+                        active_vertices.data(),
+                        external_source_counts,
+                        displacements,
+                        handle.get_stream());
+      device_allgatherv(col_comm,
+                        current_gpu.data(),
+                        active_gpu.data(),
+                        external_source_counts,
+                        displacements,
+                        handle.get_stream());
+      device_allgatherv(col_comm,
+                        current_position.data(),
+                        active_position.data(),
+                        external_source_counts,
+                        displacements,
+                        handle.get_stream());
+      device_allgatherv(col_comm,
+                        random_indices.data(),
+                        active_random_indices.data(),
+                        external_source_counts,
+                        displacements,
+                        handle.get_stream());
+      thrust::sort(handle.get_thrust_policy(),
+                   thrust::make_zip_iterator(active_vertices.begin(),
+                                             active_gpu.begin(),
+                                             active_position.begin(),
+                                             active_random_indices.begin()),
+                   thrust::make_zip_iterator(active_vertices.end(),
+                                             active_gpu.end(),
+                                             active_position.end(),
+                                             active_random_indices.begin()));
+
+      current_vertices = std::move(active_vertices);
+      current_gpu      = std::move(active_gpu);
+      current_position = std::move(active_position);
+      random_indices   = std::move(active_random_indices);
+    } else {
+      auto&& out_degrees =
+        get_active_major_global_degrees(handle, graph_view, current_vertices, global_out_degrees);
+
+      random_indices = random_selector.get_random_indices(handle, current_vertices, out_degrees);
+    }
+
+    std::tie(std::ignore, current_vertices, new_weights) =
+      detail::gather_local_edges(handle,
+                                 graph_view,
+                                 current_vertices,
+                                 std::move(random_indices),
+                                 edge_t{1},
+                                 global_degree_offsets,
+                                 false);
+
+    if constexpr (multi_gpu) {
+      //
+      //  Now I can iterate over the tuples (current_vertices, new_weights, current_gpu,
+      //  current_position) and skip over anything where current_vertices == invalid_vertex_id.
+      //  There should, for any vertex, be at most one gpu where the vertex has a new vertex
+      //  neighbor.
+      //
+      if (new_weights) {
+        auto input_iter = thrust::make_zip_iterator(current_vertices.begin(),
+                                                    new_weights->begin(),
+                                                    current_gpu.begin(),
+                                                    current_position.begin());
+
+        CUGRAPH_EXPECTS(current_vertices.size() < std::numeric_limits<int32_t>::max(),
+                        "remove_if will fail, current_vertices.size() is too large");
+
+        // FIXME: remove_if has a 32-bit overflow issue
+        // (https://github.com/NVIDIA/thrust/issues/1302) Seems unlikely here (the goal of
+        // sampling is to extract small graphs) so not going to work around this for now.
+        auto compacted_length = thrust::distance(
+          input_iter,
+          thrust::remove_if(
+            handle.get_thrust_policy(),
+            input_iter,
+            input_iter + current_vertices.size(),
+            current_vertices.begin(),
+            [invalid_vertex_id] __device__(auto dst) { return (dst == invalid_vertex_id); }));
+
+        current_vertices.resize(compacted_length, handle.get_stream());
+        new_weights->resize(compacted_length, handle.get_stream());
+        current_gpu.resize(compacted_length, handle.get_stream());
+        current_position.resize(compacted_length, handle.get_stream());
+
+        // Shuffle back to original GPU
+        auto current_iter = thrust::make_zip_iterator(current_vertices.begin(),
+                                                      new_weights->begin(),
+                                                      current_gpu.begin(),
+                                                      current_position.begin());
+
+        std::forward_as_tuple(
+          std::tie(current_vertices, *new_weights, current_gpu, current_position), std::ignore) =
+          cugraph::groupby_gpu_id_and_shuffle_values(
+            handle.get_comms(),
+            current_iter,
+            current_iter + current_vertices.size(),
+            [] __device__(auto val) { return thrust::get<2>(val); },
+            handle.get_stream());
+
+        thrust::for_each(handle.get_thrust_policy(),
+                         thrust::make_counting_iterator<size_t>(0),
+                         thrust::make_counting_iterator<size_t>(current_vertices.size()),
+                         [current_verts = current_vertices.data(),
+                          new_wgts      = new_weights->data(),
+                          current_pos   = current_position.begin(),
+                          result_verts  = result_vertices.data(),
+                          result_wgts   = result_weights->data(),
+                          level,
+                          max_length] __device__(size_t i) {
+                           result_verts[current_pos[i] * (max_length + 1) + level + 1] =
+                             current_verts[i];
+                           result_wgts[current_pos[i] * max_length + level] = new_wgts[i];
+                         });
+      } else {
+        auto input_iter = thrust::make_zip_iterator(
+          current_vertices.begin(), current_gpu.begin(), current_position.begin());
+
+        CUGRAPH_EXPECTS(current_vertices.size() < std::numeric_limits<int32_t>::max(),
+                        "remove_if will fail, current_vertices.size() is too large");
+
+        auto compacted_length = thrust::distance(
+          input_iter,
+          // FIXME: remove_if has a 32-bit overflow issue
+          // (https://github.com/NVIDIA/thrust/issues/1302) Seems unlikely here (the goal of
+          // sampling is to extract small graphs) so not going to work around this for now.
+          thrust::remove_if(
+            handle.get_thrust_policy(),
+            input_iter,
+            input_iter + current_vertices.size(),
+            current_vertices.begin(),
+            [invalid_vertex_id] __device__(auto dst) { return (dst == invalid_vertex_id); }));
+
+        current_vertices.resize(compacted_length, handle.get_stream());
+        current_gpu.resize(compacted_length, handle.get_stream());
+        current_position.resize(compacted_length, handle.get_stream());
+
+        // Shuffle back to original GPU
+        auto current_iter = thrust::make_zip_iterator(
+          current_vertices.begin(), current_gpu.begin(), current_position.begin());
+
+        std::forward_as_tuple(std::tie(current_vertices, current_gpu, current_position),
+                              std::ignore) =
+          cugraph::groupby_gpu_id_and_shuffle_values(
+            handle.get_comms(),
+            current_iter,
+            current_iter + current_vertices.size(),
+            [] __device__(auto val) { return thrust::get<1>(val); },
+            handle.get_stream());
+
+        thrust::for_each(handle.get_thrust_policy(),
+                         thrust::make_counting_iterator<size_t>(0),
+                         thrust::make_counting_iterator<size_t>(current_vertices.size()),
+                         [current_verts = current_vertices.data(),
+                          current_pos   = current_position.data(),
+                          result_verts  = result_vertices.data(),
+                          level,
+                          max_length] __device__(size_t i) {
+                           result_verts[current_pos[i] * (max_length + 1) + level + 1] =
+                             current_verts[i];
+                         });
+      }
+    } else {
+      if (new_weights) {
+        thrust::for_each(handle.get_thrust_policy(),
+                         thrust::make_counting_iterator<size_t>(0),
+                         thrust::make_counting_iterator<size_t>(current_vertices.size()),
+                         [current_verts = current_vertices.data(),
+                          new_wgts      = new_weights->data(),
+                          result_verts  = result_vertices.data(),
+                          result_wgts   = result_weights->data(),
+                          level,
+                          max_length] __device__(size_t i) {
+                           result_verts[i * (max_length + 1) + level + 1] = current_verts[i];
+                           result_wgts[i * max_length + level]            = new_wgts[i];
+                         });
+      } else {
+        thrust::for_each(handle.get_thrust_policy(),
+                         thrust::make_counting_iterator<size_t>(0),
+                         thrust::make_counting_iterator<size_t>(current_vertices.size()),
+                         [current_verts = current_vertices.data(),
+                          result_verts  = result_vertices.data(),
+                          level,
+                          max_length] __device__(size_t i) {
+                           result_verts[i * (max_length + 1) + level + 1] = current_verts[i];
+                         });
+      }
+    }
+  }
+
+  return std::make_tuple(std::move(result_vertices), std::move(result_weights));
+}
+}  // namespace detail
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
+uniform_random_walks(raft::handle_t const& handle,
+                     graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
+                     raft::device_span<vertex_t const> start_vertices,
+                     size_t max_length,
+                     uint64_t seed)
+{
+  return detail::random_walk_impl(
+    handle,
+    graph_view,
+    start_vertices,
+    max_length,
+    detail::uniform_selector((seed == 0 ? detail::get_current_time_nanoseconds() : seed)));
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
+biased_random_walks(raft::handle_t const& handle,
+                    graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
+                    raft::device_span<vertex_t const> start_vertices,
+                    size_t max_length,
+                    uint64_t seed)
+{
+  return detail::random_walk_impl(
+    handle,
+    graph_view,
+    start_vertices,
+    max_length,
+    detail::biased_selector{(seed == 0 ? detail::get_current_time_nanoseconds() : seed)});
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
+node2vec_random_walks(raft::handle_t const& handle,
+                      graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
+                      raft::device_span<vertex_t const> start_vertices,
+                      size_t max_length,
+                      weight_t p,
+                      weight_t q,
+                      uint64_t seed)
+{
+  return detail::random_walk_impl(
+    handle,
+    graph_view,
+    start_vertices,
+    max_length,
+    detail::node2vec_selector<weight_t>{
+      p, q, (seed == 0 ? detail::get_current_time_nanoseconds() : seed)});
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks_mg.cu b/cpp/src/sampling/random_walks_mg.cu
index ff4a07cb93c..c7c71d11be4 100644
--- a/cpp/src/sampling/random_walks_mg.cu
+++ b/cpp/src/sampling/random_walks_mg.cu
@@ -16,43 +16,9 @@
 
 #include <cugraph/algorithms.hpp>
 
-namespace cugraph {
-
-// FIXME: Temporarily here until random_walks_impl.cuh is ready with the real implementation
-template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
-uniform_random_walks(raft::handle_t const& handle,
-                     graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
-                     raft::device_span<vertex_t const> start_vertices,
-                     size_t max_length,
-                     uint64_t seed)
-{
-  CUGRAPH_FAIL("Not Implemented");
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
-biased_random_walks(raft::handle_t const& handle,
-                    graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
-                    raft::device_span<vertex_t const> start_vertices,
-                    size_t max_length,
-                    uint64_t seed)
-{
-  CUGRAPH_FAIL("Not Implemented");
-}
+#include <sampling/random_walks_impl.cuh>
 
-template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
-node2vec_random_walks(raft::handle_t const& handle,
-                      graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
-                      raft::device_span<vertex_t const> start_vertices,
-                      size_t max_length,
-                      weight_t p,
-                      weight_t q,
-                      uint64_t seed)
-{
-  CUGRAPH_FAIL("Not Implemented");
-}
+namespace cugraph {
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<float>>>
 uniform_random_walks(raft::handle_t const& handle,
diff --git a/cpp/src/sampling/random_walks_sg.cu b/cpp/src/sampling/random_walks_sg.cu
index e7634795eb6..fde8baccb1f 100644
--- a/cpp/src/sampling/random_walks_sg.cu
+++ b/cpp/src/sampling/random_walks_sg.cu
@@ -16,43 +16,9 @@
 
 #include <cugraph/algorithms.hpp>
 
-namespace cugraph {
-
-// FIXME: Temporarily here until random_walks_impl.cuh is ready with the real implementation
-template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
-uniform_random_walks(raft::handle_t const& handle,
-                     graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
-                     raft::device_span<vertex_t const> start_vertices,
-                     size_t max_length,
-                     uint64_t seed)
-{
-  CUGRAPH_FAIL("Not Implemented");
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
-biased_random_walks(raft::handle_t const& handle,
-                    graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
-                    raft::device_span<vertex_t const> start_vertices,
-                    size_t max_length,
-                    uint64_t seed)
-{
-  CUGRAPH_FAIL("Not Implemented");
-}
+#include <sampling/random_walks_impl.cuh>
 
-template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
-node2vec_random_walks(raft::handle_t const& handle,
-                      graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
-                      raft::device_span<vertex_t const> start_vertices,
-                      size_t max_length,
-                      weight_t p,
-                      weight_t q,
-                      uint64_t seed)
-{
-  CUGRAPH_FAIL("Not Implemented");
-}
+namespace cugraph {
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<float>>>
 uniform_random_walks(raft::handle_t const& handle,
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 1fa7532cb4e..4179ee026e0 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -23,6 +23,7 @@ add_library(cugraphtestutil STATIC
             utilities/thrust_wrapper.cu
             utilities/misc_utilities.cpp
             components/wcc_graphs.cu
+            sampling/random_walks_check_sg.cu
             ../../thirdparty/mmio/mmio.c)
 
 target_compile_options(cugraphtestutil
@@ -340,7 +341,7 @@ ConfigureTest(LEGACY_RANDOM_WALKS_TEST sampling/random_walks_test.cu)
 ###################################################################################################
 # - RANDOM_WALKS tests ----------------------------------------------------------------------------
 #  FIXME: Rename to random_walks_test.cu once the legacy implementation is deleted
-ConfigureTest(RANDOM_WALKS_TEST sampling/sg_random_walks_test.cu)
+ConfigureTest(RANDOM_WALKS_TEST sampling/sg_random_walks_test.cpp)
 
 ###################################################################################################
 ConfigureTest(RANDOM_WALKS_LOW_LEVEL_TEST sampling/rw_low_level_test.cu)
@@ -385,10 +386,17 @@ if(BUILD_CUGRAPH_MG_TESTS)
 
     add_library(cugraphmgtestutil STATIC
                 utilities/device_comm_wrapper.cu
-                utilities/mg_utilities.cpp)
+                utilities/mg_utilities.cpp
+                sampling/random_walks_check_mg.cu
+                )
 
     set_property(TARGET cugraphmgtestutil PROPERTY POSITION_INDEPENDENT_CODE ON)
 
+    target_compile_options(cugraphmgtestutil
+                PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUGRAPH_CXX_FLAGS}>"
+                       "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUGRAPH_CUDA_FLAGS}>>"
+    )
+
     target_include_directories(cugraphmgtestutil
         PRIVATE
         "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio"
@@ -543,7 +551,7 @@ if(BUILD_CUGRAPH_MG_TESTS)
 
     ###########################################################################################
     # - RANDOM_WALKS tests --------------------------------------------------------------------
-    ConfigureTestMG(MG_RANDOM_WALKS_TEST sampling/mg_random_walks_test.cu)
+    ConfigureTestMG(MG_RANDOM_WALKS_TEST sampling/mg_random_walks_test.cpp)
 
     ###########################################################################################
     # - MG C API tests ------------------------------------------------------------------------
diff --git a/cpp/tests/sampling/detail/mg_gather_utils.cu b/cpp/tests/sampling/detail/mg_gather_utils.cu
index c38f333b35f..3d4229db838 100644
--- a/cpp/tests/sampling/detail/mg_gather_utils.cu
+++ b/cpp/tests/sampling/detail/mg_gather_utils.cu
@@ -17,6 +17,7 @@
 #include "nbr_sampling_utils.cuh"
 
 #include <sampling/detail/graph_functions.hpp>
+#include <sampling/detail/sampling_utils_impl.cuh>
 
 #include <utilities/mg_utilities.hpp>
 
@@ -180,7 +181,7 @@ class Tests_MG_GatherEdges
     // 2. Gather mnmg call
     // Generate random vertex ids in the range of current gpu
 
-    auto [global_degree_offsets, global_out_degrees] =
+    auto&& [global_degree_offsets, global_out_degrees] =
       cugraph::detail::get_global_degree_information(*handle_, mg_graph_view);
 
     // Generate random sources to gather on
diff --git a/cpp/tests/sampling/mg_random_walks_test.cpp b/cpp/tests/sampling/mg_random_walks_test.cpp
new file mode 100644
index 00000000000..7818c8023cb
--- /dev/null
+++ b/cpp/tests/sampling/mg_random_walks_test.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sampling/random_walks_check.hpp>
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/high_res_clock.h>
+#include <utilities/mg_utilities.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/partition_manager.hpp>
+
+#include <raft/comms/comms.hpp>
+#include <raft/comms/mpi_comms.hpp>
+
+#include <gtest/gtest.h>
+
+struct UniformRandomWalks_Usecase {
+  bool test_weighted{false};
+  uint64_t seed{0};
+  bool check_correctness{false};
+
+  template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+  std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
+  operator()(raft::handle_t const& handle,
+             cugraph::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
+             raft::device_span<vertex_t const> start_vertices,
+             size_t max_depth)
+  {
+    return cugraph::uniform_random_walks(handle, graph_view, start_vertices, max_depth, seed);
+  }
+
+  bool expect_throw() { return false; }
+};
+
+struct BiasedRandomWalks_Usecase {
+  bool test_weighted{true};
+  uint64_t seed{0};
+  bool check_correctness{false};
+
+  template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+  std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
+  operator()(raft::handle_t const& handle,
+             cugraph::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
+             raft::device_span<vertex_t const> start_vertices,
+             size_t max_depth)
+  {
+    return cugraph::biased_random_walks(handle, graph_view, start_vertices, max_depth, seed);
+  }
+
+  // FIXME: Not currently implemented
+  bool expect_throw() { return true; }
+};
+
+struct Node2VecRandomWalks_Usecase {
+  double p{1};
+  double q{1};
+  bool test_weighted{false};
+  uint64_t seed{0};
+  bool check_correctness{false};
+
+  template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+  std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
+  operator()(raft::handle_t const& handle,
+             cugraph::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
+             raft::device_span<vertex_t const> start_vertices,
+             size_t max_depth)
+  {
+    return cugraph::node2vec_random_walks(handle,
+                                          graph_view,
+                                          start_vertices,
+                                          max_depth,
+                                          static_cast<weight_t>(p),
+                                          static_cast<weight_t>(q),
+                                          seed);
+  }
+
+  // FIXME: Not currently implemented
+  bool expect_throw() { return true; }
+};
+
+template <typename tuple_t>
+class Tests_MGRandomWalks : public ::testing::TestWithParam<tuple_t> {
+ public:
+  Tests_MGRandomWalks() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(tuple_t const& param)
+  {
+    HighResClock hr_clock{};
+
+    auto [randomwalks_usecase, input_usecase] = param;
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_clock.start();
+    }
+
+    bool renumber{true};
+    auto [graph, d_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_, input_usecase, randomwalks_usecase.test_weighted, renumber);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "construct_graph took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    auto graph_view   = graph.view();
+    edge_t num_paths  = 10;
+    edge_t max_length = 10;
+    rmm::device_uvector<vertex_t> d_start(0, handle_->get_stream());
+
+    if (graph_view.local_vertex_partition_range_size() > 0) {
+      d_start.resize(std::min(10, graph_view.local_vertex_partition_range_size()),
+                     handle_->get_stream());
+
+      cugraph::detail::sequence_fill(handle_->get_stream(),
+                                     d_start.begin(),
+                                     d_start.size(),
+                                     graph_view.local_vertex_partition_range_first());
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_clock.start();
+    }
+
+    if (randomwalks_usecase.expect_throw()) {
+      // biased and node2vec currently throw since they are not implemented
+      EXPECT_THROW(
+        randomwalks_usecase(*handle_,
+                            graph_view,
+                            raft::device_span<vertex_t const>{d_start.data(), d_start.size()},
+                            max_length),
+        std::exception);
+    } else {
+      auto [d_vertices, d_weights] =
+        randomwalks_usecase(*handle_,
+                            graph_view,
+                            raft::device_span<vertex_t const>{d_start.data(), d_start.size()},
+                            max_length);
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        double elapsed_time{0.0};
+        hr_clock.stop(&elapsed_time);
+        std::cout << "RandomWalks took " << elapsed_time * 1e-6 << " s.\n";
+      }
+
+      if (randomwalks_usecase.check_correctness) {
+        cugraph::test::random_walks_validate(*handle_,
+                                             graph_view,
+                                             std::move(d_start),
+                                             std::move(d_vertices),
+                                             std::move(d_weights),
+                                             max_length);
+      }
+    }
+  }
+
+  int get_rank() { return handle_->get_comms().get_rank(); }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename tuple_t>
+std::unique_ptr<raft::handle_t> Tests_MGRandomWalks<tuple_t>::handle_ = nullptr;
+
+using Tests_UniformRandomWalks_File =
+  Tests_MGRandomWalks<std::tuple<UniformRandomWalks_Usecase, cugraph::test::File_Usecase>>;
+using Tests_UniformRandomWalks_Rmat =
+  Tests_MGRandomWalks<std::tuple<UniformRandomWalks_Usecase, cugraph::test::Rmat_Usecase>>;
+using Tests_BiasedRandomWalks_File =
+  Tests_MGRandomWalks<std::tuple<BiasedRandomWalks_Usecase, cugraph::test::File_Usecase>>;
+using Tests_BiasedRandomWalks_Rmat =
+  Tests_MGRandomWalks<std::tuple<BiasedRandomWalks_Usecase, cugraph::test::Rmat_Usecase>>;
+using Tests_Node2VecRandomWalks_File =
+  Tests_MGRandomWalks<std::tuple<Node2VecRandomWalks_Usecase, cugraph::test::File_Usecase>>;
+using Tests_Node2VecRandomWalks_Rmat =
+  Tests_MGRandomWalks<std::tuple<Node2VecRandomWalks_Usecase, cugraph::test::Rmat_Usecase>>;
+
+TEST_P(Tests_UniformRandomWalks_File, Initialize_i32_i32_f)
+{
+  try {
+    run_current_test<int32_t, int32_t, float>(
+      override_File_Usecase_with_cmd_line_arguments(GetParam()));
+  } catch (const std::exception& e) {
+    std::cerr << "exception in rank = " << get_rank() << std::endl;
+    throw e;
+  }
+}
+
+TEST_P(Tests_UniformRandomWalks_Rmat, Initialize_i32_i32_f)
+{
+  try {
+    run_current_test<int32_t, int32_t, float>(
+      override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+  } catch (const std::exception& e) {
+    std::cerr << "exception in rank = " << get_rank() << std::endl;
+    throw e;
+  }
+}
+
+TEST_P(Tests_BiasedRandomWalks_File, Initialize_i32_i32_f)
+{
+  try {
+    run_current_test<int32_t, int32_t, float>(
+      override_File_Usecase_with_cmd_line_arguments(GetParam()));
+  } catch (const std::exception& e) {
+    std::cerr << "exception in rank = " << get_rank() << std::endl;
+    throw e;
+  }
+}
+
+TEST_P(Tests_BiasedRandomWalks_Rmat, Initialize_i32_i32_f)
+{
+  try {
+    run_current_test<int32_t, int32_t, float>(
+      override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+  } catch (const std::exception& e) {
+    std::cerr << "exception in rank = " << get_rank() << std::endl;
+    throw e;
+  }
+}
+
+TEST_P(Tests_Node2VecRandomWalks_File, Initialize_i32_i32_f)
+{
+  try {
+    run_current_test<int32_t, int32_t, float>(
+      override_File_Usecase_with_cmd_line_arguments(GetParam()));
+  } catch (const std::exception& e) {
+    std::cerr << "exception in rank = " << get_rank() << std::endl;
+    throw e;
+  }
+}
+
+TEST_P(Tests_Node2VecRandomWalks_Rmat, Initialize_i32_i32_f)
+{
+  try {
+    run_current_test<int32_t, int32_t, float>(
+      override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+  } catch (const std::exception& e) {
+    std::cerr << "exception in rank = " << get_rank() << std::endl;
+    throw e;
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Tests_UniformRandomWalks_File,
+  ::testing::Combine(
+    ::testing::Values(UniformRandomWalks_Usecase{false, 0, true},
+                      UniformRandomWalks_Usecase{true, 0, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Tests_BiasedRandomWalks_File,
+  ::testing::Combine(
+    ::testing::Values(BiasedRandomWalks_Usecase{false, 0, true},
+                      BiasedRandomWalks_Usecase{true, 0, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Tests_Node2VecRandomWalks_File,
+  ::testing::Combine(
+    ::testing::Values(Node2VecRandomWalks_Usecase{4, 8, false, 0, true},
+                      Node2VecRandomWalks_Usecase{4, 8, true, 0, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_UniformRandomWalks_Rmat,
+  ::testing::Combine(::testing::Values(UniformRandomWalks_Usecase{false, 0, true},
+                                       UniformRandomWalks_Usecase{true, 0, true}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       10, 16, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test,
+  Tests_UniformRandomWalks_Rmat,
+  ::testing::Combine(::testing::Values(UniformRandomWalks_Usecase{true, 0, false}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       20, 32, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_BiasedRandomWalks_Rmat,
+  ::testing::Combine(::testing::Values(BiasedRandomWalks_Usecase{false, 0, true},
+                                       BiasedRandomWalks_Usecase{true, 0, true}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       10, 16, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test,
+  Tests_BiasedRandomWalks_Rmat,
+  ::testing::Combine(::testing::Values(BiasedRandomWalks_Usecase{true, 0, false}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       20, 32, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Node2VecRandomWalks_Rmat,
+  ::testing::Combine(::testing::Values(Node2VecRandomWalks_Usecase{8, 4, false, 0, true},
+                                       Node2VecRandomWalks_Usecase{8, 4, true, 0, true}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       10, 16, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test,
+  Tests_Node2VecRandomWalks_Rmat,
+  ::testing::Combine(::testing::Values(Node2VecRandomWalks_Usecase{8, 4, true, 0, false}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       20, 32, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/mg_random_walks_test.cu b/cpp/tests/sampling/mg_random_walks_test.cu
deleted file mode 100644
index b909f6fb6bc..00000000000
--- a/cpp/tests/sampling/mg_random_walks_test.cu
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <utilities/base_fixture.hpp>
-#include <utilities/high_res_clock.h>
-#include <utilities/mg_utilities.hpp>
-#include <utilities/test_graphs.hpp>
-#include <utilities/test_utilities.hpp>
-#include <utilities/thrust_wrapper.hpp>
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/graph.hpp>
-#include <cugraph/graph_functions.hpp>
-#include <cugraph/graph_view.hpp>
-#include <cugraph/partition_manager.hpp>
-
-#include <raft/comms/comms.hpp>
-#include <raft/comms/mpi_comms.hpp>
-
-#include <gtest/gtest.h>
-
-struct UniformRandomWalks_Usecase {
-  bool test_weighted{false};
-  uint64_t seed{0};
-  bool check_correctness{false};
-
-  template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-  std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
-  operator()(raft::handle_t const& handle,
-             cugraph::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
-             raft::device_span<vertex_t const> start_vertices,
-             size_t max_depth)
-  {
-    return cugraph::uniform_random_walks(handle, graph_view, start_vertices, max_depth, seed);
-  }
-};
-
-struct BiasedRandomWalks_Usecase {
-  bool test_weighted{true};
-  uint64_t seed{0};
-  bool check_correctness{false};
-
-  template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-  std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
-  operator()(raft::handle_t const& handle,
-             cugraph::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
-             raft::device_span<vertex_t const> start_vertices,
-             size_t max_depth)
-  {
-    return cugraph::biased_random_walks(handle, graph_view, start_vertices, max_depth, seed);
-  }
-};
-
-struct Node2VecRandomWalks_Usecase {
-  double p{1};
-  double q{1};
-  bool test_weighted{false};
-  uint64_t seed{0};
-  bool check_correctness{false};
-
-  template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-  std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
-  operator()(raft::handle_t const& handle,
-             cugraph::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
-             raft::device_span<vertex_t const> start_vertices,
-             size_t max_depth)
-  {
-    return cugraph::node2vec_random_walks(
-      handle, graph_view, start_vertices, max_depth, p, q, seed);
-  }
-};
-
-template <typename tuple_t>
-class Tests_MGRandomWalks : public ::testing::TestWithParam<tuple_t> {
- public:
-  Tests_MGRandomWalks() {}
-
-  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
-
-  static void TearDownTestCase() { handle_.reset(); }
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-
-  template <typename vertex_t, typename edge_t, typename weight_t>
-  void run_current_test(tuple_t const& param)
-  {
-    HighResClock hr_clock{};
-
-    auto [randomwalks_usecase, input_usecase] = param;
-
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      hr_clock.start();
-    }
-
-    bool renumber{true};
-    auto [graph, d_renumber_map_labels] =
-      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
-        *handle_, input_usecase, randomwalks_usecase.test_weighted, renumber);
-
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      double elapsed_time{0.0};
-      hr_clock.stop(&elapsed_time);
-      std::cout << "construct_graph took " << elapsed_time * 1e-6 << " s.\n";
-    }
-
-    auto graph_view = graph.view();
-
-    edge_t num_paths = 10;
-    rmm::device_uvector<vertex_t> d_start(num_paths, handle_->get_stream());
-
-    thrust::tabulate(handle_->get_thrust_policy(),
-                     d_start.begin(),
-                     d_start.end(),
-                     [num_vertices = graph_view.number_of_vertices()] __device__(auto idx) {
-                       return (idx % num_vertices);
-                     });
-
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      hr_clock.start();
-    }
-
-#if 0
-    auto [vertices, weights] = randomwalks_usecase(
-      *handle_, graph_view, raft::device_span<vertex_t const>{d_start.data(), d_start.size()}, size_t{10});
-#else
-    EXPECT_THROW(
-      randomwalks_usecase(*handle_,
-                          graph_view,
-                          raft::device_span<vertex_t const>{d_start.data(), d_start.size()},
-                          size_t{10}),
-      std::exception);
-#endif
-
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      double elapsed_time{0.0};
-      hr_clock.stop(&elapsed_time);
-      std::cout << "PageRank took " << elapsed_time * 1e-6 << " s.\n";
-    }
-
-    if (randomwalks_usecase.check_correctness) {
-#if 0
-      // FIXME: Need an MG test
-#endif
-    }
-  }
-
- private:
-  static std::unique_ptr<raft::handle_t> handle_;
-};
-
-template <typename tuple_t>
-std::unique_ptr<raft::handle_t> Tests_MGRandomWalks<tuple_t>::handle_ = nullptr;
-
-using Tests_UniformRandomWalks_File =
-  Tests_MGRandomWalks<std::tuple<UniformRandomWalks_Usecase, cugraph::test::File_Usecase>>;
-using Tests_UniformRandomWalks_Rmat =
-  Tests_MGRandomWalks<std::tuple<UniformRandomWalks_Usecase, cugraph::test::Rmat_Usecase>>;
-using Tests_BiasedRandomWalks_File =
-  Tests_MGRandomWalks<std::tuple<BiasedRandomWalks_Usecase, cugraph::test::File_Usecase>>;
-using Tests_BiasedRandomWalks_Rmat =
-  Tests_MGRandomWalks<std::tuple<BiasedRandomWalks_Usecase, cugraph::test::Rmat_Usecase>>;
-using Tests_Node2VecRandomWalks_File =
-  Tests_MGRandomWalks<std::tuple<Node2VecRandomWalks_Usecase, cugraph::test::File_Usecase>>;
-using Tests_Node2VecRandomWalks_Rmat =
-  Tests_MGRandomWalks<std::tuple<Node2VecRandomWalks_Usecase, cugraph::test::Rmat_Usecase>>;
-
-TEST_P(Tests_UniformRandomWalks_File, Initialize_i32_i32_f)
-{
-  run_current_test<int32_t, int32_t, float>(
-    override_File_Usecase_with_cmd_line_arguments(GetParam()));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-  simple_test,
-  Tests_UniformRandomWalks_File,
-  ::testing::Combine(
-    ::testing::Values(UniformRandomWalks_Usecase{}),
-    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
-                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
-                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
-                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
-
-INSTANTIATE_TEST_SUITE_P(
-  simple_test,
-  Tests_BiasedRandomWalks_File,
-  ::testing::Combine(
-    ::testing::Values(BiasedRandomWalks_Usecase{}),
-    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
-                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
-                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
-                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
-
-INSTANTIATE_TEST_SUITE_P(
-  simple_test,
-  Tests_Node2VecRandomWalks_File,
-  ::testing::Combine(
-    ::testing::Values(Node2VecRandomWalks_Usecase{4, 8}),
-    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
-                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
-                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
-                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
-
-CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/random_walks_check.cuh b/cpp/tests/sampling/random_walks_check.cuh
new file mode 100644
index 00000000000..075d2f183bb
--- /dev/null
+++ b/cpp/tests/sampling/random_walks_check.cuh
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <sampling/random_walks_check.hpp>
+
+#include <utilities/device_comm_wrapper.hpp>
+
+#include <thrust/sort.h>
+
+#include <gtest/gtest.h>
+
+namespace cugraph {
+namespace test {
+
+template <typename graph_view_type>
+void random_walks_validate(
+  raft::handle_t const& handle,
+  graph_view_type const& graph_view,
+  rmm::device_uvector<typename graph_view_type::vertex_type>&& d_start,
+  rmm::device_uvector<typename graph_view_type::vertex_type>&& d_vertices,
+  std::optional<rmm::device_uvector<typename graph_view_type::weight_type>>&& d_weights,
+  size_t max_length)
+{
+  // FIXME: The sampling functions should use the standard version, not number_of_vertices
+  auto invalid_vertex_id = graph_view.number_of_vertices();
+
+  auto [d_src, d_dst, d_wgt] = graph_view.decompress_to_edgelist(handle, std::nullopt);
+
+  if constexpr (graph_view_type::is_multi_gpu) {
+    using vertex_t = typename graph_view_type::vertex_type;
+    using weight_t = typename graph_view_type::weight_type;
+
+    d_src = cugraph::test::device_gatherv(
+      handle, raft::device_span<vertex_t const>(d_src.data(), d_src.size()));
+    d_dst = cugraph::test::device_gatherv(
+      handle, raft::device_span<vertex_t const>(d_dst.data(), d_dst.size()));
+    if (d_wgt)
+      *d_wgt = cugraph::test::device_gatherv(
+        handle, raft::device_span<weight_t const>(d_wgt->data(), d_wgt->size()));
+
+    d_vertices = cugraph::test::device_gatherv(
+      handle, raft::device_span<vertex_t const>(d_vertices.data(), d_vertices.size()));
+    d_start = cugraph::test::device_gatherv(
+      handle, raft::device_span<vertex_t const>(d_start.data(), d_start.size()));
+
+    if (d_weights)
+      *d_weights = cugraph::test::device_gatherv(
+        handle, raft::device_span<weight_t const>(d_weights->data(), d_weights->size()));
+  }
+
+  if (d_start.size() > 0) {
+    rmm::device_uvector<int> failures(d_start.size() * max_length, handle.get_stream());
+
+    if (d_wgt) {
+      thrust::sort(handle.get_thrust_policy(),
+                   thrust::make_zip_iterator(d_src.begin(), d_dst.begin(), d_wgt->begin()),
+                   thrust::make_zip_iterator(d_src.end(), d_dst.end(), d_wgt->end()));
+
+      thrust::transform(
+        handle.get_thrust_policy(),
+        thrust::make_counting_iterator(size_t{0}),
+        thrust::make_counting_iterator(d_start.size() * max_length),
+        failures.begin(),
+        [src       = d_src.data(),
+         dst       = d_dst.data(),
+         wgt       = d_wgt->data(),
+         vertices  = d_vertices.data(),
+         weights   = d_weights->data(),
+         num_edges = d_src.size(),
+         invalid_vertex_id,
+         max_length] __device__(auto i) {
+          auto const s = vertices[(i / max_length) * (max_length + 1) + (i % max_length)];
+          auto const d = vertices[(i / max_length) * (max_length + 1) + (i % max_length) + 1];
+          auto const w = weights[i];
+
+          // FIXME: if src != invalid_vertex_id and dst == invalid_vertex_id
+          //    should add a check to verify that degree(src) == 0
+          if (d != invalid_vertex_id) {
+            auto iter = thrust::make_zip_iterator(src, dst);
+            auto pos  = thrust::find(thrust::seq, iter, iter + num_edges, thrust::make_tuple(s, d));
+
+            if (pos != (iter + num_edges)) {
+              auto index = thrust::distance(iter, pos);
+
+              for (; (index < num_edges) && (s == src[index]) && (d == dst[index]); ++index) {
+                if (w == wgt[index]) return 0;
+              }
+              printf("edge (%d,%d) found, got weight %g, did not match expected\n",
+                     (int)s,
+                     (int)d,
+                     (float)w);
+            } else {
+              printf("edge (%d,%d) NOT FOUND\n", (int)s, (int)d);
+            }
+
+            return 1;
+          }
+
+          return 0;
+        });
+    } else {
+      thrust::sort(handle.get_thrust_policy(),
+                   thrust::make_zip_iterator(d_src.begin(), d_dst.begin()),
+                   thrust::make_zip_iterator(d_src.end(), d_dst.end()));
+
+      thrust::transform(
+        handle.get_thrust_policy(),
+        thrust::make_counting_iterator(size_t{0}),
+        thrust::make_counting_iterator(d_start.size() * max_length),
+        failures.begin(),
+        [src       = d_src.data(),
+         dst       = d_dst.data(),
+         vertices  = d_vertices.data(),
+         num_edges = d_src.size(),
+         invalid_vertex_id,
+         max_length] __device__(auto i) {
+          auto const s = vertices[(i / max_length) * (max_length + 1) + (i % max_length)];
+          auto const d = vertices[(i / max_length) * (max_length + 1) + (i % max_length) + 1];
+
+          // FIXME: if src != invalid_vertex_id and dst == invalid_vertex_id
+          //    should add a check to verify that degree(src) == 0
+          if (d != invalid_vertex_id) {
+            auto iter = thrust::make_zip_iterator(src, dst);
+            auto pos  = thrust::find(thrust::seq, iter, iter + num_edges, thrust::make_tuple(s, d));
+
+            if (pos == (iter + num_edges)) printf("edge (%d,%d) NOT FOUND\n", (int)s, (int)d);
+
+            return (pos == (iter + num_edges)) ? 1 : 0;
+          }
+
+          return 0;
+        });
+    }
+
+    EXPECT_EQ(0, thrust::reduce(handle.get_thrust_policy(), failures.begin(), failures.end()));
+  }
+}
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/sampling/random_walks_check.hpp b/cpp/tests/sampling/random_walks_check.hpp
new file mode 100644
index 00000000000..c1f5dac61c2
--- /dev/null
+++ b/cpp/tests/sampling/random_walks_check.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/graph_view.hpp>
+
+namespace cugraph {
+namespace test {
+
+template <typename graph_view_type>
+void random_walks_validate(
+  raft::handle_t const& handle,
+  graph_view_type const& graph_view,
+  rmm::device_uvector<typename graph_view_type::vertex_type>&& d_start,
+  rmm::device_uvector<typename graph_view_type::vertex_type>&& d_vertices,
+  std::optional<rmm::device_uvector<typename graph_view_type::weight_type>>&& d_weights,
+  size_t max_length);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/sampling/random_walks_check_mg.cu b/cpp/tests/sampling/random_walks_check_mg.cu
new file mode 100644
index 00000000000..4536b3cc59c
--- /dev/null
+++ b/cpp/tests/sampling/random_walks_check_mg.cu
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <sampling/random_walks_check.cuh>
+
+namespace cugraph {
+namespace test {
+
+template void random_walks_validate(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, float, false, true> const& graph_view,
+  rmm::device_uvector<int32_t>&& d_start,
+  rmm::device_uvector<int32_t>&& d_vertices,
+  std::optional<rmm::device_uvector<float>>&& d_weights,
+  size_t max_length);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/sampling/random_walks_check_sg.cu b/cpp/tests/sampling/random_walks_check_sg.cu
new file mode 100644
index 00000000000..54c900a7933
--- /dev/null
+++ b/cpp/tests/sampling/random_walks_check_sg.cu
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <sampling/random_walks_check.cuh>
+
+namespace cugraph {
+namespace test {
+
+template void random_walks_validate(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, float, false, false> const& graph_view,
+  rmm::device_uvector<int32_t>&& d_start,
+  rmm::device_uvector<int32_t>&& d_vertices,
+  std::optional<rmm::device_uvector<float>>&& d_weights,
+  size_t max_length);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/sampling/sg_random_walks_test.cu b/cpp/tests/sampling/sg_random_walks_test.cpp
similarity index 52%
rename from cpp/tests/sampling/sg_random_walks_test.cu
rename to cpp/tests/sampling/sg_random_walks_test.cpp
index 51ced4ee700..a46aaae7fb8 100644
--- a/cpp/tests/sampling/sg_random_walks_test.cu
+++ b/cpp/tests/sampling/sg_random_walks_test.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <sampling/random_walks_check.hpp>
+
 #include <utilities/base_fixture.hpp>
 #include <utilities/high_res_clock.h>
 #include <utilities/test_graphs.hpp>
@@ -27,28 +29,6 @@
 
 #include <gtest/gtest.h>
 
-#if 0
-#include "cuda_profiler_api.h"
-
-#include <rmm/exec_policy.hpp>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
-
-#include <sampling/random_walks.cuh>
-
-#include <raft/handle.hpp>
-
-#include "random_walks_utils.cuh"
-
-#include <algorithm>
-#include <iterator>
-#include <limits>
-#include <numeric>
-#include <tuple>
-#include <vector>
-#endif
-
 struct UniformRandomWalks_Usecase {
   bool test_weighted{false};
   uint64_t seed{0};
@@ -59,10 +39,12 @@ struct UniformRandomWalks_Usecase {
   operator()(raft::handle_t const& handle,
              cugraph::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
              raft::device_span<vertex_t const> start_vertices,
-             size_t max_depth)
+             size_t num_paths)
   {
-    return cugraph::uniform_random_walks(handle, graph_view, start_vertices, max_depth, seed);
+    return cugraph::uniform_random_walks(handle, graph_view, start_vertices, num_paths, seed);
   }
+
+  bool expect_throw() { return false; }
 };
 
 struct BiasedRandomWalks_Usecase {
@@ -75,10 +57,13 @@ struct BiasedRandomWalks_Usecase {
   operator()(raft::handle_t const& handle,
              cugraph::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
              raft::device_span<vertex_t const> start_vertices,
-             size_t max_depth)
+             size_t num_paths)
   {
-    return cugraph::biased_random_walks(handle, graph_view, start_vertices, max_depth, seed);
+    return cugraph::biased_random_walks(handle, graph_view, start_vertices, num_paths, seed);
   }
+
+  // FIXME: Not currently implemented
+  bool expect_throw() { return true; }
 };
 
 struct Node2VecRandomWalks_Usecase {
@@ -93,11 +78,19 @@ struct Node2VecRandomWalks_Usecase {
   operator()(raft::handle_t const& handle,
              cugraph::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
              raft::device_span<vertex_t const> start_vertices,
-             size_t max_depth)
+             size_t num_paths)
   {
-    return cugraph::node2vec_random_walks(
-      handle, graph_view, start_vertices, max_depth, p, q, seed);
+    return cugraph::node2vec_random_walks(handle,
+                                          graph_view,
+                                          start_vertices,
+                                          num_paths,
+                                          static_cast<weight_t>(p),
+                                          static_cast<weight_t>(q),
+                                          seed);
   }
+
+  // FIXME: Not currently implemented
+  bool expect_throw() { return true; }
 };
 
 template <typename tuple_t>
@@ -123,7 +116,6 @@ class Tests_RandomWalks : public ::testing::TestWithParam<tuple_t> {
       hr_clock.start();
     }
 
-    // TODO: Do I need to turn renumber off?  It's off in the old test
     bool renumber{true};
     auto [graph, d_renumber_map_labels] =
       cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
@@ -136,51 +128,48 @@ class Tests_RandomWalks : public ::testing::TestWithParam<tuple_t> {
       std::cout << "construct_graph took " << elapsed_time * 1e-6 << " s.\n";
     }
 
-    auto graph_view = graph.view();
-
-    edge_t num_paths = 10;
+    auto graph_view   = graph.view();
+    edge_t num_paths  = std::min(edge_t{10}, graph_view.number_of_vertices());
+    edge_t max_length = 10;
     rmm::device_uvector<vertex_t> d_start(num_paths, handle.get_stream());
 
-    thrust::tabulate(handle.get_thrust_policy(),
-                     d_start.begin(),
-                     d_start.end(),
-                     [num_vertices = graph_view.number_of_vertices()] __device__(auto idx) {
-                       return (idx % num_vertices);
-                     });
-
-    edge_t max_depth{10};
+    cugraph::detail::sequence_fill(handle.get_stream(), d_start.begin(), d_start.size(), 0);
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
       hr_clock.start();
     }
 
-#if 0
-    auto [vertices, weights] = randomwalks_usecase(
-      handle, graph_view, raft::device_span<vertex_t const>{d_start.data(), d_start.size()}, size_t{10});
-#else
-    EXPECT_THROW(
-      randomwalks_usecase(handle,
-                          graph_view,
-                          raft::device_span<vertex_t const>{d_start.data(), d_start.size()},
-                          size_t{10}),
-      std::exception);
-#endif
-
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      double elapsed_time{0.0};
-      hr_clock.stop(&elapsed_time);
-      std::cout << "PageRank took " << elapsed_time * 1e-6 << " s.\n";
-    }
-
-    if (randomwalks_usecase.check_correctness) {
-#if 0
-      bool test_all_paths =
-        cugraph::test::host_check_rw_paths(handle, graph_view, vertices, weights);
-
-      ASSERT_TRUE(test_all_paths);
-#endif
+    if (randomwalks_usecase.expect_throw()) {
+      // biased and node2vec currently throw since they are not implemented
+      EXPECT_THROW(
+        randomwalks_usecase(handle,
+                            graph_view,
+                            raft::device_span<vertex_t const>{d_start.data(), d_start.size()},
+                            max_length),
+        std::exception);
+    } else {
+      auto [d_vertices, d_weights] =
+        randomwalks_usecase(handle,
+                            graph_view,
+                            raft::device_span<vertex_t const>{d_start.data(), d_start.size()},
+                            max_length);
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        double elapsed_time{0.0};
+        hr_clock.stop(&elapsed_time);
+        std::cout << "RandomWalks took " << elapsed_time * 1e-6 << " s.\n";
+      }
+
+      if (randomwalks_usecase.check_correctness) {
+        cugraph::test::random_walks_validate(handle,
+                                             graph_view,
+                                             std::move(d_start),
+                                             std::move(d_vertices),
+                                             std::move(d_weights),
+                                             max_length);
+      }
     }
   }
 };
@@ -204,11 +193,42 @@ TEST_P(Tests_UniformRandomWalks_File, Initialize_i32_i32_f)
     override_File_Usecase_with_cmd_line_arguments(GetParam()));
 }
 
+TEST_P(Tests_UniformRandomWalks_Rmat, Initialize_i32_i32_f)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_BiasedRandomWalks_File, Initialize_i32_i32_f)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_BiasedRandomWalks_Rmat, Initialize_i32_i32_f)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Node2VecRandomWalks_File, Initialize_i32_i32_f)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Node2VecRandomWalks_Rmat, Initialize_i32_i32_f)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
 INSTANTIATE_TEST_SUITE_P(
   simple_test,
   Tests_UniformRandomWalks_File,
   ::testing::Combine(
-    ::testing::Values(UniformRandomWalks_Usecase{}),
+    ::testing::Values(UniformRandomWalks_Usecase{false, 0, true},
+                      UniformRandomWalks_Usecase{true, 0, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -218,7 +238,8 @@ INSTANTIATE_TEST_SUITE_P(
   simple_test,
   Tests_BiasedRandomWalks_File,
   ::testing::Combine(
-    ::testing::Values(BiasedRandomWalks_Usecase{}),
+    ::testing::Values(BiasedRandomWalks_Usecase{false, 0, true},
+                      BiasedRandomWalks_Usecase{true, 0, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -228,10 +249,56 @@ INSTANTIATE_TEST_SUITE_P(
   simple_test,
   Tests_Node2VecRandomWalks_File,
   ::testing::Combine(
-    ::testing::Values(Node2VecRandomWalks_Usecase{4, 8}),
+    ::testing::Values(Node2VecRandomWalks_Usecase{4, 8, false, 0, true},
+                      Node2VecRandomWalks_Usecase{4, 8, true, 0, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
                       cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
 
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_UniformRandomWalks_Rmat,
+  ::testing::Combine(::testing::Values(UniformRandomWalks_Usecase{false, 0, true},
+                                       UniformRandomWalks_Usecase{true, 0, true}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       10, 16, 0.57, 0.19, 0.19, 0, false, false, 0, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test,
+  Tests_UniformRandomWalks_Rmat,
+  ::testing::Combine(::testing::Values(UniformRandomWalks_Usecase{true, 0, false}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       20, 32, 0.57, 0.19, 0.19, 0, false, false, 0, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_BiasedRandomWalks_Rmat,
+  ::testing::Combine(::testing::Values(BiasedRandomWalks_Usecase{false, 0, true},
+                                       BiasedRandomWalks_Usecase{true, 0, true}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       10, 16, 0.57, 0.19, 0.19, 0, false, false, 0, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test,
+  Tests_BiasedRandomWalks_Rmat,
+  ::testing::Combine(::testing::Values(BiasedRandomWalks_Usecase{true, 0, false}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       20, 32, 0.57, 0.19, 0.19, 0, false, false, 0, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Node2VecRandomWalks_Rmat,
+  ::testing::Combine(::testing::Values(Node2VecRandomWalks_Usecase{8, 4, false, 0, true},
+                                       Node2VecRandomWalks_Usecase{8, 4, true, 0, true}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       10, 16, 0.57, 0.19, 0.19, 0, false, false, 0, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test,
+  Tests_Node2VecRandomWalks_Rmat,
+  ::testing::Combine(::testing::Values(Node2VecRandomWalks_Usecase{8, 4, true, 0, false}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       20, 32, 0.57, 0.19, 0.19, 0, false, false, 0, false))));
+
 CUGRAPH_TEST_PROGRAM_MAIN()