diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d9bea1864..78c67d9c8 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -290,9 +290,14 @@ target_compile_options(
 add_library(
   cuvs SHARED
   src/cluster/kmeans_balanced_fit_float.cu
+  src/cluster/kmeans_fit_mg_float.cu
+  src/cluster/kmeans_fit_mg_double.cu
+  src/cluster/kmeans_fit_double.cu
   src/cluster/kmeans_fit_float.cu
   src/cluster/kmeans_auto_find_k_float.cu
+  src/cluster/kmeans_fit_predict_double.cu
   src/cluster/kmeans_fit_predict_float.cu
+  src/cluster/kmeans_predict_double.cu
   src/cluster/kmeans_predict_float.cu
   src/cluster/kmeans_balanced_fit_float.cu
   src/cluster/kmeans_balanced_fit_predict_float.cu
@@ -300,6 +305,7 @@ add_library(
   src/cluster/kmeans_balanced_fit_int8.cu
   src/cluster/kmeans_balanced_fit_predict_int8.cu
   src/cluster/kmeans_balanced_predict_int8.cu
+  src/cluster/kmeans_transform_double.cu
   src/cluster/kmeans_transform_float.cu
   src/cluster/single_linkage_float.cu
   src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
@@ -342,6 +348,8 @@ add_library(
   src/distance/detail/pairwise_matrix/dispatch_russel_rao_half_float_float_int.cu
   src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
   src/distance/detail/pairwise_matrix/dispatch_rbf.cu
+  src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int64_t.cu
+  src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int64_t.cu
   src/distance/detail/fused_distance_nn.cu
   src/distance/distance.cu
   src/distance/pairwise_distance.cu
diff --git a/cpp/include/cuvs/cluster/kmeans.hpp b/cpp/include/cuvs/cluster/kmeans.hpp
index 75205fa4f..89b3acc24 100644
--- a/cpp/include/cuvs/cluster/kmeans.hpp
+++ b/cpp/include/cuvs/cluster/kmeans.hpp
@@ -153,7 +153,7 @@ struct balanced_params : base_params {
  *   using namespace  cuvs::cluster;
  *   ...
  *   raft::resources handle;
- *    cuvs::cluster::kmeans::params params;
+ *   cuvs::cluster::kmeans::params params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
  *
@@ -203,7 +203,159 @@ void fit(raft::resources const& handle,
  *   using namespace  cuvs::cluster;
  *   ...
  *   raft::resources handle;
- *    cuvs::cluster::kmeans::params params;
+ *   cuvs::cluster::kmeans::params params;
+ *   int64_t n_features = 15, inertia, n_iter;
+ *   auto centroids = raft::make_device_matrix<float, int64_t>(handle, params.n_clusters,
+ * n_features);
+ *
+ *   kmeans::fit(handle,
+ *               params,
+ *               X,
+ *               std::nullopt,
+ *               centroids,
+ *               raft::make_scalar_view(&inertia),
+ *               raft::make_scalar_view(&n_iter));
+ * @endcode
+ *
+ * @param[in]     handle        The raft handle.
+ * @param[in]     params        Parameters for KMeans model.
+ * @param[in]     X             Training instances to cluster. The data must
+ *                              be in row-major format.
+ *                              [dim = n_samples x n_features]
+ * @param[in]     sample_weight Optional weights for each observation in X.
+ *                              [len = n_samples]
+ * @param[inout]  centroids     [in] When init is InitMethod::Array, use
+ *                              centroids as the initial cluster centers.
+ *                              [out] The generated centroids from the
+ *                              kmeans algorithm are stored at the address
+ *                              pointed by 'centroids'.
+ *                              [dim = n_clusters x n_features]
+ * @param[out]    inertia       Sum of squared distances of samples to their
+ *                              closest cluster center.
+ * @param[out]    n_iter        Number of iterations run.
+ */
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::device_matrix_view<const float, int64_t> X,
+         std::optional<raft::device_vector_view<const float, int64_t>> sample_weight,
+         raft::device_matrix_view<float, int64_t> centroids,
+         raft::host_scalar_view<float, int64_t> inertia,
+         raft::host_scalar_view<int64_t, int64_t> n_iter);
+
+/**
+ * @brief Find clusters with k-means algorithm.
+ *   Initial centroids are chosen with k-means++ algorithm. Empty
+ *   clusters are reinitialized by choosing new centroids with
+ *   k-means++ algorithm.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/resources.hpp>
+ *   #include <cuvs/cluster/kmeans.hpp>
+ *   using namespace  cuvs::cluster;
+ *   ...
+ *   raft::resources handle;
+ *   cuvs::cluster::kmeans::params params;
+ *   int n_features = 15, inertia, n_iter;
+ *   auto centroids = raft::make_device_matrix<double, int>(handle, params.n_clusters, n_features);
+ *
+ *   kmeans::fit(handle,
+ *               params,
+ *               X,
+ *               std::nullopt,
+ *               centroids,
+ *               raft::make_scalar_view(&inertia),
+ *               raft::make_scalar_view(&n_iter));
+ * @endcode
+ *
+ * @param[in]     handle        The raft handle.
+ * @param[in]     params        Parameters for KMeans model.
+ * @param[in]     X             Training instances to cluster. The data must
+ *                              be in row-major format.
+ *                              [dim = n_samples x n_features]
+ * @param[in]     sample_weight Optional weights for each observation in X.
+ *                              [len = n_samples]
+ * @param[inout]  centroids     [in] When init is InitMethod::Array, use
+ *                              centroids as the initial cluster centers.
+ *                              [out] The generated centroids from the
+ *                              kmeans algorithm are stored at the address
+ *                              pointed by 'centroids'.
+ *                              [dim = n_clusters x n_features]
+ * @param[out]    inertia       Sum of squared distances of samples to their
+ *                              closest cluster center.
+ * @param[out]    n_iter        Number of iterations run.
+ */
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::device_matrix_view<const double, int> X,
+         std::optional<raft::device_vector_view<const double, int>> sample_weight,
+         raft::device_matrix_view<double, int> centroids,
+         raft::host_scalar_view<double, int> inertia,
+         raft::host_scalar_view<int, int> n_iter);
+
+/**
+ * @brief Find clusters with k-means algorithm.
+ *   Initial centroids are chosen with k-means++ algorithm. Empty
+ *   clusters are reinitialized by choosing new centroids with
+ *   k-means++ algorithm.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/resources.hpp>
+ *   #include <cuvs/cluster/kmeans.hpp>
+ *   using namespace  cuvs::cluster;
+ *   ...
+ *   raft::resources handle;
+ *   cuvs::cluster::kmeans::params params;
+ *   int64_t n_features = 15, inertia, n_iter;
+ *   auto centroids = raft::make_device_matrix<double, int64_t>(handle, params.n_clusters,
+ * n_features);
+ *
+ *   kmeans::fit(handle,
+ *               params,
+ *               X,
+ *               std::nullopt,
+ *               centroids,
+ *               raft::make_scalar_view(&inertia),
+ *               raft::make_scalar_view(&n_iter));
+ * @endcode
+ *
+ * @param[in]     handle        The raft handle.
+ * @param[in]     params        Parameters for KMeans model.
+ * @param[in]     X             Training instances to cluster. The data must
+ *                              be in row-major format.
+ *                              [dim = n_samples x n_features]
+ * @param[in]     sample_weight Optional weights for each observation in X.
+ *                              [len = n_samples]
+ * @param[inout]  centroids     [in] When init is InitMethod::Array, use
+ *                              centroids as the initial cluster centers.
+ *                              [out] The generated centroids from the
+ *                              kmeans algorithm are stored at the address
+ *                              pointed by 'centroids'.
+ *                              [dim = n_clusters x n_features]
+ * @param[out]    inertia       Sum of squared distances of samples to their
+ *                              closest cluster center.
+ * @param[out]    n_iter        Number of iterations run.
+ */
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::device_matrix_view<const double, int64_t> X,
+         std::optional<raft::device_vector_view<const double, int64_t>> sample_weight,
+         raft::device_matrix_view<double, int64_t> centroids,
+         raft::host_scalar_view<double, int64_t> inertia,
+         raft::host_scalar_view<int64_t, int64_t> n_iter);
+
+/**
+ * @brief Find clusters with k-means algorithm.
+ *   Initial centroids are chosen with k-means++ algorithm. Empty
+ *   clusters are reinitialized by choosing new centroids with
+ *   k-means++ algorithm.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/resources.hpp>
+ *   #include <cuvs/cluster/kmeans.hpp>
+ *   using namespace  cuvs::cluster;
+ *   ...
+ *   raft::resources handle;
+ *   cuvs::cluster::kmeans::params params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
  *
@@ -250,7 +402,7 @@ void fit(raft::resources const& handle,
  *   using namespace  cuvs::cluster;
  *   ...
  *   raft::resources handle;
- *    cuvs::cluster::kmeans::balanced_params params;
+ *   cuvs::cluster::kmeans::balanced_params params;
  *   int n_features = 15;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
  *
@@ -284,7 +436,7 @@ void fit(const raft::resources& handle,
  *   using namespace  cuvs::cluster;
  *   ...
  *   raft::resources handle;
- *    cuvs::cluster::kmeans::balanced_params params;
+ *   cuvs::cluster::kmeans::balanced_params params;
  *   int n_features = 15;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
  *
@@ -308,7 +460,6 @@ void fit(const raft::resources& handle,
          cuvs::cluster::kmeans::balanced_params const& params,
          raft::device_matrix_view<const int8_t, int> X,
          raft::device_matrix_view<int8_t, int> centroids);
-
 /**
  * @brief Predict the closest cluster each sample in X belongs to.
  *
@@ -318,7 +469,7 @@ void fit(const raft::resources& handle,
  *   using namespace  cuvs::cluster;
  *   ...
  *   raft::resources handle;
- *    cuvs::cluster::kmeans::params params;
+ *   cuvs::cluster::kmeans::params params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
  *
@@ -363,7 +514,7 @@ void predict(raft::resources const& handle,
              raft::device_matrix_view<const float, int> X,
              std::optional<raft::device_vector_view<const float, int>> sample_weight,
              raft::device_matrix_view<const float, int> centroids,
-             raft::device_vector_view<uint32_t, int> labels,
+             raft::device_vector_view<int, int> labels,
              bool normalize_weight,
              raft::host_scalar_view<float> inertia);
 
@@ -376,7 +527,7 @@ void predict(raft::resources const& handle,
  *   using namespace  cuvs::cluster;
  *   ...
  *   raft::resources handle;
- *    cuvs::cluster::kmeans::params params;
+ *   cuvs::cluster::kmeans::params params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
  *
@@ -388,7 +539,7 @@ void predict(raft::resources const& handle,
  *               raft::make_scalar_view(&inertia),
  *               raft::make_scalar_view(&n_iter));
  *   ...
- *   auto labels = raft::make_device_vector<int, int>(handle, X.extent(0));
+ *   auto labels = raft::make_device_vector<int64_t, int>(handle, X.extent(0));
  *
  *   kmeans::predict(handle,
  *                   params,
@@ -404,18 +555,26 @@ void predict(raft::resources const& handle,
  * @param[in]     params           Parameters for KMeans model.
  * @param[in]     X                New data to predict.
  *                                 [dim = n_samples x n_features]
+ * @param[in]     sample_weight    Optional weights for each observation in X.
+ *                                 [len = n_samples]
  * @param[in]     centroids        Cluster centroids. The data must be in
  *                                 row-major format.
  *                                 [dim = n_clusters x n_features]
+ * @param[in]     normalize_weight True if the weights should be normalized
  * @param[out]    labels           Index of the cluster each sample in X
  *                                 belongs to.
  *                                 [len = n_samples]
+ * @param[out]    inertia          Sum of squared distances of samples to
+ *                                 their closest cluster center.
  */
-void predict(const raft::resources& handle,
-             cuvs::cluster::kmeans::balanced_params const& params,
-             raft::device_matrix_view<const int8_t, int> X,
+void predict(raft::resources const& handle,
+             const kmeans::params& params,
+             raft::device_matrix_view<const float, int> X,
+             std::optional<raft::device_vector_view<const float, int>> sample_weight,
              raft::device_matrix_view<const float, int> centroids,
-             raft::device_vector_view<uint32_t, int> labels);
+             raft::device_vector_view<int64_t, int> labels,
+             bool normalize_weight,
+             raft::host_scalar_view<float> inertia);
 
 /**
  * @brief Predict the closest cluster each sample in X belongs to.
@@ -426,22 +585,144 @@ void predict(const raft::resources& handle,
  *   using namespace  cuvs::cluster;
  *   ...
  *   raft::resources handle;
- *    cuvs::cluster::kmeans::balanced_params params;
- *   int n_features = 15;
+ *   cuvs::cluster::kmeans::params params;
+ *   int n_features = 15, inertia, n_iter;
+ *   auto centroids = raft::make_device_matrix<double, int>(handle, params.n_clusters, n_features);
+ *
+ *   kmeans::fit(handle,
+ *               params,
+ *               X,
+ *               std::nullopt,
+ *               centroids.view(),
+ *               raft::make_scalar_view(&inertia),
+ *               raft::make_scalar_view(&n_iter));
+ *   ...
+ *   auto labels = raft::make_device_vector<int, int>(handle, X.extent(0));
+ *
+ *   kmeans::predict(handle,
+ *                   params,
+ *                   X,
+ *                   std::nullopt,
+ *                   centroids.view(),
+ *                   false,
+ *                   labels.view(),
+ *                   raft::make_scalar_view(&ineratia));
+ * @endcode
+ *
+ * @param[in]     handle           The raft handle.
+ * @param[in]     params           Parameters for KMeans model.
+ * @param[in]     X                New data to predict.
+ *                                 [dim = n_samples x n_features]
+ * @param[in]     sample_weight    Optional weights for each observation in X.
+ *                                 [len = n_samples]
+ * @param[in]     centroids        Cluster centroids. The data must be in
+ *                                 row-major format.
+ *                                 [dim = n_clusters x n_features]
+ * @param[in]     normalize_weight True if the weights should be normalized
+ * @param[out]    labels           Index of the cluster each sample in X
+ *                                 belongs to.
+ *                                 [len = n_samples]
+ * @param[out]    inertia          Sum of squared distances of samples to
+ *                                 their closest cluster center.
+ */
+void predict(raft::resources const& handle,
+             const kmeans::params& params,
+             raft::device_matrix_view<const double, int> X,
+             std::optional<raft::device_vector_view<const double, int>> sample_weight,
+             raft::device_matrix_view<const double, int> centroids,
+             raft::device_vector_view<int, int> labels,
+             bool normalize_weight,
+             raft::host_scalar_view<double> inertia);
+
+/**
+ * @brief Predict the closest cluster each sample in X belongs to.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/resources.hpp>
+ *   #include <cuvs/cluster/kmeans.hpp>
+ *   using namespace  cuvs::cluster;
+ *   ...
+ *   raft::resources handle;
+ *   cuvs::cluster::kmeans::params params;
+ *   int n_features = 15, inertia, n_iter;
+ *   auto centroids = raft::make_device_matrix<double, int>(handle, params.n_clusters, n_features);
+ *
+ *   kmeans::fit(handle,
+ *               params,
+ *               X,
+ *               std::nullopt,
+ *               centroids.view(),
+ *               raft::make_scalar_view(&inertia),
+ *               raft::make_scalar_view(&n_iter));
+ *   ...
+ *   auto labels = raft::make_device_vector<int64_t, int>(handle, X.extent(0));
+ *
+ *   kmeans::predict(handle,
+ *                   params,
+ *                   X,
+ *                   std::nullopt,
+ *                   centroids.view(),
+ *                   false,
+ *                   labels.view(),
+ *                   raft::make_scalar_view(&ineratia));
+ * @endcode
+ *
+ * @param[in]     handle           The raft handle.
+ * @param[in]     params           Parameters for KMeans model.
+ * @param[in]     X                New data to predict.
+ *                                 [dim = n_samples x n_features]
+ * @param[in]     sample_weight    Optional weights for each observation in X.
+ *                                 [len = n_samples]
+ * @param[in]     centroids        Cluster centroids. The data must be in
+ *                                 row-major format.
+ *                                 [dim = n_clusters x n_features]
+ * @param[in]     normalize_weight True if the weights should be normalized
+ * @param[out]    labels           Index of the cluster each sample in X
+ *                                 belongs to.
+ *                                 [len = n_samples]
+ * @param[out]    inertia          Sum of squared distances of samples to
+ *                                 their closest cluster center.
+ */
+void predict(raft::resources const& handle,
+             const kmeans::params& params,
+             raft::device_matrix_view<const double, int> X,
+             std::optional<raft::device_vector_view<const double, int>> sample_weight,
+             raft::device_matrix_view<const double, int> centroids,
+             raft::device_vector_view<int64_t, int> labels,
+             bool normalize_weight,
+             raft::host_scalar_view<double> inertia);
+
+/**
+ * @brief Predict the closest cluster each sample in X belongs to.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/resources.hpp>
+ *   #include <cuvs/cluster/kmeans.hpp>
+ *   using namespace  cuvs::cluster;
+ *   ...
+ *   raft::resources handle;
+ *   cuvs::cluster::kmeans::params params;
+ *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
  *
  *   kmeans::fit(handle,
  *               params,
  *               X,
- *               centroids.view());
+ *               std::nullopt,
+ *               centroids.view(),
+ *               raft::make_scalar_view(&inertia),
+ *               raft::make_scalar_view(&n_iter));
  *   ...
  *   auto labels = raft::make_device_vector<int, int>(handle, X.extent(0));
  *
  *   kmeans::predict(handle,
  *                   params,
  *                   X,
+ *                   std::nullopt,
  *                   centroids.view(),
- *                   labels.view());
+ *                   false,
+ *                   labels.view(),
+ *                   raft::make_scalar_view(&ineratia));
  * @endcode
  *
  * @param[in]     handle           The raft handle.
@@ -457,7 +738,7 @@ void predict(const raft::resources& handle,
  */
 void predict(const raft::resources& handle,
              cuvs::cluster::kmeans::balanced_params const& params,
-             raft::device_matrix_view<const float, int> X,
+             raft::device_matrix_view<const int8_t, int> X,
              raft::device_matrix_view<const float, int> centroids,
              raft::device_vector_view<uint32_t, int> labels);
 
@@ -471,7 +752,7 @@ void predict(const raft::resources& handle,
  *   using namespace  cuvs::cluster;
  *   ...
  *   raft::resources handle;
- *    cuvs::cluster::kmeans::params params;
+ *   cuvs::cluster::kmeans::params params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
  *   auto labels = raft::make_device_vector<int, int>(handle, X.extent(0));
@@ -516,6 +797,171 @@ void fit_predict(raft::resources const& handle,
                  raft::host_scalar_view<float> inertia,
                  raft::host_scalar_view<int> n_iter);
 
+/**
+ * @brief Compute k-means clustering and predicts cluster index for each sample
+ * in the input.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/resources.hpp>
+ *   #include <cuvs/cluster/kmeans.hpp>
+ *   using namespace  cuvs::cluster;
+ *   ...
+ *   raft::resources handle;
+ *   cuvs::cluster::kmeans::params params;
+ *   int64_t n_features = 15, inertia, n_iter;
+ *   auto centroids = raft::make_device_matrix<float, int64_t>(handle, params.n_clusters,
+ * n_features); auto labels = raft::make_device_vector<int64_t, int64_t>(handle, X.extent(0));
+ *
+ *   kmeans::fit_predict(handle,
+ *                       params,
+ *                       X,
+ *                       std::nullopt,
+ *                       centroids.view(),
+ *                       labels.view(),
+ *                       raft::make_scalar_view(&inertia),
+ *                       raft::make_scalar_view(&n_iter));
+ * @endcode
+ *
+ * @param[in]     handle        The raft handle.
+ * @param[in]     params        Parameters for KMeans model.
+ * @param[in]     X             Training instances to cluster. The data must be
+ *                              in row-major format.
+ *                              [dim = n_samples x n_features]
+ * @param[in]     sample_weight Optional weights for each observation in X.
+ *                              [len = n_samples]
+ * @param[inout]  centroids     Optional
+ *                              [in] When init is InitMethod::Array, use
+ *                              centroids  as the initial cluster centers
+ *                              [out] The generated centroids from the
+ *                              kmeans algorithm are stored at the address
+ *                              pointed by 'centroids'.
+ *                              [dim = n_clusters x n_features]
+ * @param[out]    labels        Index of the cluster each sample in X belongs
+ *                              to.
+ *                              [len = n_samples]
+ * @param[out]    inertia       Sum of squared distances of samples to their
+ *                              closest cluster center.
+ * @param[out]    n_iter        Number of iterations run.
+ */
+void fit_predict(raft::resources const& handle,
+                 const kmeans::params& params,
+                 raft::device_matrix_view<const float, int64_t> X,
+                 std::optional<raft::device_vector_view<const float, int64_t>> sample_weight,
+                 std::optional<raft::device_matrix_view<float, int64_t>> centroids,
+                 raft::device_vector_view<int64_t, int64_t> labels,
+                 raft::host_scalar_view<float> inertia,
+                 raft::host_scalar_view<int64_t> n_iter);
+
+/**
+ * @brief Compute k-means clustering and predicts cluster index for each sample
+ * in the input.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/resources.hpp>
+ *   #include <cuvs/cluster/kmeans.hpp>
+ *   using namespace  cuvs::cluster;
+ *   ...
+ *   raft::resources handle;
+ *   cuvs::cluster::kmeans::params params;
+ *   int n_features = 15, inertia, n_iter;
+ *   auto centroids = raft::make_device_matrix<double, int>(handle, params.n_clusters, n_features);
+ *   auto labels = raft::make_device_vector<int, int>(handle, X.extent(0));
+ *
+ *   kmeans::fit_predict(handle,
+ *                       params,
+ *                       X,
+ *                       std::nullopt,
+ *                       centroids.view(),
+ *                       labels.view(),
+ *                       raft::make_scalar_view(&inertia),
+ *                       raft::make_scalar_view(&n_iter));
+ * @endcode
+ *
+ * @param[in]     handle        The raft handle.
+ * @param[in]     params        Parameters for KMeans model.
+ * @param[in]     X             Training instances to cluster. The data must be
+ *                              in row-major format.
+ *                              [dim = n_samples x n_features]
+ * @param[in]     sample_weight Optional weights for each observation in X.
+ *                              [len = n_samples]
+ * @param[inout]  centroids     Optional
+ *                              [in] When init is InitMethod::Array, use
+ *                              centroids  as the initial cluster centers
+ *                              [out] The generated centroids from the
+ *                              kmeans algorithm are stored at the address
+ *                              pointed by 'centroids'.
+ *                              [dim = n_clusters x n_features]
+ * @param[out]    labels        Index of the cluster each sample in X belongs
+ *                              to.
+ *                              [len = n_samples]
+ * @param[out]    inertia       Sum of squared distances of samples to their
+ *                              closest cluster center.
+ * @param[out]    n_iter        Number of iterations run.
+ */
+void fit_predict(raft::resources const& handle,
+                 const kmeans::params& params,
+                 raft::device_matrix_view<const double, int> X,
+                 std::optional<raft::device_vector_view<const double, int>> sample_weight,
+                 std::optional<raft::device_matrix_view<double, int>> centroids,
+                 raft::device_vector_view<int, int> labels,
+                 raft::host_scalar_view<double> inertia,
+                 raft::host_scalar_view<int> n_iter);
+
+/**
+ * @brief Compute k-means clustering and predicts cluster index for each sample
+ * in the input.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/resources.hpp>
+ *   #include <cuvs/cluster/kmeans.hpp>
+ *   using namespace  cuvs::cluster;
+ *   ...
+ *   raft::resources handle;
+ *   cuvs::cluster::kmeans::params params;
+ *   int64_t n_features = 15, inertia, n_iter;
+ *   auto centroids = raft::make_device_matrix<double, int64_t>(handle, params.n_clusters,
+ * n_features); auto labels = raft::make_device_vector<int64_t, int64_t>(handle, X.extent(0));
+ *
+ *   kmeans::fit_predict(handle,
+ *                       params,
+ *                       X,
+ *                       std::nullopt,
+ *                       centroids.view(),
+ *                       labels.view(),
+ *                       raft::make_scalar_view(&inertia),
+ *                       raft::make_scalar_view(&n_iter));
+ * @endcode
+ *
+ * @param[in]     handle        The raft handle.
+ * @param[in]     params        Parameters for KMeans model.
+ * @param[in]     X             Training instances to cluster. The data must be
+ *                              in row-major format.
+ *                              [dim = n_samples x n_features]
+ * @param[in]     sample_weight Optional weights for each observation in X.
+ *                              [len = n_samples]
+ * @param[inout]  centroids     Optional
+ *                              [in] When init is InitMethod::Array, use
+ *                              centroids  as the initial cluster centers
+ *                              [out] The generated centroids from the
+ *                              kmeans algorithm are stored at the address
+ *                              pointed by 'centroids'.
+ *                              [dim = n_clusters x n_features]
+ * @param[out]    labels        Index of the cluster each sample in X belongs
+ *                              to.
+ *                              [len = n_samples]
+ * @param[out]    inertia       Sum of squared distances of samples to their
+ *                              closest cluster center.
+ * @param[out]    n_iter        Number of iterations run.
+ */
+void fit_predict(raft::resources const& handle,
+                 const kmeans::params& params,
+                 raft::device_matrix_view<const double, int64_t> X,
+                 std::optional<raft::device_vector_view<const double, int64_t>> sample_weight,
+                 std::optional<raft::device_matrix_view<double, int64_t>> centroids,
+                 raft::device_vector_view<int64_t, int64_t> labels,
+                 raft::host_scalar_view<double> inertia,
+                 raft::host_scalar_view<int64_t> n_iter);
+
 /**
  * @brief Compute balanced k-means clustering and predicts cluster index for each sample
  * in the input.
@@ -526,7 +972,7 @@ void fit_predict(raft::resources const& handle,
  *   using namespace  cuvs::cluster;
  *   ...
  *   raft::resources handle;
- *    cuvs::cluster::kmeans::balanced_params params;
+ *   cuvs::cluster::kmeans::balanced_params params;
  *   int n_features = 15;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
  *   auto labels = raft::make_device_vector<int, int>(handle, X.extent(0));
@@ -570,7 +1016,7 @@ void fit_predict(const raft::resources& handle,
  *   using namespace  cuvs::cluster;
  *   ...
  *   raft::resources handle;
- *    cuvs::cluster::kmeans::balanced_params params;
+ *   cuvs::cluster::kmeans::balanced_params params;
  *   int n_features = 15;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
  *   auto labels = raft::make_device_vector<int, int>(handle, X.extent(0));
@@ -623,6 +1069,24 @@ void transform(raft::resources const& handle,
                raft::device_matrix_view<const float, int> centroids,
                raft::device_matrix_view<float, int> X_new);
 
+/**
+ * @brief Transform X to a cluster-distance space.
+ *
+ * @param[in]     handle        The raft handle.
+ * @param[in]     params        Parameters for KMeans model.
+ * @param[in]     X             Training instances to cluster. The data must
+ *                              be in row-major format
+ *                              [dim = n_samples x n_features]
+ * @param[in]     centroids     Cluster centroids. The data must be in row-major format.
+ *                              [dim = n_clusters x n_features]
+ * @param[out]    X_new         X transformed in the new space.
+ *                              [dim = n_samples x n_features]
+ */
+void transform(raft::resources const& handle,
+               const kmeans::params& params,
+               raft::device_matrix_view<const double, int> X,
+               raft::device_matrix_view<const double, int> centroids,
+               raft::device_matrix_view<double, int> X_new);
 /**
  * @}
  */
diff --git a/cpp/src/cluster/detail/connectivities.cuh b/cpp/src/cluster/detail/connectivities.cuh
index ada424192..e61c9166f 100644
--- a/cpp/src/cluster/detail/connectivities.cuh
+++ b/cpp/src/cluster/detail/connectivities.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "../../distance/distance.cuh"
+#include "./kmeans_common.cuh"
 #include <cuvs/cluster/agglomerative.hpp>
 #include <cuvs/distance/distance.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
@@ -153,7 +153,11 @@ void pairwise_distances(const raft::resources& handle,
   // TODO: It would ultimately be nice if the MST could accept
   // dense inputs directly so we don't need to double the memory
   // usage to hand it a sparse array here.
-  distance::pairwise_distance<value_t, value_idx>(handle, X, X, data, m, m, n, metric);
+  auto X_view = raft::make_device_matrix_view<const value_t, value_idx>(X, m, n);
+
+  cuvs::cluster::kmeans::detail::pairwise_distance_kmeans<value_t, value_idx>(
+    handle, X_view, X_view, raft::make_device_matrix_view<value_t, value_idx>(data, m, m), metric);
+
   // self-loops get max distance
   auto transform_in =
     thrust::make_zip_iterator(thrust::make_tuple(thrust::make_counting_iterator(0), data));
diff --git a/cpp/src/cluster/detail/kmeans.cuh b/cpp/src/cluster/detail/kmeans.cuh
index e7d4bdf76..9b673bca3 100644
--- a/cpp/src/cluster/detail/kmeans.cuh
+++ b/cpp/src/cluster/detail/kmeans.cuh
@@ -198,7 +198,7 @@ void kmeansPlusPlus(raft::resources const& handle,
     // Output - pwd [n_trials x n_samples]
     auto pwd = distBuffer.view();
     cuvs::cluster::kmeans::detail::pairwise_distance_kmeans<DataT, IndexT>(
-      handle, centroidCandidates.view(), X, pwd, workspace, metric);
+      handle, centroidCandidates.view(), X, pwd, metric);
 
     // Update nearest cluster distance for each centroid candidate
     // Note pwd and minDistBuf points to same buffer which currently holds pairwise distance values.
@@ -1247,7 +1247,7 @@ void kmeans_transform(raft::resources const& handle,
     // calculate pairwise distance between cluster centroids and current batch
     // of input dataset
     pairwise_distance_kmeans<DataT, IndexT>(
-      handle, datasetView, centroids, pairwiseDistanceView, workspace, metric);
+      handle, datasetView, centroids, pairwiseDistanceView, metric);
   }
 }
 
diff --git a/cpp/src/cluster/detail/kmeans_common.cuh b/cpp/src/cluster/detail/kmeans_common.cuh
index 04c1a6802..eec71b5d2 100644
--- a/cpp/src/cluster/detail/kmeans_common.cuh
+++ b/cpp/src/cluster/detail/kmeans_common.cuh
@@ -293,7 +293,6 @@ void pairwise_distance_kmeans(raft::resources const& handle,
                               raft::device_matrix_view<const DataT, IndexT> X,
                               raft::device_matrix_view<const DataT, IndexT> centroids,
                               raft::device_matrix_view<DataT, IndexT> pairwiseDistance,
-                              rmm::device_uvector<char>& workspace,
                               cuvs::distance::DistanceType metric)
 {
   auto n_samples  = X.extent(0);
@@ -303,15 +302,23 @@ void pairwise_distance_kmeans(raft::resources const& handle,
   ASSERT(X.extent(1) == centroids.extent(1),
          "# features in dataset and centroids are different (must be same)");
 
-  cuvs::distance::pairwise_distance(handle,
-                                    X.data_handle(),
-                                    centroids.data_handle(),
-                                    pairwiseDistance.data_handle(),
-                                    n_samples,
-                                    n_clusters,
-                                    n_features,
-                                    workspace,
-                                    metric);
+  if (metric == cuvs::distance::DistanceType::L2Expanded) {
+    cuvs::distance::distance<cuvs::distance::DistanceType::L2Expanded,
+                             DataT,
+                             DataT,
+                             DataT,
+                             raft::layout_c_contiguous,
+                             IndexT>(handle, X, centroids, pairwiseDistance);
+  } else if (metric == cuvs::distance::DistanceType::L2SqrtExpanded) {
+    cuvs::distance::distance<cuvs::distance::DistanceType::L2SqrtExpanded,
+                             DataT,
+                             DataT,
+                             DataT,
+                             raft::layout_c_contiguous,
+                             IndexT>(handle, X, centroids, pairwiseDistance);
+  } else {
+    RAFT_FAIL("kmeans requires L2Expanded or L2SqrtExpanded distance, have %i", metric);
+  }
 }
 
 // shuffle and randomly select 'n_samples_to_gather' from input 'in' and stores
@@ -461,7 +468,7 @@ void minClusterAndDistanceCompute(
         // calculate pairwise distance between current tile of cluster centroids
         // and input dataset
         pairwise_distance_kmeans<DataT, IndexT>(
-          handle, datasetView, centroidsView, pairwiseDistanceView, workspace, metric);
+          handle, datasetView, centroidsView, pairwiseDistanceView, metric);
 
         // argmin reduction returning <index, value> pair
         // calculates the closest centroid and the distance to the closest
@@ -591,7 +598,7 @@ void minClusterDistanceCompute(raft::resources const& handle,
         // calculate pairwise distance between current tile of cluster centroids
         // and input dataset
         pairwise_distance_kmeans<DataT, IndexT>(
-          handle, datasetView, centroidsView, pairwiseDistanceView, workspace, metric);
+          handle, datasetView, centroidsView, pairwiseDistanceView, metric);
 
         raft::linalg::coalescedReduction(minClusterDistanceView.data_handle(),
                                          pairwiseDistanceView.data_handle(),
diff --git a/cpp/src/cluster/detail/kmeans_mg.cuh b/cpp/src/cluster/detail/kmeans_mg.cuh
new file mode 100644
index 000000000..b0f435502
--- /dev/null
+++ b/cpp/src/cluster/detail/kmeans_mg.cuh
@@ -0,0 +1,781 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../kmeans.cuh"
+
+#include <cuvs/cluster/kmeans.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/kvp.hpp>
+#include <raft/core/resource/comms.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/linalg/reduce_rows_by_key.cuh>
+#include <raft/matrix/gather.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/integer_utils.hpp>
+
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cuda/functional>
+#include <thrust/execution_policy.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+#include <cstdint>
+#include <random>
+
+namespace cuvs::cluster::kmeans::mg::detail {
+
+#define CUVS_LOG_KMEANS(handle, fmt, ...)                    \
+  do {                                                       \
+    bool isRoot = true;                                      \
+    if (raft::resource::comms_initialized(handle)) {         \
+      const auto& comm  = raft::resource::get_comms(handle); \
+      const int my_rank = comm.get_rank();                   \
+      isRoot            = my_rank == 0;                      \
+    }                                                        \
+    if (isRoot) { RAFT_LOG_DEBUG(fmt, ##__VA_ARGS__); }      \
+  } while (0)
+
+template <typename IndexT, typename DataT>
+struct KeyValueIndexOp {
+  __host__ __device__ __forceinline__ IndexT
+  operator()(const raft::KeyValuePair<IndexT, DataT>& a) const
+  {
+    return a.key;
+  }
+};
+
+#define KMEANS_COMM_ROOT 0
+
+static cuvs::cluster::kmeans::params default_params;
+
+// Selects 'n_clusters' samples randomly from X
+template <typename DataT, typename IndexT>
+void initRandom(const raft::resources& handle,
+                const cuvs::cluster::kmeans::params& params,
+                raft::device_matrix_view<const DataT, IndexT> X,
+                raft::device_matrix_view<DataT, IndexT> centroids)
+{
+  const auto& comm     = raft::resource::get_comms(handle);
+  cudaStream_t stream  = raft::resource::get_cuda_stream(handle);
+  auto n_local_samples = X.extent(0);
+  auto n_features      = X.extent(1);
+  auto n_clusters      = params.n_clusters;
+
+  const int my_rank = comm.get_rank();
+  const int n_ranks = comm.get_size();
+
+  std::vector<int> nCentroidsSampledByRank(n_ranks, 0);
+  std::vector<size_t> nCentroidsElementsToReceiveFromRank(n_ranks, 0);
+
+  const int nranks_reqd = std::min(n_ranks, n_clusters);
+  ASSERT(KMEANS_COMM_ROOT < nranks_reqd, "KMEANS_COMM_ROOT must be in [0,  %d)\n", nranks_reqd);
+
+  for (int rank = 0; rank < nranks_reqd; ++rank) {
+    int nCentroidsSampledInRank = n_clusters / nranks_reqd;
+    if (rank == KMEANS_COMM_ROOT) {
+      nCentroidsSampledInRank += n_clusters - nCentroidsSampledInRank * nranks_reqd;
+    }
+    nCentroidsSampledByRank[rank]             = nCentroidsSampledInRank;
+    nCentroidsElementsToReceiveFromRank[rank] = nCentroidsSampledInRank * n_features;
+  }
+
+  auto nCentroidsSampledInRank = nCentroidsSampledByRank[my_rank];
+  ASSERT((IndexT)nCentroidsSampledInRank <= (IndexT)n_local_samples,
+         "# random samples requested from rank-%d is larger than the available "
+         "samples at the rank (requested is %lu, available is %lu)",
+         my_rank,
+         (size_t)nCentroidsSampledInRank,
+         (size_t)n_local_samples);
+
+  auto centroidsSampledInRank =
+    raft::make_device_matrix<DataT, IndexT>(handle, nCentroidsSampledInRank, n_features);
+
+  cuvs::cluster::kmeans::shuffle_and_gather(
+    handle, X, centroidsSampledInRank.view(), nCentroidsSampledInRank, params.rng_state.seed);
+
+  std::vector<size_t> displs(n_ranks);
+  thrust::exclusive_scan(thrust::host,
+                         nCentroidsElementsToReceiveFromRank.begin(),
+                         nCentroidsElementsToReceiveFromRank.end(),
+                         displs.begin());
+
+  // gather centroids from all ranks
+  comm.allgatherv<DataT>(centroidsSampledInRank.data_handle(),        // sendbuff
+                         centroids.data_handle(),                     // recvbuff
+                         nCentroidsElementsToReceiveFromRank.data(),  // recvcount
+                         displs.data(),
+                         stream);
+}
+
+/*
+ * @brief Selects 'n_clusters' samples from X using scalable kmeans++ algorithm
+ * Scalable kmeans++ pseudocode
+ * 1: C = sample a point uniformly at random from X
+ * 2: psi = phi_X (C)
+ * 3: for O( log(psi) ) times do
+ * 4:   C' = sample each point x in X independently with probability
+ *           p_x = l * ( d^2(x, C) / phi_X (C) )
+ * 5:   C = C U C'
+ * 6: end for
+ * 7: For x in C, set w_x to be the number of points in X closer to x than any
+ *    other point in C
+ * 8: Recluster the weighted points in C into k clusters
+ */
+template <typename DataT, typename IndexT>
+void initKMeansPlusPlus(const raft::resources& handle,
+                        const cuvs::cluster::kmeans::params& params,
+                        raft::device_matrix_view<const DataT, IndexT> X,
+                        raft::device_matrix_view<DataT, IndexT> centroidsRawData,
+                        rmm::device_uvector<char>& workspace)
+{
+  const auto& comm    = raft::resource::get_comms(handle);
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  const int my_rank   = comm.get_rank();
+  const int n_rank    = comm.get_size();
+
+  auto n_samples  = X.extent(0);
+  auto n_features = X.extent(1);
+  auto n_clusters = params.n_clusters;
+  auto metric     = params.metric;
+
+  raft::random::RngState rng(params.rng_state.seed, raft::random::GeneratorType::GenPhilox);
+
+  // <<<< Step-1 >>> : C <- sample a point uniformly at random from X
+  //    1.1 - Select a rank r' at random from the available n_rank ranks with a
+  //          probability of 1/n_rank [Note - with same seed all rank selects
+  //          the same r' which avoids a call to comm]
+  //    1.2 - Rank r' samples a point uniformly at random from the local dataset
+  //          X which will be used as the initial centroid for kmeans++
+  //    1.3 - Communicate the initial centroid chosen by rank-r' to all other
+  //          ranks
+  std::mt19937 gen(params.rng_state.seed);
+  std::uniform_int_distribution<> dis(0, n_rank - 1);
+  int rp = dis(gen);
+
+  // buffer to flag the sample that is chosen as initial centroids
+  std::vector<std::uint8_t> h_isSampleCentroid(n_samples);
+  std::fill(h_isSampleCentroid.begin(), h_isSampleCentroid.end(), 0);
+
+  auto initialCentroid = raft::make_device_matrix<DataT, IndexT>(handle, 1, n_features);
+  CUVS_LOG_KMEANS(
+    handle, "@Rank-%d : KMeans|| : initial centroid is sampled at rank-%d\n", my_rank, rp);
+
+  //    1.2 - Rank r' samples a point uniformly at random from the local dataset
+  //          X which will be used as the initial centroid for kmeans++
+  if (my_rank == rp) {
+    std::mt19937 gen(params.rng_state.seed);
+    std::uniform_int_distribution<> dis(0, n_samples - 1);
+
+    int cIdx           = dis(gen);
+    auto centroidsView = raft::make_device_matrix_view<const DataT, IndexT>(
+      X.data_handle() + cIdx * n_features, 1, n_features);
+
+    raft::copy(
+      initialCentroid.data_handle(), centroidsView.data_handle(), centroidsView.size(), stream);
+
+    h_isSampleCentroid[cIdx] = 1;
+  }
+
+  // 1.3 - Communicate the initial centroid chosen by rank-r' to all other ranks
+  comm.bcast<DataT>(initialCentroid.data_handle(), initialCentroid.size(), rp, stream);
+
+  // device buffer to flag the sample that is chosen as initial centroid
+  auto isSampleCentroid = raft::make_device_vector<std::uint8_t, IndexT>(handle, n_samples);
+
+  raft::copy(
+    isSampleCentroid.data_handle(), h_isSampleCentroid.data(), isSampleCentroid.size(), stream);
+
+  rmm::device_uvector<DataT> centroidsBuf(0, stream);
+
+  // reset buffer to store the chosen centroid
+  centroidsBuf.resize(initialCentroid.size(), stream);
+  raft::copy(centroidsBuf.begin(), initialCentroid.data_handle(), initialCentroid.size(), stream);
+
+  auto potentialCentroids = raft::make_device_matrix_view<DataT, IndexT>(
+    centroidsBuf.data(), initialCentroid.extent(0), initialCentroid.extent(1));
+  // <<< End of Step-1 >>>
+
+  rmm::device_uvector<DataT> L2NormBuf_OR_DistBuf(0, stream);
+
+  // L2 norm of X: ||x||^2
+  auto L2NormX = raft::make_device_vector<DataT, IndexT>(handle, n_samples);
+  if (metric == cuvs::distance::DistanceType::L2Expanded ||
+      metric == cuvs::distance::DistanceType::L2SqrtExpanded) {
+    raft::linalg::rowNorm(L2NormX.data_handle(),
+                          X.data_handle(),
+                          X.extent(1),
+                          X.extent(0),
+                          raft::linalg::L2Norm,
+                          true,
+                          stream);
+  }
+
+  auto minClusterDistance = raft::make_device_vector<DataT, IndexT>(handle, n_samples);
+  auto uniformRands       = raft::make_device_vector<DataT, IndexT>(handle, n_samples);
+
+  // <<< Step-2 >>>: psi <- phi_X (C)
+  auto clusterCost = raft::make_device_scalar<DataT>(handle, 0);
+
+  cuvs::cluster::kmeans::min_cluster_distance(handle,
+                                              X,
+                                              potentialCentroids,
+                                              minClusterDistance.view(),
+                                              L2NormX.view(),
+                                              L2NormBuf_OR_DistBuf,
+                                              params.metric,
+                                              params.batch_samples,
+                                              params.batch_centroids,
+                                              workspace);
+
+  // compute partial cluster cost from the samples in rank
+  cuvs::cluster::kmeans::cluster_cost(
+    handle,
+    minClusterDistance.view(),
+    workspace,
+    clusterCost.view(),
+    cuda::proclaim_return_type<DataT>(
+      [] __device__(const DataT& a, const DataT& b) { return a + b; }));
+
+  // compute total cluster cost by accumulating the partial cost from all the
+  // ranks
+  comm.allreduce(
+    clusterCost.data_handle(), clusterCost.data_handle(), 1, raft::comms::op_t::SUM, stream);
+
+  DataT psi = 0;
+  raft::copy(&psi, clusterCost.data_handle(), 1, stream);
+
+  // <<< End of Step-2 >>>
+
+  ASSERT(comm.sync_stream(stream) == raft::comms::status_t::SUCCESS,
+         "An error occurred in the distributed operation. This can result from "
+         "a failed rank");
+
+  // Scalable kmeans++ paper claims 8 rounds is sufficient
+  int niter = std::min(8, (int)ceil(log(psi)));
+  CUVS_LOG_KMEANS(handle,
+                  "@Rank-%d:KMeans|| :phi - %f, max # of iterations for kmeans++ loop - "
+                  "%d\n",
+                  my_rank,
+                  psi,
+                  niter);
+
+  // <<<< Step-3 >>> : for O( log(psi) ) times do
+  for (int iter = 0; iter < niter; ++iter) {
+    CUVS_LOG_KMEANS(handle,
+                    "@Rank-%d:KMeans|| - Iteration %d: # potential centroids sampled - "
+                    "%d\n",
+                    my_rank,
+                    iter,
+                    potentialCentroids.extent(0));
+
+    cuvs::cluster::kmeans::min_cluster_distance(handle,
+                                                X,
+                                                potentialCentroids,
+                                                minClusterDistance.view(),
+                                                L2NormX.view(),
+                                                L2NormBuf_OR_DistBuf,
+                                                params.metric,
+                                                params.batch_samples,
+                                                params.batch_centroids,
+                                                workspace);
+
+    cuvs::cluster::kmeans::cluster_cost(
+      handle,
+      minClusterDistance.view(),
+      workspace,
+      clusterCost.view(),
+      cuda::proclaim_return_type<DataT>(
+        [] __device__(const DataT& a, const DataT& b) { return a + b; }));
+    comm.allreduce(
+      clusterCost.data_handle(), clusterCost.data_handle(), 1, raft::comms::op_t::SUM, stream);
+    raft::copy(&psi, clusterCost.data_handle(), 1, stream);
+    ASSERT(comm.sync_stream(stream) == raft::comms::status_t::SUCCESS,
+           "An error occurred in the distributed operation. This can result "
+           "from a failed rank");
+
+    // <<<< Step-4 >>> : Sample each point x in X independently and identify new
+    // potentialCentroids
+    raft::random::uniform(
+      handle, rng, uniformRands.data_handle(), uniformRands.extent(0), (DataT)0, (DataT)1);
+    cuvs::cluster::kmeans::SamplingOp<DataT, IndexT> select_op(psi,
+                                                               params.oversampling_factor,
+                                                               n_clusters,
+                                                               uniformRands.data_handle(),
+                                                               isSampleCentroid.data_handle());
+
+    rmm::device_uvector<DataT> inRankCp(0, stream);
+    cuvs::cluster::kmeans::sample_centroids(handle,
+                                            X,
+                                            minClusterDistance.view(),
+                                            isSampleCentroid.view(),
+                                            select_op,
+                                            inRankCp,
+                                            workspace);
+    /// <<<< End of Step-4 >>>>
+
+    int* nPtsSampledByRank;
+    RAFT_CUDA_TRY(cudaMallocHost(&nPtsSampledByRank, n_rank * sizeof(int)));
+
+    /// <<<< Step-5 >>> : C = C U C'
+    // append the data in Cp from all ranks to the buffer holding the
+    // potentialCentroids
+    // RAFT_CUDA_TRY(cudaMemsetAsync(nPtsSampledByRank, 0, n_rank * sizeof(int), stream));
+    std::fill(nPtsSampledByRank, nPtsSampledByRank + n_rank, 0);
+    nPtsSampledByRank[my_rank] = inRankCp.size() / n_features;
+    comm.allgather(&(nPtsSampledByRank[my_rank]), nPtsSampledByRank, 1, stream);
+    ASSERT(comm.sync_stream(stream) == raft::comms::status_t::SUCCESS,
+           "An error occurred in the distributed operation. This can result "
+           "from a failed rank");
+
+    auto nPtsSampled =
+      thrust::reduce(thrust::host, nPtsSampledByRank, nPtsSampledByRank + n_rank, 0);
+
+    // gather centroids from all ranks
+    std::vector<size_t> sizes(n_rank);
+    thrust::transform(
+      thrust::host, nPtsSampledByRank, nPtsSampledByRank + n_rank, sizes.begin(), [&](int val) {
+        return val * n_features;
+      });
+
+    RAFT_CUDA_TRY_NO_THROW(cudaFreeHost(nPtsSampledByRank));
+
+    std::vector<size_t> displs(n_rank);
+    thrust::exclusive_scan(thrust::host, sizes.begin(), sizes.end(), displs.begin());
+
+    centroidsBuf.resize(centroidsBuf.size() + nPtsSampled * n_features, stream);
+    comm.allgatherv<DataT>(inRankCp.data(),
+                           centroidsBuf.end() - nPtsSampled * n_features,
+                           sizes.data(),
+                           displs.data(),
+                           stream);
+
+    auto tot_centroids = potentialCentroids.extent(0) + nPtsSampled;
+    potentialCentroids =
+      raft::make_device_matrix_view<DataT, IndexT>(centroidsBuf.data(), tot_centroids, n_features);
+    /// <<<< End of Step-5 >>>
+  }  /// <<<< Step-6 >>>
+
+  CUVS_LOG_KMEANS(handle,
+                  "@Rank-%d:KMeans||: # potential centroids sampled - %d\n",
+                  my_rank,
+                  potentialCentroids.extent(0));
+
+  if ((IndexT)potentialCentroids.extent(0) > (IndexT)n_clusters) {
+    // <<< Step-7 >>>: For x in C, set w_x to be the number of pts closest to X
+    // temporary buffer to store the sample count per cluster, destructor
+    // releases the resource
+
+    auto weight = raft::make_device_vector<DataT, IndexT>(handle, potentialCentroids.extent(0));
+
+    cuvs::cluster::kmeans::count_samples_in_cluster(
+      handle, params, X, L2NormX.view(), potentialCentroids, workspace, weight.view());
+
+    // merge the local histogram from all ranks
+    comm.allreduce<DataT>(weight.data_handle(),  // sendbuff
+                          weight.data_handle(),  // recvbuff
+                          weight.size(),         // count
+                          raft::comms::op_t::SUM,
+                          stream);
+
+    // <<< end of Step-7 >>>
+
+    // Step-8: Recluster the weighted points in C into k clusters
+    // Note - reclustering step is duplicated across all ranks and with the same
+    // seed they should generate the same potentialCentroids
+    auto const_centroids = raft::make_device_matrix_view<const DataT, IndexT>(
+      potentialCentroids.data_handle(), potentialCentroids.extent(0), potentialCentroids.extent(1));
+    cuvs::cluster::kmeans::init_plus_plus(
+      handle, params, const_centroids, centroidsRawData, workspace);
+
+    auto inertia = raft::make_host_scalar<DataT>(0);
+    auto n_iter  = raft::make_host_scalar<IndexT>(0);
+    auto weight_view =
+      raft::make_device_vector_view<const DataT, IndexT>(weight.data_handle(), weight.extent(0));
+    cuvs::cluster::kmeans::params params_copy = params;
+    params_copy.rng_state                     = default_params.rng_state;
+
+    cuvs::cluster::kmeans::fit_main(handle,
+                                    params_copy,
+                                    const_centroids,
+                                    weight_view,
+                                    centroidsRawData,
+                                    inertia.view(),
+                                    n_iter.view(),
+                                    workspace);
+
+  } else if ((IndexT)potentialCentroids.extent(0) < (IndexT)n_clusters) {
+    // supplement with random
+    auto n_random_clusters = n_clusters - potentialCentroids.extent(0);
+    CUVS_LOG_KMEANS(handle,
+                    "[Warning!] KMeans||: found fewer than %d centroids during "
+                    "initialization (found %d centroids, remaining %d centroids will be "
+                    "chosen randomly from input samples)\n",
+                    n_clusters,
+                    potentialCentroids.extent(0),
+                    n_random_clusters);
+
+    // generate `n_random_clusters` centroids
+    cuvs::cluster::kmeans::params rand_params = params;
+    rand_params.rng_state                     = default_params.rng_state;
+    rand_params.init                          = cuvs::cluster::kmeans::params::InitMethod::Random;
+    rand_params.n_clusters                    = n_random_clusters;
+    initRandom(handle, rand_params, X, centroidsRawData);
+
+    // copy centroids generated during kmeans|| iteration to the buffer
+    raft::copy(centroidsRawData.data_handle() + n_random_clusters * n_features,
+               potentialCentroids.data_handle(),
+               potentialCentroids.size(),
+               stream);
+
+  } else {
+    // found the required n_clusters
+    raft::copy(centroidsRawData.data_handle(),
+               potentialCentroids.data_handle(),
+               potentialCentroids.size(),
+               stream);
+  }
+}
+
+template <typename DataT, typename IndexT>
+void checkWeights(const raft::resources& handle,
+                  rmm::device_uvector<char>& workspace,
+                  raft::device_vector_view<DataT, IndexT> weight)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  rmm::device_scalar<DataT> wt_aggr(stream);
+
+  const auto& comm = raft::resource::get_comms(handle);
+
+  auto n_samples            = weight.extent(0);
+  size_t temp_storage_bytes = 0;
+  RAFT_CUDA_TRY(cub::DeviceReduce::Sum(
+    nullptr, temp_storage_bytes, weight.data_handle(), wt_aggr.data(), n_samples, stream));
+
+  workspace.resize(temp_storage_bytes, stream);
+
+  RAFT_CUDA_TRY(cub::DeviceReduce::Sum(
+    workspace.data(), temp_storage_bytes, weight.data_handle(), wt_aggr.data(), n_samples, stream));
+
+  comm.allreduce<DataT>(wt_aggr.data(),  // sendbuff
+                        wt_aggr.data(),  // recvbuff
+                        1,               // count
+                        raft::comms::op_t::SUM,
+                        stream);
+  DataT wt_sum = wt_aggr.value(stream);
+  raft::resource::sync_stream(handle, stream);
+
+  if (wt_sum != n_samples) {
+    CUVS_LOG_KMEANS(handle,
+                    "[Warning!] KMeans: normalizing the user provided sample weights to "
+                    "sum up to %d samples",
+                    n_samples);
+
+    DataT scale = n_samples / wt_sum;
+    raft::linalg::unaryOp(
+      weight.data_handle(),
+      weight.data_handle(),
+      weight.size(),
+      cuda::proclaim_return_type<DataT>([=] __device__(const DataT& wt) { return wt * scale; }),
+      stream);
+  }
+}
+
+template <typename DataT, typename IndexT>
+void fit(const raft::resources& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::device_matrix_view<const DataT, IndexT> X,
+         std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
+         raft::device_matrix_view<DataT, IndexT> centroids,
+         raft::host_scalar_view<DataT> inertia,
+         raft::host_scalar_view<IndexT> n_iter,
+         rmm::device_uvector<char>& workspace)
+{
+  const auto& comm    = raft::resource::get_comms(handle);
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  auto n_samples      = X.extent(0);
+  auto n_features     = X.extent(1);
+  auto n_clusters     = params.n_clusters;
+  auto metric         = params.metric;
+
+  auto weight = raft::make_device_vector<DataT, IndexT>(handle, n_samples);
+  if (sample_weight) {
+    raft::copy(weight.data_handle(), sample_weight->data_handle(), n_samples, stream);
+  } else {
+    thrust::fill(raft::resource::get_thrust_policy(handle),
+                 weight.data_handle(),
+                 weight.data_handle() + weight.size(),
+                 1);
+  }
+
+  // check if weights sum up to n_samples
+  checkWeights(handle, workspace, weight.view());
+
+  if (params.init == cuvs::cluster::kmeans::params::InitMethod::Random) {
+    // initializing with random samples from input dataset
+    CUVS_LOG_KMEANS(handle,
+                    "KMeans.fit: initialize cluster centers by randomly choosing from the "
+                    "input data.\n");
+    initRandom<DataT, IndexT>(handle, params, X, centroids);
+  } else if (params.init == cuvs::cluster::kmeans::params::InitMethod::KMeansPlusPlus) {
+    // default method to initialize is kmeans++
+    CUVS_LOG_KMEANS(handle, "KMeans.fit: initialize cluster centers using k-means++ algorithm.\n");
+    initKMeansPlusPlus<DataT, IndexT>(handle, params, X, centroids, workspace);
+  } else if (params.init == cuvs::cluster::kmeans::params::InitMethod::Array) {
+    CUVS_LOG_KMEANS(handle,
+                    "KMeans.fit: initialize cluster centers from the ndarray array input "
+                    "passed to init argument.\n");
+
+  } else {
+    THROW("unknown initialization method to select initial centers");
+  }
+
+  // stores (key, value) pair corresponding to each sample where
+  //   - key is the index of nearest cluster
+  //   - value is the distance to the nearest cluster
+  auto minClusterAndDistance =
+    raft::make_device_vector<raft::KeyValuePair<IndexT, DataT>, IndexT>(handle, n_samples);
+
+  // temporary buffer to store L2 norm of centroids or distance matrix,
+  // destructor releases the resource
+  rmm::device_uvector<DataT> L2NormBuf_OR_DistBuf(0, stream);
+
+  // temporary buffer to store intermediate centroids, destructor releases the
+  // resource
+  auto newCentroids = raft::make_device_matrix<DataT, IndexT>(handle, n_clusters, n_features);
+
+  // temporary buffer to store the weights per cluster, destructor releases
+  // the resource
+  auto wtInCluster = raft::make_device_vector<DataT, IndexT>(handle, n_clusters);
+
+  // L2 norm of X: ||x||^2
+  auto L2NormX = raft::make_device_vector<DataT, IndexT>(handle, n_samples);
+  if (metric == cuvs::distance::DistanceType::L2Expanded ||
+      metric == cuvs::distance::DistanceType::L2SqrtExpanded) {
+    raft::linalg::rowNorm(L2NormX.data_handle(),
+                          X.data_handle(),
+                          X.extent(1),
+                          X.extent(0),
+                          raft::linalg::L2Norm,
+                          true,
+                          stream);
+  }
+
+  DataT priorClusteringCost = 0;
+  for (n_iter[0] = 1; n_iter[0] <= params.max_iter; ++n_iter[0]) {
+    CUVS_LOG_KMEANS(handle,
+                    "KMeans.fit: Iteration-%d: fitting the model using the initialize "
+                    "cluster centers\n",
+                    n_iter[0]);
+
+    auto const_centroids = raft::make_device_matrix_view<const DataT, IndexT>(
+      centroids.data_handle(), centroids.extent(0), centroids.extent(1));
+    // computes minClusterAndDistance[0:n_samples) where
+    // minClusterAndDistance[i] is a <key, value> pair where
+    //   'key' is index to an sample in 'centroids' (index of the nearest
+    //   centroid) and 'value' is the distance between the sample 'X[i]' and the
+    //   'centroid[key]'
+    cuvs::cluster::kmeans::min_cluster_and_distance(handle,
+                                                    X,
+                                                    const_centroids,
+                                                    minClusterAndDistance.view(),
+                                                    L2NormX.view(),
+                                                    L2NormBuf_OR_DistBuf,
+                                                    params.metric,
+                                                    params.batch_samples,
+                                                    params.batch_centroids,
+                                                    workspace);
+
+    // Using TransformInputIteratorT to dereference an array of
+    // cub::KeyValuePair and converting them to just return the Key to be used
+    // in reduce_rows_by_key prims
+    KeyValueIndexOp<IndexT, DataT> conversion_op;
+    cub::TransformInputIterator<IndexT,
+                                KeyValueIndexOp<IndexT, DataT>,
+                                raft::KeyValuePair<IndexT, DataT>*>
+      itr(minClusterAndDistance.data_handle(), conversion_op);
+
+    workspace.resize(n_samples, stream);
+
+    // Calculates weighted sum of all the samples assigned to cluster-i and
+    // store the result in newCentroids[i]
+    raft::linalg::reduce_rows_by_key((DataT*)X.data_handle(),
+                                     X.extent(1),
+                                     itr,
+                                     weight.data_handle(),
+                                     workspace.data(),
+                                     X.extent(0),
+                                     X.extent(1),
+                                     static_cast<IndexT>(n_clusters),
+                                     newCentroids.data_handle(),
+                                     stream);
+
+    // Reduce weights by key to compute weight in each cluster
+    raft::linalg::reduce_cols_by_key(weight.data_handle(),
+                                     itr,
+                                     wtInCluster.data_handle(),
+                                     (IndexT)1,
+                                     (IndexT)weight.extent(0),
+                                     (IndexT)n_clusters,
+                                     stream);
+
+    // merge the local histogram from all ranks
+    comm.allreduce<DataT>(wtInCluster.data_handle(),  // sendbuff
+                          wtInCluster.data_handle(),  // recvbuff
+                          wtInCluster.size(),         // count
+                          raft::comms::op_t::SUM,
+                          stream);
+
+    // reduces newCentroids from all ranks
+    comm.allreduce<DataT>(newCentroids.data_handle(),  // sendbuff
+                          newCentroids.data_handle(),  // recvbuff
+                          newCentroids.size(),         // count
+                          raft::comms::op_t::SUM,
+                          stream);
+
+    // Computes newCentroids[i] = newCentroids[i]/wtInCluster[i] where
+    //   newCentroids[n_clusters x n_features] - 2D array, newCentroids[i] has
+    //   sum of all the samples assigned to cluster-i
+    //   wtInCluster[n_clusters] - 1D array, wtInCluster[i] contains # of
+    //   samples in cluster-i.
+    // Note - when wtInCluster[i] is 0, newCentroid[i] is reset to 0
+
+    raft::linalg::matrixVectorOp(
+      newCentroids.data_handle(),
+      newCentroids.data_handle(),
+      wtInCluster.data_handle(),
+      newCentroids.extent(1),
+      newCentroids.extent(0),
+      true,
+      false,
+      cuda::proclaim_return_type<DataT>([=] __device__(DataT mat, DataT vec) {
+        if (vec == 0)
+          return DataT(0);
+        else
+          return mat / vec;
+      }),
+      stream);
+
+    // copy the centroids[i] to newCentroids[i] when wtInCluster[i] is 0
+    cub::ArgIndexInputIterator<DataT*> itr_wt(wtInCluster.data_handle());
+    raft::matrix::gather_if(
+      centroids.data_handle(),
+      centroids.extent(1),
+      centroids.extent(0),
+      itr_wt,
+      itr_wt,
+      wtInCluster.extent(0),
+      newCentroids.data_handle(),
+      cuda::proclaim_return_type<bool>(
+        [=] __device__(raft::KeyValuePair<ptrdiff_t, DataT> map) {  // predicate
+          // copy when the # of samples in the cluster is 0
+          if (map.value == 0)
+            return true;
+          else
+            return false;
+        }),
+      cuda::proclaim_return_type<ptrdiff_t>(
+        [=] __device__(raft::KeyValuePair<ptrdiff_t, DataT> map) {  // map
+          return map.key;
+        }),
+      stream);
+
+    // compute the squared norm between the newCentroids and the original
+    // centroids, destructor releases the resource
+    auto sqrdNorm = raft::make_device_scalar<DataT>(handle, 1);
+    raft::linalg::mapThenSumReduce(
+      sqrdNorm.data_handle(),
+      newCentroids.size(),
+      cuda::proclaim_return_type<DataT>([=] __device__(const DataT a, const DataT b) {
+        DataT diff = a - b;
+        return diff * diff;
+      }),
+      stream,
+      centroids.data_handle(),
+      newCentroids.data_handle());
+
+    DataT sqrdNormError = 0;
+    raft::copy(&sqrdNormError, sqrdNorm.data_handle(), sqrdNorm.size(), stream);
+
+    raft::copy(centroids.data_handle(), newCentroids.data_handle(), newCentroids.size(), stream);
+
+    bool done = false;
+    if (params.inertia_check) {
+      rmm::device_scalar<raft::KeyValuePair<IndexT, DataT>> clusterCostD(stream);
+
+      // calculate cluster cost phi_x(C)
+      cuvs::cluster::kmeans::cluster_cost(
+        handle,
+        minClusterAndDistance.view(),
+        workspace,
+        raft::make_device_scalar_view(clusterCostD.data()),
+        cuda::proclaim_return_type<raft::KeyValuePair<IndexT, DataT>>(
+          [] __device__(const raft::KeyValuePair<IndexT, DataT>& a,
+                        const raft::KeyValuePair<IndexT, DataT>& b) {
+            raft::KeyValuePair<IndexT, DataT> res;
+            res.key   = 0;
+            res.value = a.value + b.value;
+            return res;
+          }));
+
+      // Cluster cost phi_x(C) from all ranks
+      comm.allreduce(&(clusterCostD.data()->value),
+                     &(clusterCostD.data()->value),
+                     1,
+                     raft::comms::op_t::SUM,
+                     stream);
+
+      DataT curClusteringCost = 0;
+      raft::copy(&curClusteringCost, &(clusterCostD.data()->value), 1, stream);
+
+      ASSERT(comm.sync_stream(stream) == raft::comms::status_t::SUCCESS,
+             "An error occurred in the distributed operation. This can result "
+             "from a failed rank");
+      ASSERT(curClusteringCost != (DataT)0.0,
+             "Too few points and centroids being found is getting 0 cost from "
+             "centers\n");
+
+      if (n_iter[0] > 0) {
+        DataT delta = curClusteringCost / priorClusteringCost;
+        if (delta > 1 - params.tol) done = true;
+      }
+      priorClusteringCost = curClusteringCost;
+    }
+
+    raft::resource::sync_stream(handle, stream);
+    if (sqrdNormError < params.tol) done = true;
+
+    if (done) {
+      CUVS_LOG_KMEANS(
+        handle, "Threshold triggered after %d iterations. Terminating early.\n", n_iter[0]);
+      break;
+    }
+  }
+}
+
+};  // namespace cuvs::cluster::kmeans::mg::detail
diff --git a/cpp/src/cluster/kmeans.cuh b/cpp/src/cluster/kmeans.cuh
index 1d12142da..5e6d756cc 100644
--- a/cpp/src/cluster/kmeans.cuh
+++ b/cpp/src/cluster/kmeans.cuh
@@ -17,10 +17,12 @@
 
 #include "detail/kmeans.cuh"
 #include "detail/kmeans_auto_find_k.cuh"
+#include "kmeans_mg.hpp"
 #include <cuvs/cluster/kmeans.hpp>
 #include <raft/core/kvp.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/core/operators.hpp>
+#include <raft/core/resource/comms.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 
 #include <optional>
@@ -94,8 +96,13 @@ void fit(raft::resources const& handle,
          raft::host_scalar_view<DataT> inertia,
          raft::host_scalar_view<IndexT> n_iter)
 {
-  cuvs::cluster::kmeans::detail::kmeans_fit<DataT, IndexT>(
-    handle, params, X, sample_weight, centroids, inertia, n_iter);
+  // use the mnmg kmeans fit if we have comms initialize, single gpu otherwise
+  if (raft::resource::comms_initialized(handle)) {
+    cuvs::cluster::kmeans::mg::fit(handle, params, X, sample_weight, centroids, inertia, n_iter);
+  } else {
+    cuvs::cluster::kmeans::detail::kmeans_fit<DataT, IndexT>(
+      handle, params, X, sample_weight, centroids, inertia, n_iter);
+  }
 }
 
 /**
diff --git a/cpp/src/cluster/kmeans_fit_double.cu b/cpp/src/cluster/kmeans_fit_double.cu
new file mode 100644
index 000000000..4f193da09
--- /dev/null
+++ b/cpp/src/cluster/kmeans_fit_double.cu
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kmeans.cuh"
+#include <raft/core/resources.hpp>
+
+namespace cuvs::cluster::kmeans {
+
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::device_matrix_view<const double, int> X,
+         std::optional<raft::device_vector_view<const double, int>> sample_weight,
+         raft::device_matrix_view<double, int> centroids,
+         raft::host_scalar_view<double, int> inertia,
+         raft::host_scalar_view<int, int> n_iter)
+{
+  cuvs::cluster::kmeans::fit<double, int>(
+    handle, params, X, sample_weight, centroids, inertia, n_iter);
+}
+
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::device_matrix_view<const double, int64_t> X,
+         std::optional<raft::device_vector_view<const double, int64_t>> sample_weight,
+         raft::device_matrix_view<double, int64_t> centroids,
+         raft::host_scalar_view<double, int64_t> inertia,
+         raft::host_scalar_view<int64_t, int64_t> n_iter)
+{
+  cuvs::cluster::kmeans::fit<double, int64_t>(
+    handle, params, X, sample_weight, centroids, inertia, n_iter);
+}
+}  // namespace cuvs::cluster::kmeans
diff --git a/cpp/src/cluster/kmeans_fit_float.cu b/cpp/src/cluster/kmeans_fit_float.cu
index 89862a46c..3888ae492 100644
--- a/cpp/src/cluster/kmeans_fit_float.cu
+++ b/cpp/src/cluster/kmeans_fit_float.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,4 +30,16 @@ void fit(raft::resources const& handle,
   cuvs::cluster::kmeans::fit<float, int>(
     handle, params, X, sample_weight, centroids, inertia, n_iter);
 }
+
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::device_matrix_view<const float, int64_t> X,
+         std::optional<raft::device_vector_view<const float, int64_t>> sample_weight,
+         raft::device_matrix_view<float, int64_t> centroids,
+         raft::host_scalar_view<float, int64_t> inertia,
+         raft::host_scalar_view<int64_t, int64_t> n_iter)
+{
+  cuvs::cluster::kmeans::fit<float, int64_t>(
+    handle, params, X, sample_weight, centroids, inertia, n_iter);
+}
 }  // namespace cuvs::cluster::kmeans
diff --git a/cpp/src/cluster/kmeans_fit_mg_double.cu b/cpp/src/cluster/kmeans_fit_mg_double.cu
new file mode 100644
index 000000000..15081dfba
--- /dev/null
+++ b/cpp/src/cluster/kmeans_fit_mg_double.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "./detail/kmeans_mg.cuh"
+#include "kmeans_mg.hpp"
+#include <raft/core/resources.hpp>
+
+namespace cuvs::cluster::kmeans::mg {
+
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::device_matrix_view<const double, int> X,
+         std::optional<raft::device_vector_view<const double, int>> sample_weight,
+         raft::device_matrix_view<double, int> centroids,
+         raft::host_scalar_view<double, int> inertia,
+         raft::host_scalar_view<int, int> n_iter)
+{
+  rmm::device_uvector<char> workspace(0, raft::resource::get_cuda_stream(handle));
+
+  cuvs::cluster::kmeans::mg::detail::fit<double, int>(
+    handle, params, X, sample_weight, centroids, inertia, n_iter, workspace);
+}
+
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::device_matrix_view<const double, int64_t> X,
+         std::optional<raft::device_vector_view<const double, int64_t>> sample_weight,
+         raft::device_matrix_view<double, int64_t> centroids,
+         raft::host_scalar_view<double, int64_t> inertia,
+         raft::host_scalar_view<int64_t, int64_t> n_iter)
+{
+  rmm::device_uvector<char> workspace(0, raft::resource::get_cuda_stream(handle));
+
+  cuvs::cluster::kmeans::mg::detail::fit<double, int64_t>(
+    handle, params, X, sample_weight, centroids, inertia, n_iter, workspace);
+}
+}  // namespace cuvs::cluster::kmeans::mg
diff --git a/cpp/src/cluster/kmeans_fit_mg_float.cu b/cpp/src/cluster/kmeans_fit_mg_float.cu
new file mode 100644
index 000000000..54fbd6763
--- /dev/null
+++ b/cpp/src/cluster/kmeans_fit_mg_float.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "./detail/kmeans_mg.cuh"
+#include "kmeans_mg.hpp"
+#include <raft/core/resources.hpp>
+
+namespace cuvs::cluster::kmeans::mg {
+
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::device_matrix_view<const float, int> X,
+         std::optional<raft::device_vector_view<const float, int>> sample_weight,
+         raft::device_matrix_view<float, int> centroids,
+         raft::host_scalar_view<float, int> inertia,
+         raft::host_scalar_view<int, int> n_iter)
+{
+  rmm::device_uvector<char> workspace(0, raft::resource::get_cuda_stream(handle));
+
+  cuvs::cluster::kmeans::mg::detail::fit<float, int>(
+    handle, params, X, sample_weight, centroids, inertia, n_iter, workspace);
+}
+
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::device_matrix_view<const float, int64_t> X,
+         std::optional<raft::device_vector_view<const float, int64_t>> sample_weight,
+         raft::device_matrix_view<float, int64_t> centroids,
+         raft::host_scalar_view<float, int64_t> inertia,
+         raft::host_scalar_view<int64_t, int64_t> n_iter)
+{
+  rmm::device_uvector<char> workspace(0, raft::resource::get_cuda_stream(handle));
+
+  cuvs::cluster::kmeans::mg::detail::fit<float, int64_t>(
+    handle, params, X, sample_weight, centroids, inertia, n_iter, workspace);
+}
+}  // namespace cuvs::cluster::kmeans::mg
diff --git a/cpp/src/cluster/kmeans_fit_predict_double.cu b/cpp/src/cluster/kmeans_fit_predict_double.cu
new file mode 100644
index 000000000..28a1d70c0
--- /dev/null
+++ b/cpp/src/cluster/kmeans_fit_predict_double.cu
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kmeans.cuh"
+#include <raft/core/resources.hpp>
+
+namespace cuvs::cluster::kmeans {
+
+void fit_predict(raft::resources const& handle,
+                 const kmeans::params& params,
+                 raft::device_matrix_view<const double, int> X,
+                 std::optional<raft::device_vector_view<const double, int>> sample_weight,
+                 std::optional<raft::device_matrix_view<double, int>> centroids,
+                 raft::device_vector_view<int, int> labels,
+                 raft::host_scalar_view<double> inertia,
+                 raft::host_scalar_view<int> n_iter)
+
+{
+  cuvs::cluster::kmeans::fit_predict<double, int>(
+    handle, params, X, sample_weight, centroids, labels, inertia, n_iter);
+}
+
+void fit_predict(raft::resources const& handle,
+                 const kmeans::params& params,
+                 raft::device_matrix_view<const double, int64_t> X,
+                 std::optional<raft::device_vector_view<const double, int64_t>> sample_weight,
+                 std::optional<raft::device_matrix_view<double, int64_t>> centroids,
+                 raft::device_vector_view<int64_t, int64_t> labels,
+                 raft::host_scalar_view<double> inertia,
+                 raft::host_scalar_view<int64_t> n_iter)
+
+{
+  cuvs::cluster::kmeans::fit_predict<double, int64_t>(
+    handle, params, X, sample_weight, centroids, labels, inertia, n_iter);
+}
+}  // namespace cuvs::cluster::kmeans
diff --git a/cpp/src/cluster/kmeans_fit_predict_float.cu b/cpp/src/cluster/kmeans_fit_predict_float.cu
index f043f7624..be3652db5 100644
--- a/cpp/src/cluster/kmeans_fit_predict_float.cu
+++ b/cpp/src/cluster/kmeans_fit_predict_float.cu
@@ -32,4 +32,18 @@ void fit_predict(raft::resources const& handle,
   cuvs::cluster::kmeans::fit_predict<float, int>(
     handle, params, X, sample_weight, centroids, labels, inertia, n_iter);
 }
+
+void fit_predict(raft::resources const& handle,
+                 const kmeans::params& params,
+                 raft::device_matrix_view<const float, int64_t> X,
+                 std::optional<raft::device_vector_view<const float, int64_t>> sample_weight,
+                 std::optional<raft::device_matrix_view<float, int64_t>> centroids,
+                 raft::device_vector_view<int64_t, int64_t> labels,
+                 raft::host_scalar_view<float> inertia,
+                 raft::host_scalar_view<int64_t> n_iter)
+
+{
+  cuvs::cluster::kmeans::fit_predict<float, int64_t>(
+    handle, params, X, sample_weight, centroids, labels, inertia, n_iter);
+}
 }  // namespace cuvs::cluster::kmeans
diff --git a/cpp/src/cluster/kmeans_mg.hpp b/cpp/src/cluster/kmeans_mg.hpp
new file mode 100644
index 000000000..34f38314a
--- /dev/null
+++ b/cpp/src/cluster/kmeans_mg.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cuvs/cluster/kmeans.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/resources.hpp>
+
+namespace cuvs::cluster::kmeans::mg {
+
+/**
+ * @brief MNMG kmeans fit
+ *
+ * @param[in]     handle        The raft handle.
+ * @param[in]     params        Parameters for KMeans model.
+ * @param[in]     X             Training instances to cluster. The data must
+ *                              be in row-major format.
+ *                              [dim = n_samples x n_features]
+ * @param[in]     sample_weight Optional weights for each observation in X.
+ *                              [len = n_samples]
+ * @param[inout]  centroids     [in] When init is InitMethod::Array, use
+ *                              centroids as the initial cluster centers.
+ *                              [out] The generated centroids from the
+ *                              kmeans algorithm are stored at the address
+ *                              pointed by 'centroids'.
+ *                              [dim = n_clusters x n_features]
+ * @param[out]    inertia       Sum of squared distances of samples to their
+ *                              closest cluster center.
+ * @param[out]    n_iter        Number of iterations run.
+ */
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::device_matrix_view<const float, int> X,
+         std::optional<raft::device_vector_view<const float, int>> sample_weight,
+         raft::device_matrix_view<float, int> centroids,
+         raft::host_scalar_view<float, int> inertia,
+         raft::host_scalar_view<int, int> n_iter);
+
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::device_matrix_view<const float, int64_t> X,
+         std::optional<raft::device_vector_view<const float, int64_t>> sample_weight,
+         raft::device_matrix_view<float, int64_t> centroids,
+         raft::host_scalar_view<float, int64_t> inertia,
+         raft::host_scalar_view<int64_t, int64_t> n_iter);
+
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::device_matrix_view<const double, int> X,
+         std::optional<raft::device_vector_view<const double, int>> sample_weight,
+         raft::device_matrix_view<double, int> centroids,
+         raft::host_scalar_view<double, int> inertia,
+         raft::host_scalar_view<int, int> n_iter);
+
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::device_matrix_view<const double, int64_t> X,
+         std::optional<raft::device_vector_view<const double, int64_t>> sample_weight,
+         raft::device_matrix_view<double, int64_t> centroids,
+         raft::host_scalar_view<double, int64_t> inertia,
+         raft::host_scalar_view<int64_t, int64_t> n_iter);
+}  // namespace cuvs::cluster::kmeans::mg
diff --git a/cpp/src/cluster/kmeans_predict_double.cu b/cpp/src/cluster/kmeans_predict_double.cu
new file mode 100644
index 000000000..1fcc393ac
--- /dev/null
+++ b/cpp/src/cluster/kmeans_predict_double.cu
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kmeans.cuh"
+#include <raft/core/resources.hpp>
+
+namespace cuvs::cluster::kmeans {
+
+void predict(raft::resources const& handle,
+             const kmeans::params& params,
+             raft::device_matrix_view<const double, int> X,
+             std::optional<raft::device_vector_view<const double, int>> sample_weight,
+             raft::device_matrix_view<const double, int> centroids,
+             raft::device_vector_view<int, int> labels,
+             bool normalize_weight,
+             raft::host_scalar_view<double> inertia)
+
+{
+  cuvs::cluster::kmeans::predict<double, int>(
+    handle, params, X, sample_weight, centroids, labels, normalize_weight, inertia);
+}
+
+void predict(raft::resources const& handle,
+             const kmeans::params& params,
+             raft::device_matrix_view<const double, int> X,
+             std::optional<raft::device_vector_view<const double, int>> sample_weight,
+             raft::device_matrix_view<const double, int> centroids,
+             raft::device_vector_view<int64_t, int> labels,
+             bool normalize_weight,
+             raft::host_scalar_view<double> inertia)
+
+{
+  cuvs::cluster::kmeans::predict<double, int64_t>(
+    handle, params, X, sample_weight, centroids, labels, normalize_weight, inertia);
+}
+}  // namespace cuvs::cluster::kmeans
diff --git a/cpp/src/cluster/kmeans_predict_float.cu b/cpp/src/cluster/kmeans_predict_float.cu
index d092152f1..b5f9f9e51 100644
--- a/cpp/src/cluster/kmeans_predict_float.cu
+++ b/cpp/src/cluster/kmeans_predict_float.cu
@@ -32,4 +32,17 @@ void predict(raft::resources const& handle,
   cuvs::cluster::kmeans::predict<float, int>(
     handle, params, X, sample_weight, centroids, labels, normalize_weight, inertia);
 }
+void predict(raft::resources const& handle,
+             const kmeans::params& params,
+             raft::device_matrix_view<const float, int> X,
+             std::optional<raft::device_vector_view<const float, int>> sample_weight,
+             raft::device_matrix_view<const float, int> centroids,
+             raft::device_vector_view<int64_t, int> labels,
+             bool normalize_weight,
+             raft::host_scalar_view<float> inertia)
+
+{
+  cuvs::cluster::kmeans::predict<float, int64_t>(
+    handle, params, X, sample_weight, centroids, labels, normalize_weight, inertia);
+}
 }  // namespace cuvs::cluster::kmeans
diff --git a/cpp/src/cluster/kmeans_transform_double.cu b/cpp/src/cluster/kmeans_transform_double.cu
new file mode 100644
index 000000000..4a026812e
--- /dev/null
+++ b/cpp/src/cluster/kmeans_transform_double.cu
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kmeans.cuh"
+#include <raft/core/resources.hpp>
+
+namespace cuvs::cluster::kmeans {
+
+void transform(raft::resources const& handle,
+               const kmeans::params& params,
+               raft::device_matrix_view<const double, int> X,
+               raft::device_matrix_view<const double, int> centroids,
+               raft::device_matrix_view<double, int> X_new)
+
+{
+  cuvs::cluster::kmeans::transform<double, int>(handle, params, X, centroids, X_new);
+}
+}  // namespace cuvs::cluster::kmeans
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
index 3107f0fa4..4fd194f6c 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
@@ -111,6 +111,8 @@ instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
   cuvs::distance::detail::ops::l1_distance_op, int);
 instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
   cuvs::distance::detail::ops::l2_exp_distance_op, int);
+instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
+  cuvs::distance::detail::ops::l2_exp_distance_op, int64_t);
 instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
   cuvs::distance::detail::ops::l2_unexp_distance_op, int);
 instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
@@ -124,5 +126,8 @@ instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo(
   int64_t,
   cuvs::distance::kernels::detail::rbf_fin_op);
 
+instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
+  cuvs::distance::detail::ops::l2_exp_distance_op, int64_t);
+
 #undef instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo
 #undef instantiate_cuvs_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
index 1bd51aef9..d0913833f 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
@@ -15,7 +15,7 @@
 # NOTE: this template is not perfectly formatted. Use pre-commit to get
 # everything in shape again.
 header = """/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -197,4 +197,30 @@ def arch_headers(archs):
 
         f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
 
-print("src/distance/detail/pairwise_matrix/dispatch_rbf.cu")
+
+        print("src/distance/detail/pairwise_matrix/dispatch_rbf.cu")
+
+# L2 with int64_t indices for kmeans code
+int64_t_op_instances = [
+    dict(
+        path_prefix="l2_expanded",
+        OpT="cuvs::distance::detail::ops::l2_exp_distance_op",
+        archs = [60, 80],
+    )]
+
+for op in int64_t_op_instances:
+    for dt in data_type_instances:
+        DataT, AccT, OutT, IdxT = (dt[k] for k in ["DataT", "AccT", "OutT", "IdxT"]);
+
+        IdxT = "int64_t"
+        path = f"dispatch_{op['path_prefix']}_{DataT}_{AccT}_{OutT}_{IdxT}.cu"
+        with open(path, "w") as f:
+            f.write(header)
+            f.write(arch_headers(op["archs"]))
+            f.write(macro)
+
+            OpT = op['OpT']
+            FinOpT = "raft::identity_op"
+            f.write(f"\ninstantiate_raft_distance_detail_pairwise_matrix_dispatch({OpT}, {DataT}, {AccT}, {OutT}, {FinOpT}, {IdxT});\n")
+            f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
+        print(f"src/distance/detail/pairwise_matrix/{path}")
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int64_t.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int64_t.cu
new file mode 100644
index 000000000..756739158
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int64_t.cu
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include "../distance_ops/all_ops.cuh"  // ops::*
+#include "dispatch-inl.cuh"             // dispatch
+#include "dispatch_sm60.cuh"
+#include "dispatch_sm80.cuh"
+#include <raft/core/operators.hpp>  // raft::identity_op
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void cuvs::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const OutT* x_norm,                                                              \
+      const OutT* y_norm,                                                              \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  cuvs::distance::detail::ops::l2_exp_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int64_t);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int64_t.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int64_t.cu
new file mode 100644
index 000000000..94910875c
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int64_t.cu
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include "../distance_ops/all_ops.cuh"  // ops::*
+#include "dispatch-inl.cuh"             // dispatch
+#include "dispatch_sm60.cuh"
+#include "dispatch_sm80.cuh"
+#include <raft/core/operators.hpp>  // raft::identity_op
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void cuvs::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const OutT* x_norm,                                                              \
+      const OutT* y_norm,                                                              \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  cuvs::distance::detail::ops::l2_exp_distance_op, float, float, float, raft::identity_op, int64_t);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu
index 1cb0ed8ae..3c8f25109 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/distance-ext.cuh b/cpp/src/distance/distance-ext.cuh
index e7fa30f03..8ce7ef690 100644
--- a/cpp/src/distance/distance-ext.cuh
+++ b/cpp/src/distance/distance-ext.cuh
@@ -244,6 +244,15 @@ instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::Linf);
 instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::LpUnexpanded);
 instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::RusselRaoExpanded);
 
+instantiate_cuvs_distance_distance(
+  cuvs::distance::DistanceType::L2Expanded, float, float, float, int64_t);
+instantiate_cuvs_distance_distance(
+  cuvs::distance::DistanceType::L2Expanded, double, double, double, int64_t);
+instantiate_cuvs_distance_distance(
+  cuvs::distance::DistanceType::L2SqrtExpanded, float, float, float, int64_t);
+instantiate_cuvs_distance_distance(
+  cuvs::distance::DistanceType::L2SqrtExpanded, double, double, double, int64_t);
+
 #undef instantiate_cuvs_distance_distance_by_algo
 #undef instantiate_cuvs_distance_distance
 
diff --git a/cpp/src/distance/distance.cu b/cpp/src/distance/distance.cu
index 72be93f10..c1d39f360 100644
--- a/cpp/src/distance/distance.cu
+++ b/cpp/src/distance/distance.cu
@@ -105,6 +105,16 @@ instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::Linf);
 instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::LpUnexpanded);
 instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::RusselRaoExpanded);
 
+instantiate_cuvs_distance_distance(
+  cuvs::distance::DistanceType::L2Expanded, float, float, float, int64_t);
+instantiate_cuvs_distance_distance(
+  cuvs::distance::DistanceType::L2Expanded, double, double, double, int64_t);
+
+instantiate_cuvs_distance_distance(
+  cuvs::distance::DistanceType::L2SqrtExpanded, float, float, float, int64_t);
+instantiate_cuvs_distance_distance(
+  cuvs::distance::DistanceType::L2SqrtExpanded, double, double, double, int64_t);
+
 #undef instantiate_cuvs_distance_distance_by_algo
 #undef instantiate_cuvs_distance_distance
 
diff --git a/cpp/test/cluster/kmeans_mg.cu b/cpp/test/cluster/kmeans_mg.cu
new file mode 100644
index 000000000..b9e06b2f1
--- /dev/null
+++ b/cpp/test/cluster/kmeans_mg.cu
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuvs/cluster/kmeans.hpp>
+#include <raft/random/make_blobs.cuh>
+#include <raft/stats/adjusted_rand_index.cuh>
+
+#include <raft/comms/std_comms.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/fill.h>
+
+#include <gtest/gtest.h>
+#include <nccl.h>
+#include <stdio.h>
+#include <test_utils.h>
+
+#include <vector>
+
+#define NCCLCHECK(cmd)                                                                        \
+  do {                                                                                        \
+    ncclResult_t res = cmd;                                                                   \
+    if (res != ncclSuccess) {                                                                 \
+      printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, ncclGetErrorString(res)); \
+      exit(EXIT_FAILURE);                                                                     \
+    }                                                                                         \
+  } while (0)
+
+namespace cuvs {
+
+template <typename T>
+struct KmeansInputs {
+  int n_row;
+  int n_col;
+  int n_clusters;
+  T tol;
+  bool weighted;
+};
+
+template <typename T>
+class KmeansTest : public ::testing::TestWithParam<KmeansInputs<T>> {
+ protected:
+  KmeansTest()
+    : stream(handle.get_stream()),
+      d_labels(0, stream),
+      d_labels_ref(0, stream),
+      d_centroids(0, stream),
+      d_sample_weight(0, stream)
+  {
+  }
+
+  void basicTest()
+  {
+    testparams = ::testing::TestWithParam<KmeansInputs<T>>::GetParam();
+    ncclComm_t nccl_comm;
+    NCCLCHECK(ncclCommInitAll(&nccl_comm, 1, {0}));
+    raft::comms::build_comms_nccl_only(&handle, nccl_comm, 1, 0);
+
+    int n_samples              = testparams.n_row;
+    int n_features             = testparams.n_col;
+    params.n_clusters          = testparams.n_clusters;
+    params.tol                 = testparams.tol;
+    params.n_init              = 5;
+    params.rng_state.seed      = 1;
+    params.oversampling_factor = 1;
+
+    auto stream = handle.get_stream();
+    rmm::device_uvector<T> X(n_samples * n_features, stream);
+    rmm::device_uvector<int> labels(n_samples, stream);
+
+    raft::random::make_blobs<T, int>(handle,
+                                     X.data(),
+                                     labels.data(),
+                                     n_samples,
+                                     n_features,
+                                     params.n_clusters,
+                                     true,
+                                     nullptr,
+                                     nullptr,
+                                     1.0,
+                                     false,
+                                     -10.0f,
+                                     10.0f,
+                                     1234ULL);
+
+    d_labels.resize(n_samples, stream);
+    d_labels_ref.resize(n_samples, stream);
+    d_centroids.resize(params.n_clusters * n_features, stream);
+
+    std::optional<raft::device_vector_view<const T, int>> d_sw = std::nullopt;
+    if (testparams.weighted) {
+      d_sample_weight.resize(n_samples, stream);
+      thrust::fill(thrust::cuda::par.on(stream),
+                   d_sample_weight.data(),
+                   d_sample_weight.data() + n_samples,
+                   1);
+      d_sw = raft::make_device_vector_view<const T, int>(d_sample_weight.data(), n_samples);
+    }
+    raft::copy(d_labels_ref.data(), labels.data(), n_samples, stream);
+
+    handle.sync_stream(stream);
+
+    T inertia  = 0;
+    int n_iter = 0;
+
+    auto X_view = raft::make_device_matrix_view<const T, int>(X.data(), n_samples, n_features);
+    auto centroids_view =
+      raft::make_device_matrix_view<T, int>(d_centroids.data(), params.n_clusters, n_features);
+
+    cuvs::cluster::kmeans::fit(handle,
+                               params,
+                               X_view,
+                               d_sw,
+                               centroids_view,
+                               raft::make_host_scalar_view<T>(&inertia),
+                               raft::make_host_scalar_view<int>(&n_iter));
+
+    cuvs::cluster::kmeans::predict(
+      handle,
+      params,
+      X_view,
+      d_sw,
+      d_centroids.data(),
+      raft::make_device_vector_view<int, int>(d_labels.data(), n_samples),
+      true,
+      raft::make_host_scalar_view<T>(&inertia));
+    score = raft::stats::adjusted_rand_index(
+      d_labels_ref.data(), d_labels.data(), n_samples, raft::resource::get_cuda_stream(handle));
+    handle.sync_stream(stream);
+
+    if (score < 0.99) {
+      std::cout << "Expected: " << raft::arr2Str(d_labels_ref.data(), 25, "d_labels_ref", stream)
+                << std::endl;
+      std::cout << "Actual: " << raft::arr2Str(d_labels.data(), 25, "d_labels", stream)
+                << std::endl;
+      std::cout << "score = " << score << std::endl;
+    }
+    ncclCommDestroy(nccl_comm);
+  }
+
+  void SetUp() override { basicTest(); }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+  KmeansInputs<T> testparams;
+  rmm::device_uvector<int> d_labels;
+  rmm::device_uvector<int> d_labels_ref;
+  rmm::device_uvector<T> d_centroids;
+  rmm::device_uvector<T> d_sample_weight;
+  double score;
+  cuvs::cluster::kmeans::params params;
+};
+
+const std::vector<KmeansInputs<float>> inputsf2 = {{1000, 32, 5, 0.0001, true},
+                                                   {1000, 32, 5, 0.0001, false},
+                                                   {1000, 100, 20, 0.0001, true},
+                                                   {1000, 100, 20, 0.0001, false},
+                                                   {10000, 32, 10, 0.0001, true},
+                                                   {10000, 32, 10, 0.0001, false},
+                                                   {10000, 100, 50, 0.0001, true},
+                                                   {10000, 100, 50, 0.0001, false}};
+
+const std::vector<KmeansInputs<double>> inputsd2 = {{1000, 32, 5, 0.0001, true},
+                                                    {1000, 32, 5, 0.0001, false},
+                                                    {1000, 100, 20, 0.0001, true},
+                                                    {1000, 100, 20, 0.0001, false},
+                                                    {10000, 32, 10, 0.0001, true},
+                                                    {10000, 32, 10, 0.0001, false},
+                                                    {10000, 100, 50, 0.0001, true},
+                                                    {10000, 100, 50, 0.0001, false}};
+
+typedef KmeansTest<float> KmeansTestF;
+TEST_P(KmeansTestF, Result) { ASSERT_TRUE(score >= 0.99); }
+
+typedef KmeansTest<double> KmeansTestD;
+TEST_P(KmeansTestD, Result) { ASSERT_TRUE(score >= 0.99); }
+
+INSTANTIATE_TEST_CASE_P(KmeansTests, KmeansTestF, ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_CASE_P(KmeansTests, KmeansTestD, ::testing::ValuesIn(inputsd2));
+
+}  // end namespace cuvs