Merge branch 'branch-24.04' into fix-cagra-prune-illegal-memory-access

rapidsai · Mar 15, 2024 · c773943 · c773943
2 parents 4f9f72d + 1877011
commit c773943
Show file tree

Hide file tree

Showing 16 changed files with 104 additions and 70 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,12 @@
 # <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: Reusable Accelerated Functions and Tools for Vector Search and More</div>
 
+> [!IMPORTANT]
+> The vector search and clustering algorithms in RAFT are being migrated to a new library dedicated to vector search called [cuVS](https://github.com/rapidsai/cuvs). We will continue to support the vector search algorithms in RAFT during this move, but will no longer update them after the RAPIDS 24.06 (June) release. We plan to complete the migration by RAPIDS 24.08 (August) release.
+
 ![RAFT tech stack](img/raft-tech-stack-vss.png)
 
 
+
 ## Contents
 <hr>
 
@@ -77,6 +81,8 @@ Projects that use the RAFT ANNS algorithms for accelerating vector search includ
 
 Please see the example [Jupyter notebook](https://github.com/rapidsai/raft/blob/HEAD/notebooks/VectorSearch_QuestionRetrieval.ipynb) to get started RAFT for vector search in Python.
 
+
+
 ### Information Retrieval
 
 RAFT contains a catalog of reusable primitives for composing algorithms that require fast neighborhood computations, such as

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -39,7 +39,7 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
-- numpy>=1.23
+- numpy>=1.23,<2.0a0
 - numpydoc
 - nvcc_linux-aarch64=11.8
 - pre-commit
@@ -57,5 +57,5 @@ dependencies:
 - sysroot_linux-aarch64==2.17
 - ucx-proc=*=gpu
 - ucx-py==0.37.*
-- ucx>=1.13.0
+- ucx>=1.15.0,<1.16.0
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -39,7 +39,7 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
-- numpy>=1.23
+- numpy>=1.23,<2.0a0
 - numpydoc
 - nvcc_linux-64=11.8
 - pre-commit
@@ -57,5 +57,5 @@ dependencies:
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
 - ucx-py==0.37.*
-- ucx>=1.13.0
+- ucx>=1.15.0,<1.16.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-122_arch-aarch64.yaml b/conda/environments/all_cuda-122_arch-aarch64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
-- numpy>=1.23
+- numpy>=1.23,<2.0a0
 - numpydoc
 - pre-commit
 - pydata-sphinx-theme
@@ -53,5 +53,5 @@ dependencies:
 - sysroot_linux-aarch64==2.17
 - ucx-proc=*=gpu
 - ucx-py==0.37.*
-- ucx>=1.13.0
+- ucx>=1.15.0,<1.16.0
 name: all_cuda-122_arch-aarch64
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
-- numpy>=1.23
+- numpy>=1.23,<2.0a0
 - numpydoc
 - pre-commit
 - pydata-sphinx-theme
@@ -53,5 +53,5 @@ dependencies:
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
 - ucx-py==0.37.*
-- ucx>=1.13.0
+- ucx>=1.15.0,<1.16.0
 name: all_cuda-122_arch-x86_64
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
@@ -65,7 +65,7 @@ requirements:
     {% endif %}
     - libraft {{ version }}
     - libraft-headers {{ version }}
-    - numpy >=1.23
+    - numpy >=1.23,<2.0a0
     - python x.x
     - rmm ={{ minor_version }}
 

diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -14,7 +14,7 @@ sysroot_version:
   - "2.17"
 
 ucx_version:
-  - ">=1.14.1,<1.16.0"
+  - ">=1.15.0,<1.16.0"
 
 ucx_py_version:
   - "0.37.*"

diff --git a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
@@ -91,18 +91,11 @@ std::enable_if_t<ops::has_cutlass_op<OpT>::value> cutlassDistanceKernel(const Da
 
   typename EpilogueOutputOp::Params epilog_op_param(dist_op, fin_op);
 
-  const DataT *a, *b;
-
-  IdxT gemm_lda, gemm_ldb;
-
   // Number of pipelines you want to use
   constexpr int NumStages = 3;
   // Alignment
   constexpr int Alignment = VecLen;
 
-  // default initialize problem size with row major inputs
-  auto problem_size = cutlass::gemm::GemmCoord(n, m, k);
-
   using cutlassDistKernel =
     typename cutlass::gemm::kernel::PairwiseDistanceGemm<DataT,
                                                          Alignment,
@@ -116,53 +109,73 @@ std::enable_if_t<ops::has_cutlass_op<OpT>::value> cutlassDistanceKernel(const Da
 
   using cutlassDist = cutlass::gemm::device::GemmUniversalAdapter<cutlassDistKernel>;
 
-  if constexpr (isRowMajor) {
-    a        = y;
-    b        = x;
-    gemm_lda = ldb;
-    gemm_ldb = lda;
-  } else {
-    problem_size = cutlass::gemm::GemmCoord(m, n, k);
-    a            = x;
-    b            = y;
-    gemm_lda     = lda;
-    gemm_ldb     = ldb;
+  constexpr uint32_t gridYZMax      = ((1 << (sizeof(uint16_t) * 8)) - 1);
+  constexpr uint32_t max_batch_size = gridYZMax * cutlassDistKernel::ThreadblockShape::kN;
+  IdxT numNbatches                  = (n - 1 + max_batch_size) / max_batch_size;
+
+  for (IdxT i = 0; i < numNbatches; i++) {
+    const DataT *a, *b;
+    IdxT gemm_lda, gemm_ldb;
+    size_t offsetN = i * max_batch_size;
+
+    if constexpr (isRowMajor) {
+      gemm_lda = ldb;
+      gemm_ldb = lda;
+      a        = y + offsetN * gemm_lda;
+      b        = x;
+    } else {
+      gemm_lda = lda;
+      gemm_ldb = ldb;
+      a        = x;
+      b        = y + offsetN;
+    }
+    IdxT chunkN   = (i + 1) * max_batch_size;
+    IdxT currentN = (chunkN < n) ? max_batch_size : (n - offsetN);
+
+    // default initialize problem size with row major inputs
+    auto problem_size = isRowMajor ? cutlass::gemm::GemmCoord(currentN, m, k)
+                                   : cutlass::gemm::GemmCoord(m, currentN, k);
+
+    typename cutlassDist::Arguments arguments{
+      mode,
+      problem_size,
+      batch_count,
+      epilog_op_param,
+      a,
+      b,
+      xn,                    // C matrix eq vector param, which here is A norm
+      nullptr,               // tensor_Z,
+      (DataT*)yn + offsetN,  // this is broadcast vec, which is required to be non-const param
+      dOutput + offsetN,     // Output distance matrix
+      (int64_t)0,            // batch stride A
+      (int64_t)0,            // batch stride B
+      (int64_t)0,            // batch stride Norm A
+      (int64_t)0,
+      (int64_t)0,  // batch stride Norm B
+      (int64_t)0,  // batch stride Output
+      gemm_lda,    // stride A
+      gemm_ldb,    // stride B
+      1,           // stride A norm
+      0,           // this is no-op for Z
+      0,           // This must be zero
+      ldd          // stride Output matrix
+    };
+
+    // Using the arguments, query for extra workspace required for matrix multiplication computation
+    size_t workspace_size = cutlassDist::get_workspace_size(arguments);
+    // Allocate workspace memory
+    rmm::device_uvector<uint8_t> workspace(workspace_size, stream);
+    // Instantiate CUTLASS kernel depending on templates
+    cutlassDist cutlassDist_op;
+    // Check the problem size is supported or not
+    RAFT_CUTLASS_TRY(cutlassDist_op.can_implement(arguments));
+
+    // Initialize CUTLASS kernel with arguments and workspace pointer
+    RAFT_CUTLASS_TRY(cutlassDist_op.initialize(arguments, workspace.data(), stream));
+
+    // Launch initialized CUTLASS kernel
+    RAFT_CUTLASS_TRY(cutlassDist_op(stream));
   }
-
-  typename cutlassDist::Arguments arguments{
-    mode,       problem_size, batch_count, epilog_op_param, a, b,
-    xn,          // C matrix eq vector param, which here is A norm
-    nullptr,     // tensor_Z,
-    (DataT*)yn,  // this is broadcast vec, which is required to be non-const param
-    dOutput,     // Output distance matrix
-    (int64_t)0,  // batch stride A
-    (int64_t)0,  // batch stride B
-    (int64_t)0,  // batch stride Norm A
-    (int64_t)0,
-    (int64_t)0,  // batch stride Norm B
-    (int64_t)0,  // batch stride Output
-    gemm_lda,    // stride A
-    gemm_ldb,    // stride B
-    1,           // stride A norm
-    0,           // this is no-op for Z
-    0,           // This must be zero
-    ldd          // stride Output matrix
-  };
-
-  // Using the arguments, query for extra workspace required for matrix multiplication computation
-  size_t workspace_size = cutlassDist::get_workspace_size(arguments);
-  // Allocate workspace memory
-  rmm::device_uvector<uint8_t> workspace(workspace_size, stream);
-  // Instantiate CUTLASS kernel depending on templates
-  cutlassDist cutlassDist_op;
-  // Check the problem size is supported or not
-  RAFT_CUTLASS_TRY(cutlassDist_op.can_implement(arguments));
-
-  // Initialize CUTLASS kernel with arguments and workspace pointer
-  RAFT_CUTLASS_TRY(cutlassDist_op.initialize(arguments, workspace.data(), stream));
-
-  // Launch initialized CUTLASS kernel
-  RAFT_CUTLASS_TRY(cutlassDist_op(stream));
 }
 
 };  // namespace detail

diff --git a/cpp/test/distance/dist_cos.cu b/cpp/test/distance/dist_cos.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,10 +29,12 @@ class DistanceExpCosXequalY
   : public DistanceTestSameBuffer<raft::distance::DistanceType::CosineExpanded, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
+  {0.001f, 128, (65536 + 128) * 128, 8, true, 1234ULL},
   {0.001f, 1024, 1024, 32, true, 1234ULL},
   {0.001f, 1024, 32, 1024, true, 1234ULL},
   {0.001f, 32, 1024, 1024, true, 1234ULL},
   {0.003f, 1024, 1024, 1024, true, 1234ULL},
+  {0.001f, (65536 + 128) * 128, 128, 8, false, 1234ULL},
   {0.001f, 1024, 1024, 32, false, 1234ULL},
   {0.001f, 1024, 32, 1024, false, 1234ULL},
   {0.001f, 32, 1024, 1024, false, 1234ULL},

diff --git a/cpp/test/distance/dist_l2_exp.cu b/cpp/test/distance/dist_l2_exp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,12 +29,14 @@ class DistanceEucExpTestXequalY
   : public DistanceTestSameBuffer<raft::distance::DistanceType::L2Expanded, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
+  {0.001f, 128, (65536 + 128) * 128, 8, true, 1234ULL},
   {0.001f, 2048, 4096, 128, true, 1234ULL},
   {0.001f, 1024, 1024, 32, true, 1234ULL},
   {0.001f, 1024, 32, 1024, true, 1234ULL},
   {0.001f, 32, 1024, 1024, true, 1234ULL},
   {0.003f, 1024, 1024, 1024, true, 1234ULL},
   {0.003f, 1021, 1021, 1021, true, 1234ULL},
+  {0.001f, (65536 + 128) * 128, 128, 8, false, 1234ULL},
   {0.001f, 1024, 1024, 32, false, 1234ULL},
   {0.001f, 1024, 32, 1024, false, 1234ULL},
   {0.001f, 32, 1024, 1024, false, 1234ULL},

diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
@@ -339,7 +339,7 @@ void naiveDistance(DataType* dist,
                    DataType metric_arg = 2.0f,
                    cudaStream_t stream = 0)
 {
-  static const dim3 TPB(16, 32, 1);
+  static const dim3 TPB(4, 256, 1);
   dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1);
 
   switch (type) {

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -402,7 +402,7 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - &numpy numpy>=1.23
+          - &numpy numpy>=1.23,<2.0a0
       - output_types: [conda]
         packages:
           - *rmm_conda
@@ -443,7 +443,7 @@ dependencies:
           - ucx-py==0.37.*
       - output_types: conda
         packages:
-          - ucx>=1.13.0
+          - ucx>=1.15.0,<1.16.0
           - ucx-proc=*=gpu
           - &ucx_py_conda ucx-py==0.37.*
       - output_types: pyproject

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -39,6 +39,8 @@
     "sphinx_copybutton"
 ]
 
+
+
 breathe_default_project = "RAFT"
 breathe_projects = {
     "RAFT": "../../cpp/doxygen/_xml/",
@@ -65,6 +67,14 @@
 copyright = "2023, NVIDIA Corporation"
 author = "NVIDIA Corporation"
 
+rst_prolog = """
+
+.. attention::
+
+  The vector search and clustering algorithms in RAFT are being migrated to a new library dedicated to vector search called `cuVS <https://github.com/rapidsai/cuvs>`_. We will continue to support the vector search algorithms in RAFT during this move, but will no longer update them after the RAPIDS 24.06 (June) release. We plan to complete the migration by RAPIDS 24.08 (August) release.
+
+"""
+
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -5,6 +5,7 @@ RAPIDS RAFT: Reusable Accelerated Functions and Tools for Vector Search and More
   :width: 800
   :alt: RAFT Tech Stack
 
+
 Useful Resources
 ################
 

diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
@@ -36,7 +36,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cuda-python>=11.7.1,<12.0a0",
-    "numpy>=1.23",
+    "numpy>=1.23,<2.0a0",
     "rmm==24.4.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [

diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
@@ -36,7 +36,7 @@ dependencies = [
     "dask-cuda==24.4.*",
     "joblib>=0.11",
     "numba>=0.57",
-    "numpy>=1.23",
+    "numpy>=1.23,<2.0a0",
     "pylibraft==24.4.*",
     "rapids-dask-dependency==24.4.*",
     "ucx-py==0.37.*",