rapidsai · raydouglass · Feb 13, 2024 · Nov 9, 2023 · Nov 17, 2023 · Nov 17, 2023
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -49,7 +49,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -62,15 +62,15 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
   wheel-build-pylibwholegraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -80,7 +80,7 @@ jobs:
   wheel-publish-pylibwholegraph:
     needs: wheel-build-pylibwholegraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}

@@ -21,41 +21,41 @@ jobs:
       - wheel-build-pylibwholegraph
       - wheel-test-pylibwholegraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.02
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
     with:
       build_type: pull-request
       arch: "amd64"
@@ -64,14 +64,14 @@ jobs:
   wheel-build-pylibwholegraph:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/build_wheel.sh
   wheel-test-pylibwholegraph:
     needs: wheel-build-pylibwholegraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/test_wheel.sh

@@ -16,23 +16,23 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
   conda-pytorch-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
   wheel-tests-pylibwholegraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

diff --git a/.gitignore b/.gitignore
@@ -87,3 +87,4 @@ cpp/.idea/
 cpp/cmake-build-debug/
 pylibwholegraph/.idea/
 pylibwholegraph/cmake-build-debug/
+compile_commands.json
@@ -40,7 +40,7 @@ repos:
         pass_filenames: false
         additional_dependencies: [gitpython]
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.5.1
+    rev: v1.8.0
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,28 @@
+# wholegraph 24.02.00 (12 Feb 2024)
+
+## 🐛 Bug Fixes
+
+- Revert &quot;Exclude tests from builds ([#127)&quot; (#130](https://github.com/rapidsai/wholegraph/pull/127)&quot; (#130)) [@raydouglass](https://github.com/raydouglass)
+- Exclude tests from builds ([#127](https://github.com/rapidsai/wholegraph/pull/127)) [@vyasr](https://github.com/vyasr)
+- fix a bug for embedding optimizer, which leads to undefined behavior ([#108](https://github.com/rapidsai/wholegraph/pull/108)) [@linhu-nv](https://github.com/linhu-nv)
+- fix inferencesample option ([#107](https://github.com/rapidsai/wholegraph/pull/107)) [@chuangz0](https://github.com/chuangz0)
+
+## 🚀 New Features
+
+- allow users to control gather/scatter sms ([#124](https://github.com/rapidsai/wholegraph/pull/124)) [@linhu-nv](https://github.com/linhu-nv)
+
+## 🛠️ Improvements
+
+- Logging level ([#123](https://github.com/rapidsai/wholegraph/pull/123)) [@linhu-nv](https://github.com/linhu-nv)
+- Fix pip dependencies ([#118](https://github.com/rapidsai/wholegraph/pull/118)) [@trxcllnt](https://github.com/trxcllnt)
+- Remove usages of rapids-env-update ([#117](https://github.com/rapidsai/wholegraph/pull/117)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- refactor CUDA versions in dependencies.yaml ([#115](https://github.com/rapidsai/wholegraph/pull/115)) [@jameslamb](https://github.com/jameslamb)
+- Don&#39;t overwrite wholegraph_ROOT if provided ([#114](https://github.com/rapidsai/wholegraph/pull/114)) [@vyasr](https://github.com/vyasr)
+- added Direct IO support for WholeMemory loading ([#113](https://github.com/rapidsai/wholegraph/pull/113)) [@dongxuy04](https://github.com/dongxuy04)
+- Align versions for cudnn, clang-tools, cython, and doxygen with the rest of RAPIDS. ([#112](https://github.com/rapidsai/wholegraph/pull/112)) [@bdice](https://github.com/bdice)
+- Reset WholeGraph communicators during the finalize call ([#111](https://github.com/rapidsai/wholegraph/pull/111)) [@chang-l](https://github.com/chang-l)
+- Forward-merge branch-23.12 to branch-24.02 ([#102](https://github.com/rapidsai/wholegraph/pull/102)) [@bdice](https://github.com/bdice)
+
 # wholegraph 23.12.00 (6 Dec 2023)
 
 ## 🐛 Bug Fixes

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-23.12.00
+24.02.00
@@ -3,7 +3,11 @@
 
 set -euo pipefail
 
-source rapids-env-update
+rapids-configure-conda-channels
+
+source rapids-configure-sccache
+
+source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
 

@@ -22,7 +22,7 @@ rapids-print-env
 rapids-logger "Downloading artifacts from previous jobs"
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
-export RAPIDS_VERSION_NUMBER="23.12"
+export RAPIDS_VERSION_NUMBER="24.02"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-mamba-retry install \

@@ -3,7 +3,11 @@
 
 set -euo pipefail
 
-source rapids-env-update
+rapids-configure-conda-channels
+
+source rapids-configure-sccache
+
+source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
 

@@ -9,22 +9,23 @@ channels:
 dependencies:
 - breathe
 - c-compiler
-- clang-tools=16.0.0
-- clangxx=16.0.0
+- clang-tools==16.0.6
+- clangxx==16.0.6
 - cmake>=3.26.4
 - cuda-nvtx=11.8
-- cudatoolkit=11.8
-- cudnn=8.4
+- cuda-version=11.8
+- cudatoolkit
+- cudnn=8.8
 - cxx-compiler
-- cython
-- doxygen=1.8.20
+- cython>=3.0.0
+- doxygen==1.9.1
 - gcc_linux-64=11.*
 - gitpython
 - graphviz
 - ipykernel
 - ipython
-- libraft-headers==23.12.*
-- librmm==23.12.*
+- libraft-headers==24.2.*
+- librmm==24.2.*
 - nanobind>=0.2.0
 - nbsphinx
 - nccl

@@ -9,24 +9,24 @@ channels:
 dependencies:
 - breathe
 - c-compiler
-- clang-tools=16.0.0
-- clangxx=16.0.0
+- clang-tools==16.0.6
+- clangxx==16.0.6
 - cmake>=3.26.4
 - cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx
 - cuda-version=12.0
-- cudnn=8.4
+- cudnn=8.8
 - cxx-compiler
-- cython
-- doxygen=1.8.20
+- cython>=3.0.0
+- doxygen==1.9.1
 - gcc_linux-64=11.*
 - gitpython
 - graphviz
 - ipykernel
 - ipython
-- libraft-headers==23.12.*
-- librmm==23.12.*
+- libraft-headers==24.2.*
+- librmm==24.2.*
 - nanobind>=0.2.0
 - nbsphinx
 - nccl

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -14,7 +14,7 @@
 # limitations under the License.
 #=============================================================================
 
-set(RAPIDS_VERSION "23.12")
+set(RAPIDS_VERSION "24.02")
 set(WHOLEGRAPH_VERSION "${RAPIDS_VERSION}.00")
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)

diff --git a/cpp/Doxyfile b/cpp/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "WholeGraph C API"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 23.12
+PROJECT_NUMBER         = 24.02
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a

diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
@@ -57,7 +57,7 @@ endfunction()
 # CPM_raft_SOURCE=/path/to/local/raft
 find_and_configure_raft(VERSION    ${WHOLEGRAPH_MIN_VERSION_raft}
                         FORK       rapidsai
-			PINNED_TAG branch-${WHOLEGRAPH_BRANCH_VERSION_raft}
+                        PINNED_TAG branch-${WHOLEGRAPH_BRANCH_VERSION_raft}
 
                         # When PINNED_TAG above doesn't match wholegraph,
                         # force local raft clone in build directory

diff --git a/cpp/include/wholememory/embedding.h b/cpp/include/wholememory/embedding.h
@@ -130,6 +130,7 @@ wholememory_error_code_t wholememory_destroy_embedding_cache_policy(
  * @param memory_location : Memory Location of the underlying WholeMemory
  * @param optimizer : Optimizer to use for training, if don't train embedding, use nullptr
  * @param cache_policy : Cache policy for this embedding, if don't use cache, use nullptr
+ * @param user_defined_sms : User-defined sms number for raw embedding gather/scatter
  * @return : wholememory_error_code_t
  */
 wholememory_error_code_t wholememory_create_embedding(
@@ -139,7 +140,8 @@ wholememory_error_code_t wholememory_create_embedding(
   wholememory_memory_type_t memory_type,
   wholememory_memory_location_t memory_location,
   wholememory_embedding_optimizer_t optimizer,
-  wholememory_embedding_cache_policy_t cache_policy);
+  wholememory_embedding_cache_policy_t cache_policy,
+  int user_defined_sms = -1);
 
 /**
  * Destroy WholeMemory Embedding

diff --git a/cpp/include/wholememory/wholememory.h b/cpp/include/wholememory/wholememory.h
@@ -83,9 +83,10 @@ enum wholememory_distributed_backend_t {
 /**
  * Initialize WholeMemory library
  * @param flags : reserved should be 0
+ * @param wm_log_level : wholememory log level, the default level is "info"
  * @return : wholememory_error_code_t
  */
-wholememory_error_code_t wholememory_init(unsigned int flags);
+wholememory_error_code_t wholememory_init(unsigned int flags, unsigned int wm_log_level = 3);
 
 /**
  * Finalize WholeMemory library

diff --git a/cpp/include/wholememory/wholememory_op.h b/cpp/include/wholememory/wholememory_op.h
@@ -30,13 +30,15 @@ extern "C" {
  * @param output_tensor : output tensor to gather to, should NOT be WholeMemoryTensor
  * @param p_env_fns : pointers to environment functions.
  * @param stream : cudaStream_t to use.
+ * @param gather_sms : the number of stream multiprocessor used in gather kernel
  * @return : wholememory_error_code_t
  */
 wholememory_error_code_t wholememory_gather(wholememory_tensor_t wholememory_tensor,
                                             wholememory_tensor_t indices_tensor,
                                             wholememory_tensor_t output_tensor,
                                             wholememory_env_func_t* p_env_fns,
-                                            void* stream);
+                                            void* stream,
+                                            int gather_sms = -1);
 
 /**
  * Scatter Op
@@ -45,13 +47,15 @@ wholememory_error_code_t wholememory_gather(wholememory_tensor_t wholememory_ten
  * @param wholememory_tensor : WholeMemory Tensor of embedding table.
  * @param p_env_fns : pointers to environment functions.
  * @param stream : cudaStream_t to use.
+ * @param scatter_sms : the number of stream multiprocessor used in scatter kernel
  * @return : wholememory_error_code_t
  */
 wholememory_error_code_t wholememory_scatter(wholememory_tensor_t input_tensor,
                                              wholememory_tensor_t indices_tensor,
                                              wholememory_tensor_t wholememory_tensor,
                                              wholememory_env_func_t* p_env_fns,
-                                             void* stream);
+                                             void* stream,
+                                             int scatter_sms = -1);
 
 /**
  * Just a test function,