diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 396ff859..0406e744 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -57,3 +57,41 @@ jobs:
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
+  wheel-build-ucxx:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_ucxx.sh
+  wheel-publish-ucxx:
+    needs: wheel-build-ucxx
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: ucxx
+  wheel-build-distributed-ucxx:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_distributed_ucxx.sh
+  wheel-publish-distributed-ucxx:
+    needs: [wheel-build-ucxx, wheel-build-distributed-ucxx]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: distributed_ucxx
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index ca2c646f..fa7f0007 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -17,6 +17,10 @@ jobs:
       - docs-build
       - conda-cpp-tests
       - conda-python-tests
+      - wheel-build-ucxx
+      - wheel-tests-ucxx
+      - wheel-build-distributed-ucxx
+      - wheel-tests-distributed-ucxx
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
   checks:
@@ -54,3 +58,33 @@ jobs:
     with:
       build_type: pull-request
       container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000"
+  wheel-build-ucxx:
+    needs: checks
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    with:
+      build_type: pull-request
+      script: ci/build_wheel_ucxx.sh
+  wheel-tests-ucxx:
+    needs: wheel-build-ucxx
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    with:
+      build_type: pull-request
+      container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000"
+      script: ci/test_wheel_ucxx.sh
+  wheel-build-distributed-ucxx:
+    needs: checks
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    with:
+      build_type: pull-request
+      script: ci/build_wheel_distributed_ucxx.sh
+  wheel-tests-distributed-ucxx:
+    needs: [wheel-build-ucxx, wheel-build-distributed-ucxx]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    with:
+      build_type: pull-request
+      container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000"
+      script: ci/test_wheel_distributed_ucxx.sh
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 24e975fb..73b47503 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -32,3 +32,23 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000"
+  wheel-tests-ucxx:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000"
+      script: ci/test_wheel_ucxx.sh
+  wheel-tests-distributed-ucxx:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000"
+      script: ci/test_wheel_distributed_ucxx.sh
diff --git a/VERSION b/VERSION
new file mode 100644
index 00000000..d142a90c
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+0.37.00
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
new file mode 100755
index 00000000..46cb8383
--- /dev/null
+++ b/ci/build_wheel.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_name=$1
+package_dir=$2
+
+source rapids-configure-sccache
+source rapids-date-string
+
+version=$(rapids-generate-version)
+commit=$(git rev-parse HEAD)
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+# This is the version of the suffix with a preceding hyphen. It's used
+# everywhere except in the final wheel name.
+PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
+
+# Patch project metadata files to include the CUDA version suffix and version override.
+pyproject_file="${package_dir}/pyproject.toml"
+
+sed -i -E "s/^name = \"${package_name}(.*)?\"$/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+echo "${version}" > VERSION
+sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name//-/_}/_version.py"
+
+# For nightlies we want to ensure that we're pulling in alphas as well. The
+# easiest way to do so is to augment the spec with a constraint containing a
+# min alpha version that doesn't affect the version bounds but does allow usage
+# of alpha versions for that dependency without --pre
+alpha_spec=''
+if ! rapids-is-release-build; then
+    alpha_spec=',>=0.0.0a0'
+fi
+
+if [[ ${package_name} == "distributed-ucxx" ]]; then
+    sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/\"ucxx(.*)\"/\"ucxx${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
+
+    python -m pip wheel "${package_dir}/" -w "${package_dir}/dist" -vvv --no-deps --disable-pip-version-check
+
+    RAPIDS_PY_WHEEL_NAME="distributed_ucxx_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/dist
+elif [[ ${package_name} == "ucxx" ]]; then
+    # Add -cuXX to package name
+    sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/cudf(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
+
+    # Update cupy package name (different suffix from RAPIDS)
+    if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then
+        sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file}
+    fi
+
+    SKBUILD_CMAKE_ARGS="-DUCXX_ENABLE_RMM=ON" \
+        python -m pip wheel "${package_dir}"/ -w "${package_dir}"/dist -vvv --no-deps --disable-pip-version-check
+
+    python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
+
+    # Auditwheel rewrites dynamic libraries that are referenced at link time in the
+    # package. However, UCX loads a number of sub-libraries at runtime via dlopen;
+    # these are not picked up by auditwheel. Since we have a priori knowledge of
+    # what these libraries are, we mimic the behaviour of auditwheel by using the
+    # same hash-based uniqueness scheme and rewriting the link paths.
+
+    WHL=$(realpath ${package_dir}/final_dist/ucxx*manylinux*.whl)
+
+    # first grab the auditwheel hashes for libuc{tms}
+    LIBUCM=$(unzip -l $WHL | awk 'match($4, /libucm-[^\.]+\./) { print substr($4, RSTART) }')
+    LIBUCT=$(unzip -l $WHL | awk 'match($4, /libuct-[^\.]+\./) { print substr($4, RSTART) }')
+    LIBUCS=$(unzip -l $WHL | awk 'match($4, /libucs-[^\.]+\./) { print substr($4, RSTART) }')
+    LIBNUMA=$(unzip -l $WHL | awk 'match($4, /libnuma-[^\.]+\./) { print substr($4, RSTART) }')
+
+    # Extract the libraries that have already been patched in by auditwheel
+    mkdir -p repair_dist/ucxx_${RAPIDS_PY_CUDA_SUFFIX}.libs/ucx
+    unzip $WHL "ucxx_${RAPIDS_PY_CUDA_SUFFIX}.libs/*.so*" -d repair_dist/
+
+    # Patch the RPATH to include ORIGIN for each library
+    pushd repair_dist/ucxx_${RAPIDS_PY_CUDA_SUFFIX}.libs
+    for f in libu*.so*
+    do
+        if [[ -f $f ]]; then
+            patchelf --add-rpath '$ORIGIN' $f
+        fi
+    done
+
+    popd
+
+    # Now copy in all the extra libraries that are only ever loaded at runtime
+    pushd repair_dist/ucxx_${RAPIDS_PY_CUDA_SUFFIX}.libs/ucx
+    if [[ -d /usr/lib64/ucx ]]; then
+        cp -P /usr/lib64/ucx/* .
+    elif [[ -d /usr/lib/ucx ]]; then
+        cp -P /usr/lib/ucx/* .
+    else
+        echo "Could not find ucx libraries"
+        exit 1
+    fi
+
+    # we link against <python>/lib/site-packages/ucxx_${RAPIDS_PY_CUDA_SUFFIX}.lib/libuc{ptsm}
+    # we also amend the rpath to search one directory above to *find* libuc{tsm}
+    for f in libu*.so*
+    do
+        # Avoid patching symlinks, which is redundant
+        if [[ ! -L $f ]]; then
+            patchelf --replace-needed libuct.so.0 $LIBUCT $f
+            patchelf --replace-needed libucs.so.0 $LIBUCS $f
+            patchelf --replace-needed libucm.so.0 $LIBUCM $f
+            patchelf --replace-needed libnuma.so.1 $LIBNUMA $f
+            patchelf --add-rpath '$ORIGIN/..' $f
+        fi
+    done
+
+    # Bring in cudart as well. To avoid symbol collision with other libraries e.g.
+    # cupy we mimic auditwheel by renaming the libraries to include the hashes of
+    # their names. Since there will typically be a chain of symlinks
+    # libcudart.so->libcudart.so.X->libcudart.so.X.Y.Z we need to follow the chain
+    # and rename all of them.
+
+    find /usr/local/cuda/ -name "libcudart*.so*" | xargs cp -P -t .
+    src=libcudart.so
+    hash=$(sha256sum ${src} | awk '{print substr($1, 0, 8)}')
+    target=$(basename $(readlink -f ${src}))
+
+    mv ${target} ${target/libcudart/libcudart-${hash}}
+    while readlink ${src} > /dev/null; do
+        target=$(readlink ${src})
+        ln -s ${target/libcudart/libcudart-${hash}} ${src/libcudart/libcudart-${hash}}
+        rm -f ${src}
+        src=${target}
+    done
+
+    to_rewrite=$(ldd libuct_cuda.so | awk '/libcudart/ { print $1 }')
+    patchelf --replace-needed ${to_rewrite} libcudart-${hash}.so libuct_cuda.so
+    patchelf --add-rpath '$ORIGIN' libuct_cuda.so
+
+    popd
+
+    pushd repair_dist
+    zip -r $WHL ucxx_${RAPIDS_PY_CUDA_SUFFIX}.libs/
+    popd
+
+    RAPIDS_PY_WHEEL_NAME="ucxx_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
+else
+  echo "Unknown package '${package_name}'"
+  exit 1
+fi
diff --git a/ci/build_wheel_distributed_ucxx.sh b/ci/build_wheel_distributed_ucxx.sh
new file mode 100755
index 00000000..77c2d988
--- /dev/null
+++ b/ci/build_wheel_distributed_ucxx.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir="python/distributed-ucxx"
+
+./ci/build_wheel.sh distributed-ucxx ${package_dir}
diff --git a/ci/build_wheel_ucxx.sh b/ci/build_wheel_ucxx.sh
new file mode 100755
index 00000000..12cdcbde
--- /dev/null
+++ b/ci/build_wheel_ucxx.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir="python"
+
+./ci/build_wheel.sh ucxx ${package_dir}
diff --git a/ci/test_common.sh b/ci/test_common.sh
new file mode 100755
index 00000000..2fbc2c17
--- /dev/null
+++ b/ci/test_common.sh
@@ -0,0 +1,175 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: BSD-3-Clause
+
+set -euo pipefail
+
+
+################################### Common #####################################
+log_command() {
+  CMD_LINE=$1
+  echo -e "\e[1mRunning: \n ${CMD_LINE}\e[0m"
+}
+
+print_system_stats() {
+  rapids-logger "Check GPU usage"
+  nvidia-smi
+
+  rapids-logger "Check NICs"
+  awk 'END{print $1}' /etc/hosts
+  cat /etc/hosts
+}
+
+print_ucx_config() {
+  rapids-logger "UCX Version and Build Configuration"
+
+  ucx_info -v
+}
+
+
+##################################### C++ ######################################
+_SERVER_PORT=12345
+
+run_cpp_tests() {
+  RUNTIME_PATH=${CONDA_PREFIX:-./}
+  BINARY_PATH=${RUNTIME_PATH}/bin
+
+  CMD_LINE="timeout 10m ${BINARY_PATH}/gtests/libucxx/UCXX_TEST"
+
+  log_command "${CMD_LINE}"
+  UCX_TCP_CM_REUSEADDR=y ${CMD_LINE}
+}
+
+run_cpp_benchmark() {
+  SERVER_PORT=$1
+  PROGRESS_MODE=$2
+
+  RUNTIME_PATH=${CONDA_PREFIX:-./}
+  BINARY_PATH=${RUNTIME_PATH}/bin
+
+  CMD_LINE_SERVER="timeout 1m ${BINARY_PATH}/benchmarks/libucxx/ucxx_perftest -s 8388608 -r -n 20 -m ${PROGRESS_MODE} -p ${SERVER_PORT}"
+  CMD_LINE_CLIENT="timeout 1m ${BINARY_PATH}/benchmarks/libucxx/ucxx_perftest -s 8388608 -r -n 20 -m ${PROGRESS_MODE} -p ${SERVER_PORT} 127.0.0.1"
+
+  log_command "${CMD_LINE_SERVER}"
+  UCX_TCP_CM_REUSEADDR=y ${CMD_LINE_SERVER} &
+  sleep 1
+
+  log_command "${CMD_LINE_CLIENT}"
+  ${CMD_LINE_CLIENT}
+}
+
+run_cpp_example() {
+  SERVER_PORT=$1
+  PROGRESS_MODE=$2
+
+  RUNTIME_PATH=${CONDA_PREFIX:-./}
+  BINARY_PATH=${RUNTIME_PATH}/bin
+
+  CMD_LINE="timeout 1m ${BINARY_PATH}/examples/libucxx/ucxx_example_basic -m ${PROGRESS_MODE} -p ${SERVER_PORT}"
+
+  log_command "${CMD_LINE}"
+  UCX_TCP_CM_REUSEADDR=y ${CMD_LINE}
+}
+
+run_cpp_port_retry() {
+  MAX_ATTEMPTS=${1}
+  RUN_TYPE=${2}
+  PROGRESS_MODE=${3}
+
+  set +e
+  for attempt in $(seq 1 ${MAX_ATTEMPTS}); do
+    echo "Attempt ${attempt}/${MAX_ATTEMPTS} to run ${RUN_TYPE}"
+
+    _SERVER_PORT=$((_SERVER_PORT + 1))    # Use different ports every time to prevent `Device is busy`
+
+    if [[ "${RUN_TYPE}" == "benchmark" ]]; then
+      run_cpp_benchmark ${_SERVER_PORT} ${PROGRESS_MODE}
+    elif [[ "${RUN_TYPE}" == "example" ]]; then
+      run_cpp_example ${_SERVER_PORT} ${PROGRESS_MODE}
+    else
+      set -e
+      echo "Unknown test type "${RUN_TYPE}""
+      exit 1
+    fi
+
+    LAST_STATUS=$?
+    if [ ${LAST_STATUS} -eq 0 ]; then
+      break;
+    fi
+    sleep 1
+  done
+  set -e
+
+  if [ ${LAST_STATUS} -ne 0 ]; then
+    echo "Failure running benchmark client after ${MAX_ATTEMPTS} attempts"
+    exit $LAST_STATUS
+  fi
+}
+
+
+#################################### Python ####################################
+run_py_tests() {
+  CMD_LINE="timeout 2m python -m pytest -vs python/ucxx/_lib/tests/"
+  log_command "${CMD_LINE}"
+  timeout 2m python -m pytest -vs python/ucxx/_lib/tests/
+}
+
+run_py_tests_async() {
+  PROGRESS_MODE=$1
+  ENABLE_DELAYED_SUBMISSION=$2
+  ENABLE_PYTHON_FUTURE=$3
+  SKIP=$4
+
+  CMD_LINE="UCXPY_PROGRESS_MODE=${PROGRESS_MODE} UCXPY_ENABLE_DELAYED_SUBMISSION=${ENABLE_DELAYED_SUBMISSION} UCXPY_ENABLE_PYTHON_FUTURE=${ENABLE_PYTHON_FUTURE} timeout 20m python -m pytest -vs python/ucxx/_lib_async/tests/ --durations=50"
+
+  if [ $SKIP -ne 0 ]; then
+    echo -e "\e[1;33mSkipping unstable test: ${CMD_LINE}\e[0m"
+  else
+    log_command "${CMD_LINE}"
+    UCXPY_PROGRESS_MODE=${PROGRESS_MODE} UCXPY_ENABLE_DELAYED_SUBMISSION=${ENABLE_DELAYED_SUBMISSION} UCXPY_ENABLE_PYTHON_FUTURE=${ENABLE_PYTHON_FUTURE} timeout 20m python -m pytest -vs python/ucxx/_lib_async/tests/ --durations=50
+  fi
+}
+
+run_py_benchmark() {
+  BACKEND=$1
+  PROGRESS_MODE=$2
+  ASYNCIO_WAIT=$3
+  ENABLE_DELAYED_SUBMISSION=$4
+  ENABLE_PYTHON_FUTURE=$5
+  N_BUFFERS=$6
+  SLOW=$7
+
+  if [ $ASYNCIO_WAIT -ne 0 ]; then
+    ASYNCIO_WAIT="--asyncio-wait"
+  else
+    ASYNCIO_WAIT=""
+  fi
+
+  CMD_LINE="UCXPY_ENABLE_DELAYED_SUBMISSION=${ENABLE_DELAYED_SUBMISSION} UCXPY_ENABLE_PYTHON_FUTURE=${ENABLE_PYTHON_FUTURE} timeout 2m python -m ucxx.benchmarks.send_recv --backend ${BACKEND} -o cupy --reuse-alloc -n 8MiB --n-buffers $N_BUFFERS --progress-mode ${PROGRESS_MODE} ${ASYNCIO_WAIT}"
+
+  # Workaround for https://github.com/rapidsai/ucxx/issues/15
+  CMD_LINE="UCX_KEEPALIVE_INTERVAL=1ms ${CMD_LINE}"
+
+  log_command "${CMD_LINE}"
+  if [ $SLOW -ne 0 ]; then
+    echo -e "\e[1;33mSLOW BENCHMARK: it may seem like a deadlock but will eventually complete.\e[0m"
+  fi
+
+  UCX_KEEPALIVE_INTERVAL=1ms UCXPY_ENABLE_DELAYED_SUBMISSION=${ENABLE_DELAYED_SUBMISSION} UCXPY_ENABLE_PYTHON_FUTURE=${ENABLE_PYTHON_FUTURE} timeout 2m python -m ucxx.benchmarks.send_recv --backend ${BACKEND} -o cupy --reuse-alloc -n 8MiB --n-buffers $N_BUFFERS --progress-mode ${PROGRESS_MODE} ${ASYNCIO_WAIT}
+}
+
+################################## Distributed #################################
+run_distributed_ucxx_tests() {
+  PROGRESS_MODE=$1
+  ENABLE_DELAYED_SUBMISSION=$2
+  ENABLE_PYTHON_FUTURE=$3
+
+  CMD_LINE="UCXPY_PROGRESS_MODE=${PROGRESS_MODE} UCXPY_ENABLE_DELAYED_SUBMISSION=${ENABLE_DELAYED_SUBMISSION} UCXPY_ENABLE_PYTHON_FUTURE=${ENABLE_PYTHON_FUTURE} timeout 10m python -m pytest -vs python/distributed-ucxx/distributed_ucxx/tests/"
+
+  # Workaround for https://github.com/rapidsai/ucxx/issues/15
+  # CMD_LINE="UCX_KEEPALIVE_INTERVAL=1ms ${CMD_LINE}"
+
+  log_command "${CMD_LINE}"
+  UCXPY_PROGRESS_MODE=${PROGRESS_MODE} UCXPY_ENABLE_DELAYED_SUBMISSION=${ENABLE_DELAYED_SUBMISSION} UCXPY_ENABLE_PYTHON_FUTURE=${ENABLE_PYTHON_FUTURE} timeout 10m python -m pytest -vs python/distributed-ucxx/distributed_ucxx/tests/
+}
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 5d72d463..08f202e9 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -5,7 +5,7 @@
 
 set -euo pipefail
 
-source "$(dirname "$0")/test_utils.sh"
+source "$(dirname "$0")/test_common.sh"
 
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
@@ -24,75 +24,6 @@ print_system_stats
 
 BINARY_PATH=${CONDA_PREFIX}/bin
 
-_SERVER_PORT=12345
-
-run_tests() {
-  CMD_LINE="timeout 10m ${BINARY_PATH}/gtests/libucxx/UCXX_TEST"
-
-  log_command "${CMD_LINE}"
-  UCX_TCP_CM_REUSEADDR=y ${CMD_LINE}
-}
-
-run_benchmark() {
-  SERVER_PORT=$1
-  PROGRESS_MODE=$2
-
-  CMD_LINE_SERVER="timeout 1m ${BINARY_PATH}/benchmarks/libucxx/ucxx_perftest -s 8388608 -r -n 20 -m ${PROGRESS_MODE} -p ${SERVER_PORT}"
-  CMD_LINE_CLIENT="timeout 1m ${BINARY_PATH}/benchmarks/libucxx/ucxx_perftest -s 8388608 -r -n 20 -m ${PROGRESS_MODE} -p ${SERVER_PORT} 127.0.0.1"
-
-  log_command "${CMD_LINE_SERVER}"
-  UCX_TCP_CM_REUSEADDR=y ${CMD_LINE_SERVER} &
-  sleep 1
-
-  log_command "${CMD_LINE_CLIENT}"
-  ${CMD_LINE_CLIENT}
-}
-
-run_example() {
-  SERVER_PORT=$1
-  PROGRESS_MODE=$2
-
-  CMD_LINE="timeout 1m ${BINARY_PATH}/examples/libucxx/ucxx_example_basic -m ${PROGRESS_MODE} -p ${SERVER_PORT}"
-
-  log_command "${CMD_LINE}"
-  UCX_TCP_CM_REUSEADDR=y ${CMD_LINE}
-}
-
-run_port_retry() {
-  MAX_ATTEMPTS=${1}
-  RUN_TYPE=${2}
-  PROGRESS_MODE=${3}
-
-  set +e
-  for attempt in $(seq 1 ${MAX_ATTEMPTS}); do
-    echo "Attempt ${attempt}/${MAX_ATTEMPTS} to run ${RUN_TYPE}"
-
-    _SERVER_PORT=$((_SERVER_PORT + 1))    # Use different ports every time to prevent `Device is busy`
-
-    if [[ "${RUN_TYPE}" == "benchmark" ]]; then
-      run_benchmark ${_SERVER_PORT} ${PROGRESS_MODE}
-    elif [[ "${RUN_TYPE}" == "example" ]]; then
-      run_example ${_SERVER_PORT} ${PROGRESS_MODE}
-    else
-      set -e
-      echo "Unknown test type "${RUN_TYPE}""
-      exit 1
-    fi
-
-    LAST_STATUS=$?
-    if [ ${LAST_STATUS} -eq 0 ]; then
-      break;
-    fi
-    sleep 1
-  done
-  set -e
-
-  if [ ${LAST_STATUS} -ne 0 ]; then
-    echo "Failure running benchmark client after ${MAX_ATTEMPTS} attempts"
-    exit $LAST_STATUS
-  fi
-}
-
 rapids-logger "Downloading artifacts from previous jobs"
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
@@ -104,20 +35,20 @@ print_ucx_config
 
 rapids-logger "Run tests with conda package"
 rapids-logger "C++ Tests"
-run_tests
+run_cpp_tests
 
 rapids-logger "C++ Benchmarks"
-# run_port_retry MAX_ATTEMPTS RUN_TYPE PROGRESS_MODE
-run_port_retry 10 "benchmark" "polling"
-run_port_retry 10 "benchmark" "blocking"
-run_port_retry 10 "benchmark" "thread-polling"
-run_port_retry 10 "benchmark" "thread-blocking"
-run_port_retry 10 "benchmark" "wait"
+# run_cpp_port_retry MAX_ATTEMPTS RUN_TYPE PROGRESS_MODE
+run_cpp_port_retry 10 "benchmark" "polling"
+run_cpp_port_retry 10 "benchmark" "blocking"
+run_cpp_port_retry 10 "benchmark" "thread-polling"
+run_cpp_port_retry 10 "benchmark" "thread-blocking"
+run_cpp_port_retry 10 "benchmark" "wait"
 
 rapids-logger "C++ Examples"
-# run_port_retry MAX_ATTEMPTS RUN_TYPE PROGRESS_MODE
-run_port_retry 10 "example" "polling"
-run_port_retry 10 "example" "blocking"
-run_port_retry 10 "example" "thread-polling"
-run_port_retry 10 "example" "thread-blocking"
-run_port_retry 10 "example" "wait"
+# run_cpp_port_retry MAX_ATTEMPTS RUN_TYPE PROGRESS_MODE
+run_cpp_port_retry 10 "example" "polling"
+run_cpp_port_retry 10 "example" "blocking"
+run_cpp_port_retry 10 "example" "thread-polling"
+run_cpp_port_retry 10 "example" "thread-blocking"
+run_cpp_port_retry 10 "example" "wait"
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 79b932db..c45c2cbb 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -5,7 +5,7 @@
 
 set -euo pipefail
 
-source "$(dirname "$0")/test_utils.sh"
+source "$(dirname "$0")/test_common.sh"
 
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
@@ -22,70 +22,6 @@ rapids-print-env
 
 print_system_stats
 
-run_tests() {
-  CMD_LINE="timeout 2m pytest -vs python/ucxx/_lib/tests/"
-  log_command "${CMD_LINE}"
-  timeout 2m pytest -vs python/ucxx/_lib/tests/
-}
-
-run_tests_async() {
-  PROGRESS_MODE=$1
-  ENABLE_DELAYED_SUBMISSION=$2
-  ENABLE_PYTHON_FUTURE=$3
-  SKIP=$4
-
-  CMD_LINE="UCXPY_PROGRESS_MODE=${PROGRESS_MODE} UCXPY_ENABLE_DELAYED_SUBMISSION=${ENABLE_DELAYED_SUBMISSION} UCXPY_ENABLE_PYTHON_FUTURE=${ENABLE_PYTHON_FUTURE} timeout 20m pytest -vs python/ucxx/_lib_async/tests/ --durations=50"
-
-  if [ $SKIP -ne 0 ]; then
-    echo -e "\e[1;33mSkipping unstable test: ${CMD_LINE}\e[0m"
-  else
-    log_command "${CMD_LINE}"
-    UCXPY_PROGRESS_MODE=${PROGRESS_MODE} UCXPY_ENABLE_DELAYED_SUBMISSION=${ENABLE_DELAYED_SUBMISSION} UCXPY_ENABLE_PYTHON_FUTURE=${ENABLE_PYTHON_FUTURE} timeout 20m pytest -vs python/ucxx/_lib_async/tests/ --durations=50
-  fi
-}
-
-run_py_benchmark() {
-  BACKEND=$1
-  PROGRESS_MODE=$2
-  ASYNCIO_WAIT=$3
-  ENABLE_DELAYED_SUBMISSION=$4
-  ENABLE_PYTHON_FUTURE=$5
-  N_BUFFERS=$6
-  SLOW=$7
-
-  if [ $ASYNCIO_WAIT -ne 0 ]; then
-    ASYNCIO_WAIT="--asyncio-wait"
-  else
-    ASYNCIO_WAIT=""
-  fi
-
-  CMD_LINE="UCXPY_ENABLE_DELAYED_SUBMISSION=${ENABLE_DELAYED_SUBMISSION} UCXPY_ENABLE_PYTHON_FUTURE=${ENABLE_PYTHON_FUTURE} timeout 2m python -m ucxx.benchmarks.send_recv --backend ${BACKEND} -o cupy --reuse-alloc -n 8MiB --n-buffers $N_BUFFERS --progress-mode ${PROGRESS_MODE} ${ASYNCIO_WAIT}"
-
-  # Workaround for https://github.com/rapidsai/ucxx/issues/15
-  CMD_LINE="UCX_KEEPALIVE_INTERVAL=1ms ${CMD_LINE}"
-
-  log_command "${CMD_LINE}"
-  if [ $SLOW -ne 0 ]; then
-    echo -e "\e[1;33mSLOW BENCHMARK: it may seem like a deadlock but will eventually complete.\e[0m"
-  fi
-
-  UCX_KEEPALIVE_INTERVAL=1ms UCXPY_ENABLE_DELAYED_SUBMISSION=${ENABLE_DELAYED_SUBMISSION} UCXPY_ENABLE_PYTHON_FUTURE=${ENABLE_PYTHON_FUTURE} timeout 2m python -m ucxx.benchmarks.send_recv --backend ${BACKEND} -o cupy --reuse-alloc -n 8MiB --n-buffers $N_BUFFERS --progress-mode ${PROGRESS_MODE} ${ASYNCIO_WAIT}
-}
-
-run_distributed_ucxx_tests() {
-  PROGRESS_MODE=$1
-  ENABLE_DELAYED_SUBMISSION=$2
-  ENABLE_PYTHON_FUTURE=$3
-
-  CMD_LINE="UCXPY_PROGRESS_MODE=${PROGRESS_MODE} UCXPY_ENABLE_DELAYED_SUBMISSION=${ENABLE_DELAYED_SUBMISSION} UCXPY_ENABLE_PYTHON_FUTURE=${ENABLE_PYTHON_FUTURE} timeout 10m pytest -vs python/distributed-ucxx/distributed_ucxx/tests/"
-
-  # Workaround for https://github.com/rapidsai/ucxx/issues/15
-  # CMD_LINE="UCX_KEEPALIVE_INTERVAL=1ms ${CMD_LINE}"
-
-  log_command "${CMD_LINE}"
-  UCXPY_PROGRESS_MODE=${PROGRESS_MODE} UCXPY_ENABLE_DELAYED_SUBMISSION=${ENABLE_DELAYED_SUBMISSION} UCXPY_ENABLE_PYTHON_FUTURE=${ENABLE_PYTHON_FUTURE} timeout 10m pytest -vs python/distributed-ucxx/distributed_ucxx/tests/
-}
-
 rapids-logger "Downloading artifacts from previous jobs"
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
@@ -103,12 +39,12 @@ print_ucx_config
 
 rapids-logger "Run tests with conda package"
 rapids-logger "Python Core Tests"
-run_tests
+run_py_tests
 
 rapids-logger "Python Async Tests"
-# run_tests_async PROGRESS_MODE   ENABLE_DELAYED_SUBMISSION ENABLE_PYTHON_FUTURE SKIP
-run_tests_async   thread          0                         0                    0
-run_tests_async   thread          1                         1                    0
+# run_py_tests_async PROGRESS_MODE   ENABLE_DELAYED_SUBMISSION ENABLE_PYTHON_FUTURE SKIP
+run_py_tests_async   thread          0                         0                    0
+run_py_tests_async   thread          1                         1                    0
 
 rapids-logger "Python Benchmarks"
 # run_py_benchmark  BACKEND   PROGRESS_MODE   ASYNCIO_WAIT  ENABLE_DELAYED_SUBMISSION ENABLE_PYTHON_FUTURE NBUFFERS SLOW
diff --git a/ci/test_utils.sh b/ci/test_utils.sh
deleted file mode 100755
index 237b1f1d..00000000
--- a/ci/test_utils.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: BSD-3-Clause
-
-
-log_command() {
-  CMD_LINE=$1
-  echo -e "\e[1mRunning: \n ${CMD_LINE}\e[0m"
-}
-
-print_system_stats() {
-  rapids-logger "Check GPU usage"
-  nvidia-smi
-
-  rapids-logger "Check NICs"
-  awk 'END{print $1}' /etc/hosts
-  cat /etc/hosts
-}
-
-print_ucx_config() {
-  rapids-logger "UCX Version and Build Configuration"
-  ucx_info -v
-}
diff --git a/ci/test_wheel_distributed_ucxx.sh b/ci/test_wheel_distributed_ucxx.sh
new file mode 100755
index 00000000..c8c0bbca
--- /dev/null
+++ b/ci/test_wheel_distributed_ucxx.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+PROJECT_NAME="distributed_ucxx"
+
+source "$(dirname "$0")/test_common.sh"
+
+mkdir -p ./dist
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="${PROJECT_NAME}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+
+# Install previously built ucxx wheel
+RAPIDS_PY_WHEEL_NAME="ucxx_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-ucxx-dep
+python -m pip install ./local-ucxx-dep/ucxx*.whl
+
+# echo to expand wildcard before adding `[extra]` requires for pip
+python -m pip install $(echo ./dist/${PROJECT_NAME}*.whl)[test]
+
+# TODO: We need distributed installed in developer mode to provide test utils,
+# we still need to match to the `rapids-dask-dependency` version.
+rapids-logger "Install Distributed in developer mode"
+git clone https://github.com/dask/distributed /tmp/distributed
+python -m pip install -e /tmp/distributed
+
+# Run smoke tests for aarch64 pull requests
+if [[ "$(arch)" == "aarch64" && "${RAPIDS_BUILD_TYPE}" == "pull-request" ]]; then
+  rapids-logger "Distributed Smoke Tests"
+  python -m pytest -vs ci/wheel_smoke_test_distributed_ucxx.py
+else
+  rapids-logger "Distributed Tests"
+
+  # run_distributed_ucxx_tests    PROGRESS_MODE   ENABLE_DELAYED_SUBMISSION   ENABLE_PYTHON_FUTURE
+  run_distributed_ucxx_tests      thread          1                           1
+fi
diff --git a/ci/test_wheel_ucxx.sh b/ci/test_wheel_ucxx.sh
new file mode 100755
index 00000000..c844da67
--- /dev/null
+++ b/ci/test_wheel_ucxx.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+PROJECT_NAME="ucxx"
+
+source "$(dirname "$0")/test_common.sh"
+
+mkdir -p ./dist
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="${PROJECT_NAME}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+
+# echo to expand wildcard before adding `[extra]` requires for pip
+python -m pip install $(echo ./dist/${PROJECT_NAME}*.whl)[test]
+
+# Run smoke tests for aarch64 pull requests
+if [[ "$(arch)" == "aarch64" && "${RAPIDS_BUILD_TYPE}" == "pull-request" ]]; then
+  rapids-logger "Python Async Smoke Tests"
+  python -m pytest -vs ci/wheel_smoke_test_ucxx.py
+else
+  rapids-logger "Python Core Tests"
+  run_py_tests
+
+  rapids-logger "Python Async Tests"
+  # run_py_tests_async PROGRESS_MODE   ENABLE_DELAYED_SUBMISSION ENABLE_PYTHON_FUTURE SKIP
+  run_py_tests_async   thread          1                         1                    0
+fi
diff --git a/ci/wheel_smoke_test_distributed_ucxx.py b/ci/wheel_smoke_test_distributed_ucxx.py
new file mode 100644
index 00000000..e7f9c0ef
--- /dev/null
+++ b/ci/wheel_smoke_test_distributed_ucxx.py
@@ -0,0 +1,64 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: BSD-3-Clause
+
+import asyncio
+
+import pytest
+
+from distributed.comm import connect, listen
+from distributed.protocol import to_serialize
+
+import ucxx
+
+from distributed_ucxx.utils_test import gen_test, ucxx_loop
+
+
+try:
+    HOST = ucxx.get_address()
+except Exception:
+    HOST = "127.0.0.1"
+
+
+async def get_comm_pair(
+    listen_addr=f"ucxx://{HOST}", listen_args=None, connect_args=None, **kwargs
+):
+    listen_args = listen_args or {}
+    connect_args = connect_args or {}
+    q = asyncio.queues.Queue()
+
+    async def handle_comm(comm):
+        await q.put(comm)
+
+    listener = listen(listen_addr, handle_comm, **listen_args, **kwargs)
+    async with listener:
+        comm = await connect(listener.contact_address, **connect_args, **kwargs)
+        serv_comm = await q.get()
+        return (comm, serv_comm)
+
+
+@pytest.mark.parametrize(
+    "g",
+    [
+        lambda cudf: cudf.Series([1, 2, 3]),
+        lambda cudf: cudf.DataFrame({"a": [1, 2, None], "b": [1.0, 2.0, None]}),
+    ],
+)
+@gen_test()
+async def test_ping_pong_cudf(ucxx_loop, g):
+    cudf = pytest.importorskip("cudf")
+    from cudf.testing._utils import assert_eq
+
+    cudf_obj = g(cudf)
+
+    com, serv_com = await get_comm_pair()
+    msg = {"op": "ping", "data": to_serialize(cudf_obj)}
+
+    await com.write(msg)
+    result = await serv_com.read()
+
+    cudf_obj_2 = result.pop("data")
+    assert result["op"] == "ping"
+    assert_eq(cudf_obj, cudf_obj_2)
+
+    await com.close()
+    await serv_com.close()
diff --git a/ci/wheel_smoke_test_ucxx.py b/ci/wheel_smoke_test_ucxx.py
new file mode 100644
index 00000000..44b0ebfe
--- /dev/null
+++ b/ci/wheel_smoke_test_ucxx.py
@@ -0,0 +1,116 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: BSD-3-Clause
+
+import asyncio
+import pickle
+
+import numpy as np
+import pytest
+
+import ucxx
+
+cudf = pytest.importorskip("cudf")
+distributed = pytest.importorskip("distributed")
+cuda = pytest.importorskip("numba.cuda")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "g",
+    [
+        lambda cudf: cudf.Series([1, 2, 3]),
+        lambda cudf: cudf.DataFrame({"a": np.random.random(1200000)}),
+    ],
+)
+async def test_send_recv_cudf(event_loop, g):
+    from distributed.utils import nbytes
+
+    class UCX:
+        def __init__(self, ep):
+            self.ep = ep
+
+        async def write(self, cdf):
+            header, _frames = cdf.serialize()
+            frames = [pickle.dumps(header)] + _frames
+
+            # Send meta data
+            await self.ep.send(np.array([len(frames)], dtype=np.uint64))
+            await self.ep.send(
+                np.array(
+                    [hasattr(f, "__cuda_array_interface__") for f in frames],
+                    dtype=bool,
+                )
+            )
+            await self.ep.send(np.array([nbytes(f) for f in frames], dtype=np.uint64))
+            # Send frames
+            for frame in frames:
+                if nbytes(frame) > 0:
+                    await self.ep.send(frame)
+
+        async def read(self):
+            try:
+                # Recv meta data
+                nframes = np.empty(1, dtype=np.uint64)
+                await self.ep.recv(nframes)
+                is_cudas = np.empty(nframes[0], dtype=bool)
+                await self.ep.recv(is_cudas)
+                sizes = np.empty(nframes[0], dtype=np.uint64)
+                await self.ep.recv(sizes)
+            except (
+                ucxx.exceptions.UCXCanceledError,
+                ucxx.exceptions.UCXCloseError,
+            ) as e:
+                msg = "SOMETHING TERRIBLE HAS HAPPENED IN THE TEST"
+                raise e(msg)
+            else:
+                # Recv frames
+                frames = []
+                for is_cuda, size in zip(is_cudas.tolist(), sizes.tolist()):
+                    if size > 0:
+                        if is_cuda:
+                            frame = cuda.device_array((size,), dtype=np.uint8)
+                        else:
+                            frame = np.empty(size, dtype=np.uint8)
+                        await self.ep.recv(frame)
+                        frames.append(frame)
+                    else:
+                        if is_cuda:
+                            frames.append(cuda.device_array((0,), dtype=np.uint8))
+                        else:
+                            frames.append(b"")
+                return frames
+
+    class UCXListener:
+        def __init__(self):
+            self.comm = None
+
+        def start(self):
+            async def serve_forever(ep):
+                ucx = UCX(ep)
+                self.comm = ucx
+
+            self.ucxx_server = ucxx.create_listener(serve_forever)
+
+    uu = UCXListener()
+    uu.start()
+    uu.address = ucxx.get_address()
+    uu.client = await ucxx.create_endpoint(uu.address, uu.ucxx_server.port)
+    ucx = UCX(uu.client)
+    await asyncio.sleep(0.2)
+    msg = g(cudf)
+    frames, _ = await asyncio.gather(uu.comm.read(), ucx.write(msg))
+    ucx_header = pickle.loads(frames[0])
+    cudf_buffer = frames[1:]
+    typ = type(msg)
+    res = typ.deserialize(ucx_header, cudf_buffer)
+
+    from cudf.testing._utils import assert_eq
+
+    assert_eq(res, msg)
+    await uu.comm.ep.close()
+    await uu.client.close()
+
+    assert uu.client.closed
+    assert uu.comm.ep.closed
+    del uu.ucxx_server
+    ucxx.reset()
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a3b40936..e9634c21 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -104,9 +104,13 @@ rapids_find_package(
 # add third party dependencies using CPM
 rapids_cpm_init()
 # find rmm
-include(cmake/thirdparty/get_rmm.cmake)
+if(UCXX_ENABLE_RMM)
+  include(cmake/thirdparty/get_rmm.cmake)
+endif()
 # find or install GoogleTest
-include(cmake/thirdparty/get_gtest.cmake)
+if(BUILD_TESTS)
+  include(cmake/thirdparty/get_gtest.cmake)
+endif()
 
 # ##################################################################################################
 # * library targets -------------------------------------------------------------------------------
@@ -171,17 +175,15 @@ target_compile_definitions(
 
 # Enable RMM if necessary
 if(UCXX_ENABLE_RMM)
-    target_compile_definitions(ucxx PUBLIC UCXX_ENABLE_RMM)
-endif()
+    target_link_libraries(ucxx PUBLIC rmm::rmm)
 
-# Define spdlog level
-target_compile_definitions(ucxx PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}")
+    # Define spdlog level
+    target_compile_definitions(ucxx PUBLIC UCXX_ENABLE_RMM "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}")
+endif()
 
 # Specify the target module library dependencies
-target_link_libraries(
-  ucxx
-  PUBLIC rmm::rmm ucx::ucp
-)
+target_link_libraries(ucxx PUBLIC ucx::ucp)
+
 
 # Add Conda library, and include paths if specified
 if(TARGET conda_env)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 5c56ede6..8d31e198 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -32,6 +32,7 @@ include(rapids-cython-core)
 if(NOT ucxx_FOUND)
   set(BUILD_TESTS OFF)
   set(BUILD_BENCHMARKS OFF)
+  set(UCXX_ENABLE_PYTHON ON)
 
   set(_exclude_from_all "")
 
@@ -41,10 +42,14 @@ if(NOT ucxx_FOUND)
   # and modify the rpaths appropriately.
   set(cython_lib_dir ucxx)
   install(TARGETS ucxx DESTINATION ${cython_lib_dir})
+  install(TARGETS ucxx_python DESTINATION ${cython_lib_dir})
 endif()
 
 rapids_cython_init()
 
+find_package(
+  Python3 REQUIRED COMPONENTS Development.Embed
+)
 add_subdirectory(ucxx/examples)
 add_subdirectory(ucxx/_lib)
 
diff --git a/python/distributed-ucxx/distributed_ucxx/VERSION b/python/distributed-ucxx/distributed_ucxx/VERSION
new file mode 120000
index 00000000..d62dc733
--- /dev/null
+++ b/python/distributed-ucxx/distributed_ucxx/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/distributed-ucxx/distributed_ucxx/_version.py b/python/distributed-ucxx/distributed_ucxx/_version.py
new file mode 100644
index 00000000..536769cc
--- /dev/null
+++ b/python/distributed-ucxx/distributed_ucxx/_version.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files("distributed_ucxx")
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+__git_commit__ = ""
diff --git a/python/distributed-ucxx/pyproject.toml b/python/distributed-ucxx/pyproject.toml
index c6a40be3..d1470018 100644
--- a/python/distributed-ucxx/pyproject.toml
+++ b/python/distributed-ucxx/pyproject.toml
@@ -14,7 +14,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache-2.0" }
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 dependencies = [
     "numba>=0.57.1",
     "rapids-dask-dependency==24.4.*",
@@ -24,7 +24,6 @@ classifiers = [
     "Intended Audience :: Developers",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
 ]
@@ -110,11 +109,3 @@ exclude = [
     "docs.*",
     "tests.*",
 ]
-
-[tool.versioneer]
-VCS = "git"
-style = "pep440"
-versionfile_source = "distributed_ucxx/_version.py"
-versionfile_build = "distributed_ucxx/_version.py"
-tag_prefix = "v"
-parentdir_prefix = "distributed_ucxx-"
diff --git a/python/ucxx/VERSION b/python/ucxx/VERSION
new file mode 120000
index 00000000..558194c5
--- /dev/null
+++ b/python/ucxx/VERSION
@@ -0,0 +1 @@
+../../VERSION
\ No newline at end of file
diff --git a/python/ucxx/_lib/CMakeLists.txt b/python/ucxx/_lib/CMakeLists.txt
index 21007f82..6f0c45a1 100644
--- a/python/ucxx/_lib/CMakeLists.txt
+++ b/python/ucxx/_lib/CMakeLists.txt
@@ -9,7 +9,8 @@ set(linked_libraries ucxx::ucxx ucxx::python Python3::Python)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS ucxx
+  LINKED_LIBRARIES "${linked_libraries}"
+  ASSOCIATED_TARGETS ucxx ucxx_python
 )
 
 find_package(Python REQUIRED COMPONENTS Development NumPy)
diff --git a/python/ucxx/_version.py b/python/ucxx/_version.py
new file mode 100644
index 00000000..a1f944f3
--- /dev/null
+++ b/python/ucxx/_version.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = importlib.resources.files("ucxx").joinpath("VERSION").read_text().strip()
+__git_commit__ = ""
diff --git a/python/ucxx/examples/CMakeLists.txt b/python/ucxx/examples/CMakeLists.txt
index b7c3868c..d654d239 100644
--- a/python/ucxx/examples/CMakeLists.txt
+++ b/python/ucxx/examples/CMakeLists.txt
@@ -10,6 +10,7 @@ rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}"
+  ASSOCIATED_TARGETS ucxx ucxx_python
 )
 
 target_include_directories(python_future_task_app PRIVATE ".")