Skip to content

Commit

Permalink
Remove arrow dependency (rapidsai#16640)
Browse files Browse the repository at this point in the history
This PR removes libarrow as a dependency of libcudf since we no longer use any of its APIs in our C++ code. The following places remain dependent on libarrow:
- tests: We have tests demonstrating how to interoperate with libarrow objects, as well as other tests that leverage Arrow for I/O.
- examples: We have an example demonstrating interop with libarrow arrays.
- JNI: The JNI is still using libarrow to handle ingestion or production of Arrow buffers.

In all three cases above, we are now statically linking libarrow. We also always pull it in via CPM, which means that we never require libarrow to exist on the user's system anymore. Of the above three cases, we should expect the first two to persist indefinitely. The JNI could be updated to use nanoarrow instead if desired, but that is not critical since the primary benefit of removing libarrow as a direct dependency is to remove it as a constraint for package managers such as conda in environments where we must match the version of Arrow required by other dependencies.

pyarrow remains a dependency of the cudf Python packages. For now, this PR retains the tight pinning on 16.1 since we know that this version works. A future PR will loosen this pinning since we are no longer constrained to ABI-compatible versions and can support a range of pyarrow versions that support the necessary Python APIs (I believe pyarrow>=13 will work, but that remains to be tested).

Resolves rapidsai#15193

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - James Lamb (https://github.com/jameslamb)
  - Robert Maynard (https://github.com/robertmaynard)
  - https://github.com/jakirkham
  - MithunR (https://github.com/mythrocks)

URL: rapidsai#16640
  • Loading branch information
vyasr authored Aug 27, 2024
1 parent 6747d2d commit 1a2aad2
Show file tree
Hide file tree
Showing 29 changed files with 145 additions and 351 deletions.
1 change: 0 additions & 1 deletion ci/build_wheel_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ export PIP_CONSTRAINT="/tmp/constraints.txt"

python -m auditwheel repair \
--exclude libcudf.so \
--exclude libarrow.so.1601 \
--exclude libnvcomp.so \
--exclude libnvcomp_bitcomp.so \
--exclude libnvcomp_gdeflate.so \
Expand Down
2 changes: 1 addition & 1 deletion ci/build_wheel_libcudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ package_dir="python/libcudf"
RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"

mkdir -p ${package_dir}/final_dist
python -m auditwheel repair --exclude libarrow.so.1601 -w ${package_dir}/final_dist ${package_dir}/dist/*
python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*

RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist
1 change: 0 additions & 1 deletion ci/build_wheel_pylibcudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ export PIP_CONSTRAINT="/tmp/constraints.txt"

python -m auditwheel repair \
--exclude libcudf.so \
--exclude libarrow.so.1601 \
--exclude libnvcomp.so \
--exclude libnvcomp_bitcomp.so \
--exclude libnvcomp_gdeflate.so \
Expand Down
6 changes: 0 additions & 6 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,11 @@ dependencies:
- hypothesis
- identify>=2.5.20
- ipython
- libarrow-acero==16.1.0.*
- libarrow-dataset==16.1.0.*
- libarrow==16.1.0.*
- libcufile-dev=1.4.0.31
- libcufile=1.4.0.31
- libcurand-dev=10.3.0.86
- libcurand=10.3.0.86
- libkvikio==24.10.*,>=0.0.0a0
- libparquet==16.1.0.*
- librdkafka>=1.9.0,<1.10.0a0
- librmm==24.10.*,>=0.0.0a0
- make
Expand All @@ -56,7 +52,6 @@ dependencies:
- ninja
- notebook
- numba>=0.57
- numpy
- numpy>=1.23,<3.0a0
- numpydoc
- nvcc_linux-64=11.8
Expand All @@ -68,7 +63,6 @@ dependencies:
- pandoc
- pre-commit
- ptxcompiler
- pyarrow==16.1.0.*
- pydata-sphinx-theme!=0.14.2
- pytest-benchmark
- pytest-cases>=3.8.2
Expand Down
6 changes: 0 additions & 6 deletions conda/environments/all_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,9 @@ dependencies:
- hypothesis
- identify>=2.5.20
- ipython
- libarrow-acero==16.1.0.*
- libarrow-dataset==16.1.0.*
- libarrow==16.1.0.*
- libcufile-dev
- libcurand-dev
- libkvikio==24.10.*,>=0.0.0a0
- libparquet==16.1.0.*
- librdkafka>=1.9.0,<1.10.0a0
- librmm==24.10.*,>=0.0.0a0
- make
Expand All @@ -55,7 +51,6 @@ dependencies:
- ninja
- notebook
- numba>=0.57
- numpy
- numpy>=1.23,<3.0a0
- numpydoc
- nvcomp==3.0.6
Expand All @@ -65,7 +60,6 @@ dependencies:
- pandas>=2.0,<2.2.3dev0
- pandoc
- pre-commit
- pyarrow==16.1.0.*
- pydata-sphinx-theme!=0.14.2
- pynvjitlink>=0.0.0a0
- pytest-benchmark
Expand Down
4 changes: 1 addition & 3 deletions conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,6 @@ requirements:
- rapids-build-backend >=0.3.0,<0.4.0.dev0
- scikit-build-core >=0.10.0
- dlpack >=0.8,<1.0
- numpy 2.0
- pyarrow ==16.1.0.*
- libcudf ={{ version }}
- pylibcudf ={{ version }}
- rmm ={{ minor_version }}
Expand All @@ -84,7 +82,7 @@ requirements:
- cupy >=12.0.0
- numba >=0.57
- numpy >=1.23,<3.0a0
- {{ pin_compatible('pyarrow', max_pin='x.x') }}
- pyarrow ==16.1.0.*
- libcudf ={{ version }}
- pylibcudf ={{ version }}
- {{ pin_compatible('rmm', max_pin='x.x') }}
Expand Down
3 changes: 0 additions & 3 deletions conda/recipes/libcudf/conda_build_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@ c_stdlib_version:
cmake_version:
- ">=3.26.4,!=3.30.0"

libarrow_version:
- "==16.1.0"

dlpack_version:
- ">=0.8,<1.0"

Expand Down
2 changes: 0 additions & 2 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ requirements:
{% endif %}
- cuda-version ={{ cuda_version }}
- nvcomp {{ nvcomp_version }}
- libarrow {{ libarrow_version }}
- dlpack {{ dlpack_version }}
- librdkafka {{ librdkafka_version }}
- fmt {{ fmt_version }}
Expand Down Expand Up @@ -92,7 +91,6 @@ outputs:
- cmake {{ cmake_version }}
host:
- cuda-version ={{ cuda_version }}
- libarrow {{ libarrow_version }}
run:
- {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
{% if cuda_major == "11" %}
Expand Down
4 changes: 1 addition & 3 deletions conda/recipes/pylibcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,6 @@ requirements:
- rapids-build-backend >=0.3.0,<0.4.0.dev0
- scikit-build-core >=0.10.0
- dlpack >=0.8,<1.0
- numpy 2.0
- pyarrow ==16.1.0.*
- libcudf ={{ version }}
- rmm ={{ minor_version }}
{% if cuda_major == "11" %}
Expand All @@ -81,7 +79,7 @@ requirements:
- typing_extensions >=4.0.0
- pandas >=2.0,<2.2.3dev0
- numpy >=1.23,<3.0a0
- {{ pin_compatible('pyarrow', max_pin='x.x') }}
- pyarrow ==16.1.0.*
- {{ pin_compatible('rmm', max_pin='x.x') }}
- fsspec >=0.6.0
{% if cuda_major == "11" %}
Expand Down
27 changes: 1 addition & 26 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,6 @@ mark_as_advanced(CUDF_BUILD_TESTUTIL)
option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON)
option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF)
mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED)
option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF)
option(CUDF_ENABLE_ARROW_ORC "Build the Arrow ORC adapter" OFF)
option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF)
option(CUDF_ENABLE_ARROW_PARQUET "Find (or build) Arrow with Parquet support" OFF)
option(CUDF_ENABLE_ARROW_S3 "Build/Enable AWS S3 Arrow filesystem support" OFF)
option(
CUDF_USE_PER_THREAD_DEFAULT_STREAM
"Build cuDF with per-thread default stream, including passing the per-thread default
Expand All @@ -81,8 +76,6 @@ option(CUDA_ENABLE_LINEINFO
option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON)
# cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking
option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
option(USE_LIBARROW_FROM_PYARROW "Only use the libarrow contained in pyarrow" OFF)
mark_as_advanced(USE_LIBARROW_FROM_PYARROW)

set(DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL ON)
if(CUDA_STATIC_RUNTIME OR NOT BUILD_SHARED_LIBS)
Expand All @@ -100,8 +93,6 @@ message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}")
message(VERBOSE "CUDF: Configure CMake to build (google & nvbench) benchmarks: ${BUILD_BENCHMARKS}")
message(VERBOSE "CUDF: Build cuDF shared libraries: ${BUILD_SHARED_LIBS}")
message(VERBOSE "CUDF: Use a file cache for JIT compiled kernels: ${JITIFY_USE_CACHE}")
message(VERBOSE "CUDF: Build and statically link Arrow libraries: ${CUDF_USE_ARROW_STATIC}")
message(VERBOSE "CUDF: Build and enable S3 filesystem support for Arrow: ${CUDF_ENABLE_ARROW_S3}")
message(VERBOSE "CUDF: Build with per-thread default stream: ${CUDF_USE_PER_THREAD_DEFAULT_STREAM}")
message(
VERBOSE
Expand Down Expand Up @@ -192,8 +183,6 @@ include(cmake/thirdparty/get_nvcomp.cmake)
include(cmake/thirdparty/get_cccl.cmake)
# find rmm
include(cmake/thirdparty/get_rmm.cmake)
# find arrow
include(cmake/thirdparty/get_arrow.cmake)
# find flatbuffers
include(cmake/thirdparty/get_flatbuffers.cmake)
# find dlpack
Expand Down Expand Up @@ -807,7 +796,7 @@ add_dependencies(cudf jitify_preprocess_run)
# Specify the target module library dependencies
target_link_libraries(
cudf
PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
PUBLIC CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
kvikio::kvikio $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
)
Expand Down Expand Up @@ -1056,20 +1045,6 @@ following IMPORTED GLOBAL targets:
]=]
)

if(CUDF_ENABLE_ARROW_PARQUET)
string(
APPEND
install_code_string
[=[
if(NOT Parquet_DIR)
set(Parquet_DIR "${Arrow_DIR}")
endif()
set(ArrowDataset_DIR "${Arrow_DIR}")
find_dependency(ArrowDataset)
]=]
)
endif()

rapids_export(
INSTALL cudf
EXPORT_SET cudf-exports ${_components_export_string}
Expand Down
Loading

0 comments on commit 1a2aad2

Please sign in to comment.