Skip to content

Commit

Permalink
Add from_arrow_device function to cudf interop using nanoarrow (#15458
Browse files Browse the repository at this point in the history
)

Adding a corresponding `from_arrow_device` function following up from #15047. This continues the work towards addressing #14926.

Authors:
  - Matt Topol (https://github.com/zeroshade)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)

URL: #15458
  • Loading branch information
zeroshade authored Apr 23, 2024
1 parent 6780e59 commit 8db1851
Show file tree
Hide file tree
Showing 9 changed files with 1,488 additions and 162 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ add_library(
src/interop/from_arrow.cu
src/interop/to_arrow.cu
src/interop/to_arrow_device.cu
src/interop/from_arrow_device.cu
src/interop/to_arrow_schema.cpp
src/interop/to_arrow_utilities.cpp
src/interop/detail/arrow_allocator.cpp
Expand Down
124 changes: 124 additions & 0 deletions cpp/include/cudf/interop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -348,5 +348,129 @@ std::unique_ptr<cudf::scalar> from_arrow(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

/**
* @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray
*
*/
using owned_columns_t = std::vector<std::unique_ptr<cudf::column>>;

/**
* @brief functor for a custom deleter to a unique_ptr of table_view
*
* When converting from an ArrowDeviceArray, there are cases where data can't
* be zero-copy (i.e. bools or non-UINT32 dictionary indices). This custom deleter
* is used to maintain ownership over the data allocated since a `cudf::table_view`
* doesn't hold ownership.
*/
template <typename ViewType>
struct custom_view_deleter {
/**
* @brief Construct a new custom view deleter object
*
* @param owned Vector of owning columns
*/
explicit custom_view_deleter(owned_columns_t&& owned) : owned_mem_{std::move(owned)} {}

/**
* @brief operator to delete the unique_ptr
*
* @param ptr Pointer to the object to be deleted
*/
void operator()(ViewType* ptr) const { delete ptr; }

owned_columns_t owned_mem_; ///< Owned columns that must be deleted.
};

/**
* @brief typedef for a unique_ptr to a `cudf::table_view` with custom deleter
*
*/
using unique_table_view_t =
std::unique_ptr<cudf::table_view, custom_view_deleter<cudf::table_view>>;

/**
* @brief Create `cudf::table_view` from given `ArrowDeviceArray` and `ArrowSchema`
*
* Constructs a non-owning `cudf::table_view` using `ArrowDeviceArray` and `ArrowSchema`,
* data must be accessible to the CUDA device. Because the resulting `cudf::table_view` will
* not own the data, the `ArrowDeviceArray` must be kept alive for the lifetime of the result.
* It is the responsibility of callers to ensure they call the release callback on the
* `ArrowDeviceArray` after it is no longer needed, and that the `cudf::table_view` is not
* accessed after this happens.
*
* @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
* or `ARROW_DEVICE_CUDA_MANAGED`
*
* @throws cudf::data_type_error if the input array is not a struct array, non-struct
* arrays should be passed to `from_arrow_device_column` instead.
*
* @throws cudf::data_type_error if the input arrow data type is not supported.
*
* Each child of the input struct will be the columns of the resulting table_view.
*
* @note The custom deleter used for the unique_ptr to the table_view maintains ownership
* over any memory which is allocated, such as converting boolean columns from the bitmap
* used by Arrow to the 1-byte per value for cudf.
*
* @note If the input `ArrowDeviceArray` contained a non-null sync_event it is assumed
* to be a `cudaEvent_t*` and the passed in stream will have `cudaStreamWaitEvent` called
* on it with the event. This function, however, will not explicitly synchronize on the
* stream.
*
* @param schema `ArrowSchema` pointer to object describing the type of the device array
* @param input `ArrowDeviceArray` pointer to object owning the Arrow data
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to perform any allocations
* @return `cudf::table_view` generated from given Arrow data
*/
unique_table_view_t from_arrow_device(
ArrowSchema const* schema,
ArrowDeviceArray const* input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief typedef for a unique_ptr to a `cudf::column_view` with custom deleter
*
*/
using unique_column_view_t =
std::unique_ptr<cudf::column_view, custom_view_deleter<cudf::column_view>>;

/**
* @brief Create `cudf::column_view` from given `ArrowDeviceArray` and `ArrowSchema`
*
* Constructs a non-owning `cudf::column_view` using `ArrowDeviceArray` and `ArrowSchema`,
* data must be accessible to the CUDA device. Because the resulting `cudf::column_view` will
* not own the data, the `ArrowDeviceArray` must be kept alive for the lifetime of the result.
* It is the responsibility of callers to ensure they call the release callback on the
* `ArrowDeviceArray` after it is no longer needed, and that the `cudf::column_view` is not
* accessed after this happens.
*
* @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
* or `ARROW_DEVICE_CUDA_MANAGED`
*
* @throws cudf::data_type_error input arrow data type is not supported.
*
* @note The custom deleter used for the unique_ptr to the table_view maintains ownership
* over any memory which is allocated, such as converting boolean columns from the bitmap
* used by Arrow to the 1-byte per value for cudf.
*
* @note If the input `ArrowDeviceArray` contained a non-null sync_event it is assumed
* to be a `cudaEvent_t*` and the passed in stream will have `cudaStreamWaitEvent` called
* on it with the event. This function, however, will not explicitly synchronize on the
* stream.
*
* @param schema `ArrowSchema` pointer to object describing the type of the device array
* @param input `ArrowDeviceArray` pointer to object owning the Arrow data
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to perform any allocations
* @return `cudf::column_view` generated from given Arrow data
*/
unique_column_view_t from_arrow_device_column(
ArrowSchema const* schema,
ArrowDeviceArray const* input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace cudf
30 changes: 30 additions & 0 deletions cpp/src/interop/arrow_utilities.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

namespace cudf {
namespace detail {

/**
* @brief constants for buffer indexes of Arrow arrays
*
*/
static constexpr int validity_buffer_idx = 0;
static constexpr int fixed_width_data_buffer_idx = 1;

} // namespace detail
} // namespace cudf
Loading

0 comments on commit 8db1851

Please sign in to comment.