Skip to content

Commit

Permalink
Add from_arrow_host functions for cudf interop with nanoarrow (rapi…
Browse files Browse the repository at this point in the history
…dsai#15645)

Following up from rapidsai#15458 and continuing the work to address rapidsai#14926 adding host memory version of `from_arrow_device` which will perform the copies from host memory to create cudf objects.

Authors:
  - Matt Topol (https://github.com/zeroshade)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)

URL: rapidsai#15645
  • Loading branch information
zeroshade authored May 29, 2024
1 parent 7b02f4b commit eafa570
Show file tree
Hide file tree
Showing 15 changed files with 1,631 additions and 224 deletions.
3 changes: 2 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -360,11 +360,12 @@ add_library(
src/hash/xxhash_64.cu
src/interop/dlpack.cpp
src/interop/from_arrow.cu
src/interop/arrow_utilities.cpp
src/interop/to_arrow.cu
src/interop/to_arrow_device.cu
src/interop/from_arrow_device.cu
src/interop/from_arrow_host.cu
src/interop/to_arrow_schema.cpp
src/interop/to_arrow_utilities.cpp
src/interop/detail/arrow_allocator.cpp
src/io/avro/avro.cpp
src/io/avro/avro_gpu.cu
Expand Down
91 changes: 89 additions & 2 deletions cpp/include/cudf/interop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ struct ArrowDeviceArray;

struct ArrowSchema;

struct ArrowArray;

namespace cudf {
/**
* @addtogroup interop_dlpack
Expand Down Expand Up @@ -348,6 +350,91 @@ std::unique_ptr<cudf::scalar> from_arrow(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

/**
* @brief Create `cudf::table` from given ArrowArray and ArrowSchema input
*
* @throws std::invalid_argument if either schema or input are NULL
*
* @throws cudf::data_type_error if the input array is not a struct array.
*
* The conversion will not call release on the input Array.
*
* @param schema `ArrowSchema` pointer to describe the type of the data
* @param input `ArrowArray` pointer that needs to be converted to cudf::table
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate `cudf::table`
* @return cudf table generated from given arrow data
*/
std::unique_ptr<cudf::table> from_arrow(ArrowSchema const* schema,
ArrowArray const* input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input
*
* @throws std::invalid_argument if either schema or input are NULL
*
* The conversion will not call release on the input Array.
*
* @param schema `ArrowSchema` pointer to describe the type of the data
* @param input `ArrowArray` pointer that needs to be converted to cudf::column
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate `cudf::column`
* @return cudf column generated from given arrow data
*/
std::unique_ptr<cudf::column> from_arrow_column(ArrowSchema const* schema,
ArrowArray const* input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Create `cudf::table` from given ArrowDeviceArray input
*
* @throws std::invalid_argument if either schema or input are NULL
*
* @throws std::invalid_argument if the device_type is not `ARROW_DEVICE_CPU`
*
* @throws cudf::data_type_error if the input array is not a struct array,
* non-struct arrays should be passed to `from_arrow_host_column` instead.
*
* The conversion will not call release on the input Array.
*
* @param schema `ArrowSchema` pointer to describe the type of the data
* @param input `ArrowDeviceArray` pointer to object owning the Arrow data
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to perform cuda allocation
* @return cudf table generated from the given Arrow data
*/
std::unique_ptr<table> from_arrow_host(
ArrowSchema const* schema,
ArrowDeviceArray const* input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create `cudf::column` from given ArrowDeviceArray input
*
* @throws std::invalid_argument if either schema or input are NULL
*
* @throws std::invalid_argument if the device_type is not `ARROW_DEVICE_CPU`
*
* @throws cudf::data_type_error if input arrow data type is not supported in cudf.
*
* The conversion will not call release on the input Array.
*
* @param schema `ArrowSchema` pointer to describe the type of the data
* @param input `ArrowDeviceArray` pointer to object owning the Arrow data
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to perform cuda allocation
* @return cudf column generated from the given Arrow data
*/
std::unique_ptr<column> from_arrow_host_column(
ArrowSchema const* schema,
ArrowDeviceArray const* input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray
*
Expand Down Expand Up @@ -398,7 +485,7 @@ using unique_table_view_t =
* `ArrowDeviceArray` after it is no longer needed, and that the `cudf::table_view` is not
* accessed after this happens.
*
* @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
* @throws std::invalid_argument if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
* or `ARROW_DEVICE_CUDA_MANAGED`
*
* @throws cudf::data_type_error if the input array is not a struct array, non-struct
Expand Down Expand Up @@ -446,7 +533,7 @@ using unique_column_view_t =
* `ArrowDeviceArray` after it is no longer needed, and that the `cudf::column_view` is not
* accessed after this happens.
*
* @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
* @throws std::invalid_argument if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
* or `ARROW_DEVICE_CUDA_MANAGED`
*
* @throws cudf::data_type_error input arrow data type is not supported.
Expand Down
90 changes: 90 additions & 0 deletions cpp/src/interop/arrow_utilities.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "arrow_utilities.hpp"

#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>

#include <nanoarrow/nanoarrow.h>

namespace cudf {
namespace detail {
data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view)
{
switch (arrow_view->type) {
case NANOARROW_TYPE_NA: return data_type(type_id::EMPTY);
case NANOARROW_TYPE_BOOL: return data_type(type_id::BOOL8);
case NANOARROW_TYPE_INT8: return data_type(type_id::INT8);
case NANOARROW_TYPE_INT16: return data_type(type_id::INT16);
case NANOARROW_TYPE_INT32: return data_type(type_id::INT32);
case NANOARROW_TYPE_INT64: return data_type(type_id::INT64);
case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32);
case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64);
case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS);
case NANOARROW_TYPE_STRING: return data_type(type_id::STRING);
case NANOARROW_TYPE_LIST: return data_type(type_id::LIST);
case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32);
case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT);
case NANOARROW_TYPE_TIMESTAMP: {
switch (arrow_view->time_unit) {
case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::TIMESTAMP_SECONDS);
case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS);
case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::TIMESTAMP_MICROSECONDS);
case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::TIMESTAMP_NANOSECONDS);
default: CUDF_FAIL("Unsupported timestamp unit in arrow", cudf::data_type_error);
}
}
case NANOARROW_TYPE_DURATION: {
switch (arrow_view->time_unit) {
case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::DURATION_SECONDS);
case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::DURATION_MILLISECONDS);
case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::DURATION_MICROSECONDS);
case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::DURATION_NANOSECONDS);
default: CUDF_FAIL("Unsupported duration unit in arrow", cudf::data_type_error);
}
}
case NANOARROW_TYPE_DECIMAL128:
return data_type{type_id::DECIMAL128, -arrow_view->decimal_scale};
default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error);
}
}

ArrowType id_to_arrow_type(cudf::type_id id)
{
switch (id) {
case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL;
case cudf::type_id::INT8: return NANOARROW_TYPE_INT8;
case cudf::type_id::INT16: return NANOARROW_TYPE_INT16;
case cudf::type_id::INT32: return NANOARROW_TYPE_INT32;
case cudf::type_id::INT64: return NANOARROW_TYPE_INT64;
case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8;
case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16;
case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32;
case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64;
case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error);
}
}

} // namespace detail
} // namespace cudf
21 changes: 21 additions & 0 deletions cpp/src/interop/arrow_utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@

#pragma once

#include <cudf/types.hpp>

#include <nanoarrow/nanoarrow.h>
#include <nanoarrow/nanoarrow_types.h>

namespace cudf {
namespace detail {

Expand All @@ -26,5 +31,21 @@ namespace detail {
static constexpr int validity_buffer_idx = 0;
static constexpr int fixed_width_data_buffer_idx = 1;

/**
* @brief Map ArrowType id to cudf column type id
*
* @param arrow_view SchemaView to pull the logical and storage types from
* @return Column type id
*/
data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view);

/**
* @brief Map cudf column type id to ArrowType id
*
* @param id Column type id
* @return ArrowType id
*/
ArrowType id_to_arrow_type(cudf::type_id id);

} // namespace detail
} // namespace cudf
Loading

0 comments on commit eafa570

Please sign in to comment.