From 7e58a7abb8b29f63005dfc92aba84da2ea4007a6 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 22 May 2024 03:15:17 +0000 Subject: [PATCH 01/45] Python bindings + initial artifacts for arrow schema in PQ writer --- cpp/include/cudf/io/parquet.hpp | 56 ++++++++ cpp/include/cudf/types.hpp | 6 +- cpp/src/io/parquet/writer_impl.cu | 124 ++++++++++++++---- cpp/src/io/parquet/writer_impl.hpp | 1 + python/cudf/cudf/_lib/parquet.pyx | 11 +- .../_lib/pylibcudf/libcudf/io/parquet.pxd | 18 ++- python/cudf/cudf/io/parquet.py | 4 + 7 files changed, 185 insertions(+), 35 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index b2f949cdcee..ac04cd4c11f 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -602,6 +602,8 @@ class parquet_writer_options { // Parquet writer can write timestamps as UTC // Defaults to true because libcudf timestamps are implicitly UTC bool _write_timestamps_as_UTC = true; + // Whether to write ARROW schema + bool _write_arrow_schema = true; // Column chunks file paths to be set in the raw output metadata. One per output file std::vector _column_chunks_file_paths; // Maximum size of each row group (unless smaller than a single page) @@ -731,6 +733,13 @@ class parquet_writer_options { */ [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; } + /** + * @brief Returns `true` if arrow schema will be written + * + * @return `true` if arrow schema will be written + */ + [[nodiscard]] auto is_enabled_write_arrow_schema() const { return _write_arrow_schema; } + /** * @brief Returns Column chunks file paths to be set in the raw output metadata. * @@ -882,6 +891,13 @@ class parquet_writer_options { */ void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; } + /** + * @brief Sets preference for writing arrow schema. Write arrow schema if set to `true`. + * + * @param val Boolean value to enable/disable writing of arrow schema. + */ + void enable_write_arrow_schema(bool val) { _write_arrow_schema = val; } + /** * @brief Sets column chunks file path to be set in the raw output metadata. * @@ -1215,6 +1231,18 @@ class parquet_writer_options_builder { return *this; } + /** + * @brief Set to true if arrow schema is to be written + * + * @param enabled Boolean value to enable/disable writing of arrow schema + * @return this for chaining + */ + parquet_writer_options_builder& write_arrow_schema(bool enabled) + { + options._write_arrow_schema = enabled; + return *this; + } + /** * @brief Set to true if V2 page headers are to be written. * @@ -1298,6 +1326,8 @@ class chunked_parquet_writer_options { bool _write_timestamps_as_int96 = false; // Parquet writer can write timestamps as UTC. Defaults to true. bool _write_timestamps_as_UTC = true; + // Whether to write ARROW schema + bool _write_arrow_schema = true; // Maximum size of each row group (unless smaller than a single page) size_t _row_group_size_bytes = default_row_group_size_bytes; // Maximum number of rows in row group (unless smaller than a single page) @@ -1390,6 +1420,13 @@ class chunked_parquet_writer_options { */ [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; } + /** + * @brief Returns `true` if arrow schema will be written + * + * @return `true` if arrow schema will be written + */ + [[nodiscard]] auto is_enabled_write_arrow_schema() const { return _write_arrow_schema; } + /** * @brief Returns maximum row group size, in bytes. * @@ -1525,6 +1562,13 @@ class chunked_parquet_writer_options { */ void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; } + /** + * @brief Sets preference for writing arrow schema. Write arrow schema if set to `true`. + * + * @param val Boolean value to enable/disable writing of arrow schema. + */ + void enable_write_arrow_schema(bool val) { _write_arrow_schema = val; } + /** * @brief Sets the maximum row group size, in bytes. * @@ -1711,6 +1755,18 @@ class chunked_parquet_writer_options_builder { return *this; } + /** + * @brief Set to true if arrow schema is to be written + * + * @param enabled Boolean value to enable/disable writing of arrow schema + * @return this for chaining + */ + chunked_parquet_writer_options_builder& write_arrow_schema(bool enabled) + { + options._write_arrow_schema = enabled; + return *this; + } + /** * @brief Set to true if V2 page headers are to be written. * diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index baf07fa3db6..101791cee0b 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -216,7 +216,11 @@ enum class type_id : int32_t { TIMESTAMP_MILLISECONDS, ///< point in time in milliseconds since Unix Epoch in int64 TIMESTAMP_MICROSECONDS, ///< point in time in microseconds since Unix Epoch in int64 TIMESTAMP_NANOSECONDS, ///< point in time in nanoseconds since Unix Epoch in int64 - DURATION_DAYS, ///< time interval of days in int32 + TIME_SECONDS, ///< time of day since midnight in seconds in int64 + TIME_MILLISECONDS, ///< time of day since midnight in milliseconds in int64 + TIME_MICROSECONDS, ///< time of day since midnight in microseconds in int64 + TIME_NANOSECONDS, ///< time of day since midnight in nanoseconds in int64 + DURATION_DAYS, ///< time interval of days in int64 DURATION_SECONDS, ///< time interval of seconds in int64 DURATION_MILLISECONDS, ///< time interval of milliseconds in int64 DURATION_MICROSECONDS, ///< time interval of microseconds in int64 diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 1dfced94f5b..c67c02655bb 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -25,8 +25,11 @@ #include "io/parquet/parquet.hpp" #include "io/parquet/parquet_gpu.hpp" #include "io/statistics/column_statistics.cuh" +#include "io/utilities/base64_utilities.hpp" #include "io/utilities/column_utils.cuh" #include "io/utilities/config_utils.hpp" +#include "ipc/Message_generated.h" +#include "ipc/Schema_generated.h" #include "parquet_common.hpp" #include "parquet_gpu.cuh" #include "writer_impl.hpp" @@ -66,12 +69,29 @@ namespace cudf::io::parquet::detail { using namespace cudf::io::detail; +/** + * @brief Construct and return arrow schema ipc message from input parquet schema + * + * Recursively traverses through parquet schema to construct arrow schema tree. + * The resulting schema tree is serialized and stored as the header (or metadata) of + * an otherwise empty ipc message using flatbuffers. The ipc message is then prepended + * with header (metadata) size (padded for 16 byte alignment) and a continuation + * string. The final string is base64 encoded and returned to be stored at the keyvalue + * metadata section of the Parquet file footer. + */ +std::string construct_arrow_schema_ipc_message(host_span parquet_schema) +{ + // TODO: dummy return empty string for now + return cudf::io::detail::base64_encode(""); +} + struct aggregate_writer_metadata { aggregate_writer_metadata(host_span partitions, host_span const> kv_md, host_span tbl_schema, size_type num_columns, - statistics_freq stats_granularity) + statistics_freq stats_granularity, + bool const write_arrow_schema) : version(1), schema(std::vector(tbl_schema.begin(), tbl_schema.end())), files(partitions.size()) @@ -85,6 +105,9 @@ struct aggregate_writer_metadata { this->column_orders = std::vector(num_columns, default_order); } + // Construct the arrow schema ipc message string. + auto const arrow_schema_ipc_message = construct_arrow_schema_ipc_message(schema); + for (size_t p = 0; p < kv_md.size(); ++p) { std::transform(kv_md[p].begin(), kv_md[p].end(), @@ -92,6 +115,11 @@ struct aggregate_writer_metadata { [](auto const& kv) { return KeyValue{kv.first, kv.second}; }); + // Append arrow schema to the key_value_metadata + if (write_arrow_schema and not arrow_schema_ipc_message.empty()) { + this->files[p].key_value_metadata.emplace_back( + KeyValue{"ARROW:schema", std::move(arrow_schema_ipc_message)}); + } } } @@ -507,52 +535,90 @@ struct leaf_schema_fn { } } - // unsupported outside cudf for parquet 1.0. + /* TODO: This code block should be ``time`` type and not ``duration`` type + // unsupported outside cudf for parquet 1.0. + template + std::enable_if_t, void> operator()() + { + col_schema.type = Type::INT32; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.ts_scale = 24 * 60 * 60 * 1000; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; + } + + template + std::enable_if_t, void> operator()() + { + col_schema.type = Type::INT32; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.ts_scale = 1000; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; + } + + template + std::enable_if_t, void> operator()() + { + col_schema.type = Type::INT32; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; + } + + template + std::enable_if_t, void> operator()() + { + col_schema.type = Type::INT64; + col_schema.converted_type = ConvertedType::TIME_MICROS; + col_schema.stats_dtype = statistics_dtype::dtype_int64; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}}; + } + + // unsupported outside cudf for parquet 1.0. + template + std::enable_if_t, void> operator()() + { + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_int64; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}}; + } + */ + template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT32; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.ts_scale = 24 * 60 * 60 * 1000; - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; + col_schema.type = Type::INT64; + col_schema.arrow_type = cudf::type_id::DURATION_SECONDS; + col_schema.ts_scale = 24 * 60 * 60; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT32; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.ts_scale = 1000; - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; + col_schema.type = Type::INT64; + col_schema.arrow_type = cudf::type_id::DURATION_SECONDS; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT32; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; + col_schema.type = Type::INT64; + col_schema.arrow_type = cudf::type_id::DURATION_MILLISECONDS; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.converted_type = ConvertedType::TIME_MICROS; - col_schema.stats_dtype = statistics_dtype::dtype_int64; - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}}; + col_schema.type = Type::INT64; + col_schema.arrow_type = cudf::type_id::DURATION_MICROSECONDS; } - // unsupported outside cudf for parquet 1.0. template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_int64; - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}}; + col_schema.type = Type::INT64; + col_schema.arrow_type = cudf::type_id::DURATION_NANOSECONDS; } template @@ -625,7 +691,7 @@ inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col, * Recursively traverses through linked_columns and corresponding metadata to construct schema tree. * The resulting schema tree is stored in a vector in pre-order traversal order. */ -std::vector construct_schema_tree( +std::vector construct_parquet_schema_tree( cudf::detail::LinkedColVector const& linked_columns, table_input_metadata& metadata, single_write_mode write_mode, @@ -1703,12 +1769,13 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, bool int96_timestamps, bool utc_timestamps, bool write_v2_headers, + bool write_arrow_schema, host_span const> out_sink, rmm::cuda_stream_view stream) { auto vec = table_to_linked_columns(input); auto schema_tree = - construct_schema_tree(vec, table_meta, write_mode, int96_timestamps, utc_timestamps); + construct_parquet_schema_tree(vec, table_meta, write_mode, int96_timestamps, utc_timestamps); // Construct parquet_column_views from the schema tree leaf nodes. std::vector parquet_columns; @@ -1831,7 +1898,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, std::unique_ptr agg_meta; if (!curr_agg_meta) { agg_meta = std::make_unique( - partitions, kv_meta, this_table_schema, num_columns, stats_granularity); + partitions, kv_meta, this_table_schema, num_columns, stats_granularity, write_arrow_schema); } else { agg_meta = std::make_unique(*curr_agg_meta); @@ -2312,6 +2379,7 @@ writer::impl::impl(std::vector> sinks, _int96_timestamps(options.is_enabled_int96_timestamps()), _utc_timestamps(options.is_enabled_utc_timestamps()), _write_v2_headers(options.is_enabled_write_v2_headers()), + _write_arrow_schema(options.is_enabled_write_arrow_schema()), _sorting_columns(options.get_sorting_columns()), _column_index_truncate_length(options.get_column_index_truncate_length()), _kv_meta(options.get_key_value_metadata()), @@ -2342,6 +2410,7 @@ writer::impl::impl(std::vector> sinks, _int96_timestamps(options.is_enabled_int96_timestamps()), _utc_timestamps(options.is_enabled_utc_timestamps()), _write_v2_headers(options.is_enabled_write_v2_headers()), + _write_arrow_schema(options.is_enabled_write_arrow_schema()), _sorting_columns(options.get_sorting_columns()), _column_index_truncate_length(options.get_column_index_truncate_length()), _kv_meta(options.get_key_value_metadata()), @@ -2420,6 +2489,7 @@ void writer::impl::write(table_view const& input, std::vector co _int96_timestamps, _utc_timestamps, _write_v2_headers, + _write_arrow_schema, _out_sink, _stream); } catch (...) { // catch any exception type diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 784f78f06d5..63128faf993 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -156,6 +156,7 @@ class writer::impl { bool const _int96_timestamps; bool const _utc_timestamps; bool const _write_v2_headers; + bool const _write_arrow_schema; std::optional> _sorting_columns; int32_t const _column_index_truncate_length; std::vector> const _kv_meta; // Optional user metadata. diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 70acb7f917b..6b78ac0199c 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -408,6 +408,7 @@ def write_parquet( object force_nullable_schema=False, header_version="1.0", use_dictionary=True, + write_arrow_schema=True, ): """ Cython function to call into libcudf API, see `write_parquet`. @@ -507,6 +508,7 @@ def write_parquet( .write_v2_headers(header_version == "2.0") .dictionary_policy(dict_policy) .utc_timestamps(False) + .write_arrow_schema(write_arrow_schema) .build() ) if partitions_info is not None: @@ -586,6 +588,9 @@ cdef class ParquetWriter: If ``True``, enable dictionary encoding for Parquet page data subject to ``max_dictionary_size`` constraints. If ``False``, disable dictionary encoding for Parquet page data. + write_arrow_schema : bool, default True + If ``True``, enable computing and writing arrow schema to Parquet + file footer's key-value metadata section. See Also -------- cudf.io.parquet.write_parquet @@ -604,6 +609,7 @@ cdef class ParquetWriter: cdef size_type max_page_size_rows cdef size_t max_dictionary_size cdef cudf_io_types.dictionary_policy dict_policy + cdef bool write_arrow_schema def __cinit__(self, object filepath_or_buffer, object index=None, object compression="snappy", str statistics="ROWGROUP", @@ -612,7 +618,8 @@ cdef class ParquetWriter: int max_page_size_bytes=524288, int max_page_size_rows=20000, int max_dictionary_size=1048576, - bool use_dictionary=True): + bool use_dictionary=True, + bool store_schema=True): filepaths_or_buffers = ( list(filepath_or_buffer) if is_list_like(filepath_or_buffer) @@ -633,6 +640,7 @@ cdef class ParquetWriter: if use_dictionary else cudf_io_types.dictionary_policy.NEVER ) + self.write_arrow_schema = store_schema def write_table(self, table, object partitions_info=None): """ Writes a single table to the file """ @@ -751,6 +759,7 @@ cdef class ParquetWriter: .max_page_size_bytes(self.max_page_size_bytes) .max_page_size_rows(self.max_page_size_rows) .max_dictionary_size(self.max_dictionary_size) + .write_arrow_schema(self.write_arrow_schema) .build() ) args.set_dictionary_policy(self.dict_policy) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd index 33a594b432f..cb4ce142543 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd @@ -80,6 +80,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: size_t get_max_page_size_bytes() except + size_type get_max_page_size_rows() except + size_t get_max_dictionary_size() except + + bool is_enabled_write_arrow_schema() except + void set_partitions( vector[cudf_io_types.partition_info] partitions @@ -99,12 +100,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_column_chunks_file_paths( vector[string] column_chunks_file_paths ) except + - void set_int96_timestamps( - bool enabled - ) except + - void set_utc_timestamps( - bool enabled - ) except + + void enable_int96_timestamps(bool val) except + + void enable_utc_timestamps(bool val) except + + void enable_write_arrow_schema(bool val) except + void set_row_group_size_bytes(size_t val) except + void set_row_group_size_rows(size_type val) except + void set_max_page_size_bytes(size_t val) except + @@ -147,6 +145,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: parquet_writer_options_builder& int96_timestamps( bool enabled ) except + + parquet_writer_options_builder& write_arrow_schema( + bool enabled + ) except + parquet_writer_options_builder& utc_timestamps( bool enabled ) except + @@ -190,6 +191,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: size_t get_max_page_size_bytes() except + size_type get_max_page_size_rows() except + size_t get_max_dictionary_size() except + + bool is_enabled_write_arrow_schema() except + void set_metadata( cudf_io_types.table_input_metadata m @@ -215,6 +217,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_max_page_size_rows(size_type val) except + void set_max_dictionary_size(size_t val) except + void enable_write_v2_headers(bool val) except + + void enable_write_arrow_schema(bool val) except + void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except + @staticmethod @@ -245,6 +248,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: chunked_parquet_writer_options_builder& utc_timestamps( bool enabled ) except + + chunked_parquet_writer_options_builder& write_arrow_schema( + bool enabled + ) except + chunked_parquet_writer_options_builder& row_group_size_bytes( size_t val ) except + diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index a6c67d22af7..fa93cd9fd29 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -69,6 +69,7 @@ def _write_parquet( force_nullable_schema=False, header_version="1.0", use_dictionary=True, + write_arrow_schema=True, ): if is_list_like(paths) and len(paths) > 1: if partitions_info is None: @@ -102,6 +103,7 @@ def _write_parquet( "force_nullable_schema": force_nullable_schema, "header_version": header_version, "use_dictionary": use_dictionary, + "write_arrow_schema": write_arrow_schema, } if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs): with ExitStack() as stack: @@ -906,6 +908,7 @@ def to_parquet( force_nullable_schema=False, header_version="1.0", use_dictionary=True, + store_schema=True, *args, **kwargs, ): @@ -983,6 +986,7 @@ def to_parquet( force_nullable_schema=force_nullable_schema, header_version=header_version, use_dictionary=use_dictionary, + write_arrow_schema=store_schema, ) else: From 7351f91f84dc13a0e9dab8e7eeba96553968aed8 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 23 May 2024 06:01:29 +0000 Subject: [PATCH 02/45] Add artifacts to build flatbuffers. --- cpp/src/io/parquet/arrow_schema_writer.hpp | 137 +++++++++++++++++++++ cpp/src/io/parquet/parquet_common.hpp | 5 + cpp/src/io/parquet/reader_impl_helpers.cpp | 5 - cpp/src/io/parquet/reader_impl_helpers.hpp | 3 + cpp/src/io/parquet/writer_impl.cu | 19 +-- 5 files changed, 147 insertions(+), 22 deletions(-) create mode 100644 cpp/src/io/parquet/arrow_schema_writer.hpp diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp new file mode 100644 index 00000000000..340e99c82a0 --- /dev/null +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file arrow_schema.hpp + * @brief Arrow IPC schema writer implementation + */ + +#pragma once + +#include "io/parquet/parquet.hpp" +#include "io/parquet/parquet_common.hpp" +#include "io/utilities/base64_utilities.hpp" +#include "ipc/Message_generated.h" +#include "ipc/Schema_generated.h" + +#include +#include + +#include +#include +#include +#include +#include + +namespace cudf::io::parquet::detail { + +using namespace cudf::io::detail; + +namespace flatbuf = cudf::io::parquet::flatbuf; + +using FlatBufferBuilder = flatbuffers::FlatBufferBuilder; +using DictionaryOffset = flatbuffers::Offset; +using FieldOffset = flatbuffers::Offset; +using Offset = flatbuffers::Offset; +using FBString = flatbuffers::Offset; + +class FieldPosition { + public: + FieldPosition() : parent_(nullptr), _index(-1), _depth(0) {} + + FieldPosition child(int index) const { return {this, index}; } + + std::vector path() const + { + std::vector path(_depth); + const FieldPosition* cur = this; + for (int i = _depth - 1; i >= 0; --i) { + path[i] = cur->_index; + cur = cur->parent_; + } + return path; + } + + protected: + FieldPosition(const FieldPosition* parent, int index) + : parent_(parent), _index(index), _depth(parent->_depth + 1) + { + } + + const FieldPosition* parent_; + int _index; + int _depth; +}; + +std::vector make_field_offsets(host_span parquet_schema) +{ + // MH: Get here + std::vector field_offsets; + FieldPosition pos; + + for (size_type i = 0; i < static_cast(parquet_schema.size()); ++i) { + FieldOffset offset; + // FieldToFlatbufferVisitor field_visitor(fbb, mapper, pos.child(i)); + // field_visitor.GetResult(schema.field(i), &offset); + field_offsets.push_back(offset); + } + return field_offsets; +} + +/** + * @brief Construct and return arrow schema from input parquet schema + * + * Recursively traverses through parquet schema to construct the arrow schema tree. + * Serializes the arrow schema tree and stores it as the header (or metadata) of + * an otherwise empty ipc message using flatbuffers. The ipc message is then prepended + * with header size (padded for 16 byte alignment) and a continuation string. The final + * string is base64 encoded and returned. + */ +std::string construct_arrow_schema_ipc_message(host_span parquet_schema) +{ + // Lambda function to convert int32 to a string of uint8 bytes + auto const convert_int32_to_byte_string = [&](int32_t const value) { + std::array buffer; + std::memcpy(buffer.data(), &value, sizeof(int32_t)); + return std::string(reinterpret_cast(buffer.data()), buffer.size()); + }; + + FlatBufferBuilder fbb; + auto fb_offsets = fbb.CreateVector(make_field_offsets(parquet_schema)); + + flatbuffers::Offset const fb_schema = + flatbuf::CreateSchema(fbb, flatbuf::Endianness::Endianness_Little, fb_offsets); + + auto const ipc_message_flatbuffer = flatbuf::CreateMessage(fbb, + flatbuf::MetadataVersion_V5, + flatbuf::MessageHeader_Schema, + fb_schema.Union(), + 0 /* body_length */); + fbb.Finish(ipc_message_flatbuffer); + + int32_t metadata_len = fbb.GetSize(); + + // Store the final string here to pass its view to base64_encode + std::string ipc_message = + convert_int32_to_byte_string(IPC_CONTINUATION_TOKEN) + + convert_int32_to_byte_string(metadata_len) + + std::string(reinterpret_cast(fbb.GetBufferPointer()), metadata_len); + + // encode the final ipc message to base64 and return + return cudf::io::detail::base64_encode(ipc_message); +} + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 8507eca047e..be469dd25c5 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -26,6 +26,11 @@ auto constexpr MAX_DECIMAL32_PRECISION = 9; auto constexpr MAX_DECIMAL64_PRECISION = 18; auto constexpr MAX_DECIMAL128_PRECISION = 38; // log10(2^(sizeof(int128_t) * 8 - 1) - 1) +// Constants copied from arrow source and renamed to match the case +constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL = sizeof(int32_t); +constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t); +constexpr int32_t IPC_CONTINUATION_TOKEN = -1; + /** * @brief Basic data types in Parquet, determines how data is physically stored */ diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index eb653c6b9ac..565dc2e02f2 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -786,11 +786,6 @@ void aggregate_reader_metadata::apply_arrow_schema() std::optional aggregate_reader_metadata::decode_ipc_message( std::string_view const serialized_message) const { - // Constants copied from arrow source and renamed to match the case - constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL = sizeof(int32_t); - constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t); - constexpr int32_t IPC_CONTINUATION_TOKEN = -1; - // message buffer auto message_buf = serialized_message.data(); // current message (buffer) size diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index 9aeb19a7723..8b0f59ef33d 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -117,6 +117,9 @@ struct metadata : public FileMetaData { void sanitize_schema(); }; +/** + * @brief Class to extract data types from arrow schema tree + */ struct arrow_schema_data_types { std::vector children; data_type type{type_id::EMPTY}; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index c67c02655bb..607b1f8b0f5 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -19,6 +19,7 @@ * @brief cuDF-IO parquet writer class implementation */ +#include "arrow_schema_writer.hpp" #include "compact_protocol_reader.hpp" #include "compact_protocol_writer.hpp" #include "io/comp/nvcomp_adapter.hpp" @@ -69,22 +70,6 @@ namespace cudf::io::parquet::detail { using namespace cudf::io::detail; -/** - * @brief Construct and return arrow schema ipc message from input parquet schema - * - * Recursively traverses through parquet schema to construct arrow schema tree. - * The resulting schema tree is serialized and stored as the header (or metadata) of - * an otherwise empty ipc message using flatbuffers. The ipc message is then prepended - * with header (metadata) size (padded for 16 byte alignment) and a continuation - * string. The final string is base64 encoded and returned to be stored at the keyvalue - * metadata section of the Parquet file footer. - */ -std::string construct_arrow_schema_ipc_message(host_span parquet_schema) -{ - // TODO: dummy return empty string for now - return cudf::io::detail::base64_encode(""); -} - struct aggregate_writer_metadata { aggregate_writer_metadata(host_span partitions, host_span const> kv_md, @@ -115,7 +100,7 @@ struct aggregate_writer_metadata { [](auto const& kv) { return KeyValue{kv.first, kv.second}; }); - // Append arrow schema to the key_value_metadata + // Append arrow schema to the key-value metadata if (write_arrow_schema and not arrow_schema_ipc_message.empty()) { this->files[p].key_value_metadata.emplace_back( KeyValue{"ARROW:schema", std::move(arrow_schema_ipc_message)}); From 9aca785066c4f079d052a13559b610369ff4c4d8 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 23 May 2024 07:19:25 +0000 Subject: [PATCH 03/45] Add basic artifacts to construct the field vector. --- cpp/src/io/parquet/arrow_schema_writer.hpp | 45 +++++++++++++++++----- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index 340e99c82a0..24ebbc61812 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -29,6 +29,7 @@ #include #include +#include #include #include @@ -76,18 +77,41 @@ class FieldPosition { int _depth; }; -std::vector make_field_offsets(host_span parquet_schema) +struct dispatch_to_flatbuf_type {}; + +std::vector make_field_offsets(FlatBufferBuilder& fbb, + host_span parquet_schema) { // MH: Get here std::vector field_offsets; - FieldPosition pos; + [[maybe_unused]] FieldPosition pos; + + // Create flatbuffer Fields and insert in field offsets vector + std::transform(parquet_schema.begin(), + parquet_schema.end(), + std::back_inserter(field_offsets), + [&](auto schema_elem) { + auto fb_name = fbb.CreateString(schema_elem.name); + auto is_nullable = + schema_elem.repetition_type == FieldRepetitionType::OPTIONAL or + schema_elem.repetition_type == FieldRepetitionType::REPEATED; + + auto type_type = flatbuf::Type_NONE; + Offset type_offset; + + DictionaryOffset dictionary = 0; + + std::vector children{}; + auto fb_children = fbb.CreateVector(children.data(), children.size()); + // cudf::type_dispatcher( + // schema_elem.type, dispatch_to_flatbuf_type{}, schema_elem, type_offset, + // children); + + // push to field offsets vector + return flatbuf::CreateField( + fbb, fb_name, is_nullable, type_type, type_offset, dictionary, fb_children); + }); - for (size_type i = 0; i < static_cast(parquet_schema.size()); ++i) { - FieldOffset offset; - // FieldToFlatbufferVisitor field_visitor(fbb, mapper, pos.child(i)); - // field_visitor.GetResult(schema.field(i), &offset); - field_offsets.push_back(offset); - } return field_offsets; } @@ -110,7 +134,8 @@ std::string construct_arrow_schema_ipc_message(host_span pa }; FlatBufferBuilder fbb; - auto fb_offsets = fbb.CreateVector(make_field_offsets(parquet_schema)); + auto field_offsets = make_field_offsets(fbb, parquet_schema); + auto fb_offsets = fbb.CreateVector(field_offsets); flatbuffers::Offset const fb_schema = flatbuf::CreateSchema(fbb, flatbuf::Endianness::Endianness_Little, fb_offsets); @@ -119,7 +144,7 @@ std::string construct_arrow_schema_ipc_message(host_span pa flatbuf::MetadataVersion_V5, flatbuf::MessageHeader_Schema, fb_schema.Union(), - 0 /* body_length */); + 0 /* body_length = 0 */); fbb.Finish(ipc_message_flatbuffer); int32_t metadata_len = fbb.GetSize(); From de0fc403520edd7a6713f7a0b2dbe6a60fb202dc Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 24 May 2024 17:39:35 +0000 Subject: [PATCH 04/45] Add artifacts for arrow schema in pq writer --- cpp/CMakeLists.txt | 1 + cpp/src/io/parquet/arrow_schema_writer.cpp | 159 +++++++++++++++++++++ cpp/src/io/parquet/arrow_schema_writer.hpp | 103 +------------ 3 files changed, 163 insertions(+), 100 deletions(-) create mode 100644 cpp/src/io/parquet/arrow_schema_writer.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7390c465ccb..dd3ffe3bc12 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -407,6 +407,7 @@ add_library( src/io/orc/stripe_init.cu src/datetime/timezone.cpp src/io/orc/writer_impl.cu + src/io/parquet/arrow_schema_writer.cpp src/io/parquet/compact_protocol_reader.cpp src/io/parquet/compact_protocol_writer.cpp src/io/parquet/decode_preprocess.cu diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp new file mode 100644 index 00000000000..55e1a081c15 --- /dev/null +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file arrow_schema.cpp + * @brief Arrow IPC schema writer implementation + */ + +#include "arrow_schema_writer.hpp" + +namespace cudf::io::parquet::detail { + +// Helper class copied over from Arrow source +class FieldPosition { + public: + FieldPosition() : parent_(nullptr), _index(-1), _depth(0) {} + + FieldPosition child(int index) const { return {this, index}; } + + std::vector path() const + { + std::vector path(_depth); + const FieldPosition* cur = this; + for (int i = _depth - 1; i >= 0; --i) { + path[i] = cur->_index; + cur = cur->parent_; + } + return path; + } + + protected: + FieldPosition(const FieldPosition* parent, int index) + : parent_(parent), _index(index), _depth(parent->_depth + 1) + { + } + + const FieldPosition* parent_; + int _index; + int _depth; +}; + +// Functor for cudf to flatbuf::type conversion +struct dispatch_to_flatbuf_type {}; + +/** + * @brief Construct and return arrow schema from input parquet schema + * + * Recursively traverses through parquet schema to construct the arrow schema tree. + * Serializes the arrow schema tree and stores it as the header (or metadata) of + * an otherwise empty ipc message using flatbuffers. The ipc message is then prepended + * with header size (padded for 16 byte alignment) and a continuation string. The final + * string is base64 encoded and returned. + */ +std::string construct_arrow_schema_ipc_message(host_span parquet_schema) +{ + // intantiate a flatbuffer builder + FlatBufferBuilder fbb; + + // Lambda function to construct a tree of arrow schema fields + std::function make_arrow_schema_fields = + [&](FieldPosition pos, int32_t const schema_idx) -> FieldOffset { + SchemaElement const schema_elem = parquet_schema[schema_idx]; + + std::vector children{}; + + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(schema_elem.num_children), + std::back_inserter(children), + [&](auto const idx) { + return make_arrow_schema_fields(pos.child(idx), schema_elem.children_idx[idx]); + }); + + auto type_type = flatbuf::Type_NONE; + Offset type_offset; + + // TODO: Implement functor + /*cudf::type_dispatcher(schema_elem.arrow_type.value_or(type_id::EMPTY), + dispatch_to_flatbuf_type{}, + schema_elem, + type_offset, + type_type, + children);*/ + + auto const fb_name = fbb.CreateString(schema_elem.name); + auto const fb_children = fbb.CreateVector(children.data(), children.size()); + auto const is_nullable = schema_elem.repetition_type == FieldRepetitionType::OPTIONAL or + schema_elem.repetition_type == FieldRepetitionType::REPEATED; + DictionaryOffset dictionary = 0; + + // push to field offsets vector + return flatbuf::CreateField( + fbb, fb_name, is_nullable, type_type, type_offset, dictionary, fb_children); + }; + + // Lambda function to convert int32 to a string of uint8 bytes + auto const convert_int32_to_byte_string = [&](int32_t const value) { + std::array buffer; + std::memcpy(buffer.data(), &value, sizeof(int32_t)); + return std::string(reinterpret_cast(buffer.data()), buffer.size()); + }; + + // TODO: What to do with this? + [[maybe_unused]] FieldPosition pos; + std::vector field_offsets; + + // populate field offsets (aka schema fields) + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(parquet_schema[0].num_children), + std::back_inserter(field_offsets), + [&](auto const idx) { + return make_arrow_schema_fields(pos.child(idx), + parquet_schema[0].children_idx[idx]); + }); + + // Create a flatbuffer vector from the field offset vector + auto const fb_offsets = fbb.CreateVector(field_offsets); + + // Create an arrow:schema flatbuffer + flatbuffers::Offset const fb_schema = + flatbuf::CreateSchema(fbb, flatbuf::Endianness::Endianness_Little, fb_offsets); + + // Schema type message has zero length body + constexpr int64_t bodylength = 0; + + // Create an ipc message flatbuffer + auto const ipc_message_flatbuffer = flatbuf::CreateMessage( + fbb, flatbuf::MetadataVersion_V5, flatbuf::MessageHeader_Schema, fb_schema.Union(), bodylength); + + // All done, finish building flatbuffers + fbb.Finish(ipc_message_flatbuffer); + + // Since the ipc message doesn't have a body or other custom key value metadata, + // its size is equal to the size of its header (the schema flatbuffer) + int32_t const metadata_len = fbb.GetSize(); + + // Construct the final string and store in this variable here to use in base64_encode + std::string const ipc_message = + convert_int32_to_byte_string(IPC_CONTINUATION_TOKEN) + + convert_int32_to_byte_string(metadata_len) + + std::string(reinterpret_cast(fbb.GetBufferPointer()), metadata_len); + + // Encode the final ipc message string to base64 and return + return cudf::io::detail::base64_encode(ipc_message); +} + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index 24ebbc61812..c9217b8d376 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -31,6 +31,8 @@ #include #include +#include + #include #include #include @@ -49,72 +51,6 @@ using FieldOffset = flatbuffers::Offset; using Offset = flatbuffers::Offset; using FBString = flatbuffers::Offset; -class FieldPosition { - public: - FieldPosition() : parent_(nullptr), _index(-1), _depth(0) {} - - FieldPosition child(int index) const { return {this, index}; } - - std::vector path() const - { - std::vector path(_depth); - const FieldPosition* cur = this; - for (int i = _depth - 1; i >= 0; --i) { - path[i] = cur->_index; - cur = cur->parent_; - } - return path; - } - - protected: - FieldPosition(const FieldPosition* parent, int index) - : parent_(parent), _index(index), _depth(parent->_depth + 1) - { - } - - const FieldPosition* parent_; - int _index; - int _depth; -}; - -struct dispatch_to_flatbuf_type {}; - -std::vector make_field_offsets(FlatBufferBuilder& fbb, - host_span parquet_schema) -{ - // MH: Get here - std::vector field_offsets; - [[maybe_unused]] FieldPosition pos; - - // Create flatbuffer Fields and insert in field offsets vector - std::transform(parquet_schema.begin(), - parquet_schema.end(), - std::back_inserter(field_offsets), - [&](auto schema_elem) { - auto fb_name = fbb.CreateString(schema_elem.name); - auto is_nullable = - schema_elem.repetition_type == FieldRepetitionType::OPTIONAL or - schema_elem.repetition_type == FieldRepetitionType::REPEATED; - - auto type_type = flatbuf::Type_NONE; - Offset type_offset; - - DictionaryOffset dictionary = 0; - - std::vector children{}; - auto fb_children = fbb.CreateVector(children.data(), children.size()); - // cudf::type_dispatcher( - // schema_elem.type, dispatch_to_flatbuf_type{}, schema_elem, type_offset, - // children); - - // push to field offsets vector - return flatbuf::CreateField( - fbb, fb_name, is_nullable, type_type, type_offset, dictionary, fb_children); - }); - - return field_offsets; -} - /** * @brief Construct and return arrow schema from input parquet schema * @@ -124,39 +60,6 @@ std::vector make_field_offsets(FlatBufferBuilder& fbb, * with header size (padded for 16 byte alignment) and a continuation string. The final * string is base64 encoded and returned. */ -std::string construct_arrow_schema_ipc_message(host_span parquet_schema) -{ - // Lambda function to convert int32 to a string of uint8 bytes - auto const convert_int32_to_byte_string = [&](int32_t const value) { - std::array buffer; - std::memcpy(buffer.data(), &value, sizeof(int32_t)); - return std::string(reinterpret_cast(buffer.data()), buffer.size()); - }; - - FlatBufferBuilder fbb; - auto field_offsets = make_field_offsets(fbb, parquet_schema); - auto fb_offsets = fbb.CreateVector(field_offsets); - - flatbuffers::Offset const fb_schema = - flatbuf::CreateSchema(fbb, flatbuf::Endianness::Endianness_Little, fb_offsets); - - auto const ipc_message_flatbuffer = flatbuf::CreateMessage(fbb, - flatbuf::MetadataVersion_V5, - flatbuf::MessageHeader_Schema, - fb_schema.Union(), - 0 /* body_length = 0 */); - fbb.Finish(ipc_message_flatbuffer); - - int32_t metadata_len = fbb.GetSize(); - - // Store the final string here to pass its view to base64_encode - std::string ipc_message = - convert_int32_to_byte_string(IPC_CONTINUATION_TOKEN) + - convert_int32_to_byte_string(metadata_len) + - std::string(reinterpret_cast(fbb.GetBufferPointer()), metadata_len); - - // encode the final ipc message to base64 and return - return cudf::io::detail::base64_encode(ipc_message); -} +std::string construct_arrow_schema_ipc_message(host_span parquet_schema); } // namespace cudf::io::parquet::detail From 497727ebce22a3fae86b382ad1a1ad8b5c275479 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 24 May 2024 17:44:27 +0000 Subject: [PATCH 05/45] merge with upstream --- python/cudf/cudf/io/parquet.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index ac8b79424b3..25647d16271 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -107,6 +107,11 @@ def _write_parquet( "force_nullable_schema": force_nullable_schema, "header_version": header_version, "use_dictionary": use_dictionary, + "skip_compression": skip_compression, + "column_encoding": column_encoding, + "column_type_length": column_type_length, + "output_as_binary": output_as_binary, + "write_arrow_schema": write_arrow_schema, } if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs): with ExitStack() as stack: @@ -953,6 +958,11 @@ def to_parquet( force_nullable_schema=False, header_version="1.0", use_dictionary=True, + skip_compression=None, + column_encoding=None, + column_type_length=None, + output_as_binary=None, + store_schema=True, *args, **kwargs, ): @@ -1036,6 +1046,11 @@ def to_parquet( force_nullable_schema=force_nullable_schema, header_version=header_version, use_dictionary=use_dictionary, + skip_compression=skip_compression, + column_encoding=column_encoding, + column_type_length=column_type_length, + output_as_binary=output_as_binary, + write_arrow_schema=store_schema, ) else: From d166fe6f367c36db86e460c095267296df66ac90 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Sat, 25 May 2024 02:23:49 +0000 Subject: [PATCH 06/45] Workin arrow schema builder. Need to handle nested_types and dict32 --- cpp/src/io/parquet/arrow_schema_writer.cpp | 352 +++++++++++++++++---- cpp/src/io/parquet/arrow_schema_writer.hpp | 13 +- cpp/src/io/parquet/writer_impl.cu | 15 +- 3 files changed, 317 insertions(+), 63 deletions(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index 55e1a081c15..9f55a61f630 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -23,10 +23,39 @@ namespace cudf::io::parquet::detail { -// Helper class copied over from Arrow source +class FieldPosition; + +/** + * @brief Function to construct a tree of arrow schema fields + */ +FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, + FieldPosition field_position, + cudf::detail::LinkedColPtr const& col, + column_in_metadata const& col_meta, + single_write_mode const write_mode, + bool const utc_timestamps); + +// TODO: Copied over from ``writer_impl.cu``. Need to placed at a common location to avoid +// duplication. +inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col, + column_in_metadata const& col_meta, + single_write_mode write_mode) +{ + if (col_meta.is_nullability_defined()) { + CUDF_EXPECTS(col_meta.nullable() or col->null_count() == 0, + "Mismatch in metadata prescribed nullability and input column. " + "Metadata for input column with nulls cannot prescribe nullability = false"); + return col_meta.nullable(); + } + // For chunked write, when not provided nullability, we assume the worst case scenario + // that all columns are nullable. + return write_mode == single_write_mode::NO or col->nullable(); +} + +// Helper class copied over from Arrow source. Do we need it even? class FieldPosition { public: - FieldPosition() : parent_(nullptr), _index(-1), _depth(0) {} + FieldPosition() : _parent(nullptr), _index(-1), _depth(0) {} FieldPosition child(int index) const { return {this, index}; } @@ -36,24 +65,271 @@ class FieldPosition { const FieldPosition* cur = this; for (int i = _depth - 1; i >= 0; --i) { path[i] = cur->_index; - cur = cur->parent_; + cur = cur->_parent; } return path; } protected: FieldPosition(const FieldPosition* parent, int index) - : parent_(parent), _index(index), _depth(parent->_depth + 1) + : _parent(parent), _index(index), _depth(parent->_depth + 1) { } - const FieldPosition* parent_; + const FieldPosition* _parent; int _index; int _depth; }; -// Functor for cudf to flatbuf::type conversion -struct dispatch_to_flatbuf_type {}; +/** + * @brief Functor to convert cudf column metadata to arrow schema + */ +struct dispatch_to_flatbuf { + FlatBufferBuilder& fbb; + cudf::detail::LinkedColPtr const& col; + column_in_metadata const& col_meta; + single_write_mode const write_mode; + bool const utc_timestamps; + FieldPosition& field_position; + Offset& field_offset; + flatbuf::Type& type_type; + std::vector& children; + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Bool; + field_offset = flatbuf::CreateBool(fbb).Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 8, true).Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 16, true).Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 32, true).Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 64, true).Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 8, false).Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 16, false).Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 32, false).Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 64, false).Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_FloatingPoint; + field_offset = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_SINGLE).Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_FloatingPoint; + field_offset = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_DOUBLE).Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Utf8View; + field_offset = flatbuf::CreateUtf8View(fbb).Union(); + } + + template + std::enable_if_t or std::is_same_v, + void> + operator()() + { + type_type = flatbuf::Type_Timestamp; + field_offset = flatbuf::CreateTimestamp( + fbb, flatbuf::TimeUnit_SECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) + .Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Timestamp; + field_offset = + flatbuf::CreateTimestamp( + fbb, flatbuf::TimeUnit_MILLISECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) + .Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Timestamp; + field_offset = + flatbuf::CreateTimestamp( + fbb, flatbuf::TimeUnit_MICROSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) + .Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Timestamp; + field_offset = + flatbuf::CreateTimestamp( + fbb, flatbuf::TimeUnit_NANOSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) + .Union(); + } + + template + std::enable_if_t or std::is_same_v, void> + operator()() + { + type_type = flatbuf::Type_Duration; + field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_SECOND).Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Duration; + field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MILLISECOND).Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Duration; + field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MICROSECOND).Union(); + } + + template + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Duration; + field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_NANOSECOND).Union(); + } + + template + std::enable_if_t(), void> operator()() + { + if (std::is_same_v) { + type_type = flatbuf::Type_Decimal; + field_offset = + flatbuf::CreateDecimal(fbb, col_meta.get_decimal_precision(), col->type().scale(), 128) + .Union(); + } else { + CUDF_FAIL("fixed point type other than decimal128 not supported for arrow schema"); + } + } + + template + std::enable_if_t(), void> operator()() + { + // TODO: Handle list and struct types. Remember, Lists are different in arrow schema and PQ + // schema pq schema. List in PQ schema: "column_name" : { "list" : { "element" }} in + // List in arrow schema: "column_name" : { "list" } + // TODO: Arrow expects only 1 child for Lists and Structs. How and Why? + std::transform(thrust::make_counting_iterator(0ul), + thrust::make_counting_iterator(col->children.size()), + std::back_inserter(children), + [&](auto const idx) { + return make_arrow_schema_fields(fbb, + field_position.child(idx), + col->children[idx], + col_meta.child(idx), + write_mode, + utc_timestamps); + }); + + if (std::is_same_v) { + type_type = flatbuf::Type_List; + field_offset = flatbuf::CreateList(fbb).Union(); + } else if (std::is_same_v) { + type_type = flatbuf::Type_Struct_; + field_offset = flatbuf::CreateStruct_(fbb).Union(); + } else { + CUDF_FAIL("Unexpected nested type"); + } + } + + template + std::enable_if_t(), void> operator()() + { + CUDF_FAIL("Dictionary columns are not supported for writing"); + } +}; + +FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, + FieldPosition field_position, + cudf::detail::LinkedColPtr const& col, + column_in_metadata const& col_meta, + single_write_mode const write_mode, + bool const utc_timestamps) +{ + Offset field_offset = 0; + flatbuf::Type type_type = flatbuf::Type_NONE; + std::vector children; + + cudf::type_dispatcher(col->type(), + dispatch_to_flatbuf{fbb, + col, + col_meta, + write_mode, + utc_timestamps, + field_position, + field_offset, + type_type, + children}); + + auto const fb_name = fbb.CreateString(col_meta.get_name()); + auto const fb_children = fbb.CreateVector(children.data(), children.size()); + auto const is_nullable = is_col_nullable(col, col_meta, write_mode); + DictionaryOffset dictionary = 0; + + // push to field offsets vector + return flatbuf::CreateField( + fbb, fb_name, is_nullable, type_type, field_offset, dictionary, fb_children); +} /** * @brief Construct and return arrow schema from input parquet schema @@ -64,47 +340,11 @@ struct dispatch_to_flatbuf_type {}; * with header size (padded for 16 byte alignment) and a continuation string. The final * string is base64 encoded and returned. */ -std::string construct_arrow_schema_ipc_message(host_span parquet_schema) +std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, + table_input_metadata const& metadata, + single_write_mode const write_mode, + bool const utc_timestamps) { - // intantiate a flatbuffer builder - FlatBufferBuilder fbb; - - // Lambda function to construct a tree of arrow schema fields - std::function make_arrow_schema_fields = - [&](FieldPosition pos, int32_t const schema_idx) -> FieldOffset { - SchemaElement const schema_elem = parquet_schema[schema_idx]; - - std::vector children{}; - - std::transform(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(schema_elem.num_children), - std::back_inserter(children), - [&](auto const idx) { - return make_arrow_schema_fields(pos.child(idx), schema_elem.children_idx[idx]); - }); - - auto type_type = flatbuf::Type_NONE; - Offset type_offset; - - // TODO: Implement functor - /*cudf::type_dispatcher(schema_elem.arrow_type.value_or(type_id::EMPTY), - dispatch_to_flatbuf_type{}, - schema_elem, - type_offset, - type_type, - children);*/ - - auto const fb_name = fbb.CreateString(schema_elem.name); - auto const fb_children = fbb.CreateVector(children.data(), children.size()); - auto const is_nullable = schema_elem.repetition_type == FieldRepetitionType::OPTIONAL or - schema_elem.repetition_type == FieldRepetitionType::REPEATED; - DictionaryOffset dictionary = 0; - - // push to field offsets vector - return flatbuf::CreateField( - fbb, fb_name, is_nullable, type_type, type_offset, dictionary, fb_children); - }; - // Lambda function to convert int32 to a string of uint8 bytes auto const convert_int32_to_byte_string = [&](int32_t const value) { std::array buffer; @@ -112,17 +352,23 @@ std::string construct_arrow_schema_ipc_message(host_span pa return std::string(reinterpret_cast(buffer.data()), buffer.size()); }; - // TODO: What to do with this? - [[maybe_unused]] FieldPosition pos; + // intantiate a flatbuffer builder + FlatBufferBuilder fbb; + + FieldPosition field_position; std::vector field_offsets; // populate field offsets (aka schema fields) - std::transform(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(parquet_schema[0].num_children), + std::transform(thrust::make_counting_iterator(0ul), + thrust::make_counting_iterator(linked_columns.size()), std::back_inserter(field_offsets), [&](auto const idx) { - return make_arrow_schema_fields(pos.child(idx), - parquet_schema[0].children_idx[idx]); + return make_arrow_schema_fields(fbb, + field_position.child(idx), + linked_columns[idx], + metadata.column_metadata[idx], + write_mode, + utc_timestamps); }); // Create a flatbuffer vector from the field offset vector diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index c9217b8d376..bc5ddbaa27e 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -21,14 +21,18 @@ #pragma once -#include "io/parquet/parquet.hpp" #include "io/parquet/parquet_common.hpp" #include "io/utilities/base64_utilities.hpp" #include "ipc/Message_generated.h" #include "ipc/Schema_generated.h" +#include +#include +#include +#include +#include #include -#include +#include #include #include @@ -60,6 +64,9 @@ using FBString = flatbuffers::Offset; * with header size (padded for 16 byte alignment) and a continuation string. The final * string is base64 encoded and returned. */ -std::string construct_arrow_schema_ipc_message(host_span parquet_schema); +std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, + table_input_metadata const& metadata, + single_write_mode write_mode, + bool utc_timestamps); } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 607b1f8b0f5..60b1970e979 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -26,7 +26,6 @@ #include "io/parquet/parquet.hpp" #include "io/parquet/parquet_gpu.hpp" #include "io/statistics/column_statistics.cuh" -#include "io/utilities/base64_utilities.hpp" #include "io/utilities/column_utils.cuh" #include "io/utilities/config_utils.hpp" #include "ipc/Message_generated.h" @@ -76,7 +75,7 @@ struct aggregate_writer_metadata { host_span tbl_schema, size_type num_columns, statistics_freq stats_granularity, - bool const write_arrow_schema) + std::string const arrow_schema_ipc_message) : version(1), schema(std::vector(tbl_schema.begin(), tbl_schema.end())), files(partitions.size()) @@ -90,9 +89,6 @@ struct aggregate_writer_metadata { this->column_orders = std::vector(num_columns, default_order); } - // Construct the arrow schema ipc message string. - auto const arrow_schema_ipc_message = construct_arrow_schema_ipc_message(schema); - for (size_t p = 0; p < kv_md.size(); ++p) { std::transform(kv_md[p].begin(), kv_md[p].end(), @@ -101,7 +97,7 @@ struct aggregate_writer_metadata { return KeyValue{kv.first, kv.second}; }); // Append arrow schema to the key-value metadata - if (write_arrow_schema and not arrow_schema_ipc_message.empty()) { + if (not arrow_schema_ipc_message.empty()) { this->files[p].key_value_metadata.emplace_back( KeyValue{"ARROW:schema", std::move(arrow_schema_ipc_message)}); } @@ -1883,7 +1879,12 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, std::unique_ptr agg_meta; if (!curr_agg_meta) { agg_meta = std::make_unique( - partitions, kv_meta, this_table_schema, num_columns, stats_granularity, write_arrow_schema); + partitions, + kv_meta, + this_table_schema, + num_columns, + stats_granularity, + construct_arrow_schema_ipc_message(vec, table_meta, write_mode, utc_timestamps)); } else { agg_meta = std::make_unique(*curr_agg_meta); From 7dad37bed8763788e019d59e6dda04ea2f5ebb64 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 29 May 2024 00:34:05 +0000 Subject: [PATCH 07/45] Handle structs and lists --- cpp/src/io/parquet/arrow_schema_writer.cpp | 103 ++++++------ cpp/src/io/parquet/arrow_schema_writer.hpp | 4 +- cpp/src/io/parquet/parquet_common.hpp | 3 + python/cudf/cudf/tests/test_parquet.py | 174 ++++++++++++++++++--- 4 files changed, 213 insertions(+), 71 deletions(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index 9f55a61f630..1452bbea8f9 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -52,7 +52,7 @@ inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col, return write_mode == single_write_mode::NO or col->nullable(); } -// Helper class copied over from Arrow source. Do we need it even? +// TODO: Helper class copied over from Arrow source. Do we need it even? class FieldPosition { public: FieldPosition() : _parent(nullptr), _index(-1), _depth(0) {} @@ -184,7 +184,8 @@ struct dispatch_to_flatbuf { void> operator()() { - type_type = flatbuf::Type_Timestamp; + type_type = flatbuf::Type_Timestamp; + // TODO: Verify if this is the correct logic field_offset = flatbuf::CreateTimestamp( fbb, flatbuf::TimeUnit_SECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) .Union(); @@ -194,6 +195,7 @@ struct dispatch_to_flatbuf { std::enable_if_t, void> operator()() { type_type = flatbuf::Type_Timestamp; + // TODO: Verify if this is the correct logic for UTC field_offset = flatbuf::CreateTimestamp( fbb, flatbuf::TimeUnit_MILLISECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) @@ -204,6 +206,7 @@ struct dispatch_to_flatbuf { std::enable_if_t, void> operator()() { type_type = flatbuf::Type_Timestamp; + // TODO: Verify if this is the correct logic for UTC field_offset = flatbuf::CreateTimestamp( fbb, flatbuf::TimeUnit_MICROSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) @@ -214,6 +217,7 @@ struct dispatch_to_flatbuf { std::enable_if_t, void> operator()() { type_type = flatbuf::Type_Timestamp; + // TODO: Verify if this is the correct logic for UTC field_offset = flatbuf::CreateTimestamp( fbb, flatbuf::TimeUnit_NANOSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) @@ -252,43 +256,52 @@ struct dispatch_to_flatbuf { template std::enable_if_t(), void> operator()() { + // TODO: cuDF-PQ writer supports d32 and d64 types not supported by Arrow without conversion. + // See more: https://github.com/rapidsai/cudf/blob/branch-24.08/cpp/src/interop/to_arrow.cu#L155 + // if (std::is_same_v) { type_type = flatbuf::Type_Decimal; field_offset = flatbuf::CreateDecimal(fbb, col_meta.get_decimal_precision(), col->type().scale(), 128) .Union(); } else { - CUDF_FAIL("fixed point type other than decimal128 not supported for arrow schema"); + // TODO: Should we fail or just not write arrow:schema anymore? + CUDF_FAIL("Fixed point types other than decimal128 are not supported for arrow schema"); } } template std::enable_if_t(), void> operator()() { - // TODO: Handle list and struct types. Remember, Lists are different in arrow schema and PQ - // schema pq schema. List in PQ schema: "column_name" : { "list" : { "element" }} in - // List in arrow schema: "column_name" : { "list" } - // TODO: Arrow expects only 1 child for Lists and Structs. How and Why? - std::transform(thrust::make_counting_iterator(0ul), - thrust::make_counting_iterator(col->children.size()), - std::back_inserter(children), - [&](auto const idx) { - return make_arrow_schema_fields(fbb, - field_position.child(idx), - col->children[idx], - col_meta.child(idx), + // Lists are represented differently in arrow and cuDF. + // cuDF representation: List: "col_name" : { "list" : { "element" }} (2 children) + // arrow schema representation: List: "col_name" : { "list" } (1 child) + if constexpr (std::is_same_v) { + // Only need to process the second child (at idx = 1) + children.emplace_back(make_arrow_schema_fields(fbb, + field_position.child(0), + col->children[1], + col_meta.child(1), write_mode, - utc_timestamps); - }); - - if (std::is_same_v) { + utc_timestamps)); type_type = flatbuf::Type_List; field_offset = flatbuf::CreateList(fbb).Union(); - } else if (std::is_same_v) { + } + // Traverse the struct in DFS manner and process children fields. + else if constexpr (std::is_same_v) { + std::transform(thrust::make_counting_iterator(0UL), + thrust::make_counting_iterator(col->children.size()), + std::back_inserter(children), + [&](auto const idx) { + return make_arrow_schema_fields(fbb, + field_position.child(idx), + col->children[idx], + col_meta.child(idx), + write_mode, + utc_timestamps); + }); type_type = flatbuf::Type_Struct_; field_offset = flatbuf::CreateStruct_(fbb).Union(); - } else { - CUDF_FAIL("Unexpected nested type"); } } @@ -352,10 +365,13 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con return std::string(reinterpret_cast(buffer.data()), buffer.size()); }; - // intantiate a flatbuffer builder + // Intantiate a flatbuffer builder FlatBufferBuilder fbb; + // Instantiate a field position mapper struct (not sure if needed yet?) FieldPosition field_position; + + // Create an empty field offset vector std::vector field_offsets; // populate field offsets (aka schema fields) @@ -371,32 +387,25 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con utc_timestamps); }); - // Create a flatbuffer vector from the field offset vector - auto const fb_offsets = fbb.CreateVector(field_offsets); - - // Create an arrow:schema flatbuffer - flatbuffers::Offset const fb_schema = - flatbuf::CreateSchema(fbb, flatbuf::Endianness::Endianness_Little, fb_offsets); - - // Schema type message has zero length body - constexpr int64_t bodylength = 0; - - // Create an ipc message flatbuffer - auto const ipc_message_flatbuffer = flatbuf::CreateMessage( - fbb, flatbuf::MetadataVersion_V5, flatbuf::MessageHeader_Schema, fb_schema.Union(), bodylength); - - // All done, finish building flatbuffers - fbb.Finish(ipc_message_flatbuffer); - - // Since the ipc message doesn't have a body or other custom key value metadata, - // its size is equal to the size of its header (the schema flatbuffer) - int32_t const metadata_len = fbb.GetSize(); - - // Construct the final string and store in this variable here to use in base64_encode + // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to + // create an ipc message flatbuffer + fbb.Finish(flatbuf::CreateMessage( + fbb, + flatbuf::MetadataVersion_V5, /* Metadata version V5 (latest) */ + flatbuf::MessageHeader_Schema, /* Schema type message header */ + flatbuf::CreateSchema( + fbb, flatbuf::Endianness::Endianness_Little, fbb.CreateVector(field_offsets)) + .Union(), /* Build an arrow:schema from the field vector */ + SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH /* Body length is zero for schema type ipc message */ + )); + + // Construct the final string and store it here to use its view in base64_encode std::string const ipc_message = convert_int32_to_byte_string(IPC_CONTINUATION_TOKEN) + - convert_int32_to_byte_string(metadata_len) + - std::string(reinterpret_cast(fbb.GetBufferPointer()), metadata_len); + // Since the schema type ipc message doesn't have a body, the flatbuffer size is equal to the + // ipc message's metadata length + convert_int32_to_byte_string(fbb.GetSize()) + + std::string(reinterpret_cast(fbb.GetBufferPointer()), fbb.GetSize()); // Encode the final ipc message string to base64 and return return cudf::io::detail::base64_encode(ipc_message); diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index bc5ddbaa27e..29db9f05df4 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -66,7 +66,7 @@ using FBString = flatbuffers::Offset; */ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, table_input_metadata const& metadata, - single_write_mode write_mode, - bool utc_timestamps); + single_write_mode const write_mode, + bool const utc_timestamps); } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index be469dd25c5..69c0a89fd57 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -31,6 +31,9 @@ constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL = sizeof(in constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t); constexpr int32_t IPC_CONTINUATION_TOKEN = -1; +// Schema type ipc message has zero length body +constexpr int64_t SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH = 0; + /** * @brief Basic data types in Parquet, determines how data is physically stored */ diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index e32fdacd8d6..0776a3a6ada 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -3372,31 +3372,28 @@ def test_parquet_reader_roundtrip_with_arrow_schema(): def test_parquet_reader_roundtrip_structs_with_arrow_schema(): # Ensure that the structs with duration types are faithfully being # roundtripped across Parquet with arrow schema - pdf = pd.DataFrame( - { - "struct": { - "payload": { - "Domain": { - "Name": "abc", - "Id": {"Name": "host", "Value": "127.0.0.8"}, - "Duration": datetime.timedelta(minutes=12), - }, - "StreamId": "12345678", - "Duration": datetime.timedelta(minutes=4), - "Offset": None, - "Resource": [ - { - "Name": "ZoneName", - "Value": "RAPIDS", - "Duration": datetime.timedelta(seconds=1), - } - ], + data = { + "payload": { + "Domain": { + "Name": "abc", + "Id": {"Name": "host", "Value": "127.0.0.8"}, + "Duration": datetime.timedelta(minutes=12), + }, + "StreamId": "12345678", + "Duration": datetime.timedelta(minutes=4), + "Offset": None, + "Resource": [ + { + "Name": "ZoneName", + "Value": "RAPIDS", + "Duration": datetime.timedelta(seconds=1), } - } + ], } - ) + } + + pdf = pd.DataFrame({"struct": pd.Series(data)}) - # Reset the buffer and write parquet with arrow buffer = BytesIO() pdf.to_parquet(buffer, engine="pyarrow") @@ -3407,3 +3404,136 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema(): # Check results assert_eq(expected, got) + + +def test_parquet_writer_roundtrip_with_arrow_schema(): + expected = cudf.DataFrame( + { + "s": cudf.Series([None, None, None], dtype="timedelta64[s]"), + "ms": cudf.Series([1234, None, 32442], dtype="timedelta64[ms]"), + "us": cudf.Series([None, 3456, None], dtype="timedelta64[us]"), + "ns": cudf.Series([1234, 3456, 32442], dtype="timedelta64[ns]"), + "duration_list": list( + [ + [ + datetime.timedelta(minutes=7, seconds=4), + datetime.timedelta(minutes=7), + ], + [ + None, + None, + ], + [ + datetime.timedelta(minutes=7, seconds=4), + None, + ], + ] + ), + "int64": cudf.Series([1234, 123, 4123], dtype="int64"), + "int64_list": list([[1, 2], [1, 2], [1, 2]]), + "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), + "map": cudf.Series(["cat", "dog", "lion"]).map( + {"cat": "kitten", "dog": "puppy", "lion": "cub"} + ), + } + ) + + buffer = BytesIO() + expected.to_parquet(buffer) + read = cudf.DataFrame.from_arrow(pq.read_table(buffer)) + + assert_eq(expected, read) + + +@pytest.mark.parametrize( + "data", + [ + # struct + [ + {"a": 1, "b": 2}, + {"a": 10, "b": 20}, + {"a": None, "b": 22}, + {"a": None, "b": None}, + {"a": 15, "b": None}, + ], + # struct-of-list + [ + {"a": 1, "b": 2, "c": [1, 2, 3]}, + {"a": 10, "b": 20, "c": [4, 5]}, + {"a": None, "b": 22, "c": [6]}, + {"a": None, "b": None, "c": None}, + {"a": 15, "b": None, "c": [-1, -2]}, + None, + {"a": 100, "b": 200, "c": [-10, None, -20]}, + ], + # list-of-struct + [ + [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}], + None, + [{"a": 10, "b": 20}], + [{"a": 100, "b": 200}, {"a": None, "b": 300}, None], + ], + # struct-of-struct + [ + {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2}, + {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4}, + {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6}, + {"a": 7, "b": None, "c": 8}, + {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None}, + None, + {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10}, + ], + # struct-with-mixed-types + [ + { + "struct": { + "payload": { + "Domain": { + "Name": "abc", + "Id": {"Name": "host", "Value": "127.0.0.8"}, + "Duration": datetime.timedelta(minutes=12), + }, + "StreamId": "12345678", + "Duration": datetime.timedelta(minutes=4), + "Offset": None, + "Resource": [ + { + "Name": "ZoneName", + "Value": "RAPIDS", + "Duration": datetime.timedelta(seconds=1), + } + ], + } + } + } + ], + ], +) +def test_parquet_writer_roundtrip_structs_with_arrow_schema(tmpdir, data): + # Ensure that the structs are faithfully being roundtripped across + # Parquet with arrow schema + pa_expected = pa.Table.from_pydict({"struct": data}) + + expected = cudf.DataFrame.from_arrow(pa_expected) + + # IO buffer + buffer = BytesIO() + + # Write expected data frame to Parquet + expected.to_parquet(buffer) + + # Read Parquet with pyarrow + pa_got = pq.read_table(buffer) + + # Check results + assert_eq(pa_expected, pa_got) + + # Convert to cuDF table and also read Parquet with cuDF reader + got = cudf.DataFrame.from_arrow(pa_got) + got2 = cudf.read_parquet(buffer) + + # Check results + assert_eq(expected, got) + assert_eq(expected, got2) + + From e1fc02ef62dcdd054c5fefb08e112304b2b2ec5f Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 29 May 2024 01:16:25 +0000 Subject: [PATCH 08/45] Remove unused code borrowed from arrow. --- cpp/src/io/parquet/arrow_schema_writer.cpp | 94 +++++----------------- cpp/src/io/parquet/writer_impl.cu | 50 ------------ python/cudf/cudf/tests/test_parquet.py | 2 - 3 files changed, 21 insertions(+), 125 deletions(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index 1452bbea8f9..3bc728abe10 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -23,13 +23,10 @@ namespace cudf::io::parquet::detail { -class FieldPosition; - /** * @brief Function to construct a tree of arrow schema fields */ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, - FieldPosition field_position, cudf::detail::LinkedColPtr const& col, column_in_metadata const& col_meta, single_write_mode const write_mode, @@ -52,35 +49,6 @@ inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col, return write_mode == single_write_mode::NO or col->nullable(); } -// TODO: Helper class copied over from Arrow source. Do we need it even? -class FieldPosition { - public: - FieldPosition() : _parent(nullptr), _index(-1), _depth(0) {} - - FieldPosition child(int index) const { return {this, index}; } - - std::vector path() const - { - std::vector path(_depth); - const FieldPosition* cur = this; - for (int i = _depth - 1; i >= 0; --i) { - path[i] = cur->_index; - cur = cur->_parent; - } - return path; - } - - protected: - FieldPosition(const FieldPosition* parent, int index) - : _parent(parent), _index(index), _depth(parent->_depth + 1) - { - } - - const FieldPosition* _parent; - int _index; - int _depth; -}; - /** * @brief Functor to convert cudf column metadata to arrow schema */ @@ -90,7 +58,6 @@ struct dispatch_to_flatbuf { column_in_metadata const& col_meta; single_write_mode const write_mode; bool const utc_timestamps; - FieldPosition& field_position; Offset& field_offset; flatbuf::Type& type_type; std::vector& children; @@ -274,16 +241,12 @@ struct dispatch_to_flatbuf { std::enable_if_t(), void> operator()() { // Lists are represented differently in arrow and cuDF. - // cuDF representation: List: "col_name" : { "list" : { "element" }} (2 children) - // arrow schema representation: List: "col_name" : { "list" } (1 child) + // cuDF representation: List: "col_name" : { "list","element : int" } (2 children) + // arrow schema representation: List: "col_name" : { "list" } (1 child) if constexpr (std::is_same_v) { // Only need to process the second child (at idx = 1) - children.emplace_back(make_arrow_schema_fields(fbb, - field_position.child(0), - col->children[1], - col_meta.child(1), - write_mode, - utc_timestamps)); + children.emplace_back(make_arrow_schema_fields( + fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps)); type_type = flatbuf::Type_List; field_offset = flatbuf::CreateList(fbb).Union(); } @@ -293,12 +256,8 @@ struct dispatch_to_flatbuf { thrust::make_counting_iterator(col->children.size()), std::back_inserter(children), [&](auto const idx) { - return make_arrow_schema_fields(fbb, - field_position.child(idx), - col->children[idx], - col_meta.child(idx), - write_mode, - utc_timestamps); + return make_arrow_schema_fields( + fbb, col->children[idx], col_meta.child(idx), write_mode, utc_timestamps); }); type_type = flatbuf::Type_Struct_; field_offset = flatbuf::CreateStruct_(fbb).Union(); @@ -308,12 +267,13 @@ struct dispatch_to_flatbuf { template std::enable_if_t(), void> operator()() { + // TODO: Implementing ``dictionary32`` would need ``DictionaryFieldMapper`` and + // ``FieldPosition`` classes from arrow source to keep track of dictionary encoding paths. CUDF_FAIL("Dictionary columns are not supported for writing"); } }; FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, - FieldPosition field_position, cudf::detail::LinkedColPtr const& col, column_in_metadata const& col_meta, single_write_mode const write_mode, @@ -323,16 +283,10 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, flatbuf::Type type_type = flatbuf::Type_NONE; std::vector children; - cudf::type_dispatcher(col->type(), - dispatch_to_flatbuf{fbb, - col, - col_meta, - write_mode, - utc_timestamps, - field_position, - field_offset, - type_type, - children}); + cudf::type_dispatcher( + col->type(), + dispatch_to_flatbuf{ + fbb, col, col_meta, write_mode, utc_timestamps, field_offset, type_type, children}); auto const fb_name = fbb.CreateString(col_meta.get_name()); auto const fb_children = fbb.CreateVector(children.data(), children.size()); @@ -365,27 +319,21 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con return std::string(reinterpret_cast(buffer.data()), buffer.size()); }; - // Intantiate a flatbuffer builder + // Instantiate a flatbuffer builder FlatBufferBuilder fbb; - // Instantiate a field position mapper struct (not sure if needed yet?) - FieldPosition field_position; - // Create an empty field offset vector std::vector field_offsets; // populate field offsets (aka schema fields) - std::transform(thrust::make_counting_iterator(0ul), - thrust::make_counting_iterator(linked_columns.size()), - std::back_inserter(field_offsets), - [&](auto const idx) { - return make_arrow_schema_fields(fbb, - field_position.child(idx), - linked_columns[idx], - metadata.column_metadata[idx], - write_mode, - utc_timestamps); - }); + std::transform( + thrust::make_counting_iterator(0ul), + thrust::make_counting_iterator(linked_columns.size()), + std::back_inserter(field_offsets), + [&](auto const idx) { + return make_arrow_schema_fields( + fbb, linked_columns[idx], metadata.column_metadata[idx], write_mode, utc_timestamps); + }); // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to // create an ipc message flatbuffer diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 60b1970e979..81fb42bee15 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -516,56 +516,6 @@ struct leaf_schema_fn { } } - /* TODO: This code block should be ``time`` type and not ``duration`` type - // unsupported outside cudf for parquet 1.0. - template - std::enable_if_t, void> operator()() - { - col_schema.type = Type::INT32; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.ts_scale = 24 * 60 * 60 * 1000; - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; - } - - template - std::enable_if_t, void> operator()() - { - col_schema.type = Type::INT32; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.ts_scale = 1000; - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; - } - - template - std::enable_if_t, void> operator()() - { - col_schema.type = Type::INT32; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; - } - - template - std::enable_if_t, void> operator()() - { - col_schema.type = Type::INT64; - col_schema.converted_type = ConvertedType::TIME_MICROS; - col_schema.stats_dtype = statistics_dtype::dtype_int64; - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}}; - } - - // unsupported outside cudf for parquet 1.0. - template - std::enable_if_t, void> operator()() - { - col_schema.type = Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_int64; - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}}; - } - */ - template std::enable_if_t, void> operator()() { diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 0776a3a6ada..b183c5ef6e5 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -3535,5 +3535,3 @@ def test_parquet_writer_roundtrip_structs_with_arrow_schema(tmpdir, data): # Check results assert_eq(expected, got) assert_eq(expected, got2) - - From 44fb0ef7d6e1ef3098ad6cce527a89c966894511 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 29 May 2024 02:37:09 +0000 Subject: [PATCH 09/45] minor improvements to tests and code --- cpp/src/io/parquet/arrow_schema_writer.cpp | 24 +++++++++---------- cpp/src/io/parquet/writer_impl.cu | 4 +--- python/cudf/cudf/tests/test_parquet.py | 27 ++++++++++++++-------- 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index 3bc728abe10..90f32a2b3cb 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -73,56 +73,56 @@ struct dispatch_to_flatbuf { std::enable_if_t, void> operator()() { type_type = flatbuf::Type_Int; - field_offset = flatbuf::CreateInt(fbb, 8, true).Union(); + field_offset = flatbuf::CreateInt(fbb, 8, std::numeric_limits::is_signed).Union(); } template std::enable_if_t, void> operator()() { type_type = flatbuf::Type_Int; - field_offset = flatbuf::CreateInt(fbb, 16, true).Union(); + field_offset = flatbuf::CreateInt(fbb, 16, std::numeric_limits::is_signed).Union(); } template std::enable_if_t, void> operator()() { type_type = flatbuf::Type_Int; - field_offset = flatbuf::CreateInt(fbb, 32, true).Union(); + field_offset = flatbuf::CreateInt(fbb, 32, std::numeric_limits::is_signed).Union(); } template std::enable_if_t, void> operator()() { type_type = flatbuf::Type_Int; - field_offset = flatbuf::CreateInt(fbb, 64, true).Union(); + field_offset = flatbuf::CreateInt(fbb, 64, std::numeric_limits::is_signed).Union(); } template std::enable_if_t, void> operator()() { type_type = flatbuf::Type_Int; - field_offset = flatbuf::CreateInt(fbb, 8, false).Union(); + field_offset = flatbuf::CreateInt(fbb, 8, std::numeric_limits::is_signed).Union(); } template std::enable_if_t, void> operator()() { type_type = flatbuf::Type_Int; - field_offset = flatbuf::CreateInt(fbb, 16, false).Union(); + field_offset = flatbuf::CreateInt(fbb, 16, std::numeric_limits::is_signed).Union(); } template std::enable_if_t, void> operator()() { type_type = flatbuf::Type_Int; - field_offset = flatbuf::CreateInt(fbb, 32, false).Union(); + field_offset = flatbuf::CreateInt(fbb, 32, std::numeric_limits::is_signed).Union(); } template std::enable_if_t, void> operator()() { type_type = flatbuf::Type_Int; - field_offset = flatbuf::CreateInt(fbb, 64, false).Union(); + field_offset = flatbuf::CreateInt(fbb, 64, std::numeric_limits::is_signed).Union(); } template @@ -226,14 +226,14 @@ struct dispatch_to_flatbuf { // TODO: cuDF-PQ writer supports d32 and d64 types not supported by Arrow without conversion. // See more: https://github.com/rapidsai/cudf/blob/branch-24.08/cpp/src/interop/to_arrow.cu#L155 // - if (std::is_same_v) { + if constexpr (std::is_same_v) { type_type = flatbuf::Type_Decimal; field_offset = flatbuf::CreateDecimal(fbb, col_meta.get_decimal_precision(), col->type().scale(), 128) .Union(); } else { // TODO: Should we fail or just not write arrow:schema anymore? - CUDF_FAIL("Fixed point types other than decimal128 are not supported for arrow schema"); + CUDF_FAIL("Fixed point types smaller than `decimal128` are not supported in arrow schema"); } } @@ -269,7 +269,7 @@ struct dispatch_to_flatbuf { { // TODO: Implementing ``dictionary32`` would need ``DictionaryFieldMapper`` and // ``FieldPosition`` classes from arrow source to keep track of dictionary encoding paths. - CUDF_FAIL("Dictionary columns are not supported for writing"); + CUDF_FAIL("Dictionary columns are not supported for writing arrow schema"); } }; @@ -327,7 +327,7 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con // populate field offsets (aka schema fields) std::transform( - thrust::make_counting_iterator(0ul), + thrust::make_counting_iterator(0UL), thrust::make_counting_iterator(linked_columns.size()), std::back_inserter(field_offsets), [&](auto const idx) { diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 81fb42bee15..0c7798b9b18 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -28,8 +28,6 @@ #include "io/statistics/column_statistics.cuh" #include "io/utilities/column_utils.cuh" #include "io/utilities/config_utils.hpp" -#include "ipc/Message_generated.h" -#include "ipc/Schema_generated.h" #include "parquet_common.hpp" #include "parquet_gpu.cuh" #include "writer_impl.hpp" @@ -99,7 +97,7 @@ struct aggregate_writer_metadata { // Append arrow schema to the key-value metadata if (not arrow_schema_ipc_message.empty()) { this->files[p].key_value_metadata.emplace_back( - KeyValue{"ARROW:schema", std::move(arrow_schema_ipc_message)}); + KeyValue{"ARROW:schema", arrow_schema_ipc_message}); } } } diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index b183c5ef6e5..62d40cff67c 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -3407,12 +3407,12 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema(): def test_parquet_writer_roundtrip_with_arrow_schema(): + # Ensure that the concrete and nested types are faithfully being roundtripped + # across Parquet with arrow schema expected = cudf.DataFrame( { "s": cudf.Series([None, None, None], dtype="timedelta64[s]"), - "ms": cudf.Series([1234, None, 32442], dtype="timedelta64[ms]"), "us": cudf.Series([None, 3456, None], dtype="timedelta64[us]"), - "ns": cudf.Series([1234, 3456, 32442], dtype="timedelta64[ns]"), "duration_list": list( [ [ @@ -3429,8 +3429,13 @@ def test_parquet_writer_roundtrip_with_arrow_schema(): ], ] ), - "int64": cudf.Series([1234, 123, 4123], dtype="int64"), - "int64_list": list([[1, 2], [1, 2], [1, 2]]), + "int64": cudf.Series([-1234, 123, 4123], dtype="int64"), + "uint32": cudf.Series([1234, 123, 4123], dtype="uint32"), + "list": list([[1, 2], [1, 2], [1, 2]]), + "bool": cudf.Series([True, None, False], dtype=bool), + "fixed_pt": cudf.Series([0.00, 1.0, None]).astype( + cudf.Decimal128Dtype(7, 2) + ), "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), "map": cudf.Series(["cat", "dog", "lion"]).map( {"cat": "kitten", "dog": "puppy", "lion": "cub"} @@ -3438,11 +3443,17 @@ def test_parquet_writer_roundtrip_with_arrow_schema(): } ) + # Write to Parquet buffer = BytesIO() expected.to_parquet(buffer) - read = cudf.DataFrame.from_arrow(pq.read_table(buffer)) - assert_eq(expected, read) + # Read parquet with pyarrow and cudf readers + got = cudf.DataFrame.from_arrow(pq.read_table(buffer)) + got2 = cudf.read_parquet(buffer) + + # Check results + assert_eq(expected, got) + assert_eq(expected, got2) @pytest.mark.parametrize( @@ -3516,10 +3527,8 @@ def test_parquet_writer_roundtrip_structs_with_arrow_schema(tmpdir, data): expected = cudf.DataFrame.from_arrow(pa_expected) - # IO buffer - buffer = BytesIO() - # Write expected data frame to Parquet + buffer = BytesIO() expected.to_parquet(buffer) # Read Parquet with pyarrow From e733ff150cf9e3d3af4c37598b3f7a955037eac8 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 29 May 2024 03:21:59 +0000 Subject: [PATCH 10/45] Code cleanup and add API docs. --- cpp/src/io/parquet/arrow_schema_writer.cpp | 104 ++++++++++----------- cpp/src/io/parquet/arrow_schema_writer.hpp | 53 ++++++----- cpp/src/io/parquet/writer_impl.cu | 15 --- 3 files changed, 84 insertions(+), 88 deletions(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index 90f32a2b3cb..25d3e01abc4 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -15,40 +15,46 @@ */ /** - * @file arrow_schema.cpp + * @file arrow_schema_writer.cpp * @brief Arrow IPC schema writer implementation */ #include "arrow_schema_writer.hpp" +#include "io/parquet/parquet_common.hpp" +#include "io/utilities/base64_utilities.hpp" +#include "ipc/Message_generated.h" +#include "ipc/Schema_generated.h" + +#include +#include +#include + namespace cudf::io::parquet::detail { +// Copied over from arrow source for better code readability +namespace flatbuf = cudf::io::parquet::flatbuf; +using FlatBufferBuilder = flatbuffers::FlatBufferBuilder; +using DictionaryOffset = flatbuffers::Offset; +using FieldOffset = flatbuffers::Offset; +using Offset = flatbuffers::Offset; +using FBString = flatbuffers::Offset; + /** * @brief Function to construct a tree of arrow schema fields + * + * @param fbb + * @param column + * @param column_metadata + * @param write_mode + * @param utc_timestamps */ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, - cudf::detail::LinkedColPtr const& col, - column_in_metadata const& col_meta, + cudf::detail::LinkedColPtr const& column, + column_in_metadata const& column_metadata, single_write_mode const write_mode, bool const utc_timestamps); -// TODO: Copied over from ``writer_impl.cu``. Need to placed at a common location to avoid -// duplication. -inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col, - column_in_metadata const& col_meta, - single_write_mode write_mode) -{ - if (col_meta.is_nullability_defined()) { - CUDF_EXPECTS(col_meta.nullable() or col->null_count() == 0, - "Mismatch in metadata prescribed nullability and input column. " - "Metadata for input column with nulls cannot prescribe nullability = false"); - return col_meta.nullable(); - } - // For chunked write, when not provided nullability, we assume the worst case scenario - // that all columns are nullable. - return write_mode == single_write_mode::NO or col->nullable(); -} - /** * @brief Functor to convert cudf column metadata to arrow schema */ @@ -152,7 +158,7 @@ struct dispatch_to_flatbuf { operator()() { type_type = flatbuf::Type_Timestamp; - // TODO: Verify if this is the correct logic + // TODO: Verify if this is the correct logic for UTC field_offset = flatbuf::CreateTimestamp( fbb, flatbuf::TimeUnit_SECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) .Union(); @@ -223,16 +229,17 @@ struct dispatch_to_flatbuf { template std::enable_if_t(), void> operator()() { - // TODO: cuDF-PQ writer supports d32 and d64 types not supported by Arrow without conversion. - // See more: https://github.com/rapidsai/cudf/blob/branch-24.08/cpp/src/interop/to_arrow.cu#L155 - // if constexpr (std::is_same_v) { type_type = flatbuf::Type_Decimal; field_offset = flatbuf::CreateDecimal(fbb, col_meta.get_decimal_precision(), col->type().scale(), 128) .Union(); - } else { - // TODO: Should we fail or just not write arrow:schema anymore? + } + // cuDF-PQ writer supports ``decimal32`` and ``decimal64`` types, not directly supported by + // Arrow without explicit conversion. See more: + // https://github.com/rapidsai/cudf/blob/branch-24.08/cpp/src/interop/to_arrow.cu#L155. + else { + // TODO: Should we fail here or just not write arrow schema?. CUDF_FAIL("Fixed point types smaller than `decimal128` are not supported in arrow schema"); } } @@ -241,15 +248,16 @@ struct dispatch_to_flatbuf { std::enable_if_t(), void> operator()() { // Lists are represented differently in arrow and cuDF. - // cuDF representation: List: "col_name" : { "list","element : int" } (2 children) - // arrow schema representation: List: "col_name" : { "list" } (1 child) + // cuDF representation: List: "col_name" : { "list", "element:int" } (2 children) + // arrow schema representation: List: "col_name" : { "list" } (1 child) + // Hence, we only need to process the second child of the list. if constexpr (std::is_same_v) { - // Only need to process the second child (at idx = 1) children.emplace_back(make_arrow_schema_fields( fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps)); type_type = flatbuf::Type_List; field_offset = flatbuf::CreateList(fbb).Union(); } + // Traverse the struct in DFS manner and process children fields. else if constexpr (std::is_same_v) { std::transform(thrust::make_counting_iterator(0UL), @@ -274,8 +282,8 @@ struct dispatch_to_flatbuf { }; FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, - cudf::detail::LinkedColPtr const& col, - column_in_metadata const& col_meta, + cudf::detail::LinkedColPtr const& column, + column_in_metadata const& column_metadata, single_write_mode const write_mode, bool const utc_timestamps) { @@ -284,13 +292,13 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, std::vector children; cudf::type_dispatcher( - col->type(), + column->type(), dispatch_to_flatbuf{ - fbb, col, col_meta, write_mode, utc_timestamps, field_offset, type_type, children}); + fbb, column, column_metadata, write_mode, utc_timestamps, field_offset, type_type, children}); - auto const fb_name = fbb.CreateString(col_meta.get_name()); + auto const fb_name = fbb.CreateString(column_metadata.get_name()); auto const fb_children = fbb.CreateVector(children.data(), children.size()); - auto const is_nullable = is_col_nullable(col, col_meta, write_mode); + auto const is_nullable = is_col_nullable(column, column_metadata, write_mode); DictionaryOffset dictionary = 0; // push to field offsets vector @@ -298,15 +306,6 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, fbb, fb_name, is_nullable, type_type, field_offset, dictionary, fb_children); } -/** - * @brief Construct and return arrow schema from input parquet schema - * - * Recursively traverses through parquet schema to construct the arrow schema tree. - * Serializes the arrow schema tree and stores it as the header (or metadata) of - * an otherwise empty ipc message using flatbuffers. The ipc message is then prepended - * with header size (padded for 16 byte alignment) and a continuation string. The final - * string is base64 encoded and returned. - */ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, table_input_metadata const& metadata, single_write_mode const write_mode, @@ -326,14 +325,15 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con std::vector field_offsets; // populate field offsets (aka schema fields) - std::transform( - thrust::make_counting_iterator(0UL), - thrust::make_counting_iterator(linked_columns.size()), - std::back_inserter(field_offsets), - [&](auto const idx) { - return make_arrow_schema_fields( - fbb, linked_columns[idx], metadata.column_metadata[idx], write_mode, utc_timestamps); - }); + std::transform(thrust::make_zip_iterator( + thrust::make_tuple(linked_columns.begin(), metadata.column_metadata.begin())), + thrust::make_zip_iterator( + thrust::make_tuple(linked_columns.end(), metadata.column_metadata.end())), + std::back_inserter(field_offsets), + [&](auto const& elem) { + return make_arrow_schema_fields( + fbb, thrust::get<0>(elem), thrust::get<1>(elem), write_mode, utc_timestamps); + }); // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to // create an ipc message flatbuffer diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index 29db9f05df4..309704d4e87 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -15,45 +15,49 @@ */ /** - * @file arrow_schema.hpp + * @file arrow_schema_writer.hpp * @brief Arrow IPC schema writer implementation */ #pragma once -#include "io/parquet/parquet_common.hpp" -#include "io/utilities/base64_utilities.hpp" -#include "ipc/Message_generated.h" -#include "ipc/Schema_generated.h" - -#include #include #include #include #include #include -#include -#include - -#include -#include -#include #include -#include #include namespace cudf::io::parquet::detail { using namespace cudf::io::detail; -namespace flatbuf = cudf::io::parquet::flatbuf; - -using FlatBufferBuilder = flatbuffers::FlatBufferBuilder; -using DictionaryOffset = flatbuffers::Offset; -using FieldOffset = flatbuffers::Offset; -using Offset = flatbuffers::Offset; -using FBString = flatbuffers::Offset; +/** + * @brief Returns ``true`` if the column is nullable or if the write mode is not + * set to write the table all at once instead of chunked + * + * @param column A view of the column + * @param column_metadata Metadata of the column + * @param write_mode Flag to indicate that we are guaranteeing a single table write + * + * @return Whether the column is nullable. + */ +[[nodiscard]] inline bool is_col_nullable(cudf::detail::LinkedColPtr const& column, + column_in_metadata const& column_metadata, + single_write_mode write_mode) +{ + if (column_metadata.is_nullability_defined()) { + CUDF_EXPECTS(column_metadata.nullable() or column->null_count() == 0, + "Mismatch in metadata prescribed nullability and input column. " + "Metadata for input column with nulls cannot prescribe nullability = false"); + return column_metadata.nullable(); + } + // For chunked write, when not provided nullability, we assume the worst case scenario + // that all columns are nullable. + return write_mode == single_write_mode::NO or column->nullable(); +} /** * @brief Construct and return arrow schema from input parquet schema @@ -63,6 +67,13 @@ using FBString = flatbuffers::Offset; * an otherwise empty ipc message using flatbuffers. The ipc message is then prepended * with header size (padded for 16 byte alignment) and a continuation string. The final * string is base64 encoded and returned. + * + * @param linked_columns Vector of table column views + * @param metadata Metadata of the columns of the table + * @param write_mode Flag to indicate that we are guaranteeing a single table write + * @param utc_timestamps Flag to indicate if timestamps are UTC + * + * @return The constructed arrow ipc message string */ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, table_input_metadata const& metadata, diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 0c7798b9b18..63745939755 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -599,21 +599,6 @@ struct leaf_schema_fn { } }; -inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col, - column_in_metadata const& col_meta, - single_write_mode write_mode) -{ - if (col_meta.is_nullability_defined()) { - CUDF_EXPECTS(col_meta.nullable() or col->null_count() == 0, - "Mismatch in metadata prescribed nullability and input column. " - "Metadata for input column with nulls cannot prescribe nullability = false"); - return col_meta.nullable(); - } - // For chunked write, when not provided nullability, we assume the worst case scenario - // that all columns are nullable. - return write_mode == single_write_mode::NO or col->nullable(); -} - /** * @brief Construct schema from input columns and per-column input options * From f4a9595d393a8362f857e778876ba3b542fcd9e5 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 29 May 2024 03:22:52 +0000 Subject: [PATCH 11/45] Revert changes to types.hpp --- cpp/include/cudf/types.hpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 101791cee0b..466d53fcafc 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -216,10 +216,6 @@ enum class type_id : int32_t { TIMESTAMP_MILLISECONDS, ///< point in time in milliseconds since Unix Epoch in int64 TIMESTAMP_MICROSECONDS, ///< point in time in microseconds since Unix Epoch in int64 TIMESTAMP_NANOSECONDS, ///< point in time in nanoseconds since Unix Epoch in int64 - TIME_SECONDS, ///< time of day since midnight in seconds in int64 - TIME_MILLISECONDS, ///< time of day since midnight in milliseconds in int64 - TIME_MICROSECONDS, ///< time of day since midnight in microseconds in int64 - TIME_NANOSECONDS, ///< time of day since midnight in nanoseconds in int64 DURATION_DAYS, ///< time interval of days in int64 DURATION_SECONDS, ///< time interval of seconds in int64 DURATION_MILLISECONDS, ///< time interval of milliseconds in int64 From ede6191e2a6483c0309439262a514b9f53908c51 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 29 May 2024 03:31:51 +0000 Subject: [PATCH 12/45] Minor code and doc cleanup --- cpp/src/io/parquet/arrow_schema_writer.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index 25d3e01abc4..1c3f66fc818 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -41,13 +41,15 @@ using Offset = flatbuffers::Offset; using FBString = flatbuffers::Offset; /** - * @brief Function to construct a tree of arrow schema fields + * @brief Recursively construct the arrow schema (fields) tree * - * @param fbb - * @param column - * @param column_metadata - * @param write_mode - * @param utc_timestamps + * @param fbb The root flatbuffer builder object instance + * @param column A view of the column + * @param column_metadata Metadata of the column + * @param write_mode Flag to indicate that we are guaranteeing a single table write + * @param utc_timestamps Flag to indicate if timestamps are UTC + * + * @return Flatbuffer offset to the constructed field */ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, cudf::detail::LinkedColPtr const& column, @@ -56,7 +58,7 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, bool const utc_timestamps); /** - * @brief Functor to convert cudf column metadata to arrow schema + * @brief Functor to convert cudf column metadata to arrow schema field metadata */ struct dispatch_to_flatbuf { FlatBufferBuilder& fbb; From 62a26843be733c53fa4d4a378bd5676013984be0 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 29 May 2024 03:40:30 +0000 Subject: [PATCH 13/45] Minor fix for failing pytest --- python/cudf/cudf/tests/test_parquet.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 62d40cff67c..ae5846f9bcc 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -3170,10 +3170,10 @@ def test_parquet_writer_time_delta_physical_type(): got = pd.read_parquet(buffer) expected = pd.DataFrame( { - "s": ["00:00:01"], - "ms": ["00:00:00.002000"], - "us": ["00:00:00.000003"], - "ns": ["00:00:00.000004"], + "s": ["0 days 00:00:01"], + "ms": ["0 days 00:00:00.002000"], + "us": ["0 days 00:00:00.000003"], + "ns": ["0 days 00:00:00.000004"], }, dtype="str", ) From 6e448abefffa5d82c9b9f72f7f98abecebfd2540 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 29 May 2024 04:46:29 +0000 Subject: [PATCH 14/45] Handle int96 timestamps. --- cpp/src/io/parquet/arrow_schema_writer.cpp | 71 +++++++++++++++++----- cpp/src/io/parquet/arrow_schema_writer.hpp | 4 +- cpp/src/io/parquet/writer_impl.cu | 4 +- python/cudf/cudf/tests/test_parquet.py | 8 ++- 4 files changed, 68 insertions(+), 19 deletions(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index 1c3f66fc818..b13bcc98581 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -48,6 +48,7 @@ using FBString = flatbuffers::Offset; * @param column_metadata Metadata of the column * @param write_mode Flag to indicate that we are guaranteeing a single table write * @param utc_timestamps Flag to indicate if timestamps are UTC + * @param int96_timestamps Flag to indicate if timestamps was written as INT96 * * @return Flatbuffer offset to the constructed field */ @@ -55,7 +56,8 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, cudf::detail::LinkedColPtr const& column, column_in_metadata const& column_metadata, single_write_mode const write_mode, - bool const utc_timestamps); + bool const utc_timestamps, + bool const int96_timestamps); /** * @brief Functor to convert cudf column metadata to arrow schema field metadata @@ -66,6 +68,7 @@ struct dispatch_to_flatbuf { column_in_metadata const& col_meta; single_write_mode const write_mode; bool const utc_timestamps; + bool const int96_timestamps; Offset& field_offset; flatbuf::Type& type_type; std::vector& children; @@ -159,8 +162,13 @@ struct dispatch_to_flatbuf { void> operator()() { + // INT96 timestamps have been deprecated in arrow + if (int96_timestamps or col_meta.is_enabled_int96_timestamps()) { + CUDF_FAIL("INT96 timestamps have been deprecated in arrow schema"); + } + type_type = flatbuf::Type_Timestamp; - // TODO: Verify if this is the correct logic for UTC + // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = flatbuf::CreateTimestamp( fbb, flatbuf::TimeUnit_SECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) .Union(); @@ -169,8 +177,13 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { + // INT96 timestamps have been deprecated in arrow + if (int96_timestamps or col_meta.is_enabled_int96_timestamps()) { + CUDF_FAIL("INT96 timestamps have been deprecated in arrow schema"); + } + type_type = flatbuf::Type_Timestamp; - // TODO: Verify if this is the correct logic for UTC + // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = flatbuf::CreateTimestamp( fbb, flatbuf::TimeUnit_MILLISECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) @@ -180,8 +193,13 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { + // INT96 timestamps have been deprecated in arrow + if (int96_timestamps or col_meta.is_enabled_int96_timestamps()) { + CUDF_FAIL("INT96 timestamps have been deprecated in arrow schema"); + } + type_type = flatbuf::Type_Timestamp; - // TODO: Verify if this is the correct logic for UTC + // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = flatbuf::CreateTimestamp( fbb, flatbuf::TimeUnit_MICROSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) @@ -191,8 +209,13 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { + // INT96 timestamps have been deprecated in arrow + if (int96_timestamps or col_meta.is_enabled_int96_timestamps()) { + CUDF_FAIL("INT96 timestamps have been deprecated in arrow schema"); + } + type_type = flatbuf::Type_Timestamp; - // TODO: Verify if this is the correct logic for UTC + // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = flatbuf::CreateTimestamp( fbb, flatbuf::TimeUnit_NANOSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) @@ -255,7 +278,7 @@ struct dispatch_to_flatbuf { // Hence, we only need to process the second child of the list. if constexpr (std::is_same_v) { children.emplace_back(make_arrow_schema_fields( - fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps)); + fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps, int96_timestamps)); type_type = flatbuf::Type_List; field_offset = flatbuf::CreateList(fbb).Union(); } @@ -266,8 +289,12 @@ struct dispatch_to_flatbuf { thrust::make_counting_iterator(col->children.size()), std::back_inserter(children), [&](auto const idx) { - return make_arrow_schema_fields( - fbb, col->children[idx], col_meta.child(idx), write_mode, utc_timestamps); + return make_arrow_schema_fields(fbb, + col->children[idx], + col_meta.child(idx), + write_mode, + utc_timestamps, + int96_timestamps); }); type_type = flatbuf::Type_Struct_; field_offset = flatbuf::CreateStruct_(fbb).Union(); @@ -287,16 +314,23 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, cudf::detail::LinkedColPtr const& column, column_in_metadata const& column_metadata, single_write_mode const write_mode, - bool const utc_timestamps) + bool const utc_timestamps, + bool const int96_timestamps) { Offset field_offset = 0; flatbuf::Type type_type = flatbuf::Type_NONE; std::vector children; - cudf::type_dispatcher( - column->type(), - dispatch_to_flatbuf{ - fbb, column, column_metadata, write_mode, utc_timestamps, field_offset, type_type, children}); + cudf::type_dispatcher(column->type(), + dispatch_to_flatbuf{fbb, + column, + column_metadata, + write_mode, + utc_timestamps, + int96_timestamps, + field_offset, + type_type, + children}); auto const fb_name = fbb.CreateString(column_metadata.get_name()); auto const fb_children = fbb.CreateVector(children.data(), children.size()); @@ -311,7 +345,8 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, table_input_metadata const& metadata, single_write_mode const write_mode, - bool const utc_timestamps) + bool const utc_timestamps, + bool const int96_timestamps) { // Lambda function to convert int32 to a string of uint8 bytes auto const convert_int32_to_byte_string = [&](int32_t const value) { @@ -333,8 +368,12 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con thrust::make_tuple(linked_columns.end(), metadata.column_metadata.end())), std::back_inserter(field_offsets), [&](auto const& elem) { - return make_arrow_schema_fields( - fbb, thrust::get<0>(elem), thrust::get<1>(elem), write_mode, utc_timestamps); + return make_arrow_schema_fields(fbb, + thrust::get<0>(elem), + thrust::get<1>(elem), + write_mode, + utc_timestamps, + int96_timestamps); }); // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index 309704d4e87..8ba48361eb3 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -72,12 +72,14 @@ using namespace cudf::io::detail; * @param metadata Metadata of the columns of the table * @param write_mode Flag to indicate that we are guaranteeing a single table write * @param utc_timestamps Flag to indicate if timestamps are UTC + * @param int96_timestamps Flag to indicate if timestamps was written as INT96 * * @return The constructed arrow ipc message string */ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, table_input_metadata const& metadata, single_write_mode const write_mode, - bool const utc_timestamps); + bool const utc_timestamps, + bool const int96_timestamps); } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 63745939755..d08fed7761c 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1817,7 +1817,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, this_table_schema, num_columns, stats_granularity, - construct_arrow_schema_ipc_message(vec, table_meta, write_mode, utc_timestamps)); + (write_arrow_schema) ? construct_arrow_schema_ipc_message( + vec, table_meta, write_mode, utc_timestamps, int96_timestamps) + : ""); } else { agg_meta = std::make_unique(*curr_agg_meta); diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index ae5846f9bcc..220cef67bd8 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1620,7 +1620,13 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf): assert_eq(pdf, gdf) # Write out the gdf using the GPU accelerated writer with INT96 timestamps - gdf.to_parquet(gdf_fname.strpath, index=None, int96_timestamps=True) + # INT96 timestamps have been deprecated in Arrow so set `store_schema=False` + gdf.to_parquet( + gdf_fname.strpath, + index=None, + int96_timestamps=True, + store_schema=False, + ) assert os.path.exists(gdf_fname) From 3c800f5e65ff261acc8966d56e53a99357c366f6 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 29 May 2024 19:15:44 +0000 Subject: [PATCH 15/45] Add `stats_dtype` to INT64 duration columns --- cpp/src/io/parquet/writer_impl.cu | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index d08fed7761c..29ac5125a85 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -517,37 +517,37 @@ struct leaf_schema_fn { template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.arrow_type = cudf::type_id::DURATION_SECONDS; - col_schema.ts_scale = 24 * 60 * 60; + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_int64; + col_schema.ts_scale = 24 * 60 * 60; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.arrow_type = cudf::type_id::DURATION_SECONDS; + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_int64; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.arrow_type = cudf::type_id::DURATION_MILLISECONDS; + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_int64; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.arrow_type = cudf::type_id::DURATION_MICROSECONDS; + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_int64; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.arrow_type = cudf::type_id::DURATION_NANOSECONDS; + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_int64; } template From f7aaaad7476aad0def7d903cbca9e84968219db5 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 30 May 2024 01:42:15 +0000 Subject: [PATCH 16/45] turn arrow schema off by default --- cpp/include/cudf/io/parquet.hpp | 6 +- cpp/src/io/parquet/arrow_schema_writer.cpp | 67 +++---------- cpp/src/io/parquet/arrow_schema_writer.hpp | 4 +- cpp/src/io/parquet/writer_impl.cu | 110 ++++++++++----------- cpp/src/io/parquet/writer_impl.hpp | 1 - python/cudf/cudf/_lib/parquet.pyx | 6 +- python/cudf/cudf/tests/test_parquet.py | 40 +------- 7 files changed, 81 insertions(+), 153 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index ac04cd4c11f..ede1994312d 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -72,7 +72,7 @@ class parquet_reader_options { // Whether to use PANDAS metadata to load columns bool _use_pandas_metadata = true; // Whether to read and use ARROW schema - bool _use_arrow_schema = true; + bool _use_arrow_schema = false; // Cast timestamp columns to a specific type data_type _timestamp_type{type_id::EMPTY}; @@ -603,7 +603,7 @@ class parquet_writer_options { // Defaults to true because libcudf timestamps are implicitly UTC bool _write_timestamps_as_UTC = true; // Whether to write ARROW schema - bool _write_arrow_schema = true; + bool _write_arrow_schema = false; // Column chunks file paths to be set in the raw output metadata. One per output file std::vector _column_chunks_file_paths; // Maximum size of each row group (unless smaller than a single page) @@ -1327,7 +1327,7 @@ class chunked_parquet_writer_options { // Parquet writer can write timestamps as UTC. Defaults to true. bool _write_timestamps_as_UTC = true; // Whether to write ARROW schema - bool _write_arrow_schema = true; + bool _write_arrow_schema = false; // Maximum size of each row group (unless smaller than a single page) size_t _row_group_size_bytes = default_row_group_size_bytes; // Maximum number of rows in row group (unless smaller than a single page) diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index b13bcc98581..f3f3fba4bac 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -48,7 +48,6 @@ using FBString = flatbuffers::Offset; * @param column_metadata Metadata of the column * @param write_mode Flag to indicate that we are guaranteeing a single table write * @param utc_timestamps Flag to indicate if timestamps are UTC - * @param int96_timestamps Flag to indicate if timestamps was written as INT96 * * @return Flatbuffer offset to the constructed field */ @@ -56,8 +55,7 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, cudf::detail::LinkedColPtr const& column, column_in_metadata const& column_metadata, single_write_mode const write_mode, - bool const utc_timestamps, - bool const int96_timestamps); + bool const utc_timestamps); /** * @brief Functor to convert cudf column metadata to arrow schema field metadata @@ -68,7 +66,6 @@ struct dispatch_to_flatbuf { column_in_metadata const& col_meta; single_write_mode const write_mode; bool const utc_timestamps; - bool const int96_timestamps; Offset& field_offset; flatbuf::Type& type_type; std::vector& children; @@ -162,11 +159,6 @@ struct dispatch_to_flatbuf { void> operator()() { - // INT96 timestamps have been deprecated in arrow - if (int96_timestamps or col_meta.is_enabled_int96_timestamps()) { - CUDF_FAIL("INT96 timestamps have been deprecated in arrow schema"); - } - type_type = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = flatbuf::CreateTimestamp( @@ -177,11 +169,6 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { - // INT96 timestamps have been deprecated in arrow - if (int96_timestamps or col_meta.is_enabled_int96_timestamps()) { - CUDF_FAIL("INT96 timestamps have been deprecated in arrow schema"); - } - type_type = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = @@ -193,11 +180,6 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { - // INT96 timestamps have been deprecated in arrow - if (int96_timestamps or col_meta.is_enabled_int96_timestamps()) { - CUDF_FAIL("INT96 timestamps have been deprecated in arrow schema"); - } - type_type = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = @@ -209,11 +191,6 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { - // INT96 timestamps have been deprecated in arrow - if (int96_timestamps or col_meta.is_enabled_int96_timestamps()) { - CUDF_FAIL("INT96 timestamps have been deprecated in arrow schema"); - } - type_type = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = @@ -278,7 +255,7 @@ struct dispatch_to_flatbuf { // Hence, we only need to process the second child of the list. if constexpr (std::is_same_v) { children.emplace_back(make_arrow_schema_fields( - fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps, int96_timestamps)); + fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps)); type_type = flatbuf::Type_List; field_offset = flatbuf::CreateList(fbb).Union(); } @@ -289,12 +266,8 @@ struct dispatch_to_flatbuf { thrust::make_counting_iterator(col->children.size()), std::back_inserter(children), [&](auto const idx) { - return make_arrow_schema_fields(fbb, - col->children[idx], - col_meta.child(idx), - write_mode, - utc_timestamps, - int96_timestamps); + return make_arrow_schema_fields( + fbb, col->children[idx], col_meta.child(idx), write_mode, utc_timestamps); }); type_type = flatbuf::Type_Struct_; field_offset = flatbuf::CreateStruct_(fbb).Union(); @@ -314,23 +287,20 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, cudf::detail::LinkedColPtr const& column, column_in_metadata const& column_metadata, single_write_mode const write_mode, - bool const utc_timestamps, - bool const int96_timestamps) + bool const utc_timestamps) { Offset field_offset = 0; flatbuf::Type type_type = flatbuf::Type_NONE; std::vector children; - cudf::type_dispatcher(column->type(), - dispatch_to_flatbuf{fbb, - column, - column_metadata, - write_mode, - utc_timestamps, - int96_timestamps, - field_offset, - type_type, - children}); + cudf::type_dispatcher( + column->type(), + dispatch_to_flatbuf{ + fbb, column, column_metadata, write_mode, utc_timestamps, field_offset, type_type, children}); + + std::cout << "Name: " << column_metadata.get_name() + << ", Type: " << static_cast::type>(column->type().id()) + << std::endl; auto const fb_name = fbb.CreateString(column_metadata.get_name()); auto const fb_children = fbb.CreateVector(children.data(), children.size()); @@ -345,8 +315,7 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, table_input_metadata const& metadata, single_write_mode const write_mode, - bool const utc_timestamps, - bool const int96_timestamps) + bool const utc_timestamps) { // Lambda function to convert int32 to a string of uint8 bytes auto const convert_int32_to_byte_string = [&](int32_t const value) { @@ -368,12 +337,8 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con thrust::make_tuple(linked_columns.end(), metadata.column_metadata.end())), std::back_inserter(field_offsets), [&](auto const& elem) { - return make_arrow_schema_fields(fbb, - thrust::get<0>(elem), - thrust::get<1>(elem), - write_mode, - utc_timestamps, - int96_timestamps); + return make_arrow_schema_fields( + fbb, thrust::get<0>(elem), thrust::get<1>(elem), write_mode, utc_timestamps); }); // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index 8ba48361eb3..309704d4e87 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -72,14 +72,12 @@ using namespace cudf::io::detail; * @param metadata Metadata of the columns of the table * @param write_mode Flag to indicate that we are guaranteeing a single table write * @param utc_timestamps Flag to indicate if timestamps are UTC - * @param int96_timestamps Flag to indicate if timestamps was written as INT96 * * @return The constructed arrow ipc message string */ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, table_input_metadata const& metadata, single_write_mode const write_mode, - bool const utc_timestamps, - bool const int96_timestamps); + bool const utc_timestamps); } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 29ac5125a85..1c2b26f1f8f 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -345,8 +345,8 @@ struct leaf_schema_fn { schema_tree_node& col_schema; cudf::detail::LinkedColPtr const& col; column_in_metadata const& col_meta; - bool timestamp_is_int96; bool timestamp_is_utc; + bool write_arrow_schema; template std::enable_if_t, void> operator()() @@ -468,72 +468,75 @@ struct leaf_schema_fn { template std::enable_if_t, void> operator()() { - col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; - col_schema.ts_scale = 1000; - if (not timestamp_is_int96) { - col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS; - col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}}; - } + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; + col_schema.ts_scale = 1000; + col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS; + col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}}; } template std::enable_if_t, void> operator()() { - col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; - if (not timestamp_is_int96) { - col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS; - col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}}; - } + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; + col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS; + col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}}; } template std::enable_if_t, void> operator()() { - col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; - if (not timestamp_is_int96) { - col_schema.converted_type = ConvertedType::TIMESTAMP_MICROS; - col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MICROS}}; - } + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; + col_schema.converted_type = ConvertedType::TIMESTAMP_MICROS; + col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MICROS}}; } template std::enable_if_t, void> operator()() { - col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; + col_schema.type = Type::INT64; col_schema.converted_type = thrust::nullopt; col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; - if (timestamp_is_int96) { - col_schema.ts_scale = -1000; // negative value indicates division by absolute value - } - // set logical type if it's not int96 - else { - col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::NANOS}}; - } + col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::NANOS}}; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_int64; - col_schema.ts_scale = 24 * 60 * 60; + col_schema.type = (write_arrow_schema) ? Type::INT64 : Type::INT32; + col_schema.stats_dtype = + (write_arrow_schema) ? statistics_dtype::dtype_int64 : statistics_dtype::dtype_int32; + col_schema.ts_scale = 24 * 60 * 60; + + if (not write_arrow_schema) { + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; + col_schema.ts_scale *= 1000; + } } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_int64; + col_schema.type = (write_arrow_schema) ? Type::INT64 : Type::INT32; + col_schema.stats_dtype = + (write_arrow_schema) ? statistics_dtype::dtype_int64 : statistics_dtype::dtype_int32; + if (not write_arrow_schema) { + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; + col_schema.ts_scale = 1000; + } } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_int64; + col_schema.type = (write_arrow_schema) ? Type::INT64 : Type::INT32; + col_schema.stats_dtype = + (write_arrow_schema) ? statistics_dtype::dtype_int64 : statistics_dtype::dtype_int32; + if (not write_arrow_schema) { + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; + } } template @@ -541,6 +544,9 @@ struct leaf_schema_fn { { col_schema.type = Type::INT64; col_schema.stats_dtype = statistics_dtype::dtype_int64; + if (not write_arrow_schema) { + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}}; + } } template @@ -548,6 +554,9 @@ struct leaf_schema_fn { { col_schema.type = Type::INT64; col_schema.stats_dtype = statistics_dtype::dtype_int64; + if (not write_arrow_schema) { + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}}; + } } template @@ -609,8 +618,8 @@ std::vector construct_parquet_schema_tree( cudf::detail::LinkedColVector const& linked_columns, table_input_metadata& metadata, single_write_mode write_mode, - bool int96_timestamps, - bool utc_timestamps) + bool utc_timestamps, + bool write_arrow_schema) { std::vector schema; schema_tree_node root{}; @@ -876,11 +885,9 @@ std::vector construct_parquet_schema_tree( schema_tree_node col_schema{}; - bool timestamp_is_int96 = int96_timestamps or col_meta.is_enabled_int96_timestamps(); - cudf::type_dispatcher( col->type(), - leaf_schema_fn{col_schema, col, col_meta, timestamp_is_int96, utc_timestamps}); + leaf_schema_fn{col_schema, col, col_meta, utc_timestamps, write_arrow_schema}); col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED; col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name(); @@ -1133,19 +1140,17 @@ void calculate_page_fragments(device_span frag, * * @param frag_stats output statistics * @param frags Input page fragments - * @param int96_timestamps Flag to indicate if timestamps will be written as INT96 * @param stream CUDA stream used for device memory operations and kernel launches */ void gather_fragment_statistics(device_span frag_stats, device_span frags, - bool int96_timestamps, rmm::cuda_stream_view stream) { rmm::device_uvector frag_stats_group(frag_stats.size(), stream); InitFragmentStatistics(frag_stats_group, frags, stream); detail::calculate_group_statistics( - frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream, int96_timestamps); + frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream); stream.synchronize(); } @@ -1655,7 +1660,6 @@ void fill_table_meta(std::unique_ptr const& table_meta) * @param dict_policy Policy for dictionary use * @param max_dictionary_size Maximum dictionary size, in bytes * @param single_write_mode Flag to indicate that we are guaranteeing a single table write - * @param int96_timestamps Flag to indicate if timestamps will be written as INT96 * @param utc_timestamps Flag to indicate if timestamps are UTC * @param write_v2_headers True if V2 page headers are to be written * @param out_sink Sink for checking if device write is supported, should not be used to write any @@ -1680,7 +1684,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, dictionary_policy dict_policy, size_t max_dictionary_size, single_write_mode write_mode, - bool int96_timestamps, bool utc_timestamps, bool write_v2_headers, bool write_arrow_schema, @@ -1689,7 +1692,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, { auto vec = table_to_linked_columns(input); auto schema_tree = - construct_parquet_schema_tree(vec, table_meta, write_mode, int96_timestamps, utc_timestamps); + construct_parquet_schema_tree(vec, table_meta, write_mode, utc_timestamps, write_arrow_schema); // Construct parquet_column_views from the schema tree leaf nodes. std::vector parquet_columns; @@ -1817,9 +1820,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, this_table_schema, num_columns, stats_granularity, - (write_arrow_schema) ? construct_arrow_schema_ipc_message( - vec, table_meta, write_mode, utc_timestamps, int96_timestamps) - : ""); + (write_arrow_schema) + ? construct_arrow_schema_ipc_message(vec, table_meta, write_mode, utc_timestamps) + : ""); } else { agg_meta = std::make_unique(*curr_agg_meta); @@ -1990,10 +1993,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, // and gather fragment statistics if (not frag_stats.is_empty()) { - gather_fragment_statistics(frag_stats, - {page_fragments.device_ptr(), static_cast(total_frags)}, - int96_timestamps, - stream); + gather_fragment_statistics( + frag_stats, {page_fragments.device_ptr(), static_cast(total_frags)}, stream); } } @@ -2297,7 +2298,6 @@ writer::impl::impl(std::vector> sinks, _dict_policy(options.get_dictionary_policy()), _max_dictionary_size(options.get_max_dictionary_size()), _max_page_fragment_size(options.get_max_page_fragment_size()), - _int96_timestamps(options.is_enabled_int96_timestamps()), _utc_timestamps(options.is_enabled_utc_timestamps()), _write_v2_headers(options.is_enabled_write_v2_headers()), _write_arrow_schema(options.is_enabled_write_arrow_schema()), @@ -2328,7 +2328,6 @@ writer::impl::impl(std::vector> sinks, _dict_policy(options.get_dictionary_policy()), _max_dictionary_size(options.get_max_dictionary_size()), _max_page_fragment_size(options.get_max_page_fragment_size()), - _int96_timestamps(options.is_enabled_int96_timestamps()), _utc_timestamps(options.is_enabled_utc_timestamps()), _write_v2_headers(options.is_enabled_write_v2_headers()), _write_arrow_schema(options.is_enabled_write_arrow_schema()), @@ -2407,7 +2406,6 @@ void writer::impl::write(table_view const& input, std::vector co _dict_policy, _max_dictionary_size, _single_write_mode, - _int96_timestamps, _utc_timestamps, _write_v2_headers, _write_arrow_schema, diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 63128faf993..bcc8de13ceb 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -153,7 +153,6 @@ class writer::impl { dictionary_policy const _dict_policy; size_t const _max_dictionary_size; std::optional const _max_page_fragment_size; - bool const _int96_timestamps; bool const _utc_timestamps; bool const _write_v2_headers; bool const _write_arrow_schema; diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index c11a6d026b9..886133e6763 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -412,7 +412,7 @@ def write_parquet( object column_encoding=None, object column_type_length=None, object output_as_binary=None, - write_arrow_schema=True, + write_arrow_schema=False, ): """ Cython function to call into libcudf API, see `write_parquet`. @@ -597,7 +597,7 @@ cdef class ParquetWriter: If ``True``, enable dictionary encoding for Parquet page data subject to ``max_dictionary_size`` constraints. If ``False``, disable dictionary encoding for Parquet page data. - write_arrow_schema : bool, default True + write_arrow_schema : bool, default False If ``True``, enable computing and writing arrow schema to Parquet file footer's key-value metadata section. See Also @@ -628,7 +628,7 @@ cdef class ParquetWriter: int max_page_size_rows=20000, int max_dictionary_size=1048576, bool use_dictionary=True, - bool store_schema=True): + bool store_schema=False): filepaths_or_buffers = ( list(filepath_or_buffer) if is_list_like(filepath_or_buffer) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 220cef67bd8..a44e9612d8f 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1604,38 +1604,6 @@ def clone_field(table, name, datatype): assert_eq(expect, got) -@pytest.mark.filterwarnings("ignore:Using CPU") -def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf): - gdf_fname = tmpdir.join("gdf.parquet") - - if len(pdf) == 0: - pdf = pdf.reset_index(drop=True) - gdf = gdf.reset_index(drop=True) - - if "col_category" in pdf.columns: - pdf = pdf.drop(columns=["col_category"]) - if "col_category" in gdf.columns: - gdf = gdf.drop(columns=["col_category"]) - - assert_eq(pdf, gdf) - - # Write out the gdf using the GPU accelerated writer with INT96 timestamps - # INT96 timestamps have been deprecated in Arrow so set `store_schema=False` - gdf.to_parquet( - gdf_fname.strpath, - index=None, - int96_timestamps=True, - store_schema=False, - ) - - assert os.path.exists(gdf_fname) - - expect = pdf - got = pd.read_parquet(gdf_fname) - - # verify INT96 timestamps were converted back to the same data. - assert_eq(expect, got, check_categorical=False, check_dtype=False) - def test_multifile_parquet_folder(tmpdir): test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2, dtype="float64") @@ -3171,7 +3139,7 @@ def test_parquet_writer_time_delta_physical_type(): } ) buffer = BytesIO() - df.to_parquet(buffer) + df.to_parquet(buffer, store_schema=True) got = pd.read_parquet(buffer) expected = pd.DataFrame( @@ -3209,7 +3177,7 @@ def test_parquet_roundtrip_time_delta(): } ) buffer = BytesIO() - df.to_parquet(buffer) + df.to_parquet(buffer, store_schema=True) # TODO: Remove `check_dtype` once following issue is fixed in arrow: # https://github.com/apache/arrow/issues/33321 assert_eq(df, cudf.read_parquet(buffer), check_dtype=False) @@ -3451,7 +3419,7 @@ def test_parquet_writer_roundtrip_with_arrow_schema(): # Write to Parquet buffer = BytesIO() - expected.to_parquet(buffer) + expected.to_parquet(buffer, store_schema=True) # Read parquet with pyarrow and cudf readers got = cudf.DataFrame.from_arrow(pq.read_table(buffer)) @@ -3535,7 +3503,7 @@ def test_parquet_writer_roundtrip_structs_with_arrow_schema(tmpdir, data): # Write expected data frame to Parquet buffer = BytesIO() - expected.to_parquet(buffer) + expected.to_parquet(buffer, store_schema=True) # Read Parquet with pyarrow pa_got = pq.read_table(buffer) From 04a19985be93f3f9201ad3716fb29ce4459fdbbf Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 30 May 2024 02:12:42 +0000 Subject: [PATCH 17/45] Disable the missed `store_schema` in parquet.py --- python/cudf/cudf/io/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 25647d16271..ef5bd50053f 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -962,7 +962,7 @@ def to_parquet( column_encoding=None, column_type_length=None, output_as_binary=None, - store_schema=True, + store_schema=False, *args, **kwargs, ): From ff22e7ddd173cd4f74d5f7540eec1a71271a77d8 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 30 May 2024 05:44:23 +0000 Subject: [PATCH 18/45] minor bug fixing --- cpp/examples/parquet_io/parquet_io.cpp | 2 +- cpp/src/io/parquet/arrow_schema_writer.cpp | 4 -- cpp/src/io/parquet/writer_impl.cu | 12 +++-- cpp/tests/io/parquet_writer_test.cpp | 60 ++++++++++++++-------- python/cudf/cudf/tests/test_parquet.py | 55 +++++++++++++------- 5 files changed, 84 insertions(+), 49 deletions(-) diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index 8be17db3781..90d956e578d 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -67,7 +67,7 @@ void write_parquet(cudf::table_view input, table_metadata.column_metadata.end(), [=](auto& col_meta) { col_meta.set_encoding(encoding); }); - builder.metadata(table_metadata); + builder.metadata(table_metadata).write_arrow_schema(true); auto options = builder.build(); options.set_compression(compression); // Either use the input stats level or don't write stats diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index f3f3fba4bac..5f7fd9e9409 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -298,10 +298,6 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, dispatch_to_flatbuf{ fbb, column, column_metadata, write_mode, utc_timestamps, field_offset, type_type, children}); - std::cout << "Name: " << column_metadata.get_name() - << ", Type: " << static_cast::type>(column->type().id()) - << std::endl; - auto const fb_name = fbb.CreateString(column_metadata.get_name()); auto const fb_children = fbb.CreateVector(children.data(), children.size()); auto const is_nullable = is_col_nullable(column, column_metadata, write_mode); diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 1c2b26f1f8f..5d1001633f8 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -94,11 +94,13 @@ struct aggregate_writer_metadata { [](auto const& kv) { return KeyValue{kv.first, kv.second}; }); - // Append arrow schema to the key-value metadata - if (not arrow_schema_ipc_message.empty()) { - this->files[p].key_value_metadata.emplace_back( - KeyValue{"ARROW:schema", arrow_schema_ipc_message}); - } + } + + // Append arrow schema to the key-value metadata + if (not arrow_schema_ipc_message.empty()) { + std::for_each(this->files.begin(), this->files.end(), [&](auto& file) { + file.key_value_metadata.emplace_back(KeyValue{"ARROW:schema", arrow_schema_ipc_message}); + }); } } diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index ad0860e265e..509c89480e3 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -35,7 +35,7 @@ using cudf::test::iterators::no_nulls; template -void test_durations(mask_op_t mask_op, bool use_byte_stream_split) +void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_schema) { std::default_random_engine generator; std::uniform_int_distribution distribution_d(0, 30); @@ -73,23 +73,33 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split) col_meta.set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT); } } - - auto filepath = temp_env->get_temp_filepath("Durations.parquet"); + std::string a = (arrow_schema) ? "1" : "0"; + auto filepath = "/home/coder/Durations" + a + ".parquet"; cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .write_arrow_schema(arrow_schema); + cudf::io::write_parquet(out_opts); cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .use_arrow_schema(arrow_schema); auto result = cudf::io::read_parquet(in_opts); - auto durations_d_got = - cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view()); - - auto durations_s_got = - cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view()); + if (arrow_schema) { + auto durations_d_got = + cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(1)); + } else { + auto durations_d_got = + cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view()); + + auto durations_s_got = + cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view()); + } CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3)); @@ -98,10 +108,15 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split) TEST_F(ParquetWriterTest, Durations) { - test_durations([](auto i) { return true; }, false); - test_durations([](auto i) { return (i % 2) != 0; }, false); - test_durations([](auto i) { return (i % 3) != 0; }, false); - test_durations([](auto i) { return false; }, false); + test_durations([](auto i) { return true; }, false, false); + test_durations([](auto i) { return (i % 2) != 0; }, false, false); + test_durations([](auto i) { return (i % 3) != 0; }, false, false); + test_durations([](auto i) { return false; }, false, false); + + test_durations([](auto i) { return true; }, false, true); + test_durations([](auto i) { return (i % 2) != 0; }, false, true); + test_durations([](auto i) { return (i % 3) != 0; }, false, true); + test_durations([](auto i) { return false; }, false, true); } TEST_F(ParquetWriterTest, MultiIndex) @@ -1866,10 +1881,15 @@ TEST_F(ParquetWriterTest, DecimalByteStreamSplit) TEST_F(ParquetWriterTest, DurationByteStreamSplit) { - test_durations([](auto i) { return true; }, true); - test_durations([](auto i) { return (i % 2) != 0; }, true); - test_durations([](auto i) { return (i % 3) != 0; }, true); - test_durations([](auto i) { return false; }, true); + test_durations([](auto i) { return true; }, true, false); + test_durations([](auto i) { return (i % 2) != 0; }, true, false); + test_durations([](auto i) { return (i % 3) != 0; }, true, false); + test_durations([](auto i) { return false; }, true, false); + + test_durations([](auto i) { return true; }, true, true); + test_durations([](auto i) { return (i % 2) != 0; }, true, true); + test_durations([](auto i) { return (i % 3) != 0; }, true, true); + test_durations([](auto i) { return false; }, true, true); } TEST_F(ParquetWriterTest, WriteFixedLenByteArray) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index a44e9612d8f..460d917e95e 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2527,6 +2527,9 @@ def normalized_equals(value1, value2): value1 = value1.replace(tzinfo=None) if isinstance(value2, datetime.datetime): value2 = value2.replace(tzinfo=None) + if isinstance(value1, pd.Timedelta): + unit = "ms" if value1.unit == "s" else value1.unit + value2 = pd.Timedelta(value2, unit=unit) # if one is datetime then both values are datetimes now if isinstance(value1, datetime.datetime): @@ -2540,7 +2543,8 @@ def normalized_equals(value1, value2): @pytest.mark.parametrize("add_nulls", [True, False]) -def test_parquet_writer_statistics(tmpdir, pdf, add_nulls): +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_writer_statistics(tmpdir, pdf, add_nulls, store_schema): file_path = tmpdir.join("cudf.parquet") if "col_category" in pdf.columns: pdf = pdf.drop(columns=["col_category", "col_bool"]) @@ -2557,7 +2561,7 @@ def test_parquet_writer_statistics(tmpdir, pdf, add_nulls): if add_nulls: for col in gdf: set_random_null_mask_inplace(gdf[col]) - gdf.to_parquet(file_path, index=False) + gdf.to_parquet(file_path, index=False, store_schema=store_schema) # Read back from pyarrow pq_file = pq.ParquetFile(file_path) @@ -3126,8 +3130,8 @@ def test_parquet_writer_zstd(): got = pd.read_parquet(buff) assert_eq(expected, got) - -def test_parquet_writer_time_delta_physical_type(): +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_writer_time_delta_physical_type(store_schema): df = cudf.DataFrame( { "s": cudf.Series([1], dtype="timedelta64[s]"), @@ -3139,22 +3143,35 @@ def test_parquet_writer_time_delta_physical_type(): } ) buffer = BytesIO() - df.to_parquet(buffer, store_schema=True) + df.to_parquet(buffer, store_schema=store_schema) got = pd.read_parquet(buffer) - expected = pd.DataFrame( - { - "s": ["0 days 00:00:01"], - "ms": ["0 days 00:00:00.002000"], - "us": ["0 days 00:00:00.000003"], - "ns": ["0 days 00:00:00.000004"], - }, - dtype="str", - ) + + if (store_schema): + expected = pd.DataFrame( + { + "s": ["0 days 00:00:01"], + "ms": ["0 days 00:00:00.002000"], + "us": ["0 days 00:00:00.000003"], + "ns": ["0 days 00:00:00.000004"], + }, + dtype="str", + ) + else: + expected = pd.DataFrame( + { + "s": ["00:00:01"], + "ms": ["00:00:00.002000"], + "us": ["00:00:00.000003"], + "ns": ["00:00:00.000004"], + }, + dtype="str", + ) assert_eq(got.astype("str"), expected) -def test_parquet_roundtrip_time_delta(): +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_roundtrip_time_delta(store_schema): num_rows = 12345 df = cudf.DataFrame( { @@ -3177,11 +3194,11 @@ def test_parquet_roundtrip_time_delta(): } ) buffer = BytesIO() - df.to_parquet(buffer, store_schema=True) - # TODO: Remove `check_dtype` once following issue is fixed in arrow: - # https://github.com/apache/arrow/issues/33321 + df.to_parquet(buffer, store_schema=store_schema) + # `check_dtype` cannot be removed here as timedelta64[s] will change to `timedelta[ms]` assert_eq(df, cudf.read_parquet(buffer), check_dtype=False) - + if (store_schema == True): + assert_eq(df, pd.read_parquet(buffer)) def test_parquet_reader_malformed_file(datadir): fname = datadir / "nested-unsigned-malformed.parquet" From d5f01beda3f41456703c048623238ba0084a166a Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 30 May 2024 07:46:25 +0000 Subject: [PATCH 19/45] Fixes for tests --- cpp/src/io/parquet/arrow_schema_writer.cpp | 22 +++++++++++++++++----- cpp/src/io/parquet/parquet.hpp | 11 +++++++++++ cpp/src/io/parquet/reader_impl_helpers.cpp | 2 +- cpp/src/io/parquet/writer_impl.cu | 15 +++++---------- cpp/tests/io/parquet_writer_test.cpp | 11 ++++------- python/cudf/cudf/tests/test_parquet.py | 9 +++++---- 6 files changed, 43 insertions(+), 27 deletions(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index 5f7fd9e9409..7d5419e2ebd 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -155,9 +155,15 @@ struct dispatch_to_flatbuf { } template - std::enable_if_t or std::is_same_v, - void> - operator()() + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Date; + // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp + field_offset = flatbuf::CreateDate(fbb, flatbuf::DateUnit_DAY).Union(); + } + + template + std::enable_if_t, void> operator()() { type_type = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp @@ -200,8 +206,14 @@ struct dispatch_to_flatbuf { } template - std::enable_if_t or std::is_same_v, void> - operator()() + std::enable_if_t, void> operator()() + { + type_type = flatbuf::Type_Time; + field_offset = flatbuf::CreateTime(fbb, flatbuf::TimeUnit_MILLISECOND).Union(); + } + + template + std::enable_if_t, void> operator()() { type_type = flatbuf::Type_Duration; field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_SECOND).Union(); diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index e35742c2527..b0734719002 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -57,6 +57,15 @@ struct TimeUnit { Type type; }; +struct DateUnit { + enum Type : char { DAYS = 0, MILLIS = 1 }; + Type type; +}; + +struct DateType { + DateUnit unit = {DateUnit::DAYS}; +}; + struct TimeType { // Default to true because the timestamps are implicitly in UTC // Writer option overrides this default @@ -97,12 +106,14 @@ struct LogicalType { thrust::optional decimal_type; thrust::optional time_type; thrust::optional timestamp_type; + thrust::optional date_type; thrust::optional int_type; LogicalType(Type tp = UNDEFINED) : type(tp) {} LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {} LogicalType(TimeType&& tt) : type(TIME), time_type(tt) {} LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {} + LogicalType(DateType&& date) : type(DATE), date_type(date) {} LogicalType(IntType&& it) : type(INTEGER), int_type(it) {} constexpr bool is_time_millis() const diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 565dc2e02f2..9c7a1348aec 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -45,7 +45,7 @@ thrust::optional converted_to_logical_type(SchemaElement const& sch case MAP: return LogicalType{LogicalType::MAP}; case LIST: return LogicalType{LogicalType::LIST}; case DECIMAL: return LogicalType{DecimalType{schema.decimal_scale, schema.decimal_precision}}; - case DATE: return LogicalType{LogicalType::DATE}; + case DATE: return LogicalType{DateType{DateUnit::DAYS}}; case TIME_MILLIS: return LogicalType{TimeType{true, TimeUnit::MILLIS}}; case TIME_MICROS: return LogicalType{TimeType{true, TimeUnit::MICROS}}; case TIMESTAMP_MILLIS: return LogicalType{TimestampType{true, TimeUnit::MILLIS}}; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 5d1001633f8..0bfbad8d260 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -464,7 +464,7 @@ struct leaf_schema_fn { col_schema.type = Type::INT32; col_schema.converted_type = ConvertedType::DATE; col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.logical_type = LogicalType{LogicalType::DATE}; + col_schema.logical_type = LogicalType{DateType{DateUnit::DAYS}}; } template @@ -507,15 +507,10 @@ struct leaf_schema_fn { template std::enable_if_t, void> operator()() { - col_schema.type = (write_arrow_schema) ? Type::INT64 : Type::INT32; - col_schema.stats_dtype = - (write_arrow_schema) ? statistics_dtype::dtype_int64 : statistics_dtype::dtype_int32; - col_schema.ts_scale = 24 * 60 * 60; - - if (not write_arrow_schema) { - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; - col_schema.ts_scale *= 1000; - } + col_schema.type = Type::INT32; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.ts_scale = 24 * 60 * 60 * 1000; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; } template diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 509c89480e3..58dfc49d4aa 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -86,16 +86,13 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc .use_arrow_schema(arrow_schema); auto result = cudf::io::read_parquet(in_opts); + auto durations_d_got = + cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view()); + if (arrow_schema) { - auto durations_d_got = - cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(1)); } else { - auto durations_d_got = - cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view()); - auto durations_s_got = cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view()); diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 460d917e95e..41f47899d84 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1604,7 +1604,6 @@ def clone_field(table, name, datatype): assert_eq(expect, got) - def test_multifile_parquet_folder(tmpdir): test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2, dtype="float64") test_pdf2 = make_pdf(nrows=20, dtype="float64") @@ -3130,6 +3129,7 @@ def test_parquet_writer_zstd(): got = pd.read_parquet(buff) assert_eq(expected, got) + @pytest.mark.parametrize("store_schema", [True, False]) def test_parquet_writer_time_delta_physical_type(store_schema): df = cudf.DataFrame( @@ -3147,7 +3147,7 @@ def test_parquet_writer_time_delta_physical_type(store_schema): got = pd.read_parquet(buffer) - if (store_schema): + if store_schema: expected = pd.DataFrame( { "s": ["0 days 00:00:01"], @@ -3197,8 +3197,9 @@ def test_parquet_roundtrip_time_delta(store_schema): df.to_parquet(buffer, store_schema=store_schema) # `check_dtype` cannot be removed here as timedelta64[s] will change to `timedelta[ms]` assert_eq(df, cudf.read_parquet(buffer), check_dtype=False) - if (store_schema == True): - assert_eq(df, pd.read_parquet(buffer)) + if store_schema: + assert_eq(df, pd.read_parquet(buffer)) + def test_parquet_reader_malformed_file(datadir): fname = datadir / "nested-unsigned-malformed.parquet" From 55296dfbe903587f9f101e1f6d002c158d014510 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 30 May 2024 19:37:27 +0000 Subject: [PATCH 20/45] Cleanup and restore int96timestamps for this PR. --- cpp/examples/parquet_io/parquet_io.cpp | 2 +- cpp/include/cudf/types.hpp | 2 +- cpp/src/io/parquet/arrow_schema_writer.cpp | 48 +++++++++++--- cpp/src/io/parquet/arrow_schema_writer.hpp | 1 + cpp/src/io/parquet/parquet_common.hpp | 10 +-- cpp/src/io/parquet/reader_impl_helpers.cpp | 10 ++- cpp/src/io/parquet/reader_impl_helpers.hpp | 2 +- cpp/src/io/parquet/writer_impl.cu | 77 ++++++++++++++-------- cpp/src/io/parquet/writer_impl.hpp | 1 + cpp/tests/io/parquet_writer_test.cpp | 4 +- python/cudf/cudf/tests/test_parquet.py | 48 ++++++++++++-- 11 files changed, 150 insertions(+), 55 deletions(-) diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index 90d956e578d..8be17db3781 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -67,7 +67,7 @@ void write_parquet(cudf::table_view input, table_metadata.column_metadata.end(), [=](auto& col_meta) { col_meta.set_encoding(encoding); }); - builder.metadata(table_metadata).write_arrow_schema(true); + builder.metadata(table_metadata); auto options = builder.build(); options.set_compression(compression); // Either use the input stats level or don't write stats diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 466d53fcafc..baf07fa3db6 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -216,7 +216,7 @@ enum class type_id : int32_t { TIMESTAMP_MILLISECONDS, ///< point in time in milliseconds since Unix Epoch in int64 TIMESTAMP_MICROSECONDS, ///< point in time in microseconds since Unix Epoch in int64 TIMESTAMP_NANOSECONDS, ///< point in time in nanoseconds since Unix Epoch in int64 - DURATION_DAYS, ///< time interval of days in int64 + DURATION_DAYS, ///< time interval of days in int32 DURATION_SECONDS, ///< time interval of seconds in int64 DURATION_MILLISECONDS, ///< time interval of milliseconds in int64 DURATION_MICROSECONDS, ///< time interval of microseconds in int64 diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index 7d5419e2ebd..6e56c2f1e35 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -55,6 +55,7 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, cudf::detail::LinkedColPtr const& column, column_in_metadata const& column_metadata, single_write_mode const write_mode, + bool const int96_timestamps, bool const utc_timestamps); /** @@ -65,6 +66,7 @@ struct dispatch_to_flatbuf { cudf::detail::LinkedColPtr const& col; column_in_metadata const& col_meta; single_write_mode const write_mode; + bool const int96_timestamps; bool const utc_timestamps; Offset& field_offset; flatbuf::Type& type_type; @@ -165,6 +167,9 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { + // Check if writing INT96 timestamps with arrow schema + if (int96_timestamps) { CUDF_FAIL("INT96 timestamps are deprecated in arrow schema"); } + type_type = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = flatbuf::CreateTimestamp( @@ -175,6 +180,9 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { + // Check if writing INT96 timestamps with arrow schema + if (int96_timestamps) { CUDF_FAIL("INT96 timestamps are deprecated in arrow schema"); } + type_type = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = @@ -186,6 +194,9 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { + // Check if writing INT96 timestamps with arrow schema + if (int96_timestamps) { CUDF_FAIL("INT96 timestamps are deprecated in arrow schema"); } + type_type = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = @@ -197,6 +208,9 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { + // Check if writing INT96 timestamps with arrow schema + if (int96_timestamps) { CUDF_FAIL("INT96 timestamps are deprecated in arrow schema"); } + type_type = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = @@ -267,7 +281,7 @@ struct dispatch_to_flatbuf { // Hence, we only need to process the second child of the list. if constexpr (std::is_same_v) { children.emplace_back(make_arrow_schema_fields( - fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps)); + fbb, col->children[1], col_meta.child(1), write_mode, int96_timestamps, utc_timestamps)); type_type = flatbuf::Type_List; field_offset = flatbuf::CreateList(fbb).Union(); } @@ -278,8 +292,12 @@ struct dispatch_to_flatbuf { thrust::make_counting_iterator(col->children.size()), std::back_inserter(children), [&](auto const idx) { - return make_arrow_schema_fields( - fbb, col->children[idx], col_meta.child(idx), write_mode, utc_timestamps); + return make_arrow_schema_fields(fbb, + col->children[idx], + col_meta.child(idx), + write_mode, + int96_timestamps, + utc_timestamps); }); type_type = flatbuf::Type_Struct_; field_offset = flatbuf::CreateStruct_(fbb).Union(); @@ -299,16 +317,23 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, cudf::detail::LinkedColPtr const& column, column_in_metadata const& column_metadata, single_write_mode const write_mode, + bool const int96_timestamps, bool const utc_timestamps) { Offset field_offset = 0; flatbuf::Type type_type = flatbuf::Type_NONE; std::vector children; - cudf::type_dispatcher( - column->type(), - dispatch_to_flatbuf{ - fbb, column, column_metadata, write_mode, utc_timestamps, field_offset, type_type, children}); + cudf::type_dispatcher(column->type(), + dispatch_to_flatbuf{fbb, + column, + column_metadata, + write_mode, + int96_timestamps, + utc_timestamps, + field_offset, + type_type, + children}); auto const fb_name = fbb.CreateString(column_metadata.get_name()); auto const fb_children = fbb.CreateVector(children.data(), children.size()); @@ -323,6 +348,7 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, table_input_metadata const& metadata, single_write_mode const write_mode, + bool const int96_timestamps, bool const utc_timestamps) { // Lambda function to convert int32 to a string of uint8 bytes @@ -345,8 +371,12 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con thrust::make_tuple(linked_columns.end(), metadata.column_metadata.end())), std::back_inserter(field_offsets), [&](auto const& elem) { - return make_arrow_schema_fields( - fbb, thrust::get<0>(elem), thrust::get<1>(elem), write_mode, utc_timestamps); + return make_arrow_schema_fields(fbb, + thrust::get<0>(elem), + thrust::get<1>(elem), + write_mode, + int96_timestamps, + utc_timestamps); }); // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index 309704d4e87..15e4f63a9f9 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -78,6 +78,7 @@ using namespace cudf::io::detail; std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, table_input_metadata const& metadata, single_write_mode const write_mode, + bool const int96_timestamps, bool const utc_timestamps); } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 69c0a89fd57..e42c259b1bf 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include namespace cudf::io::parquet::detail { @@ -27,12 +28,13 @@ auto constexpr MAX_DECIMAL64_PRECISION = 18; auto constexpr MAX_DECIMAL128_PRECISION = 38; // log10(2^(sizeof(int128_t) * 8 - 1) - 1) // Constants copied from arrow source and renamed to match the case -constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL = sizeof(int32_t); -constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t); -constexpr int32_t IPC_CONTINUATION_TOKEN = -1; +int32_t constexpr MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL = sizeof(int32_t); +int32_t constexpr MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t); +int32_t constexpr IPC_CONTINUATION_TOKEN = -1; +std::string const ARROW_SCHEMA_KEY = "ARROW:schema"; // Schema type ipc message has zero length body -constexpr int64_t SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH = 0; +int64_t constexpr SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH = 0; /** * @brief Basic data types in Parquet, determines how data is physically stored diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 9c7a1348aec..2998bf6f0eb 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -562,14 +562,14 @@ aggregate_reader_metadata::aggregate_reader_metadata( // Collect and apply arrow:schema from Parquet's key value metadata section if (use_arrow_schema) { apply_arrow_schema(); } - // Erase "ARROW:schema" from the output pfm if exists + // Erase ARROW_SCHEMA_KEY from the output pfm if exists std::for_each( - keyval_maps.begin(), keyval_maps.end(), [](auto& pfm) { pfm.erase("ARROW:schema"); }); + keyval_maps.begin(), keyval_maps.end(), [](auto& pfm) { pfm.erase(ARROW_SCHEMA_KEY); }); } arrow_schema_data_types aggregate_reader_metadata::collect_arrow_schema() const { - // Check the key_value metadata for ARROW:schema, decode and walk it + // Check the key_value metadata for arrow schema, decode and walk it // Function to convert from flatbuf::duration type to cudf::type_id auto const duration_from_flatbuffer = [](flatbuf::Duration const* duration) { // TODO: we only need this for arrow::DurationType for now. Else, we can take in a @@ -643,9 +643,7 @@ arrow_schema_data_types aggregate_reader_metadata::collect_arrow_schema() const return true; }; - // TODO: Should we check if any file has the "ARROW:schema" key - // Or if all files have the same "ARROW:schema"? - auto const it = keyval_maps[0].find("ARROW:schema"); + auto const it = keyval_maps[0].find(ARROW_SCHEMA_KEY); if (it == keyval_maps[0].end()) { return {}; } // Decode the base64 encoded ipc message string diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index 8b0f59ef33d..6bfa8519c76 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -145,7 +145,7 @@ class aggregate_reader_metadata { const; /** - * @brief Decodes and constructs the arrow schema from the "ARROW:schema" IPC message + * @brief Decodes and constructs the arrow schema from the ARROW_SCHEMA_KEY IPC message * in key value metadata section of Parquet file footer */ [[nodiscard]] arrow_schema_data_types collect_arrow_schema() const; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 0bfbad8d260..b49f0d3ea73 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -99,7 +99,7 @@ struct aggregate_writer_metadata { // Append arrow schema to the key-value metadata if (not arrow_schema_ipc_message.empty()) { std::for_each(this->files.begin(), this->files.end(), [&](auto& file) { - file.key_value_metadata.emplace_back(KeyValue{"ARROW:schema", arrow_schema_ipc_message}); + file.key_value_metadata.emplace_back(KeyValue{ARROW_SCHEMA_KEY, arrow_schema_ipc_message}); }); } } @@ -347,6 +347,7 @@ struct leaf_schema_fn { schema_tree_node& col_schema; cudf::detail::LinkedColPtr const& col; column_in_metadata const& col_meta; + bool timestamp_is_int96; bool timestamp_is_utc; bool write_arrow_schema; @@ -470,38 +471,50 @@ struct leaf_schema_fn { template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; - col_schema.ts_scale = 1000; - col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS; - col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}}; + col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; + col_schema.ts_scale = 1000; + if (not timestamp_is_int96) { + col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS; + col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}}; + } } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; - col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS; - col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}}; + col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; + if (not timestamp_is_int96) { + col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS; + col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}}; + } } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; - col_schema.converted_type = ConvertedType::TIMESTAMP_MICROS; - col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MICROS}}; + col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; + if (not timestamp_is_int96) { + col_schema.converted_type = ConvertedType::TIMESTAMP_MICROS; + col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MICROS}}; + } } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; + col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; col_schema.converted_type = thrust::nullopt; col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; - col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::NANOS}}; + if (timestamp_is_int96) { + col_schema.ts_scale = -1000; // negative value indicates division by absolute value + } + // set logical type if it's not int96 + else { + col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::NANOS}}; + } } template @@ -615,6 +628,7 @@ std::vector construct_parquet_schema_tree( cudf::detail::LinkedColVector const& linked_columns, table_input_metadata& metadata, single_write_mode write_mode, + bool int96_timestamps, bool utc_timestamps, bool write_arrow_schema) { @@ -882,9 +896,12 @@ std::vector construct_parquet_schema_tree( schema_tree_node col_schema{}; + bool timestamp_is_int96 = int96_timestamps or col_meta.is_enabled_int96_timestamps(); + cudf::type_dispatcher( col->type(), - leaf_schema_fn{col_schema, col, col_meta, utc_timestamps, write_arrow_schema}); + leaf_schema_fn{ + col_schema, col, col_meta, timestamp_is_int96, utc_timestamps, write_arrow_schema}); col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED; col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name(); @@ -1141,13 +1158,14 @@ void calculate_page_fragments(device_span frag, */ void gather_fragment_statistics(device_span frag_stats, device_span frags, + bool int96_timestamps, rmm::cuda_stream_view stream) { rmm::device_uvector frag_stats_group(frag_stats.size(), stream); InitFragmentStatistics(frag_stats_group, frags, stream); detail::calculate_group_statistics( - frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream); + frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream, int96_timestamps); stream.synchronize(); } @@ -1657,6 +1675,7 @@ void fill_table_meta(std::unique_ptr const& table_meta) * @param dict_policy Policy for dictionary use * @param max_dictionary_size Maximum dictionary size, in bytes * @param single_write_mode Flag to indicate that we are guaranteeing a single table write + * @param int96_timestamps Flag to indicate if timestamps will be written as INT96 * @param utc_timestamps Flag to indicate if timestamps are UTC * @param write_v2_headers True if V2 page headers are to be written * @param out_sink Sink for checking if device write is supported, should not be used to write any @@ -1681,15 +1700,16 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, dictionary_policy dict_policy, size_t max_dictionary_size, single_write_mode write_mode, + bool int96_timestamps, bool utc_timestamps, bool write_v2_headers, bool write_arrow_schema, host_span const> out_sink, rmm::cuda_stream_view stream) { - auto vec = table_to_linked_columns(input); - auto schema_tree = - construct_parquet_schema_tree(vec, table_meta, write_mode, utc_timestamps, write_arrow_schema); + auto vec = table_to_linked_columns(input); + auto schema_tree = construct_parquet_schema_tree( + vec, table_meta, write_mode, int96_timestamps, utc_timestamps, write_arrow_schema); // Construct parquet_column_views from the schema tree leaf nodes. std::vector parquet_columns; @@ -1817,9 +1837,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, this_table_schema, num_columns, stats_granularity, - (write_arrow_schema) - ? construct_arrow_schema_ipc_message(vec, table_meta, write_mode, utc_timestamps) - : ""); + (write_arrow_schema) ? construct_arrow_schema_ipc_message( + vec, table_meta, write_mode, int96_timestamps, utc_timestamps) + : ""); } else { agg_meta = std::make_unique(*curr_agg_meta); @@ -1990,8 +2010,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, // and gather fragment statistics if (not frag_stats.is_empty()) { - gather_fragment_statistics( - frag_stats, {page_fragments.device_ptr(), static_cast(total_frags)}, stream); + gather_fragment_statistics(frag_stats, + {page_fragments.device_ptr(), static_cast(total_frags)}, + int96_timestamps, + stream); } } @@ -2295,6 +2317,7 @@ writer::impl::impl(std::vector> sinks, _dict_policy(options.get_dictionary_policy()), _max_dictionary_size(options.get_max_dictionary_size()), _max_page_fragment_size(options.get_max_page_fragment_size()), + _int96_timestamps(options.is_enabled_int96_timestamps()), _utc_timestamps(options.is_enabled_utc_timestamps()), _write_v2_headers(options.is_enabled_write_v2_headers()), _write_arrow_schema(options.is_enabled_write_arrow_schema()), @@ -2325,6 +2348,7 @@ writer::impl::impl(std::vector> sinks, _dict_policy(options.get_dictionary_policy()), _max_dictionary_size(options.get_max_dictionary_size()), _max_page_fragment_size(options.get_max_page_fragment_size()), + _int96_timestamps(options.is_enabled_int96_timestamps()), _utc_timestamps(options.is_enabled_utc_timestamps()), _write_v2_headers(options.is_enabled_write_v2_headers()), _write_arrow_schema(options.is_enabled_write_arrow_schema()), @@ -2403,6 +2427,7 @@ void writer::impl::write(table_view const& input, std::vector co _dict_policy, _max_dictionary_size, _single_write_mode, + _int96_timestamps, _utc_timestamps, _write_v2_headers, _write_arrow_schema, diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index bcc8de13ceb..63128faf993 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -153,6 +153,7 @@ class writer::impl { dictionary_policy const _dict_policy; size_t const _max_dictionary_size; std::optional const _max_page_fragment_size; + bool const _int96_timestamps; bool const _utc_timestamps; bool const _write_v2_headers; bool const _write_arrow_schema; diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 58dfc49d4aa..82e4c4bd82f 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -73,8 +73,8 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc col_meta.set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT); } } - std::string a = (arrow_schema) ? "1" : "0"; - auto filepath = "/home/coder/Durations" + a + ".parquet"; + + auto filepath = temp_env->get_temp_filepath("Durations.parquet"); cudf::io::parquet_writer_options out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) .write_arrow_schema(arrow_schema); diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 41f47899d84..8561e2ac4b3 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1604,6 +1604,39 @@ def clone_field(table, name, datatype): assert_eq(expect, got) +@pytest.mark.filterwarnings("ignore:Using CPU") +def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf): + gdf_fname = tmpdir.join("gdf.parquet") + + if len(pdf) == 0: + pdf = pdf.reset_index(drop=True) + gdf = gdf.reset_index(drop=True) + + if "col_category" in pdf.columns: + pdf = pdf.drop(columns=["col_category"]) + if "col_category" in gdf.columns: + gdf = gdf.drop(columns=["col_category"]) + + assert_eq(pdf, gdf) + + # Write out the gdf using the GPU accelerated writer with INT96 timestamps + # TODO: store_schema must be false when working with INT96 timestamps + gdf.to_parquet( + gdf_fname.strpath, + index=None, + int96_timestamps=True, + store_schema=False, + ) + + assert os.path.exists(gdf_fname) + + expect = pdf + got = pd.read_parquet(gdf_fname) + + # verify INT96 timestamps were converted back to the same data. + assert_eq(expect, got, check_categorical=False, check_dtype=False) + + def test_multifile_parquet_folder(tmpdir): test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2, dtype="float64") test_pdf2 = make_pdf(nrows=20, dtype="float64") @@ -3435,17 +3468,19 @@ def test_parquet_writer_roundtrip_with_arrow_schema(): } ) - # Write to Parquet + # Write to Parquet with arrow schema buffer = BytesIO() expected.to_parquet(buffer, store_schema=True) - # Read parquet with pyarrow and cudf readers + # Read parquet with pyarrow, pandas and cudf readers got = cudf.DataFrame.from_arrow(pq.read_table(buffer)) - got2 = cudf.read_parquet(buffer) + got2 = cudf.DataFrame.from_pandas(pd.read_parquet(buffer)) + got3 = cudf.read_parquet(buffer) # Check results assert_eq(expected, got) assert_eq(expected, got2) + assert_eq(expected, got3) @pytest.mark.parametrize( @@ -3516,18 +3551,21 @@ def test_parquet_writer_roundtrip_structs_with_arrow_schema(tmpdir, data): # Ensure that the structs are faithfully being roundtripped across # Parquet with arrow schema pa_expected = pa.Table.from_pydict({"struct": data}) + pd_expected = pa_expected.to_pandas() expected = cudf.DataFrame.from_arrow(pa_expected) - # Write expected data frame to Parquet + # Write expected data frame to Parquet with arrow schema buffer = BytesIO() expected.to_parquet(buffer, store_schema=True) - # Read Parquet with pyarrow + # Read Parquet with pyarrow and pandas pa_got = pq.read_table(buffer) + pd_got = pd.read_parquet(buffer) # Check results assert_eq(pa_expected, pa_got) + assert_eq(pd_expected, pd_got) # Convert to cuDF table and also read Parquet with cuDF reader got = cudf.DataFrame.from_arrow(pa_got) From 706eb186805dfa352681d2ced0f14437ad6a9f0b Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 30 May 2024 20:16:56 +0000 Subject: [PATCH 21/45] Modify int96 and arrow schema option behavior --- cpp/src/io/parquet/arrow_schema_writer.cpp | 48 ++++--------------- cpp/src/io/parquet/arrow_schema_writer.hpp | 1 - cpp/src/io/parquet/writer_impl.cu | 19 ++++++-- .../_lib/pylibcudf/libcudf/io/parquet.pxd | 6 +++ 4 files changed, 29 insertions(+), 45 deletions(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index 6e56c2f1e35..7d5419e2ebd 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -55,7 +55,6 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, cudf::detail::LinkedColPtr const& column, column_in_metadata const& column_metadata, single_write_mode const write_mode, - bool const int96_timestamps, bool const utc_timestamps); /** @@ -66,7 +65,6 @@ struct dispatch_to_flatbuf { cudf::detail::LinkedColPtr const& col; column_in_metadata const& col_meta; single_write_mode const write_mode; - bool const int96_timestamps; bool const utc_timestamps; Offset& field_offset; flatbuf::Type& type_type; @@ -167,9 +165,6 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { - // Check if writing INT96 timestamps with arrow schema - if (int96_timestamps) { CUDF_FAIL("INT96 timestamps are deprecated in arrow schema"); } - type_type = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = flatbuf::CreateTimestamp( @@ -180,9 +175,6 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { - // Check if writing INT96 timestamps with arrow schema - if (int96_timestamps) { CUDF_FAIL("INT96 timestamps are deprecated in arrow schema"); } - type_type = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = @@ -194,9 +186,6 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { - // Check if writing INT96 timestamps with arrow schema - if (int96_timestamps) { CUDF_FAIL("INT96 timestamps are deprecated in arrow schema"); } - type_type = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = @@ -208,9 +197,6 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { - // Check if writing INT96 timestamps with arrow schema - if (int96_timestamps) { CUDF_FAIL("INT96 timestamps are deprecated in arrow schema"); } - type_type = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = @@ -281,7 +267,7 @@ struct dispatch_to_flatbuf { // Hence, we only need to process the second child of the list. if constexpr (std::is_same_v) { children.emplace_back(make_arrow_schema_fields( - fbb, col->children[1], col_meta.child(1), write_mode, int96_timestamps, utc_timestamps)); + fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps)); type_type = flatbuf::Type_List; field_offset = flatbuf::CreateList(fbb).Union(); } @@ -292,12 +278,8 @@ struct dispatch_to_flatbuf { thrust::make_counting_iterator(col->children.size()), std::back_inserter(children), [&](auto const idx) { - return make_arrow_schema_fields(fbb, - col->children[idx], - col_meta.child(idx), - write_mode, - int96_timestamps, - utc_timestamps); + return make_arrow_schema_fields( + fbb, col->children[idx], col_meta.child(idx), write_mode, utc_timestamps); }); type_type = flatbuf::Type_Struct_; field_offset = flatbuf::CreateStruct_(fbb).Union(); @@ -317,23 +299,16 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, cudf::detail::LinkedColPtr const& column, column_in_metadata const& column_metadata, single_write_mode const write_mode, - bool const int96_timestamps, bool const utc_timestamps) { Offset field_offset = 0; flatbuf::Type type_type = flatbuf::Type_NONE; std::vector children; - cudf::type_dispatcher(column->type(), - dispatch_to_flatbuf{fbb, - column, - column_metadata, - write_mode, - int96_timestamps, - utc_timestamps, - field_offset, - type_type, - children}); + cudf::type_dispatcher( + column->type(), + dispatch_to_flatbuf{ + fbb, column, column_metadata, write_mode, utc_timestamps, field_offset, type_type, children}); auto const fb_name = fbb.CreateString(column_metadata.get_name()); auto const fb_children = fbb.CreateVector(children.data(), children.size()); @@ -348,7 +323,6 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, table_input_metadata const& metadata, single_write_mode const write_mode, - bool const int96_timestamps, bool const utc_timestamps) { // Lambda function to convert int32 to a string of uint8 bytes @@ -371,12 +345,8 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con thrust::make_tuple(linked_columns.end(), metadata.column_metadata.end())), std::back_inserter(field_offsets), [&](auto const& elem) { - return make_arrow_schema_fields(fbb, - thrust::get<0>(elem), - thrust::get<1>(elem), - write_mode, - int96_timestamps, - utc_timestamps); + return make_arrow_schema_fields( + fbb, thrust::get<0>(elem), thrust::get<1>(elem), write_mode, utc_timestamps); }); // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index 15e4f63a9f9..309704d4e87 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -78,7 +78,6 @@ using namespace cudf::io::detail; std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, table_input_metadata const& metadata, single_write_mode const write_mode, - bool const int96_timestamps, bool const utc_timestamps); } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index b49f0d3ea73..ab338c4ab49 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1837,9 +1837,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, this_table_schema, num_columns, stats_granularity, - (write_arrow_schema) ? construct_arrow_schema_ipc_message( - vec, table_meta, write_mode, int96_timestamps, utc_timestamps) - : ""); + (write_arrow_schema) + ? construct_arrow_schema_ipc_message(vec, table_meta, write_mode, utc_timestamps) + : ""); } else { agg_meta = std::make_unique(*curr_agg_meta); @@ -2317,7 +2317,8 @@ writer::impl::impl(std::vector> sinks, _dict_policy(options.get_dictionary_policy()), _max_dictionary_size(options.get_max_dictionary_size()), _max_page_fragment_size(options.get_max_page_fragment_size()), - _int96_timestamps(options.is_enabled_int96_timestamps()), + _int96_timestamps(options.is_enabled_int96_timestamps() and + not options.is_enabled_write_arrow_schema()), _utc_timestamps(options.is_enabled_utc_timestamps()), _write_v2_headers(options.is_enabled_write_v2_headers()), _write_arrow_schema(options.is_enabled_write_arrow_schema()), @@ -2328,6 +2329,10 @@ writer::impl::impl(std::vector> sinks, _out_sink(std::move(sinks)), _compression_statistics{options.get_compression_statistics()} { + if (options.is_enabled_int96_timestamps() and options.is_enabled_write_arrow_schema()) { + CUDF_LOG_WARN("INT96 timestamps are deprecated in arrow schema. Disabling INT96 timestamps."); + } + if (options.get_metadata()) { _table_meta = std::make_unique(*options.get_metadata()); } @@ -2348,7 +2353,8 @@ writer::impl::impl(std::vector> sinks, _dict_policy(options.get_dictionary_policy()), _max_dictionary_size(options.get_max_dictionary_size()), _max_page_fragment_size(options.get_max_page_fragment_size()), - _int96_timestamps(options.is_enabled_int96_timestamps()), + _int96_timestamps(options.is_enabled_int96_timestamps() and + not options.is_enabled_write_arrow_schema()), _utc_timestamps(options.is_enabled_utc_timestamps()), _write_v2_headers(options.is_enabled_write_v2_headers()), _write_arrow_schema(options.is_enabled_write_arrow_schema()), @@ -2359,6 +2365,9 @@ writer::impl::impl(std::vector> sinks, _out_sink(std::move(sinks)), _compression_statistics{options.get_compression_statistics()} { + if (options.is_enabled_int96_timestamps() and options.is_enabled_write_arrow_schema()) { + CUDF_LOG_WARN("INT96 timestamps are deprecated in arrow schema. Disabling INT96 timestamps."); + } if (options.get_metadata()) { _table_meta = std::make_unique(*options.get_metadata()); } diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd index cb4ce142543..32245539d3c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd @@ -100,6 +100,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_column_chunks_file_paths( vector[string] column_chunks_file_paths ) except + + void set_int96_timestamps( + bool enabled + ) except + + void set_utc_timestamps( + bool enabled + ) except + void enable_int96_timestamps(bool val) except + void enable_utc_timestamps(bool val) except + void enable_write_arrow_schema(bool val) except + From fa247b7f015eb3df7c52b60da66f1ff716c42ada Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 30 May 2024 20:23:25 +0000 Subject: [PATCH 22/45] Revert _use_arrow_schema to true --- cpp/include/cudf/io/parquet.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index ede1994312d..e13d7aab4bd 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -72,7 +72,7 @@ class parquet_reader_options { // Whether to use PANDAS metadata to load columns bool _use_pandas_metadata = true; // Whether to read and use ARROW schema - bool _use_arrow_schema = false; + bool _use_arrow_schema = true; // Cast timestamp columns to a specific type data_type _timestamp_type{type_id::EMPTY}; From 9607618d1e782e1933fba3620aee583bd80fac4b Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 31 May 2024 01:48:18 +0000 Subject: [PATCH 23/45] Add tests --- python/cudf/cudf/tests/test_parquet.py | 149 +++++++++++++++++++------ 1 file changed, 116 insertions(+), 33 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 8561e2ac4b3..f2c46c1e192 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1798,10 +1798,11 @@ def test_parquet_write_bytes_io(simple_gdf): assert_eq(cudf.read_parquet(output), simple_gdf) -def test_parquet_writer_bytes_io(simple_gdf): +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_writer_bytes_io(simple_gdf, store_schema): output = BytesIO() - writer = ParquetWriter(output) + writer = ParquetWriter(output, store_schema=store_schema) writer.write_table(simple_gdf) writer.write_table(simple_gdf) writer.close() @@ -2133,7 +2134,8 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory): @pytest.mark.parametrize("cols", [None, ["b"]]) -def test_parquet_write_to_dataset(tmpdir_factory, cols): +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema): dir1 = tmpdir_factory.mktemp("dir1") dir2 = tmpdir_factory.mktemp("dir2") if cols is None: @@ -2149,7 +2151,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols): "b": np.random.choice(np.arange(4), size=size), } ) - gdf.to_parquet(dir1, partition_cols=cols) + gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema) cudf.io.write_to_dataset(gdf, dir2, partition_cols=cols) # Read back with cudf @@ -2165,7 +2167,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols): } ) with pytest.raises(ValueError): - gdf.to_parquet(dir1, partition_cols=cols) + gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema) @pytest.mark.parametrize( @@ -2395,7 +2397,8 @@ def test_parquet_writer_list_large_mixed(tmpdir): assert_eq(expect, got) -def test_parquet_writer_list_chunked(tmpdir): +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_writer_list_chunked(tmpdir, store_schema): table1 = cudf.DataFrame( { "a": list_gen(string_gen, 128, 80, 50), @@ -2416,7 +2419,7 @@ def test_parquet_writer_list_chunked(tmpdir): expect = cudf.concat([table1, table2]) expect = expect.reset_index(drop=True) - writer = ParquetWriter(fname) + writer = ParquetWriter(fname, store_schema=store_schema) writer.write_table(table1) writer.write_table(table2) writer.close() @@ -3393,30 +3396,85 @@ def test_parquet_reader_roundtrip_with_arrow_schema(): # Check results for reader with schema assert_eq(expected, got) + # Reset buffer + buffer = BytesIO() + + # Write to buffer with cudf + expected.to_parquet(buffer, store_schema=True) + + # Read parquet with arrow schema + got = cudf.read_parquet(buffer) + # Convert to cudf table for an apple to apple comparison + expected = cudf.from_pandas(pdf) -def test_parquet_reader_roundtrip_structs_with_arrow_schema(): + +@pytest.mark.parametrize( + "data", + [ + # struct + [ + {"a": 1, "b": 2}, + {"a": 10, "b": 20}, + {"a": None, "b": 22}, + {"a": None, "b": None}, + {"a": 15, "b": None}, + ], + # struct-of-list + [ + {"a": 1, "b": 2, "c": [1, 2, 3]}, + {"a": 10, "b": 20, "c": [4, 5]}, + {"a": None, "b": 22, "c": [6]}, + {"a": None, "b": None, "c": None}, + {"a": 15, "b": None, "c": [-1, -2]}, + None, + {"a": 100, "b": 200, "c": [-10, None, -20]}, + ], + # list-of-struct + [ + [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}], + None, + [{"a": 10, "b": 20}], + [{"a": 100, "b": 200}, {"a": None, "b": 300}, None], + ], + # struct-of-struct + [ + {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2}, + {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4}, + {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6}, + {"a": 7, "b": None, "c": 8}, + {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None}, + None, + {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10}, + ], + # struct-with-mixed-types + [ + { + "struct": { + "payload": { + "Domain": { + "Name": "abc", + "Id": {"Name": "host", "Value": "127.0.0.8"}, + "Duration": datetime.timedelta(minutes=12), + }, + "StreamId": "12345678", + "Duration": datetime.timedelta(minutes=4), + "Offset": None, + "Resource": [ + { + "Name": "ZoneName", + "Value": "RAPIDS", + "Duration": datetime.timedelta(seconds=1), + } + ], + } + } + } + ], + ], +) +def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmpdir, data): # Ensure that the structs with duration types are faithfully being # roundtripped across Parquet with arrow schema - data = { - "payload": { - "Domain": { - "Name": "abc", - "Id": {"Name": "host", "Value": "127.0.0.8"}, - "Duration": datetime.timedelta(minutes=12), - }, - "StreamId": "12345678", - "Duration": datetime.timedelta(minutes=4), - "Offset": None, - "Resource": [ - { - "Name": "ZoneName", - "Value": "RAPIDS", - "Duration": datetime.timedelta(seconds=1), - } - ], - } - } - pdf = pd.DataFrame({"struct": pd.Series(data)}) buffer = BytesIO() @@ -3430,8 +3488,20 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema(): # Check results assert_eq(expected, got) + # Reset buffer + buffer = BytesIO() + + # Write to buffer with cudf + expected.to_parquet(buffer, store_schema=True) + + # Read parquet with arrow schema + got = cudf.read_parquet(buffer) + # Convert to cudf table for an apple to apple comparison + expected = cudf.from_pandas(pdf) + -def test_parquet_writer_roundtrip_with_arrow_schema(): +@pytest.mark.parametrize("index", [None, True, False]) +def test_parquet_writer_roundtrip_with_arrow_schema(index): # Ensure that the concrete and nested types are faithfully being roundtripped # across Parquet with arrow schema expected = cudf.DataFrame( @@ -3468,15 +3538,20 @@ def test_parquet_writer_roundtrip_with_arrow_schema(): } ) - # Write to Parquet with arrow schema + # Write to Parquet with arrow schema for faithful roundtrip buffer = BytesIO() - expected.to_parquet(buffer, store_schema=True) + expected.to_parquet(buffer, store_schema=True, index=index) # Read parquet with pyarrow, pandas and cudf readers got = cudf.DataFrame.from_arrow(pq.read_table(buffer)) got2 = cudf.DataFrame.from_pandas(pd.read_parquet(buffer)) got3 = cudf.read_parquet(buffer) + # drop the index column for comparison: __index_level_0__ + if index: + got.drop(columns="__index_level_0__", inplace=True) + got2.drop(columns="__index_level_0__", inplace=True) + # Check results assert_eq(expected, got) assert_eq(expected, got2) @@ -3547,7 +3622,10 @@ def test_parquet_writer_roundtrip_with_arrow_schema(): ], ], ) -def test_parquet_writer_roundtrip_structs_with_arrow_schema(tmpdir, data): +@pytest.mark.parametrize("index", [None, True, False]) +def test_parquet_writer_roundtrip_structs_with_arrow_schema( + tmpdir, data, index +): # Ensure that the structs are faithfully being roundtripped across # Parquet with arrow schema pa_expected = pa.Table.from_pydict({"struct": data}) @@ -3557,12 +3635,17 @@ def test_parquet_writer_roundtrip_structs_with_arrow_schema(tmpdir, data): # Write expected data frame to Parquet with arrow schema buffer = BytesIO() - expected.to_parquet(buffer, store_schema=True) + expected.to_parquet(buffer, store_schema=True, index=index) # Read Parquet with pyarrow and pandas pa_got = pq.read_table(buffer) pd_got = pd.read_parquet(buffer) + # drop the index column for comparison: __index_level_0__ + if index: + pa_got = pa_got.drop(columns="__index_level_0__") + pd_got = pd_got.drop(columns="__index_level_0__") + # Check results assert_eq(pa_expected, pa_got) assert_eq(pd_expected, pd_got) From a044f3fd90cc016717355aa83ed183f53628bb77 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 31 May 2024 02:06:09 +0000 Subject: [PATCH 24/45] remove temp variables --- cpp/src/io/parquet/arrow_schema_writer.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index 7d5419e2ebd..30759d323d1 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -301,6 +301,7 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, single_write_mode const write_mode, bool const utc_timestamps) { + // Variables to be set by the dispatch_to_flatbuf functor Offset field_offset = 0; flatbuf::Type type_type = flatbuf::Type_NONE; std::vector children; @@ -310,14 +311,15 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, dispatch_to_flatbuf{ fbb, column, column_metadata, write_mode, utc_timestamps, field_offset, type_type, children}); - auto const fb_name = fbb.CreateString(column_metadata.get_name()); - auto const fb_children = fbb.CreateVector(children.data(), children.size()); - auto const is_nullable = is_col_nullable(column, column_metadata, write_mode); - DictionaryOffset dictionary = 0; - // push to field offsets vector return flatbuf::CreateField( - fbb, fb_name, is_nullable, type_type, field_offset, dictionary, fb_children); + fbb, + fbb.CreateString(column_metadata.get_name()), // name + is_col_nullable(column, column_metadata, write_mode), // nullable + type_type, // type id + field_offset, // field offset + {0}, // DictionaryOffset + fbb.CreateVector(children.data(), children.size())); // children vector } std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, From a5ab9fbd1b60b8e37e3a60554ed8d28f43d886f1 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 31 May 2024 02:09:03 +0000 Subject: [PATCH 25/45] minor comments cleanup --- cpp/src/io/parquet/arrow_schema_writer.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index 30759d323d1..a9b3131166d 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -355,12 +355,13 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con // create an ipc message flatbuffer fbb.Finish(flatbuf::CreateMessage( fbb, - flatbuf::MetadataVersion_V5, /* Metadata version V5 (latest) */ - flatbuf::MessageHeader_Schema, /* Schema type message header */ - flatbuf::CreateSchema( - fbb, flatbuf::Endianness::Endianness_Little, fbb.CreateVector(field_offsets)) - .Union(), /* Build an arrow:schema from the field vector */ - SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH /* Body length is zero for schema type ipc message */ + flatbuf::MetadataVersion_V5, // Metadata version V5 (latest) + flatbuf::MessageHeader_Schema, // Schema type message header + flatbuf::CreateSchema(fbb, + flatbuf::Endianness::Endianness_Little, + fbb.CreateVector(field_offsets)) + .Union(), // arrow:schema built from the field vector + SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH // Body length is zero for schema type ipc message )); // Construct the final string and store it here to use its view in base64_encode From 844a1d6030f9ba89d33deba9d7c0b48c67625b49 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 31 May 2024 02:25:09 +0000 Subject: [PATCH 26/45] revert convertedtype setting --- cpp/src/io/parquet/writer_impl.cu | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index ab338c4ab49..b9e31f5704f 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -520,10 +520,13 @@ struct leaf_schema_fn { template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT32; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.ts_scale = 24 * 60 * 60 * 1000; - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; + // duration_D is based on int32_t and not a valid arrow duration type so simply convert to + // time32(ms). + col_schema.type = Type::INT32; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.ts_scale = 24 * 60 * 60 * 1000; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; } template @@ -532,9 +535,11 @@ struct leaf_schema_fn { col_schema.type = (write_arrow_schema) ? Type::INT64 : Type::INT32; col_schema.stats_dtype = (write_arrow_schema) ? statistics_dtype::dtype_int64 : statistics_dtype::dtype_int32; + // only write as time32 logical type if not writing arrow schema if (not write_arrow_schema) { - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; - col_schema.ts_scale = 1000; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; + col_schema.ts_scale = 1000; } } @@ -544,8 +549,10 @@ struct leaf_schema_fn { col_schema.type = (write_arrow_schema) ? Type::INT64 : Type::INT32; col_schema.stats_dtype = (write_arrow_schema) ? statistics_dtype::dtype_int64 : statistics_dtype::dtype_int32; + // only write as time32 logical type if not writing arrow schema if (not write_arrow_schema) { - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; } } @@ -554,8 +561,10 @@ struct leaf_schema_fn { { col_schema.type = Type::INT64; col_schema.stats_dtype = statistics_dtype::dtype_int64; + // only write as time64 logical type if not writing arrow schema if (not write_arrow_schema) { - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}}; + col_schema.converted_type = ConvertedType::TIME_MICROS; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}}; } } @@ -564,6 +573,7 @@ struct leaf_schema_fn { { col_schema.type = Type::INT64; col_schema.stats_dtype = statistics_dtype::dtype_int64; + // only write as time64 logical type if not writing arrow schema if (not write_arrow_schema) { col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}}; } From dc1608c446d0ef38e1fc11bd9309b1b4f654e5ce Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 4 Jun 2024 08:30:18 +0000 Subject: [PATCH 27/45] Add decimal column conversion --- cpp/src/io/parquet/arrow_schema_writer.cpp | 24 +-- cpp/src/io/parquet/arrow_schema_writer.hpp | 25 --- cpp/src/io/parquet/writer_impl.cu | 217 ++++++++++++++------- cpp/src/io/parquet/writer_impl_helpers.cpp | 107 ++++++++++ cpp/src/io/parquet/writer_impl_helpers.hpp | 80 ++++++++ cpp/tests/io/parquet_writer_test.cpp | 44 +++++ python/cudf/cudf/tests/test_parquet.py | 12 +- 7 files changed, 399 insertions(+), 110 deletions(-) create mode 100644 cpp/src/io/parquet/writer_impl_helpers.cpp create mode 100644 cpp/src/io/parquet/writer_impl_helpers.hpp diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index a9b3131166d..accc5e52533 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -25,6 +25,7 @@ #include "io/utilities/base64_utilities.hpp" #include "ipc/Message_generated.h" #include "ipc/Schema_generated.h" +#include "writer_impl_helpers.hpp" #include #include @@ -243,19 +244,20 @@ struct dispatch_to_flatbuf { template std::enable_if_t(), void> operator()() { - if constexpr (std::is_same_v) { - type_type = flatbuf::Type_Decimal; - field_offset = - flatbuf::CreateDecimal(fbb, col_meta.get_decimal_precision(), col->type().scale(), 128) - .Union(); - } - // cuDF-PQ writer supports ``decimal32`` and ``decimal64`` types, not directly supported by - // Arrow without explicit conversion. See more: - // https://github.com/rapidsai/cudf/blob/branch-24.08/cpp/src/interop/to_arrow.cu#L155. - else { - // TODO: Should we fail here or just not write arrow schema?. + if constexpr (not std::is_same_v) { + // ``decimal32`` and ``decimal64`` types are not supported by + // Arrow without explicit conversion. CUDF_FAIL("Fixed point types smaller than `decimal128` are not supported in arrow schema"); } + + type_type = flatbuf::Type_Decimal; + field_offset = flatbuf::CreateDecimal(fbb, + (col_meta.is_decimal_precision_set()) + ? col_meta.get_decimal_precision() + : MAX_DECIMAL128_PRECISION, + col->type().scale(), + 128) + .Union(); } template diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index 309704d4e87..4043889ea99 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -34,31 +34,6 @@ namespace cudf::io::parquet::detail { using namespace cudf::io::detail; -/** - * @brief Returns ``true`` if the column is nullable or if the write mode is not - * set to write the table all at once instead of chunked - * - * @param column A view of the column - * @param column_metadata Metadata of the column - * @param write_mode Flag to indicate that we are guaranteeing a single table write - * - * @return Whether the column is nullable. - */ -[[nodiscard]] inline bool is_col_nullable(cudf::detail::LinkedColPtr const& column, - column_in_metadata const& column_metadata, - single_write_mode write_mode) -{ - if (column_metadata.is_nullability_defined()) { - CUDF_EXPECTS(column_metadata.nullable() or column->null_count() == 0, - "Mismatch in metadata prescribed nullability and input column. " - "Metadata for input column with nulls cannot prescribe nullability = false"); - return column_metadata.nullable(); - } - // For chunked write, when not provided nullability, we assume the worst case scenario - // that all columns are nullable. - return write_mode == single_write_mode::NO or column->nullable(); -} - /** * @brief Construct and return arrow schema from input parquet schema * diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index b9e31f5704f..ef2e3e54245 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -31,6 +31,7 @@ #include "parquet_common.hpp" #include "parquet_gpu.cuh" #include "writer_impl.hpp" +#include "writer_impl_helpers.hpp" #include #include @@ -192,26 +193,6 @@ struct aggregate_writer_metadata { namespace { -/** - * @brief Function that translates GDF compression to parquet compression. - * - * @param compression The compression type - * @return The supported Parquet compression - */ -Compression to_parquet_compression(compression_type compression) -{ - switch (compression) { - case compression_type::AUTO: - case compression_type::SNAPPY: return Compression::SNAPPY; - case compression_type::ZSTD: return Compression::ZSTD; - case compression_type::LZ4: - // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4 - return Compression::LZ4_RAW; - case compression_type::NONE: return Compression::UNCOMPRESSED; - default: CUDF_FAIL("Unsupported compression type"); - } -} - /** * @brief Convert a mask of encodings to a vector. * @@ -582,25 +563,30 @@ struct leaf_schema_fn { template std::enable_if_t(), void> operator()() { - if (std::is_same_v) { - col_schema.type = Type::INT32; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.decimal_precision = MAX_DECIMAL32_PRECISION; - col_schema.logical_type = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}}; - } else if (std::is_same_v) { - col_schema.type = Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_decimal64; - col_schema.decimal_precision = MAX_DECIMAL64_PRECISION; - col_schema.logical_type = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}}; - } else if (std::is_same_v) { + // If writing arrow schema, then convert d32 and d64 to d128 + if (write_arrow_schema or std::is_same_v) { col_schema.type = Type::FIXED_LEN_BYTE_ARRAY; col_schema.type_length = sizeof(__int128_t); col_schema.stats_dtype = statistics_dtype::dtype_decimal128; col_schema.decimal_precision = MAX_DECIMAL128_PRECISION; col_schema.logical_type = LogicalType{DecimalType{0, MAX_DECIMAL128_PRECISION}}; } else { - CUDF_FAIL("Unsupported fixed point type for parquet writer"); + if (std::is_same_v) { + col_schema.type = Type::INT32; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.decimal_precision = MAX_DECIMAL32_PRECISION; + col_schema.logical_type = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}}; + } else if (std::is_same_v) { + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_decimal64; + col_schema.decimal_precision = MAX_DECIMAL64_PRECISION; + col_schema.logical_type = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}}; + } else { + CUDF_FAIL("Unsupported fixed point type for parquet writer"); + } } + + // Write logical and converted types, decimal scale and precision col_schema.converted_type = ConvertedType::DECIMAL; col_schema.decimal_scale = -col->type().scale(); // parquet and cudf disagree about scale signs col_schema.logical_type->decimal_type->scale = -col->type().scale(); @@ -1179,32 +1165,6 @@ void gather_fragment_statistics(device_span frag_stats, stream.synchronize(); } -auto to_nvcomp_compression_type(Compression codec) -{ - if (codec == Compression::SNAPPY) return nvcomp::compression_type::SNAPPY; - if (codec == Compression::ZSTD) return nvcomp::compression_type::ZSTD; - // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4 - if (codec == Compression::LZ4_RAW) return nvcomp::compression_type::LZ4; - CUDF_FAIL("Unsupported compression type"); -} - -auto page_alignment(Compression codec) -{ - if (codec == Compression::UNCOMPRESSED or - nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) { - return 1u; - } - - return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec)); -} - -size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize) -{ - if (codec == Compression::UNCOMPRESSED) return 0; - - return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize); -} - auto init_page_sizes(hostdevice_2dvector& chunks, device_span col_desc, uint32_t num_columns, @@ -1644,23 +1604,125 @@ size_t column_index_buffer_size(EncColumnChunk* ck, } /** - * @brief Fill the table metadata with default column names. + * @brief Convert decimal32 and decimal64 data to decimal128 and return the device vector + * + * @tparam DecimalType to convert from + * + * @param column A view of the input columns + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return A device vector containing the converted decimal128 data + */ +template +rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& column, + rmm::cuda_stream_view stream) +{ + size_type constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType); + + rmm::device_uvector<__int128_t> d128_buffer(column.size(), stream); + + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(column.size()), + [in = column.begin(), + out = reinterpret_cast(d128_buffer.data()), + BIT_WIDTH_RATIO] __device__(auto in_idx) { + auto const out_idx = in_idx * BIT_WIDTH_RATIO; + // The lowest order bits are the value, the remainder + // simply matches the sign bit to satisfy the two's + // complement integer representation of negative numbers. + out[out_idx] = in[in_idx]; +#pragma unroll BIT_WIDTH_RATIO - 1 + for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { + out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; + } + }); + + return d128_buffer; +} + +/** + * @brief Helper function to convert decimal32 and decimal64 columns to decimal128 data, + * update the input table metadata, and return a new vector of column views. * - * @param table_meta The table metadata to fill + * @param[in,out] table_meta The table metadata + * @param input The input table + * @param[in,out] d128_vectors Vector containing the computed decimal128 data buffers. + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return A device vector containing the converted decimal128 data */ -void fill_table_meta(std::unique_ptr const& table_meta) +std::vector convert_decimal_columns_and_metadata( + table_input_metadata& table_meta, + table_view const& table, + std::vector>& d128_vectors, + rmm::cuda_stream_view stream) { - // Fill unnamed columns' names in table_meta - std::function add_default_name = - [&](column_in_metadata& col_meta, std::string default_name) { - if (col_meta.get_name().empty()) col_meta.set_name(default_name); - for (size_type i = 0; i < col_meta.num_children(); ++i) { - add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i)); - } - }; - for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) { - add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i)); - } + std::vector converted_column_views{table.begin(), table.end()}; + + std::function convert_column = + [&](column_view& column, column_in_metadata& metadata) -> void { + // Vector of passable-by-reference children column views + std::vector converted_children{column.child_begin(), column.child_end()}; + // Process children column views first + std::for_each( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(column.num_children()), + [&](auto const idx) { convert_column(converted_children[idx], metadata.child(idx)); }); + + // Process this column view. Only convert if decimal32 and decimal64 column. + switch (column.type().id()) { + case type_id::DECIMAL32: + // Convert data to decimal128 type + d128_vectors.push_back(convert_data_to_decimal128(column, stream)); + // Update metadata + metadata.set_decimal_precision(MAX_DECIMAL32_PRECISION); + metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()})); + // Create a new column view from the d128 data vector + column = column_view{data_type{type_id::DECIMAL128, column.type().scale()}, + column.size(), + d128_vectors.back().data(), + column.null_mask(), + column.null_count(), + column.offset(), + converted_children}; + break; + case type_id::DECIMAL64: + // Convert data to decimal128 type + d128_vectors.push_back(convert_data_to_decimal128(column, stream)); + // Update metadata + metadata.set_decimal_precision(MAX_DECIMAL64_PRECISION); + metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()})); + // Create a new column view from the d128 data vector + column = column_view{data_type{type_id::DECIMAL128, column.type().scale()}, + column.size(), + d128_vectors.back().data(), + column.null_mask(), + column.null_count(), + column.offset(), + converted_children}; + break; + default: + // Update the children vector keeping everything else the same + column = column_view{column.type(), + column.size(), + column.head(), + column.null_mask(), + column.null_count(), + column.offset(), + converted_children}; + break; + } + }; + + // Convert each column view + std::for_each(thrust::make_zip_iterator(thrust::make_tuple(converted_column_views.begin(), + table_meta.column_metadata.begin())), + thrust::make_zip_iterator(thrust::make_tuple(converted_column_views.end(), + table_meta.column_metadata.end())), + [&](auto elem) { convert_column(thrust::get<0>(elem), thrust::get<1>(elem)); }); + + return converted_column_views; } /** @@ -1717,7 +1779,16 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, host_span const> out_sink, rmm::cuda_stream_view stream) { - auto vec = table_to_linked_columns(input); + // Container to store decimal128 converted data if needed + std::vector> d128_vectors; + + // Convert decimal32/decimal64 data to decimal128 if writing arrow schema + // and initialize LinkedColVector + auto vec = table_to_linked_columns( + (write_arrow_schema) + ? table_view({convert_decimal_columns_and_metadata(table_meta, input, d128_vectors, stream)}) + : input); + auto schema_tree = construct_parquet_schema_tree( vec, table_meta, write_mode, int96_timestamps, utc_timestamps, write_arrow_schema); // Construct parquet_column_views from the schema tree leaf nodes. diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp new file mode 100644 index 00000000000..b2400ec19b2 --- /dev/null +++ b/cpp/src/io/parquet/writer_impl_helpers.cpp @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file writer_impl_helpers.cpp + * @brief Helper function implementation for Parquet writer + */ + +#include "writer_impl_helpers.hpp" + +namespace cudf::io::parquet::detail { + +using namespace cudf::io::detail; + +Compression to_parquet_compression(compression_type compression) +{ + switch (compression) { + case compression_type::AUTO: + case compression_type::SNAPPY: return Compression::SNAPPY; + case compression_type::ZSTD: return Compression::ZSTD; + case compression_type::LZ4: + // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4 + return Compression::LZ4_RAW; + case compression_type::NONE: return Compression::UNCOMPRESSED; + default: CUDF_FAIL("Unsupported compression type"); + } +} + +nvcomp::compression_type to_nvcomp_compression_type(Compression codec) +{ + if (codec == Compression::SNAPPY) return nvcomp::compression_type::SNAPPY; + if (codec == Compression::ZSTD) return nvcomp::compression_type::ZSTD; + // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4 + if (codec == Compression::LZ4_RAW) return nvcomp::compression_type::LZ4; + CUDF_FAIL("Unsupported compression type"); +} + +uint32_t page_alignment(Compression codec) +{ + if (codec == Compression::UNCOMPRESSED or + nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) { + return 1u; + } + + return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec)); +} + +size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize) +{ + if (codec == Compression::UNCOMPRESSED) return 0; + + return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize); +} + +void fill_table_meta(std::unique_ptr const& table_meta) +{ + // Fill unnamed columns' names in table_meta + std::function add_default_name = + [&](column_in_metadata& col_meta, std::string default_name) { + if (col_meta.get_name().empty()) col_meta.set_name(default_name); + for (size_type i = 0; i < col_meta.num_children(); ++i) { + add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i)); + } + }; + for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) { + add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i)); + } +} + +[[nodiscard]] bool is_col_nullable(cudf::detail::LinkedColPtr const& column, + column_in_metadata const& column_metadata, + single_write_mode write_mode) +{ + if (column_metadata.is_nullability_defined()) { + CUDF_EXPECTS(column_metadata.nullable() or column->null_count() == 0, + "Mismatch in metadata prescribed nullability and input column. " + "Metadata for input column with nulls cannot prescribe nullability = false"); + return column_metadata.nullable(); + } + // For chunked write, when not provided nullability, we assume the worst case scenario + // that all columns are nullable. + return write_mode == single_write_mode::NO or column->nullable(); +} + +[[nodiscard]] bool is_col_fixed_width(column_view const& column) +{ + if (column.type().id() == type_id::STRUCT) { + return std::all_of(column.child_begin(), column.child_end(), is_col_fixed_width); + } + + return is_fixed_width(column.type()); +} + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp new file mode 100644 index 00000000000..9ffa4538134 --- /dev/null +++ b/cpp/src/io/parquet/writer_impl_helpers.hpp @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file writer_impl_helpers.hpp + * @brief Helper function implementation for Parquet writer + */ + +#pragma once +#include "io/comp/nvcomp_adapter.hpp" +#include "parquet_common.hpp" + +#include +#include + +namespace cudf::io::parquet::detail { + +using namespace cudf::io::detail; + +/** + * @brief Function that translates GDF compression to parquet compression. + * + * @param compression The compression type + * @return The supported Parquet compression + */ +Compression to_parquet_compression(compression_type compression); + +nvcomp::compression_type to_nvcomp_compression_type(Compression codec); + +uint32_t page_alignment(Compression codec); + +size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize); + +/** + * @brief Fill the table metadata with default column names. + * + * @param table_meta The table metadata to fill + */ +void fill_table_meta(std::unique_ptr const& table_meta); + +/** + * @brief Returns ``true`` if the column is nullable or if the write mode is not + * set to write the table all at once instead of chunked + * + * @param column A view of the column + * @param column_metadata Metadata of the column + * @param write_mode Flag to indicate that we are guaranteeing a single table write + * + * @return Whether the column is nullable. + */ +[[nodiscard]] bool is_col_nullable(cudf::detail::LinkedColPtr const& column, + column_in_metadata const& column_metadata, + single_write_mode write_mode); +/** + * @brief Returns ``true`` if the given column has a fixed size. + * + * This doesn't check every row, so assumes string and list columns are not fixed, even + * if each row is the same width. + * TODO: update this if FIXED_LEN_BYTE_ARRAY is ever supported for writes. + * + * @param column A view of the column + * + * @return Whether the column has a fixed size + */ +[[nodiscard]] bool is_col_fixed_width(column_view const& column); + +} // namespace cudf::io::parquet::detail diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 82e4c4bd82f..e3430c16363 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -503,6 +503,50 @@ TEST_F(ParquetWriterTest, DecimalWrite) CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, table); } +TEST_F(ParquetWriterTest, DecimalWriteWithArrowSchema) +{ + constexpr cudf::size_type num_rows = 500; + auto seq_col0 = random_values(num_rows); + auto seq_col1 = random_values(num_rows); + + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); + + auto col0 = cudf::test::fixed_point_column_wrapper{ + seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}}; + auto col1 = cudf::test::fixed_point_column_wrapper{ + seq_col1.begin(), seq_col1.end(), valids, numeric::scale_type{-9}}; + + auto table = table_view({col0, col1}); + + auto filepath = temp_env->get_temp_filepath("DecimalWriteWithArrowSchema.parquet"); + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table) + .write_arrow_schema(true); + + cudf::io::table_input_metadata expected_metadata(table); + // verify success if equal precision is given + expected_metadata.column_metadata[0].set_decimal_precision( + cudf::io::parquet::detail::MAX_DECIMAL32_PRECISION); + expected_metadata.column_metadata[1].set_decimal_precision( + cudf::io::parquet::detail::MAX_DECIMAL64_PRECISION); + args.set_metadata(std::move(expected_metadata)); + cudf::io::write_parquet(args); + + auto expected_col0 = cudf::test::fixed_point_column_wrapper<__int128_t>{ + seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}}; + auto expected_col1 = cudf::test::fixed_point_column_wrapper<__int128_t>{ + seq_col1.begin(), seq_col1.end(), valids, numeric::scale_type{-9}}; + + auto expected_table = table_view({expected_col0, expected_col1}); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected_table); +} + TEST_F(ParquetWriterTest, RowGroupSizeInvalid) { auto const unused_table = std::make_unique(); diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index f2c46c1e192..fd5dd439f18 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -3528,7 +3528,13 @@ def test_parquet_writer_roundtrip_with_arrow_schema(index): "uint32": cudf.Series([1234, 123, 4123], dtype="uint32"), "list": list([[1, 2], [1, 2], [1, 2]]), "bool": cudf.Series([True, None, False], dtype=bool), - "fixed_pt": cudf.Series([0.00, 1.0, None]).astype( + "fixed32": cudf.Series([0.00, 1.0, None]).astype( + cudf.Decimal32Dtype(7, 2) + ), + "fixed64": cudf.Series([0.00, 1.0, None]).astype( + cudf.Decimal64Dtype(7, 2) + ), + "fixed128": cudf.Series([0.00, 1.0, None]).astype( cudf.Decimal128Dtype(7, 2) ), "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), @@ -3542,6 +3548,10 @@ def test_parquet_writer_roundtrip_with_arrow_schema(index): buffer = BytesIO() expected.to_parquet(buffer, store_schema=True, index=index) + # Convert decimal types to d128 + expected = expected.astype({'fixed32': cudf.Decimal128Dtype(9, 2)}) + expected = expected.astype({'fixed64': cudf.Decimal128Dtype(18, 2)}) + # Read parquet with pyarrow, pandas and cudf readers got = cudf.DataFrame.from_arrow(pq.read_table(buffer)) got2 = cudf.DataFrame.from_pandas(pd.read_parquet(buffer)) From 0946eb48777a6c95c7bf5bc6379cbca195a0aa48 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Tue, 4 Jun 2024 08:35:54 +0000 Subject: [PATCH 28/45] minor ruff-formatting fix --- python/cudf/cudf/tests/test_parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index fd5dd439f18..8f6517e15a3 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -3549,8 +3549,8 @@ def test_parquet_writer_roundtrip_with_arrow_schema(index): expected.to_parquet(buffer, store_schema=True, index=index) # Convert decimal types to d128 - expected = expected.astype({'fixed32': cudf.Decimal128Dtype(9, 2)}) - expected = expected.astype({'fixed64': cudf.Decimal128Dtype(18, 2)}) + expected = expected.astype({"fixed32": cudf.Decimal128Dtype(9, 2)}) + expected = expected.astype({"fixed64": cudf.Decimal128Dtype(18, 2)}) # Read parquet with pyarrow, pandas and cudf readers got = cudf.DataFrame.from_arrow(pq.read_table(buffer)) From ea01aeedf4c3eef1f13268f1f63c3ed6df2fa8d1 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 4 Jun 2024 19:56:35 +0000 Subject: [PATCH 29/45] refactor and move some helpers to writer_impl_helpers.cpp --- cpp/CMakeLists.txt | 1 + cpp/src/io/parquet/writer_impl.cu | 135 +++++++-------------- cpp/src/io/parquet/writer_impl_helpers.cpp | 31 +++++ cpp/src/io/parquet/writer_impl_helpers.hpp | 9 ++ 4 files changed, 85 insertions(+), 91 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 633f5f36b38..f66408e318a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -423,6 +423,7 @@ add_library( src/io/parquet/reader_impl_helpers.cpp src/io/parquet/reader_impl_preprocess.cu src/io/parquet/writer_impl.cu + src/io/parquet/writer_impl_helpers.cpp src/io/parquet/decode_fixed.cu src/io/statistics/orc_column_statistics.cu src/io/statistics/parquet_column_statistics.cu diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index ef2e3e54245..f9fd936a40b 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -42,9 +42,6 @@ #include #include #include -#include -#include -#include #include #include @@ -254,52 +251,6 @@ void update_chunk_encoding_stats(ColumnChunkMetaData& chunk_meta, if (not result.empty()) { chunk_meta.encoding_stats = std::move(result); } } -/** - * @brief Compute size (in bytes) of the data stored in the given column. - * - * @param column The input column - * @param stream CUDA stream used for device memory operations and kernel launches - * @return The data size of the input - */ -size_t column_size(column_view const& column, rmm::cuda_stream_view stream) -{ - if (column.is_empty()) { return 0; } - - if (is_fixed_width(column.type())) { - return size_of(column.type()) * column.size(); - } else if (column.type().id() == type_id::STRING) { - auto const scol = strings_column_view(column); - return cudf::strings::detail::get_offset_value( - scol.offsets(), column.size() + column.offset(), stream) - - cudf::strings::detail::get_offset_value(scol.offsets(), column.offset(), stream); - } else if (column.type().id() == type_id::STRUCT) { - auto const scol = structs_column_view(column); - size_t ret = 0; - for (int i = 0; i < scol.num_children(); i++) { - ret += column_size(scol.get_sliced_child(i, stream), stream); - } - return ret; - } else if (column.type().id() == type_id::LIST) { - auto const lcol = lists_column_view(column); - return column_size(lcol.get_sliced_child(stream), stream); - } - - CUDF_FAIL("Unexpected compound type"); -} - -// checks to see if the given column has a fixed size. This doesn't -// check every row, so assumes string and list columns are not fixed, even -// if each row is the same width. -// TODO: update this if FIXED_LEN_BYTE_ARRAY is ever supported for writes. -bool is_col_fixed_width(column_view const& column) -{ - if (column.type().id() == type_id::STRUCT) { - return std::all_of(column.child_begin(), column.child_end(), is_col_fixed_width); - } - - return is_fixed_width(column.type()); -} - /** * @brief Extends SchemaElement to add members required in constructing parquet_column_view * @@ -1642,85 +1593,87 @@ rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& co } /** - * @brief Helper function to convert decimal32 and decimal64 columns to decimal128 data, + * @brief Function to convert decimal32 and decimal64 columns to decimal128 data, * update the input table metadata, and return a new vector of column views. * * @param[in,out] table_meta The table metadata - * @param input The input table * @param[in,out] d128_vectors Vector containing the computed decimal128 data buffers. + * @param input The input table * @param stream CUDA stream used for device memory operations and kernel launches * * @return A device vector containing the converted decimal128 data */ std::vector convert_decimal_columns_and_metadata( table_input_metadata& table_meta, - table_view const& table, std::vector>& d128_vectors, + table_view const& table, rmm::cuda_stream_view stream) { - std::vector converted_column_views{table.begin(), table.end()}; - - std::function convert_column = - [&](column_view& column, column_in_metadata& metadata) -> void { + // Lambda function to convert each decimal32/decimal64 column to decimal128. + std::function convert_column = + [&](column_view column, column_in_metadata& metadata) -> column_view { // Vector of passable-by-reference children column views - std::vector converted_children{column.child_begin(), column.child_end()}; + std::vector converted_children; + // Process children column views first - std::for_each( + std::transform( thrust::make_counting_iterator(0), thrust::make_counting_iterator(column.num_children()), - [&](auto const idx) { convert_column(converted_children[idx], metadata.child(idx)); }); + std::back_inserter(converted_children), + [&](auto const idx) { return convert_column(column.child(idx), metadata.child(idx)); }); // Process this column view. Only convert if decimal32 and decimal64 column. switch (column.type().id()) { case type_id::DECIMAL32: // Convert data to decimal128 type - d128_vectors.push_back(convert_data_to_decimal128(column, stream)); + d128_vectors.emplace_back(convert_data_to_decimal128(column, stream)); // Update metadata metadata.set_decimal_precision(MAX_DECIMAL32_PRECISION); metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()})); // Create a new column view from the d128 data vector - column = column_view{data_type{type_id::DECIMAL128, column.type().scale()}, - column.size(), - d128_vectors.back().data(), - column.null_mask(), - column.null_count(), - column.offset(), - converted_children}; - break; + return {data_type{type_id::DECIMAL128, column.type().scale()}, + column.size(), + d128_vectors.back().data(), + column.null_mask(), + column.null_count(), + column.offset(), + converted_children}; case type_id::DECIMAL64: // Convert data to decimal128 type - d128_vectors.push_back(convert_data_to_decimal128(column, stream)); + d128_vectors.emplace_back(convert_data_to_decimal128(column, stream)); // Update metadata metadata.set_decimal_precision(MAX_DECIMAL64_PRECISION); metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()})); // Create a new column view from the d128 data vector - column = column_view{data_type{type_id::DECIMAL128, column.type().scale()}, - column.size(), - d128_vectors.back().data(), - column.null_mask(), - column.null_count(), - column.offset(), - converted_children}; - break; + return {data_type{type_id::DECIMAL128, column.type().scale()}, + column.size(), + d128_vectors.back().data(), + column.null_mask(), + column.null_count(), + column.offset(), + converted_children}; default: // Update the children vector keeping everything else the same - column = column_view{column.type(), - column.size(), - column.head(), - column.null_mask(), - column.null_count(), - column.offset(), - converted_children}; - break; + return {column.type(), + column.size(), + column.head(), + column.null_mask(), + column.null_count(), + column.offset(), + converted_children}; } }; + // Vector of converted column views + std::vector converted_column_views; + // Convert each column view - std::for_each(thrust::make_zip_iterator(thrust::make_tuple(converted_column_views.begin(), - table_meta.column_metadata.begin())), - thrust::make_zip_iterator(thrust::make_tuple(converted_column_views.end(), - table_meta.column_metadata.end())), - [&](auto elem) { convert_column(thrust::get<0>(elem), thrust::get<1>(elem)); }); + std::transform( + thrust::make_zip_iterator( + thrust::make_tuple(table.begin(), table_meta.column_metadata.begin())), + thrust::make_zip_iterator(thrust::make_tuple(table.end(), table_meta.column_metadata.end())), + std::back_inserter(converted_column_views), + [&](auto elem) { return convert_column(thrust::get<0>(elem), thrust::get<1>(elem)); }); return converted_column_views; } @@ -1786,7 +1739,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, // and initialize LinkedColVector auto vec = table_to_linked_columns( (write_arrow_schema) - ? table_view({convert_decimal_columns_and_metadata(table_meta, input, d128_vectors, stream)}) + ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_vectors, input, stream)}) : input); auto schema_tree = construct_parquet_schema_tree( diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp index b2400ec19b2..364b1a9777a 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.cpp +++ b/cpp/src/io/parquet/writer_impl_helpers.cpp @@ -21,6 +21,11 @@ #include "writer_impl_helpers.hpp" +#include +#include +#include +#include + namespace cudf::io::parquet::detail { using namespace cudf::io::detail; @@ -80,6 +85,32 @@ void fill_table_meta(std::unique_ptr const& table_meta) } } +size_t column_size(column_view const& column, rmm::cuda_stream_view stream) +{ + if (column.is_empty()) { return 0; } + + if (is_fixed_width(column.type())) { + return size_of(column.type()) * column.size(); + } else if (column.type().id() == type_id::STRING) { + auto const scol = strings_column_view(column); + return cudf::strings::detail::get_offset_value( + scol.offsets(), column.size() + column.offset(), stream) - + cudf::strings::detail::get_offset_value(scol.offsets(), column.offset(), stream); + } else if (column.type().id() == type_id::STRUCT) { + auto const scol = structs_column_view(column); + size_t ret = 0; + for (int i = 0; i < scol.num_children(); i++) { + ret += column_size(scol.get_sliced_child(i, stream), stream); + } + return ret; + } else if (column.type().id() == type_id::LIST) { + auto const lcol = lists_column_view(column); + return column_size(lcol.get_sliced_child(stream), stream); + } + + CUDF_FAIL("Unexpected compound type"); +} + [[nodiscard]] bool is_col_nullable(cudf::detail::LinkedColPtr const& column, column_in_metadata const& column_metadata, single_write_mode write_mode) diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp index 9ffa4538134..316ee1da240 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.hpp +++ b/cpp/src/io/parquet/writer_impl_helpers.hpp @@ -51,6 +51,15 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block */ void fill_table_meta(std::unique_ptr const& table_meta); +/** + * @brief Compute size (in bytes) of the data stored in the given column. + * + * @param column The input column + * @param stream CUDA stream used for device memory operations and kernel launches + * @return The data size of the input + */ +size_t column_size(column_view const& column, rmm::cuda_stream_view stream); + /** * @brief Returns ``true`` if the column is nullable or if the write mode is not * set to write the table all at once instead of chunked From b9f2989e31a86d76aef1a95f1b75058c972d9d87 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 6 Jun 2024 21:05:41 +0000 Subject: [PATCH 30/45] resolve conflicts, minor doc and pytest updates. --- python/cudf/cudf/_lib/parquet.pyx | 4 ++-- python/cudf/cudf/io/parquet.py | 7 +++++++ python/cudf/cudf/tests/test_parquet.py | 17 +++++++++-------- python/cudf/cudf/utils/ioutils.py | 3 +++ 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 1dbe31aac6a..86a2585121d 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -621,9 +621,9 @@ cdef class ParquetWriter: If ``True``, enable dictionary encoding for Parquet page data subject to ``max_dictionary_size`` constraints. If ``False``, disable dictionary encoding for Parquet page data. - write_arrow_schema : bool, default False + store_schema : bool, default False If ``True``, enable computing and writing arrow schema to Parquet - file footer's key-value metadata section. + file footer's key-value metadata section for faithful round-tripping. See Also -------- cudf.io.parquet.write_parquet diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index ef5bd50053f..e86334633ef 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -156,6 +156,7 @@ def write_to_dataset( column_encoding=None, column_type_length=None, output_as_binary=None, + store_schema=False, ): """Wraps `to_parquet` to write partitioned Parquet datasets. For each combination of partition group and value, @@ -244,6 +245,9 @@ def write_to_dataset( output_as_binary : set, optional, default None If a column name is present in the set, that column will be output as unannotated binary, rather than the default 'UTF-8'. + store_schema : bool, default False + If ``True``, enable computing and writing arrow schema to Parquet + file footer's key-value metadata section for faithful round-tripping. """ fs = ioutils._ensure_filesystem(fs, root_path, storage_options) @@ -287,6 +291,7 @@ def write_to_dataset( column_encoding=column_encoding, column_type_length=column_type_length, output_as_binary=output_as_binary, + store_schema=store_schema, ) else: @@ -314,6 +319,7 @@ def write_to_dataset( column_encoding=column_encoding, column_type_length=column_type_length, output_as_binary=output_as_binary, + store_schema=store_schema, ) return metadata @@ -1018,6 +1024,7 @@ def to_parquet( column_encoding=column_encoding, column_type_length=column_type_length, output_as_binary=output_as_binary, + store_schema=store_schema, ) partition_info = ( diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 47334cf1371..6e1e491c732 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2555,6 +2555,10 @@ def normalized_equals(value1, value2): value1 = None if value2 is pd.NA or value2 is pd.NaT: value2 = None + if isinstance(value1, np.datetime64): + value1 = pd.Timestamp(value1).to_pydatetime() + if isinstance(value2, np.datetime64): + value2 = pd.Timestamp(value2).to_pydatetime() if isinstance(value1, pd.Timestamp): value1 = value1.to_pydatetime() if isinstance(value2, pd.Timestamp): @@ -3489,7 +3493,6 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmpdir, data): # Check results assert_eq(expected, got) -<<<<<<< arrow-schema-support-pq-writer # Reset buffer buffer = BytesIO() @@ -3501,6 +3504,9 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmpdir, data): # Convert to cudf table for an apple to apple comparison expected = cudf.from_pandas(pdf) + # Check results + assert_eq(expected, got) + @pytest.mark.parametrize("index", [None, True, False]) def test_parquet_writer_roundtrip_with_arrow_schema(index): @@ -3641,7 +3647,6 @@ def test_parquet_writer_roundtrip_structs_with_arrow_schema( # Ensure that the structs are faithfully being roundtripped across # Parquet with arrow schema pa_expected = pa.Table.from_pydict({"struct": data}) - pd_expected = pa_expected.to_pandas() expected = cudf.DataFrame.from_arrow(pa_expected) @@ -3649,18 +3654,15 @@ def test_parquet_writer_roundtrip_structs_with_arrow_schema( buffer = BytesIO() expected.to_parquet(buffer, store_schema=True, index=index) - # Read Parquet with pyarrow and pandas + # Read Parquet with pyarrow pa_got = pq.read_table(buffer) - pd_got = pd.read_parquet(buffer) # drop the index column for comparison: __index_level_0__ if index: pa_got = pa_got.drop(columns="__index_level_0__") - pd_got = pd_got.drop(columns="__index_level_0__") # Check results assert_eq(pa_expected, pa_got) - assert_eq(pd_expected, pd_got) # Convert to cuDF table and also read Parquet with cuDF reader got = cudf.DataFrame.from_arrow(pa_got) @@ -3669,7 +3671,7 @@ def test_parquet_writer_roundtrip_structs_with_arrow_schema( # Check results assert_eq(expected, got) assert_eq(expected, got2) -======= + @pytest.mark.parametrize("chunk_read_limit", [0, 240, 1024000000]) @pytest.mark.parametrize("pass_read_limit", [0, 240, 1024000000]) @@ -3695,4 +3697,3 @@ def test_parquet_chunked_reader( ) actual = reader.read() assert_eq(expected, actual) ->>>>>>> branch-24.08 diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 0209c692935..5d115c6be5a 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -322,6 +322,9 @@ output_as_binary : set, optional, default None If a column name is present in the set, that column will be output as unannotated binary, rather than the default 'UTF-8'. +store_schema : bool, default False + If ``True``, enable computing and writing arrow schema to Parquet + file footer's key-value metadata section for faithful round-tripping. **kwargs Additional parameters will be passed to execution engines other than ``cudf``. From 8e77687ceaf0870974e4707b3edcb497d9de0ea3 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Tue, 11 Jun 2024 20:21:34 +0000 Subject: [PATCH 31/45] Changes from reviewer suggestions --- cpp/include/cudf/io/parquet.hpp | 8 -------- cpp/src/io/parquet/arrow_schema_writer.cpp | 10 +++------- cpp/src/io/parquet/arrow_schema_writer.hpp | 3 --- cpp/src/io/parquet/parquet.hpp | 11 ----------- cpp/src/io/parquet/reader_impl_helpers.cpp | 2 +- cpp/src/io/parquet/writer_impl.cu | 2 +- cpp/src/io/parquet/writer_impl_helpers.hpp | 2 +- 7 files changed, 6 insertions(+), 32 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 6329f3dbe65..8a124069e54 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -840,14 +840,6 @@ class parquet_writer_options_base { */ void enable_write_arrow_schema(bool val); - /** - * @brief Sets column chunks file path to be set in the raw output metadata. - * - * @param file_paths Vector of Strings which indicates file path. Must be same size as number of - * data sinks in sink info - */ - void set_column_chunks_file_paths(std::vector file_paths); - /** * @brief Sets the maximum row group size, in bytes. * diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index accc5e52533..668d4754800 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -159,7 +159,7 @@ struct dispatch_to_flatbuf { std::enable_if_t, void> operator()() { type_type = flatbuf::Type_Date; - // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp + // Date type (Set unit type to DAY for arrows's Date32) field_offset = flatbuf::CreateDate(fbb, flatbuf::DateUnit_DAY).Union(); } @@ -209,6 +209,8 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { + // `duration_D` is written as TimeType as `duration_D` is not a valid arrow type. + // This also allows for easy and faithful roundtripping with cudf. type_type = flatbuf::Type_Time; field_offset = flatbuf::CreateTime(fbb, flatbuf::TimeUnit_MILLISECOND).Union(); } @@ -244,12 +246,6 @@ struct dispatch_to_flatbuf { template std::enable_if_t(), void> operator()() { - if constexpr (not std::is_same_v) { - // ``decimal32`` and ``decimal64`` types are not supported by - // Arrow without explicit conversion. - CUDF_FAIL("Fixed point types smaller than `decimal128` are not supported in arrow schema"); - } - type_type = flatbuf::Type_Decimal; field_offset = flatbuf::CreateDecimal(fbb, (col_meta.is_decimal_precision_set()) diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index 4043889ea99..7b7c6cf722c 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -27,9 +27,6 @@ #include #include -#include -#include - namespace cudf::io::parquet::detail { using namespace cudf::io::detail; diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index b0734719002..e35742c2527 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -57,15 +57,6 @@ struct TimeUnit { Type type; }; -struct DateUnit { - enum Type : char { DAYS = 0, MILLIS = 1 }; - Type type; -}; - -struct DateType { - DateUnit unit = {DateUnit::DAYS}; -}; - struct TimeType { // Default to true because the timestamps are implicitly in UTC // Writer option overrides this default @@ -106,14 +97,12 @@ struct LogicalType { thrust::optional decimal_type; thrust::optional time_type; thrust::optional timestamp_type; - thrust::optional date_type; thrust::optional int_type; LogicalType(Type tp = UNDEFINED) : type(tp) {} LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {} LogicalType(TimeType&& tt) : type(TIME), time_type(tt) {} LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {} - LogicalType(DateType&& date) : type(DATE), date_type(date) {} LogicalType(IntType&& it) : type(INTEGER), int_type(it) {} constexpr bool is_time_millis() const diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 2998bf6f0eb..63f25d417ff 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -45,7 +45,7 @@ thrust::optional converted_to_logical_type(SchemaElement const& sch case MAP: return LogicalType{LogicalType::MAP}; case LIST: return LogicalType{LogicalType::LIST}; case DECIMAL: return LogicalType{DecimalType{schema.decimal_scale, schema.decimal_precision}}; - case DATE: return LogicalType{DateType{DateUnit::DAYS}}; + case DATE: return LogicalType{LogicalType::DATE}; case TIME_MILLIS: return LogicalType{TimeType{true, TimeUnit::MILLIS}}; case TIME_MICROS: return LogicalType{TimeType{true, TimeUnit::MICROS}}; case TIMESTAMP_MILLIS: return LogicalType{TimestampType{true, TimeUnit::MILLIS}}; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index f9fd936a40b..f9ca9553cf1 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -397,7 +397,7 @@ struct leaf_schema_fn { col_schema.type = Type::INT32; col_schema.converted_type = ConvertedType::DATE; col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.logical_type = LogicalType{DateType{DateUnit::DAYS}}; + col_schema.logical_type = LogicalType{LogicalType::DATE}; } template diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp index 316ee1da240..6adacc981d1 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.hpp +++ b/cpp/src/io/parquet/writer_impl_helpers.hpp @@ -58,7 +58,7 @@ void fill_table_meta(std::unique_ptr const& table_meta); * @param stream CUDA stream used for device memory operations and kernel launches * @return The data size of the input */ -size_t column_size(column_view const& column, rmm::cuda_stream_view stream); +[[nodiscard]] size_t column_size(column_view const& column, rmm::cuda_stream_view stream); /** * @brief Returns ``true`` if the column is nullable or if the write mode is not From 30057c0636e7c1dd683a647d35846bfd8197ea8d Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Tue, 11 Jun 2024 20:32:14 +0000 Subject: [PATCH 32/45] Minor changes from reviewer suggestions. --- cpp/src/io/parquet/writer_impl.cu | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index f9ca9553cf1..d2604f058bf 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -464,11 +464,14 @@ struct leaf_schema_fn { template std::enable_if_t, void> operator()() { - col_schema.type = (write_arrow_schema) ? Type::INT64 : Type::INT32; - col_schema.stats_dtype = - (write_arrow_schema) ? statistics_dtype::dtype_int64 : statistics_dtype::dtype_int32; - // only write as time32 logical type if not writing arrow schema - if (not write_arrow_schema) { + // If writing arrow schema, no logical type nor converted type is necessary + if (write_arrow_schema) { + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_int64; + } else { + // Write as Time32 logical type otherwise. Parquet TIME_MILLIS annotates INT32 + col_schema.type = Type::INT32; + col_schema.stats_dtype = statistics_dtype::dtype_int32; col_schema.converted_type = ConvertedType::TIME_MILLIS; col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; col_schema.ts_scale = 1000; @@ -478,11 +481,14 @@ struct leaf_schema_fn { template std::enable_if_t, void> operator()() { - col_schema.type = (write_arrow_schema) ? Type::INT64 : Type::INT32; - col_schema.stats_dtype = - (write_arrow_schema) ? statistics_dtype::dtype_int64 : statistics_dtype::dtype_int32; - // only write as time32 logical type if not writing arrow schema - if (not write_arrow_schema) { + // If writing arrow schema, no logical type nor converted type is necessary + if (write_arrow_schema) { + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_int64; + } else { + // Write as Time32 logical type otherwise. Parquet TIME_MILLIS annotates INT32 + col_schema.type = Type::INT32; + col_schema.stats_dtype = statistics_dtype::dtype_int32; col_schema.converted_type = ConvertedType::TIME_MILLIS; col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; } @@ -493,7 +499,7 @@ struct leaf_schema_fn { { col_schema.type = Type::INT64; col_schema.stats_dtype = statistics_dtype::dtype_int64; - // only write as time64 logical type if not writing arrow schema + // Only write as time64 logical type if not writing arrow schema if (not write_arrow_schema) { col_schema.converted_type = ConvertedType::TIME_MICROS; col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}}; @@ -505,7 +511,7 @@ struct leaf_schema_fn { { col_schema.type = Type::INT64; col_schema.stats_dtype = statistics_dtype::dtype_int64; - // only write as time64 logical type if not writing arrow schema + // Only write as time64 logical type if not writing arrow schema if (not write_arrow_schema) { col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}}; } From 0f136429b34d171747904349926852c0d9554d19 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Tue, 11 Jun 2024 21:56:17 +0000 Subject: [PATCH 33/45] minor update. add nodiscard. --- cpp/src/io/parquet/writer_impl_helpers.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp index 364b1a9777a..9ded83736d6 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.cpp +++ b/cpp/src/io/parquet/writer_impl_helpers.cpp @@ -85,7 +85,7 @@ void fill_table_meta(std::unique_ptr const& table_meta) } } -size_t column_size(column_view const& column, rmm::cuda_stream_view stream) +[[nodiscard]] size_t column_size(column_view const& column, rmm::cuda_stream_view stream) { if (column.is_empty()) { return 0; } From f9c123bebba2456ef17cd90be7a73f486df1fe2f Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 14 Jun 2024 01:46:00 +0000 Subject: [PATCH 34/45] Minor changes addressing reviewer comments. --- cpp/src/io/functions.cpp | 11 +- cpp/src/io/parquet/arrow_schema_writer.cpp | 121 +++++++++++---------- cpp/src/io/parquet/arrow_schema_writer.hpp | 11 +- cpp/src/io/parquet/writer_impl.cu | 9 +- cpp/src/io/parquet/writer_impl_helpers.hpp | 26 ++++- 5 files changed, 105 insertions(+), 73 deletions(-) diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 3a844312367..b4ece9cec66 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -762,6 +762,9 @@ void parquet_writer_options_base::set_compression(compression_type compression) void parquet_writer_options_base::enable_int96_timestamps(bool req) { + CUDF_EXPECTS(not req or not is_enabled_write_arrow_schema(), + "INT96 timestamps and arrow schema cannot be simultaneously " + "enabled as INT96 timestamps are deprecated in Arrow."); _write_timestamps_as_int96 = req; } @@ -770,7 +773,13 @@ void parquet_writer_options_base::enable_utc_timestamps(bool val) _write_timestamps_as_UTC = val; } -void parquet_writer_options_base::enable_write_arrow_schema(bool val) { _write_arrow_schema = val; } +void parquet_writer_options_base::enable_write_arrow_schema(bool val) +{ + CUDF_EXPECTS(not val or not is_enabled_int96_timestamps(), + "arrow schema and INT96 timestamps cannot be simultaneously " + "enabled as INT96 timestamps are deprecated in Arrow."); + _write_arrow_schema = val; +} void parquet_writer_options_base::set_row_group_size_bytes(size_t size_bytes) { diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index 668d4754800..458ef7f065d 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -33,6 +33,10 @@ namespace cudf::io::parquet::detail { +using namespace cudf::io::detail; + +namespace { + // Copied over from arrow source for better code readability namespace flatbuf = cudf::io::parquet::flatbuf; using FlatBufferBuilder = flatbuffers::FlatBufferBuilder; @@ -68,97 +72,97 @@ struct dispatch_to_flatbuf { single_write_mode const write_mode; bool const utc_timestamps; Offset& field_offset; - flatbuf::Type& type_type; + flatbuf::Type& field_type_id; std::vector& children; template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Bool; - field_offset = flatbuf::CreateBool(fbb).Union(); + field_type_id = flatbuf::Type_Bool; + field_offset = flatbuf::CreateBool(fbb).Union(); } template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Int; - field_offset = flatbuf::CreateInt(fbb, 8, std::numeric_limits::is_signed).Union(); + field_type_id = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 8, std::numeric_limits::is_signed).Union(); } template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Int; - field_offset = flatbuf::CreateInt(fbb, 16, std::numeric_limits::is_signed).Union(); + field_type_id = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 16, std::numeric_limits::is_signed).Union(); } template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Int; - field_offset = flatbuf::CreateInt(fbb, 32, std::numeric_limits::is_signed).Union(); + field_type_id = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 32, std::numeric_limits::is_signed).Union(); } template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Int; - field_offset = flatbuf::CreateInt(fbb, 64, std::numeric_limits::is_signed).Union(); + field_type_id = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 64, std::numeric_limits::is_signed).Union(); } template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Int; - field_offset = flatbuf::CreateInt(fbb, 8, std::numeric_limits::is_signed).Union(); + field_type_id = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 8, std::numeric_limits::is_signed).Union(); } template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Int; - field_offset = flatbuf::CreateInt(fbb, 16, std::numeric_limits::is_signed).Union(); + field_type_id = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 16, std::numeric_limits::is_signed).Union(); } template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Int; - field_offset = flatbuf::CreateInt(fbb, 32, std::numeric_limits::is_signed).Union(); + field_type_id = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 32, std::numeric_limits::is_signed).Union(); } template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Int; - field_offset = flatbuf::CreateInt(fbb, 64, std::numeric_limits::is_signed).Union(); + field_type_id = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 64, std::numeric_limits::is_signed).Union(); } template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_FloatingPoint; - field_offset = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_SINGLE).Union(); + field_type_id = flatbuf::Type_FloatingPoint; + field_offset = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_SINGLE).Union(); } template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_FloatingPoint; - field_offset = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_DOUBLE).Union(); + field_type_id = flatbuf::Type_FloatingPoint; + field_offset = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_DOUBLE).Union(); } template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Utf8View; - field_offset = flatbuf::CreateUtf8View(fbb).Union(); + field_type_id = flatbuf::Type_Utf8View; + field_offset = flatbuf::CreateUtf8View(fbb).Union(); } template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Date; + field_type_id = flatbuf::Type_Date; // Date type (Set unit type to DAY for arrows's Date32) field_offset = flatbuf::CreateDate(fbb, flatbuf::DateUnit_DAY).Union(); } @@ -166,7 +170,7 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Timestamp; + field_type_id = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = flatbuf::CreateTimestamp( fbb, flatbuf::TimeUnit_SECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) @@ -176,7 +180,7 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Timestamp; + field_type_id = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = flatbuf::CreateTimestamp( @@ -187,7 +191,7 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Timestamp; + field_type_id = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = flatbuf::CreateTimestamp( @@ -198,7 +202,7 @@ struct dispatch_to_flatbuf { template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Timestamp; + field_type_id = flatbuf::Type_Timestamp; // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp field_offset = flatbuf::CreateTimestamp( @@ -211,46 +215,46 @@ struct dispatch_to_flatbuf { { // `duration_D` is written as TimeType as `duration_D` is not a valid arrow type. // This also allows for easy and faithful roundtripping with cudf. - type_type = flatbuf::Type_Time; - field_offset = flatbuf::CreateTime(fbb, flatbuf::TimeUnit_MILLISECOND).Union(); + field_type_id = flatbuf::Type_Time; + field_offset = flatbuf::CreateTime(fbb, flatbuf::TimeUnit_MILLISECOND).Union(); } template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Duration; - field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_SECOND).Union(); + field_type_id = flatbuf::Type_Duration; + field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_SECOND).Union(); } template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Duration; - field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MILLISECOND).Union(); + field_type_id = flatbuf::Type_Duration; + field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MILLISECOND).Union(); } template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Duration; - field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MICROSECOND).Union(); + field_type_id = flatbuf::Type_Duration; + field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MICROSECOND).Union(); } template std::enable_if_t, void> operator()() { - type_type = flatbuf::Type_Duration; - field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_NANOSECOND).Union(); + field_type_id = flatbuf::Type_Duration; + field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_NANOSECOND).Union(); } template std::enable_if_t(), void> operator()() { - type_type = flatbuf::Type_Decimal; - field_offset = flatbuf::CreateDecimal(fbb, + field_type_id = flatbuf::Type_Decimal; + field_offset = flatbuf::CreateDecimal(fbb, (col_meta.is_decimal_precision_set()) - ? col_meta.get_decimal_precision() - : MAX_DECIMAL128_PRECISION, + ? col_meta.get_decimal_precision() + : MAX_DECIMAL128_PRECISION, col->type().scale(), 128) .Union(); @@ -266,8 +270,8 @@ struct dispatch_to_flatbuf { if constexpr (std::is_same_v) { children.emplace_back(make_arrow_schema_fields( fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps)); - type_type = flatbuf::Type_List; - field_offset = flatbuf::CreateList(fbb).Union(); + field_type_id = flatbuf::Type_List; + field_offset = flatbuf::CreateList(fbb).Union(); } // Traverse the struct in DFS manner and process children fields. @@ -279,8 +283,8 @@ struct dispatch_to_flatbuf { return make_arrow_schema_fields( fbb, col->children[idx], col_meta.child(idx), write_mode, utc_timestamps); }); - type_type = flatbuf::Type_Struct_; - field_offset = flatbuf::CreateStruct_(fbb).Union(); + field_type_id = flatbuf::Type_Struct_; + field_offset = flatbuf::CreateStruct_(fbb).Union(); } } @@ -300,26 +304,33 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, bool const utc_timestamps) { // Variables to be set by the dispatch_to_flatbuf functor - Offset field_offset = 0; - flatbuf::Type type_type = flatbuf::Type_NONE; + Offset field_offset = 0; + flatbuf::Type field_type_id = flatbuf::Type_NONE; std::vector children; - cudf::type_dispatcher( - column->type(), - dispatch_to_flatbuf{ - fbb, column, column_metadata, write_mode, utc_timestamps, field_offset, type_type, children}); + cudf::type_dispatcher(column->type(), + dispatch_to_flatbuf{fbb, + column, + column_metadata, + write_mode, + utc_timestamps, + field_offset, + field_type_id, + children}); // push to field offsets vector return flatbuf::CreateField( fbb, fbb.CreateString(column_metadata.get_name()), // name is_col_nullable(column, column_metadata, write_mode), // nullable - type_type, // type id + field_type_id, // type id field_offset, // field offset {0}, // DictionaryOffset fbb.CreateVector(children.data(), children.size())); // children vector } +} // namespace + std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, table_input_metadata const& metadata, single_write_mode const write_mode, diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index 7b7c6cf722c..1b62ef35c86 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -29,8 +29,6 @@ namespace cudf::io::parquet::detail { -using namespace cudf::io::detail; - /** * @brief Construct and return arrow schema from input parquet schema * @@ -47,9 +45,10 @@ using namespace cudf::io::detail; * * @return The constructed arrow ipc message string */ -std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, - table_input_metadata const& metadata, - single_write_mode const write_mode, - bool const utc_timestamps); +std::string construct_arrow_schema_ipc_message( + cudf::detail::LinkedColVector const& linked_columns, + table_input_metadata const& metadata, + ::cudf::io::detail::single_write_mode const write_mode, + bool const utc_timestamps); } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 01d7de84f0f..c63b12eaa38 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1577,7 +1577,7 @@ rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& co rmm::device_uvector<__int128_t> d128_buffer(column.size(), stream); - thrust::for_each(rmm::exec_policy(stream), + thrust::for_each(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(column.size()), [in = column.begin(), @@ -2356,8 +2356,7 @@ writer::impl::impl(std::vector> sinks, _dict_policy(options.get_dictionary_policy()), _max_dictionary_size(options.get_max_dictionary_size()), _max_page_fragment_size(options.get_max_page_fragment_size()), - _int96_timestamps(options.is_enabled_int96_timestamps() and - not options.is_enabled_write_arrow_schema()), + _int96_timestamps(options.is_enabled_int96_timestamps()), _utc_timestamps(options.is_enabled_utc_timestamps()), _write_v2_headers(options.is_enabled_write_v2_headers()), _write_arrow_schema(options.is_enabled_write_arrow_schema()), @@ -2368,10 +2367,6 @@ writer::impl::impl(std::vector> sinks, _out_sink(std::move(sinks)), _compression_statistics{options.get_compression_statistics()} { - if (options.is_enabled_int96_timestamps() and options.is_enabled_write_arrow_schema()) { - CUDF_LOG_WARN("INT96 timestamps are deprecated in arrow schema. Disabling INT96 timestamps."); - } - if (options.get_metadata()) { _table_meta = std::make_unique(*options.get_metadata()); } diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp index 6adacc981d1..3f96d03ab01 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.hpp +++ b/cpp/src/io/parquet/writer_impl_helpers.hpp @@ -28,8 +28,6 @@ namespace cudf::io::parquet::detail { -using namespace cudf::io::detail; - /** * @brief Function that translates GDF compression to parquet compression. * @@ -38,10 +36,30 @@ using namespace cudf::io::detail; */ Compression to_parquet_compression(compression_type compression); +/** + * @brief Function that translates the given compression codec to nvcomp compression type. + * + * @param codec Compression codec + * @return Translated nvcomp compression type + */ nvcomp::compression_type to_nvcomp_compression_type(Compression codec); +/** + * @brief Function that computes input alignment requirements for the given compression type. + * + * @param codec Compression codec + * @return Required alignment + */ uint32_t page_alignment(Compression codec); +/** + * @brief Gets the maximum compressed chunk size for the largest chunk uncompressed chunk in the + * batch. + * + * @param codec Compression codec + * @param compression_blocksize Size of the largest uncompressed chunk in the batch + * @return Maximum compressed chunk size + */ size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize); /** @@ -64,7 +82,7 @@ void fill_table_meta(std::unique_ptr const& table_meta); * @brief Returns ``true`` if the column is nullable or if the write mode is not * set to write the table all at once instead of chunked * - * @param column A view of the column + * @param column A view of the (linked) column * @param column_metadata Metadata of the column * @param write_mode Flag to indicate that we are guaranteeing a single table write * @@ -72,7 +90,7 @@ void fill_table_meta(std::unique_ptr const& table_meta); */ [[nodiscard]] bool is_col_nullable(cudf::detail::LinkedColPtr const& column, column_in_metadata const& column_metadata, - single_write_mode write_mode); + ::cudf::io::detail::single_write_mode write_mode); /** * @brief Returns ``true`` if the given column has a fixed size. * From 92d88a04dbdccb23962a865eac48df706c1619de Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 14 Jun 2024 19:47:04 +0000 Subject: [PATCH 35/45] Rename `is_col_nullable` to `is_output_col_nullable` --- cpp/src/io/parquet/arrow_schema_writer.cpp | 12 ++++++------ cpp/src/io/parquet/writer_impl.cu | 4 ++-- cpp/src/io/parquet/writer_impl_helpers.cpp | 6 +++--- cpp/src/io/parquet/writer_impl_helpers.hpp | 6 +++--- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index 458ef7f065d..d7e23b63774 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -321,12 +321,12 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, // push to field offsets vector return flatbuf::CreateField( fbb, - fbb.CreateString(column_metadata.get_name()), // name - is_col_nullable(column, column_metadata, write_mode), // nullable - field_type_id, // type id - field_offset, // field offset - {0}, // DictionaryOffset - fbb.CreateVector(children.data(), children.size())); // children vector + fbb.CreateString(column_metadata.get_name()), // name + is_output_column_nullable(column, column_metadata, write_mode), // nullable + field_type_id, // type id + field_offset, // field offset + {0}, // DictionaryOffset + fbb.CreateVector(children.data(), children.size())); // children vector } } // namespace diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 229fc9d0401..fd4991da41f 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -595,7 +595,7 @@ std::vector construct_parquet_schema_tree( std::function add_schema = [&](cudf::detail::LinkedColPtr const& col, column_in_metadata& col_meta, size_t parent_idx) { - bool const col_nullable = is_col_nullable(col, col_meta, write_mode); + bool const col_nullable = is_output_column_nullable(col, col_meta, write_mode); auto set_field_id = [&schema, parent_idx](schema_tree_node& s, column_in_metadata const& col_meta) { @@ -820,7 +820,7 @@ std::vector construct_parquet_schema_tree( right_child_meta.set_name("value"); // check the repetition type of key is required i.e. the col should be non-nullable auto key_col = col->children[lists_column_view::child_column_index]->children[0]; - CUDF_EXPECTS(!is_col_nullable(key_col, left_child_meta, write_mode), + CUDF_EXPECTS(!is_output_column_nullable(key_col, left_child_meta, write_mode), "key column cannot be nullable. For chunked writing, explicitly set the " "nullability to false in metadata"); // process key diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp index 9ded83736d6..529fe1cac2c 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.cpp +++ b/cpp/src/io/parquet/writer_impl_helpers.cpp @@ -111,9 +111,9 @@ void fill_table_meta(std::unique_ptr const& table_meta) CUDF_FAIL("Unexpected compound type"); } -[[nodiscard]] bool is_col_nullable(cudf::detail::LinkedColPtr const& column, - column_in_metadata const& column_metadata, - single_write_mode write_mode) +[[nodiscard]] bool is_output_column_nullable(cudf::detail::LinkedColPtr const& column, + column_in_metadata const& column_metadata, + single_write_mode write_mode) { if (column_metadata.is_nullability_defined()) { CUDF_EXPECTS(column_metadata.nullable() or column->null_count() == 0, diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp index 3f96d03ab01..1e461e9b4bc 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.hpp +++ b/cpp/src/io/parquet/writer_impl_helpers.hpp @@ -88,9 +88,9 @@ void fill_table_meta(std::unique_ptr const& table_meta); * * @return Whether the column is nullable. */ -[[nodiscard]] bool is_col_nullable(cudf::detail::LinkedColPtr const& column, - column_in_metadata const& column_metadata, - ::cudf::io::detail::single_write_mode write_mode); +[[nodiscard]] bool is_output_column_nullable(cudf::detail::LinkedColPtr const& column, + column_in_metadata const& column_metadata, + ::cudf::io::detail::single_write_mode write_mode); /** * @brief Returns ``true`` if the given column has a fixed size. * From df11288043f45bdf4de322d7091f3dcb8cf428fe Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 14 Jun 2024 19:56:30 +0000 Subject: [PATCH 36/45] minor comment update --- cpp/src/io/parquet/arrow_schema_writer.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index d7e23b63774..5f63c93cff3 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -291,9 +291,8 @@ struct dispatch_to_flatbuf { template std::enable_if_t(), void> operator()() { - // TODO: Implementing ``dictionary32`` would need ``DictionaryFieldMapper`` and - // ``FieldPosition`` classes from arrow source to keep track of dictionary encoding paths. - CUDF_FAIL("Dictionary columns are not supported for writing arrow schema"); + // `dictionary32` columns are not written to parquet by cudf. + CUDF_FAIL("Dictionary columns are not supported for writing"); } }; From 578c8e1c89cee46741ea359172b37a45200ea1c8 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 14 Jun 2024 20:06:00 +0000 Subject: [PATCH 37/45] minor comment update --- cpp/src/io/parquet/writer_impl_helpers.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp index 1e461e9b4bc..2f8125f3c44 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.hpp +++ b/cpp/src/io/parquet/writer_impl_helpers.hpp @@ -79,8 +79,10 @@ void fill_table_meta(std::unique_ptr const& table_meta); [[nodiscard]] size_t column_size(column_view const& column, rmm::cuda_stream_view stream); /** - * @brief Returns ``true`` if the column is nullable or if the write mode is not - * set to write the table all at once instead of chunked + * @brief Indicates if the column should be marked as nullable in the output schema + * + * Returns `true` if the input column is nullable or if the write mode is not set to + * write the table all at once instead of chunked. * * @param column A view of the (linked) column * @param column_metadata Metadata of the column From b1e6b6fd51a0b2cf4f52c936e2b9613baa5e262b Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 26 Jun 2024 01:53:13 +0000 Subject: [PATCH 38/45] Minor refactor --- cpp/src/io/parquet/writer_impl.cu | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index f1ac122482b..94594c83996 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -2429,8 +2429,7 @@ writer::impl::impl(std::vector> sinks, _dict_policy(options.get_dictionary_policy()), _max_dictionary_size(options.get_max_dictionary_size()), _max_page_fragment_size(options.get_max_page_fragment_size()), - _int96_timestamps(options.is_enabled_int96_timestamps() and - not options.is_enabled_write_arrow_schema()), + _int96_timestamps(options.is_enabled_int96_timestamps()), _utc_timestamps(options.is_enabled_utc_timestamps()), _write_v2_headers(options.is_enabled_write_v2_headers()), _write_arrow_schema(options.is_enabled_write_arrow_schema()), @@ -2441,9 +2440,6 @@ writer::impl::impl(std::vector> sinks, _out_sink(std::move(sinks)), _compression_statistics{options.get_compression_statistics()} { - if (options.is_enabled_int96_timestamps() and options.is_enabled_write_arrow_schema()) { - CUDF_LOG_WARN("INT96 timestamps are deprecated in arrow schema. Disabling INT96 timestamps."); - } if (options.get_metadata()) { _table_meta = std::make_unique(*options.get_metadata()); } From c011e512cefa2362b7e30617afc06f9008c99a69 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 27 Jun 2024 21:19:02 +0000 Subject: [PATCH 39/45] Incorporating minor suggestions from review --- cpp/src/io/parquet/arrow_schema_writer.cpp | 3 ++- cpp/src/io/parquet/writer_impl_helpers.cpp | 12 +++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index 5f63c93cff3..ddf65e9020f 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -345,8 +345,9 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con // Instantiate a flatbuffer builder FlatBufferBuilder fbb; - // Create an empty field offset vector + // Create an empty field offset vector and reserve space for linked columns std::vector field_offsets; + field_offsets.reserve(linked_columns.size()); // populate field offsets (aka schema fields) std::transform(thrust::make_zip_iterator( diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp index 04d5730528d..fb20a5b0999 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.cpp +++ b/cpp/src/io/parquet/writer_impl_helpers.cpp @@ -46,11 +46,13 @@ Compression to_parquet_compression(compression_type compression) nvcomp::compression_type to_nvcomp_compression_type(Compression codec) { - if (codec == Compression::SNAPPY) return nvcomp::compression_type::SNAPPY; - if (codec == Compression::ZSTD) return nvcomp::compression_type::ZSTD; - // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4 - if (codec == Compression::LZ4_RAW) return nvcomp::compression_type::LZ4; - CUDF_FAIL("Unsupported compression type"); + switch (codec) { + case Compression::SNAPPY: return nvcomp::compression_type::SNAPPY; + case Compression::ZSTD: return nvcomp::compression_type::ZSTD; + // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4 + case Compression::LZ4_RAW: return nvcomp::compression_type::LZ4; + default: CUDF_FAIL("Unsupported compression type"); + } } uint32_t page_alignment(Compression codec) From b6a54ecae453de986e3b5d8abbcac104569ea8a2 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 4 Jul 2024 01:54:24 +0000 Subject: [PATCH 40/45] Test for exception handling to_parquet with int96 and arrow schema enabled --- python/cudf/cudf/tests/test_parquet.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 1010823d643..2db9800fd86 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1617,12 +1617,10 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf): assert_eq(pdf, gdf) # Write out the gdf using the GPU accelerated writer with INT96 timestamps - # TODO: store_schema must be false when working with INT96 timestamps gdf.to_parquet( gdf_fname.strpath, index=None, int96_timestamps=True, - store_schema=False, ) assert os.path.exists(gdf_fname) @@ -3627,6 +3625,24 @@ def test_parquet_writer_roundtrip_with_arrow_schema(index): assert_eq(expected, got3) +def test_parquet_writer_int96_timestamps_and_arrow_schema(index): + df = cudf.DataFrame( + { + "timestamp": cudf.Series( + [1234, 123, 4123], dtype="datetime64[ms]" + ), + } + ) + + # Output buffer + buffer = BytesIO() + + # Writing out parquet with both INT96 timestamps and arrow_schema + # enabled should throw an exception. + with pytest.raises(RuntimeError): + df.to_parquet(buffer, int96_timestamps=True, store_schema=True) + + @pytest.mark.parametrize( "data", [ From bddfabe7f3de79d596727cd95f1e65f43d5610a4 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 5 Jul 2024 18:45:54 +0000 Subject: [PATCH 41/45] Minor fix for failing pytests --- python/cudf/cudf/tests/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 2db9800fd86..ff0c9040737 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -3625,7 +3625,7 @@ def test_parquet_writer_roundtrip_with_arrow_schema(index): assert_eq(expected, got3) -def test_parquet_writer_int96_timestamps_and_arrow_schema(index): +def test_parquet_writer_int96_timestamps_and_arrow_schema(): df = cudf.DataFrame( { "timestamp": cudf.Series( From e9ab52f0f1888fa0011b38616a9d589fa701cbaf Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 9 Jul 2024 01:28:02 +0000 Subject: [PATCH 42/45] Minor changes from reviewer suggestions --- cpp/src/io/parquet/writer_impl.cu | 2 +- cpp/src/io/parquet/writer_impl_helpers.cpp | 6 +++--- cpp/src/io/parquet/writer_impl_helpers.hpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 94594c83996..66b4fce16fe 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -2474,7 +2474,7 @@ void writer::impl::write(table_view const& input, std::vector co CUDF_EXPECTS(not _closed, "Data has already been flushed to out and closed"); if (not _table_meta) { _table_meta = std::make_unique(input); } - fill_table_meta(_table_meta); + fill_table_meta(*_table_meta); // All kinds of memory allocation and data compressions/encoding are performed here. // If any error occurs, such as out-of-memory exception, the internal state of the current diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp index fb20a5b0999..e2f09f872d3 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.cpp +++ b/cpp/src/io/parquet/writer_impl_helpers.cpp @@ -72,7 +72,7 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize); } -void fill_table_meta(std::unique_ptr const& table_meta) +void fill_table_meta(table_input_metadata& table_meta) { // Fill unnamed columns' names in table_meta std::function add_default_name = @@ -82,8 +82,8 @@ void fill_table_meta(std::unique_ptr const& table_meta) add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i)); } }; - for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) { - add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i)); + for (size_t i = 0; i < table_meta.column_metadata.size(); ++i) { + add_default_name(table_meta.column_metadata[i], "_col" + std::to_string(i)); } } diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp index 73d302ec3fc..a85411594e9 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.hpp +++ b/cpp/src/io/parquet/writer_impl_helpers.hpp @@ -67,7 +67,7 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block * * @param table_meta The table metadata to fill */ -void fill_table_meta(std::unique_ptr const& table_meta); +void fill_table_meta(table_input_metadata& table_meta); /** * @brief Compute size (in bytes) of the data stored in the given column. From 9b163f791f2e1dae0833eed63f372d4e2bedb9cc Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 9 Jul 2024 11:22:22 -0700 Subject: [PATCH 43/45] Update cpp/src/io/parquet/arrow_schema_writer.hpp Co-authored-by: Bradley Dice --- cpp/src/io/parquet/arrow_schema_writer.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index 1b62ef35c86..bc901ded9af 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -48,7 +48,7 @@ namespace cudf::io::parquet::detail { std::string construct_arrow_schema_ipc_message( cudf::detail::LinkedColVector const& linked_columns, table_input_metadata const& metadata, - ::cudf::io::detail::single_write_mode const write_mode, + cudf::io::detail::single_write_mode const write_mode, bool const utc_timestamps); } // namespace cudf::io::parquet::detail From 13a06acbc6dd8e4dd8192752c1382b5aada506bc Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 9 Jul 2024 18:25:47 +0000 Subject: [PATCH 44/45] Apply clang-format --- cpp/src/io/parquet/arrow_schema_writer.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index bc901ded9af..9bc435bf6c8 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -45,10 +45,9 @@ namespace cudf::io::parquet::detail { * * @return The constructed arrow ipc message string */ -std::string construct_arrow_schema_ipc_message( - cudf::detail::LinkedColVector const& linked_columns, - table_input_metadata const& metadata, - cudf::io::detail::single_write_mode const write_mode, - bool const utc_timestamps); +std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, + table_input_metadata const& metadata, + cudf::io::detail::single_write_mode const write_mode, + bool const utc_timestamps); } // namespace cudf::io::parquet::detail From 1ceca42b7f4d16fa6b5550594469873c8abba18d Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 9 Jul 2024 22:06:56 +0000 Subject: [PATCH 45/45] Add details to `store_schema` docstring --- python/cudf/cudf/utils/ioutils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 5d115c6be5a..76c7f2bfdb8 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -323,8 +323,11 @@ If a column name is present in the set, that column will be output as unannotated binary, rather than the default 'UTF-8'. store_schema : bool, default False - If ``True``, enable computing and writing arrow schema to Parquet - file footer's key-value metadata section for faithful round-tripping. + If ``True``, writes arrow schema to Parquet file footer's key-value + metadata section to faithfully round-trip ``duration`` types with arrow. + This cannot be used with ``int96_timestamps`` enabled as int96 timestamps + are deprecated in arrow. Also, all decimal32 and decimal64 columns will be + converted to decimal128 as arrow only supports decimal128 and decimal256 types. **kwargs Additional parameters will be passed to execution engines other than ``cudf``.