From 7e58a7abb8b29f63005dfc92aba84da2ea4007a6 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 22 May 2024 03:15:17 +0000
Subject: [PATCH 01/45] Python bindings + initial artifacts for arrow schema in
 PQ writer

---
 cpp/include/cudf/io/parquet.hpp               |  56 ++++++++
 cpp/include/cudf/types.hpp                    |   6 +-
 cpp/src/io/parquet/writer_impl.cu             | 124 ++++++++++++++----
 cpp/src/io/parquet/writer_impl.hpp            |   1 +
 python/cudf/cudf/_lib/parquet.pyx             |  11 +-
 .../_lib/pylibcudf/libcudf/io/parquet.pxd     |  18 ++-
 python/cudf/cudf/io/parquet.py                |   4 +
 7 files changed, 185 insertions(+), 35 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index b2f949cdcee..ac04cd4c11f 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -602,6 +602,8 @@ class parquet_writer_options {
   // Parquet writer can write timestamps as UTC
   // Defaults to true because libcudf timestamps are implicitly UTC
   bool _write_timestamps_as_UTC = true;
+  // Whether to write ARROW schema
+  bool _write_arrow_schema = true;
   // Column chunks file paths to be set in the raw output metadata. One per output file
   std::vector<std::string> _column_chunks_file_paths;
   // Maximum size of each row group (unless smaller than a single page)
@@ -731,6 +733,13 @@ class parquet_writer_options {
    */
   [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
 
+  /**
+   * @brief Returns `true` if arrow schema will be written
+   *
+   * @return `true` if arrow schema will be written
+   */
+  [[nodiscard]] auto is_enabled_write_arrow_schema() const { return _write_arrow_schema; }
+
   /**
    * @brief Returns Column chunks file paths to be set in the raw output metadata.
    *
@@ -882,6 +891,13 @@ class parquet_writer_options {
    */
   void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; }
 
+  /**
+   * @brief Sets preference for writing arrow schema. Write arrow schema if set to `true`.
+   *
+   * @param val Boolean value to enable/disable writing of arrow schema.
+   */
+  void enable_write_arrow_schema(bool val) { _write_arrow_schema = val; }
+
   /**
    * @brief Sets column chunks file path to be set in the raw output metadata.
    *
@@ -1215,6 +1231,18 @@ class parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set to true if arrow schema is to be written
+   *
+   * @param enabled Boolean value to enable/disable writing of arrow schema
+   * @return this for chaining
+   */
+  parquet_writer_options_builder& write_arrow_schema(bool enabled)
+  {
+    options._write_arrow_schema = enabled;
+    return *this;
+  }
+
   /**
    * @brief Set to true if V2 page headers are to be written.
    *
@@ -1298,6 +1326,8 @@ class chunked_parquet_writer_options {
   bool _write_timestamps_as_int96 = false;
   // Parquet writer can write timestamps as UTC. Defaults to true.
   bool _write_timestamps_as_UTC = true;
+  // Whether to write ARROW schema
+  bool _write_arrow_schema = true;
   // Maximum size of each row group (unless smaller than a single page)
   size_t _row_group_size_bytes = default_row_group_size_bytes;
   // Maximum number of rows in row group (unless smaller than a single page)
@@ -1390,6 +1420,13 @@ class chunked_parquet_writer_options {
    */
   [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
 
+  /**
+   * @brief Returns `true` if arrow schema will be written
+   *
+   * @return `true` if arrow schema will be written
+   */
+  [[nodiscard]] auto is_enabled_write_arrow_schema() const { return _write_arrow_schema; }
+
   /**
    * @brief Returns maximum row group size, in bytes.
    *
@@ -1525,6 +1562,13 @@ class chunked_parquet_writer_options {
    */
   void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; }
 
+  /**
+   * @brief Sets preference for writing arrow schema. Write arrow schema if set to `true`.
+   *
+   * @param val Boolean value to enable/disable writing of arrow schema.
+   */
+  void enable_write_arrow_schema(bool val) { _write_arrow_schema = val; }
+
   /**
    * @brief Sets the maximum row group size, in bytes.
    *
@@ -1711,6 +1755,18 @@ class chunked_parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set to true if arrow schema is to be written
+   *
+   * @param enabled Boolean value to enable/disable writing of arrow schema
+   * @return this for chaining
+   */
+  chunked_parquet_writer_options_builder& write_arrow_schema(bool enabled)
+  {
+    options._write_arrow_schema = enabled;
+    return *this;
+  }
+
   /**
    * @brief Set to true if V2 page headers are to be written.
    *
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index baf07fa3db6..101791cee0b 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -216,7 +216,11 @@ enum class type_id : int32_t {
   TIMESTAMP_MILLISECONDS,  ///< point in time in milliseconds since Unix Epoch in int64
   TIMESTAMP_MICROSECONDS,  ///< point in time in microseconds since Unix Epoch in int64
   TIMESTAMP_NANOSECONDS,   ///< point in time in nanoseconds since Unix Epoch in int64
-  DURATION_DAYS,           ///< time interval of days in int32
+  TIME_SECONDS,            ///< time of day since midnight in seconds in int64
+  TIME_MILLISECONDS,       ///< time of day since midnight in milliseconds in int64
+  TIME_MICROSECONDS,       ///< time of day since midnight in microseconds in int64
+  TIME_NANOSECONDS,        ///< time of day since midnight in nanoseconds in int64
+  DURATION_DAYS,           ///< time interval of days in int64
   DURATION_SECONDS,        ///< time interval of seconds in int64
   DURATION_MILLISECONDS,   ///< time interval of milliseconds in int64
   DURATION_MICROSECONDS,   ///< time interval of microseconds in int64
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 1dfced94f5b..c67c02655bb 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -25,8 +25,11 @@
 #include "io/parquet/parquet.hpp"
 #include "io/parquet/parquet_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
+#include "io/utilities/base64_utilities.hpp"
 #include "io/utilities/column_utils.cuh"
 #include "io/utilities/config_utils.hpp"
+#include "ipc/Message_generated.h"
+#include "ipc/Schema_generated.h"
 #include "parquet_common.hpp"
 #include "parquet_gpu.cuh"
 #include "writer_impl.hpp"
@@ -66,12 +69,29 @@ namespace cudf::io::parquet::detail {
 
 using namespace cudf::io::detail;
 
+/**
+ * @brief Construct and return arrow schema ipc message from input parquet schema
+ *
+ * Recursively traverses through parquet schema to construct arrow schema tree.
+ * The resulting schema tree is serialized and stored as the header (or metadata) of
+ * an otherwise empty ipc message using flatbuffers. The ipc message is then prepended
+ * with header (metadata) size (padded for 16 byte alignment) and a continuation
+ * string. The final string is base64 encoded and returned to be stored at the keyvalue
+ * metadata section of the Parquet file footer.
+ */
+std::string construct_arrow_schema_ipc_message(host_span<SchemaElement const> parquet_schema)
+{
+  // TODO: dummy return empty string for now
+  return cudf::io::detail::base64_encode("");
+}
+
 struct aggregate_writer_metadata {
   aggregate_writer_metadata(host_span<partition_info const> partitions,
                             host_span<std::map<std::string, std::string> const> kv_md,
                             host_span<SchemaElement const> tbl_schema,
                             size_type num_columns,
-                            statistics_freq stats_granularity)
+                            statistics_freq stats_granularity,
+                            bool const write_arrow_schema)
     : version(1),
       schema(std::vector<SchemaElement>(tbl_schema.begin(), tbl_schema.end())),
       files(partitions.size())
@@ -85,6 +105,9 @@ struct aggregate_writer_metadata {
       this->column_orders       = std::vector<ColumnOrder>(num_columns, default_order);
     }
 
+    // Construct the arrow schema ipc message string.
+    auto const arrow_schema_ipc_message = construct_arrow_schema_ipc_message(schema);
+
     for (size_t p = 0; p < kv_md.size(); ++p) {
       std::transform(kv_md[p].begin(),
                      kv_md[p].end(),
@@ -92,6 +115,11 @@ struct aggregate_writer_metadata {
                      [](auto const& kv) {
                        return KeyValue{kv.first, kv.second};
                      });
+      // Append arrow schema to the key_value_metadata
+      if (write_arrow_schema and not arrow_schema_ipc_message.empty()) {
+        this->files[p].key_value_metadata.emplace_back(
+          KeyValue{"ARROW:schema", std::move(arrow_schema_ipc_message)});
+      }
     }
   }
 
@@ -507,52 +535,90 @@ struct leaf_schema_fn {
     }
   }
 
-  //  unsupported outside cudf for parquet 1.0.
+  /* TODO: This code block should be ``time`` type and not ``duration`` type
+    //  unsupported outside cudf for parquet 1.0.
+    template <typename T>
+    std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
+    {
+      col_schema.type           = Type::INT32;
+      col_schema.converted_type = ConvertedType::TIME_MILLIS;
+      col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+      col_schema.ts_scale       = 24 * 60 * 60 * 1000;
+      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    }
+
+    template <typename T>
+    std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
+    {
+      col_schema.type           = Type::INT32;
+      col_schema.converted_type = ConvertedType::TIME_MILLIS;
+      col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+      col_schema.ts_scale       = 1000;
+      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    }
+
+    template <typename T>
+    std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
+    {
+      col_schema.type           = Type::INT32;
+      col_schema.converted_type = ConvertedType::TIME_MILLIS;
+      col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    }
+
+    template <typename T>
+    std::enable_if_t<std::is_same_v<T, cudf::duration_us>, void> operator()()
+    {
+      col_schema.type           = Type::INT64;
+      col_schema.converted_type = ConvertedType::TIME_MICROS;
+      col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}};
+    }
+
+    //  unsupported outside cudf for parquet 1.0.
+    template <typename T>
+    std::enable_if_t<std::is_same_v<T, cudf::duration_ns>, void> operator()()
+    {
+      col_schema.type         = Type::INT64;
+      col_schema.stats_dtype  = statistics_dtype::dtype_int64;
+      col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}};
+    }
+  */
+
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
   {
-    col_schema.type           = Type::INT32;
-    col_schema.converted_type = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
-    col_schema.ts_scale       = 24 * 60 * 60 * 1000;
-    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    col_schema.type       = Type::INT64;
+    col_schema.arrow_type = cudf::type_id::DURATION_SECONDS;
+    col_schema.ts_scale   = 24 * 60 * 60;
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
   {
-    col_schema.type           = Type::INT32;
-    col_schema.converted_type = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
-    col_schema.ts_scale       = 1000;
-    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    col_schema.type       = Type::INT64;
+    col_schema.arrow_type = cudf::type_id::DURATION_SECONDS;
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
   {
-    col_schema.type           = Type::INT32;
-    col_schema.converted_type = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
-    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    col_schema.type       = Type::INT64;
+    col_schema.arrow_type = cudf::type_id::DURATION_MILLISECONDS;
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_us>, void> operator()()
   {
-    col_schema.type           = Type::INT64;
-    col_schema.converted_type = ConvertedType::TIME_MICROS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
-    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}};
+    col_schema.type       = Type::INT64;
+    col_schema.arrow_type = cudf::type_id::DURATION_MICROSECONDS;
   }
 
-  //  unsupported outside cudf for parquet 1.0.
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ns>, void> operator()()
   {
-    col_schema.type         = Type::INT64;
-    col_schema.stats_dtype  = statistics_dtype::dtype_int64;
-    col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}};
+    col_schema.type       = Type::INT64;
+    col_schema.arrow_type = cudf::type_id::DURATION_NANOSECONDS;
   }
 
   template <typename T>
@@ -625,7 +691,7 @@ inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
  * Recursively traverses through linked_columns and corresponding metadata to construct schema tree.
  * The resulting schema tree is stored in a vector in pre-order traversal order.
  */
-std::vector<schema_tree_node> construct_schema_tree(
+std::vector<schema_tree_node> construct_parquet_schema_tree(
   cudf::detail::LinkedColVector const& linked_columns,
   table_input_metadata& metadata,
   single_write_mode write_mode,
@@ -1703,12 +1769,13 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    bool int96_timestamps,
                                    bool utc_timestamps,
                                    bool write_v2_headers,
+                                   bool write_arrow_schema,
                                    host_span<std::unique_ptr<data_sink> const> out_sink,
                                    rmm::cuda_stream_view stream)
 {
   auto vec = table_to_linked_columns(input);
   auto schema_tree =
-    construct_schema_tree(vec, table_meta, write_mode, int96_timestamps, utc_timestamps);
+    construct_parquet_schema_tree(vec, table_meta, write_mode, int96_timestamps, utc_timestamps);
   // Construct parquet_column_views from the schema tree leaf nodes.
   std::vector<parquet_column_view> parquet_columns;
 
@@ -1831,7 +1898,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   std::unique_ptr<aggregate_writer_metadata> agg_meta;
   if (!curr_agg_meta) {
     agg_meta = std::make_unique<aggregate_writer_metadata>(
-      partitions, kv_meta, this_table_schema, num_columns, stats_granularity);
+      partitions, kv_meta, this_table_schema, num_columns, stats_granularity, write_arrow_schema);
   } else {
     agg_meta = std::make_unique<aggregate_writer_metadata>(*curr_agg_meta);
 
@@ -2312,6 +2379,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
+    _write_arrow_schema(options.is_enabled_write_arrow_schema()),
     _sorting_columns(options.get_sorting_columns()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
@@ -2342,6 +2410,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
+    _write_arrow_schema(options.is_enabled_write_arrow_schema()),
     _sorting_columns(options.get_sorting_columns()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
@@ -2420,6 +2489,7 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                                            _int96_timestamps,
                                            _utc_timestamps,
                                            _write_v2_headers,
+                                           _write_arrow_schema,
                                            _out_sink,
                                            _stream);
     } catch (...) {  // catch any exception type
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 784f78f06d5..63128faf993 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -156,6 +156,7 @@ class writer::impl {
   bool const _int96_timestamps;
   bool const _utc_timestamps;
   bool const _write_v2_headers;
+  bool const _write_arrow_schema;
   std::optional<std::vector<sorting_column>> _sorting_columns;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 70acb7f917b..6b78ac0199c 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -408,6 +408,7 @@ def write_parquet(
     object force_nullable_schema=False,
     header_version="1.0",
     use_dictionary=True,
+    write_arrow_schema=True,
 ):
     """
     Cython function to call into libcudf API, see `write_parquet`.
@@ -507,6 +508,7 @@ def write_parquet(
         .write_v2_headers(header_version == "2.0")
         .dictionary_policy(dict_policy)
         .utc_timestamps(False)
+        .write_arrow_schema(write_arrow_schema)
         .build()
     )
     if partitions_info is not None:
@@ -586,6 +588,9 @@ cdef class ParquetWriter:
         If ``True``, enable dictionary encoding for Parquet page data
         subject to ``max_dictionary_size`` constraints.
         If ``False``, disable dictionary encoding for Parquet page data.
+    write_arrow_schema : bool, default True
+        If ``True``, enable computing and writing arrow schema to Parquet
+        file footer's key-value metadata section.
     See Also
     --------
     cudf.io.parquet.write_parquet
@@ -604,6 +609,7 @@ cdef class ParquetWriter:
     cdef size_type max_page_size_rows
     cdef size_t max_dictionary_size
     cdef cudf_io_types.dictionary_policy dict_policy
+    cdef bool write_arrow_schema
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
                   object compression="snappy", str statistics="ROWGROUP",
@@ -612,7 +618,8 @@ cdef class ParquetWriter:
                   int max_page_size_bytes=524288,
                   int max_page_size_rows=20000,
                   int max_dictionary_size=1048576,
-                  bool use_dictionary=True):
+                  bool use_dictionary=True,
+                  bool store_schema=True):
         filepaths_or_buffers = (
             list(filepath_or_buffer)
             if is_list_like(filepath_or_buffer)
@@ -633,6 +640,7 @@ cdef class ParquetWriter:
             if use_dictionary
             else cudf_io_types.dictionary_policy.NEVER
         )
+        self.write_arrow_schema = store_schema
 
     def write_table(self, table, object partitions_info=None):
         """ Writes a single table to the file """
@@ -751,6 +759,7 @@ cdef class ParquetWriter:
                 .max_page_size_bytes(self.max_page_size_bytes)
                 .max_page_size_rows(self.max_page_size_rows)
                 .max_dictionary_size(self.max_dictionary_size)
+                .write_arrow_schema(self.write_arrow_schema)
                 .build()
             )
             args.set_dictionary_policy(self.dict_policy)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
index 33a594b432f..cb4ce142543 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
@@ -80,6 +80,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         size_t get_max_page_size_bytes() except +
         size_type get_max_page_size_rows() except +
         size_t get_max_dictionary_size() except +
+        bool is_enabled_write_arrow_schema() except +
 
         void set_partitions(
             vector[cudf_io_types.partition_info] partitions
@@ -99,12 +100,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_column_chunks_file_paths(
             vector[string] column_chunks_file_paths
         ) except +
-        void set_int96_timestamps(
-            bool enabled
-        ) except +
-        void set_utc_timestamps(
-            bool enabled
-        ) except +
+        void enable_int96_timestamps(bool val) except +
+        void enable_utc_timestamps(bool val) except +
+        void enable_write_arrow_schema(bool val) except +
         void set_row_group_size_bytes(size_t val) except +
         void set_row_group_size_rows(size_type val) except +
         void set_max_page_size_bytes(size_t val) except +
@@ -147,6 +145,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_writer_options_builder& int96_timestamps(
             bool enabled
         ) except +
+        parquet_writer_options_builder& write_arrow_schema(
+            bool enabled
+        ) except +
         parquet_writer_options_builder& utc_timestamps(
             bool enabled
         ) except +
@@ -190,6 +191,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         size_t get_max_page_size_bytes() except +
         size_type get_max_page_size_rows() except +
         size_t get_max_dictionary_size() except +
+        bool is_enabled_write_arrow_schema() except +
 
         void set_metadata(
             cudf_io_types.table_input_metadata m
@@ -215,6 +217,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_max_page_size_rows(size_type val) except +
         void set_max_dictionary_size(size_t val) except +
         void enable_write_v2_headers(bool val) except +
+        void enable_write_arrow_schema(bool val) except +
         void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except +
 
         @staticmethod
@@ -245,6 +248,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_writer_options_builder& utc_timestamps(
             bool enabled
         ) except +
+        chunked_parquet_writer_options_builder& write_arrow_schema(
+            bool enabled
+        ) except +
         chunked_parquet_writer_options_builder& row_group_size_bytes(
             size_t val
         ) except +
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index a6c67d22af7..fa93cd9fd29 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -69,6 +69,7 @@ def _write_parquet(
     force_nullable_schema=False,
     header_version="1.0",
     use_dictionary=True,
+    write_arrow_schema=True,
 ):
     if is_list_like(paths) and len(paths) > 1:
         if partitions_info is None:
@@ -102,6 +103,7 @@ def _write_parquet(
         "force_nullable_schema": force_nullable_schema,
         "header_version": header_version,
         "use_dictionary": use_dictionary,
+        "write_arrow_schema": write_arrow_schema,
     }
     if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs):
         with ExitStack() as stack:
@@ -906,6 +908,7 @@ def to_parquet(
     force_nullable_schema=False,
     header_version="1.0",
     use_dictionary=True,
+    store_schema=True,
     *args,
     **kwargs,
 ):
@@ -983,6 +986,7 @@ def to_parquet(
             force_nullable_schema=force_nullable_schema,
             header_version=header_version,
             use_dictionary=use_dictionary,
+            write_arrow_schema=store_schema,
         )
 
     else:

From 7351f91f84dc13a0e9dab8e7eeba96553968aed8 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 23 May 2024 06:01:29 +0000
Subject: [PATCH 02/45] Add artifacts to build flatbuffers.

---
 cpp/src/io/parquet/arrow_schema_writer.hpp | 137 +++++++++++++++++++++
 cpp/src/io/parquet/parquet_common.hpp      |   5 +
 cpp/src/io/parquet/reader_impl_helpers.cpp |   5 -
 cpp/src/io/parquet/reader_impl_helpers.hpp |   3 +
 cpp/src/io/parquet/writer_impl.cu          |  19 +--
 5 files changed, 147 insertions(+), 22 deletions(-)
 create mode 100644 cpp/src/io/parquet/arrow_schema_writer.hpp

diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
new file mode 100644
index 00000000000..340e99c82a0
--- /dev/null
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file arrow_schema.hpp
+ * @brief Arrow IPC schema writer implementation
+ */
+
+#pragma once
+
+#include "io/parquet/parquet.hpp"
+#include "io/parquet/parquet_common.hpp"
+#include "io/utilities/base64_utilities.hpp"
+#include "ipc/Message_generated.h"
+#include "ipc/Schema_generated.h"
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+namespace cudf::io::parquet::detail {
+
+using namespace cudf::io::detail;
+
+namespace flatbuf = cudf::io::parquet::flatbuf;
+
+using FlatBufferBuilder = flatbuffers::FlatBufferBuilder;
+using DictionaryOffset  = flatbuffers::Offset<flatbuf::DictionaryEncoding>;
+using FieldOffset       = flatbuffers::Offset<flatbuf::Field>;
+using Offset            = flatbuffers::Offset<void>;
+using FBString          = flatbuffers::Offset<flatbuffers::String>;
+
+class FieldPosition {
+ public:
+  FieldPosition() : parent_(nullptr), _index(-1), _depth(0) {}
+
+  FieldPosition child(int index) const { return {this, index}; }
+
+  std::vector<int> path() const
+  {
+    std::vector<int> path(_depth);
+    const FieldPosition* cur = this;
+    for (int i = _depth - 1; i >= 0; --i) {
+      path[i] = cur->_index;
+      cur     = cur->parent_;
+    }
+    return path;
+  }
+
+ protected:
+  FieldPosition(const FieldPosition* parent, int index)
+    : parent_(parent), _index(index), _depth(parent->_depth + 1)
+  {
+  }
+
+  const FieldPosition* parent_;
+  int _index;
+  int _depth;
+};
+
+std::vector<FieldOffset> make_field_offsets(host_span<SchemaElement const> parquet_schema)
+{
+  // MH: Get here
+  std::vector<FieldOffset> field_offsets;
+  FieldPosition pos;
+
+  for (size_type i = 0; i < static_cast<size_type>(parquet_schema.size()); ++i) {
+    FieldOffset offset;
+    // FieldToFlatbufferVisitor field_visitor(fbb, mapper, pos.child(i));
+    // field_visitor.GetResult(schema.field(i), &offset);
+    field_offsets.push_back(offset);
+  }
+  return field_offsets;
+}
+
+/**
+ * @brief Construct and return arrow schema from input parquet schema
+ *
+ * Recursively traverses through parquet schema to construct the arrow schema tree.
+ * Serializes the arrow schema tree and stores it as the header (or metadata) of
+ * an otherwise empty ipc message using flatbuffers. The ipc message is then prepended
+ * with header size (padded for 16 byte alignment) and a continuation string. The final
+ * string is base64 encoded and returned.
+ */
+std::string construct_arrow_schema_ipc_message(host_span<SchemaElement const> parquet_schema)
+{
+  // Lambda function to convert int32 to a string of uint8 bytes
+  auto const convert_int32_to_byte_string = [&](int32_t const value) {
+    std::array<uint8_t, sizeof(int32_t)> buffer;
+    std::memcpy(buffer.data(), &value, sizeof(int32_t));
+    return std::string(reinterpret_cast<char*>(buffer.data()), buffer.size());
+  };
+
+  FlatBufferBuilder fbb;
+  auto fb_offsets = fbb.CreateVector(make_field_offsets(parquet_schema));
+
+  flatbuffers::Offset<flatbuf::Schema> const fb_schema =
+    flatbuf::CreateSchema(fbb, flatbuf::Endianness::Endianness_Little, fb_offsets);
+
+  auto const ipc_message_flatbuffer = flatbuf::CreateMessage(fbb,
+                                                             flatbuf::MetadataVersion_V5,
+                                                             flatbuf::MessageHeader_Schema,
+                                                             fb_schema.Union(),
+                                                             0 /* body_length */);
+  fbb.Finish(ipc_message_flatbuffer);
+
+  int32_t metadata_len = fbb.GetSize();
+
+  // Store the final string here to pass its view to base64_encode
+  std::string ipc_message =
+    convert_int32_to_byte_string(IPC_CONTINUATION_TOKEN) +
+    convert_int32_to_byte_string(metadata_len) +
+    std::string(reinterpret_cast<char*>(fbb.GetBufferPointer()), metadata_len);
+
+  // encode the final ipc message to base64 and return
+  return cudf::io::detail::base64_encode(ipc_message);
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index 8507eca047e..be469dd25c5 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -26,6 +26,11 @@ auto constexpr MAX_DECIMAL32_PRECISION  = 9;
 auto constexpr MAX_DECIMAL64_PRECISION  = 18;
 auto constexpr MAX_DECIMAL128_PRECISION = 38;  // log10(2^(sizeof(int128_t) * 8 - 1) - 1)
 
+// Constants copied from arrow source and renamed to match the case
+constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL         = sizeof(int32_t);
+constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t);
+constexpr int32_t IPC_CONTINUATION_TOKEN                             = -1;
+
 /**
  * @brief Basic data types in Parquet, determines how data is physically stored
  */
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index eb653c6b9ac..565dc2e02f2 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -786,11 +786,6 @@ void aggregate_reader_metadata::apply_arrow_schema()
 std::optional<std::string_view> aggregate_reader_metadata::decode_ipc_message(
   std::string_view const serialized_message) const
 {
-  // Constants copied from arrow source and renamed to match the case
-  constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL         = sizeof(int32_t);
-  constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t);
-  constexpr int32_t IPC_CONTINUATION_TOKEN                             = -1;
-
   // message buffer
   auto message_buf = serialized_message.data();
   // current message (buffer) size
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 9aeb19a7723..8b0f59ef33d 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -117,6 +117,9 @@ struct metadata : public FileMetaData {
   void sanitize_schema();
 };
 
+/**
+ * @brief Class to extract data types from arrow schema tree
+ */
 struct arrow_schema_data_types {
   std::vector<arrow_schema_data_types> children;
   data_type type{type_id::EMPTY};
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index c67c02655bb..607b1f8b0f5 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -19,6 +19,7 @@
  * @brief cuDF-IO parquet writer class implementation
  */
 
+#include "arrow_schema_writer.hpp"
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
@@ -69,22 +70,6 @@ namespace cudf::io::parquet::detail {
 
 using namespace cudf::io::detail;
 
-/**
- * @brief Construct and return arrow schema ipc message from input parquet schema
- *
- * Recursively traverses through parquet schema to construct arrow schema tree.
- * The resulting schema tree is serialized and stored as the header (or metadata) of
- * an otherwise empty ipc message using flatbuffers. The ipc message is then prepended
- * with header (metadata) size (padded for 16 byte alignment) and a continuation
- * string. The final string is base64 encoded and returned to be stored at the keyvalue
- * metadata section of the Parquet file footer.
- */
-std::string construct_arrow_schema_ipc_message(host_span<SchemaElement const> parquet_schema)
-{
-  // TODO: dummy return empty string for now
-  return cudf::io::detail::base64_encode("");
-}
-
 struct aggregate_writer_metadata {
   aggregate_writer_metadata(host_span<partition_info const> partitions,
                             host_span<std::map<std::string, std::string> const> kv_md,
@@ -115,7 +100,7 @@ struct aggregate_writer_metadata {
                      [](auto const& kv) {
                        return KeyValue{kv.first, kv.second};
                      });
-      // Append arrow schema to the key_value_metadata
+      // Append arrow schema to the key-value metadata
       if (write_arrow_schema and not arrow_schema_ipc_message.empty()) {
         this->files[p].key_value_metadata.emplace_back(
           KeyValue{"ARROW:schema", std::move(arrow_schema_ipc_message)});

From 9aca785066c4f079d052a13559b610369ff4c4d8 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 23 May 2024 07:19:25 +0000
Subject: [PATCH 03/45] Add basic artifacts to construct the field vector.

---
 cpp/src/io/parquet/arrow_schema_writer.hpp | 45 +++++++++++++++++-----
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index 340e99c82a0..24ebbc61812 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -29,6 +29,7 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cstdint>
 #include <memory>
@@ -76,18 +77,41 @@ class FieldPosition {
   int _depth;
 };
 
-std::vector<FieldOffset> make_field_offsets(host_span<SchemaElement const> parquet_schema)
+struct dispatch_to_flatbuf_type {};
+
+std::vector<FieldOffset> make_field_offsets(FlatBufferBuilder& fbb,
+                                            host_span<SchemaElement const> parquet_schema)
 {
   // MH: Get here
   std::vector<FieldOffset> field_offsets;
-  FieldPosition pos;
+  [[maybe_unused]] FieldPosition pos;
+
+  // Create flatbuffer Fields and insert in field offsets vector
+  std::transform(parquet_schema.begin(),
+                 parquet_schema.end(),
+                 std::back_inserter(field_offsets),
+                 [&](auto schema_elem) {
+                   auto fb_name = fbb.CreateString(schema_elem.name);
+                   auto is_nullable =
+                     schema_elem.repetition_type == FieldRepetitionType::OPTIONAL or
+                     schema_elem.repetition_type == FieldRepetitionType::REPEATED;
+
+                   auto type_type = flatbuf::Type_NONE;
+                   Offset type_offset;
+
+                   DictionaryOffset dictionary = 0;
+
+                   std::vector<FieldOffset> children{};
+                   auto fb_children = fbb.CreateVector(children.data(), children.size());
+                   // cudf::type_dispatcher(
+                   // schema_elem.type, dispatch_to_flatbuf_type{}, schema_elem, type_offset,
+                   // children);
+
+                   // push to field offsets vector
+                   return flatbuf::CreateField(
+                     fbb, fb_name, is_nullable, type_type, type_offset, dictionary, fb_children);
+                 });
 
-  for (size_type i = 0; i < static_cast<size_type>(parquet_schema.size()); ++i) {
-    FieldOffset offset;
-    // FieldToFlatbufferVisitor field_visitor(fbb, mapper, pos.child(i));
-    // field_visitor.GetResult(schema.field(i), &offset);
-    field_offsets.push_back(offset);
-  }
   return field_offsets;
 }
 
@@ -110,7 +134,8 @@ std::string construct_arrow_schema_ipc_message(host_span<SchemaElement const> pa
   };
 
   FlatBufferBuilder fbb;
-  auto fb_offsets = fbb.CreateVector(make_field_offsets(parquet_schema));
+  auto field_offsets = make_field_offsets(fbb, parquet_schema);
+  auto fb_offsets    = fbb.CreateVector(field_offsets);
 
   flatbuffers::Offset<flatbuf::Schema> const fb_schema =
     flatbuf::CreateSchema(fbb, flatbuf::Endianness::Endianness_Little, fb_offsets);
@@ -119,7 +144,7 @@ std::string construct_arrow_schema_ipc_message(host_span<SchemaElement const> pa
                                                              flatbuf::MetadataVersion_V5,
                                                              flatbuf::MessageHeader_Schema,
                                                              fb_schema.Union(),
-                                                             0 /* body_length */);
+                                                             0 /* body_length = 0 */);
   fbb.Finish(ipc_message_flatbuffer);
 
   int32_t metadata_len = fbb.GetSize();

From de0fc403520edd7a6713f7a0b2dbe6a60fb202dc Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Fri, 24 May 2024 17:39:35 +0000
Subject: [PATCH 04/45] Add artifacts for arrow schema in pq writer

---
 cpp/CMakeLists.txt                         |   1 +
 cpp/src/io/parquet/arrow_schema_writer.cpp | 159 +++++++++++++++++++++
 cpp/src/io/parquet/arrow_schema_writer.hpp | 103 +------------
 3 files changed, 163 insertions(+), 100 deletions(-)
 create mode 100644 cpp/src/io/parquet/arrow_schema_writer.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7390c465ccb..dd3ffe3bc12 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -407,6 +407,7 @@ add_library(
   src/io/orc/stripe_init.cu
   src/datetime/timezone.cpp
   src/io/orc/writer_impl.cu
+  src/io/parquet/arrow_schema_writer.cpp
   src/io/parquet/compact_protocol_reader.cpp
   src/io/parquet/compact_protocol_writer.cpp
   src/io/parquet/decode_preprocess.cu
diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
new file mode 100644
index 00000000000..55e1a081c15
--- /dev/null
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file arrow_schema.cpp
+ * @brief Arrow IPC schema writer implementation
+ */
+
+#include "arrow_schema_writer.hpp"
+
+namespace cudf::io::parquet::detail {
+
+// Helper class copied over from Arrow source
+class FieldPosition {
+ public:
+  FieldPosition() : parent_(nullptr), _index(-1), _depth(0) {}
+
+  FieldPosition child(int index) const { return {this, index}; }
+
+  std::vector<int> path() const
+  {
+    std::vector<int> path(_depth);
+    const FieldPosition* cur = this;
+    for (int i = _depth - 1; i >= 0; --i) {
+      path[i] = cur->_index;
+      cur     = cur->parent_;
+    }
+    return path;
+  }
+
+ protected:
+  FieldPosition(const FieldPosition* parent, int index)
+    : parent_(parent), _index(index), _depth(parent->_depth + 1)
+  {
+  }
+
+  const FieldPosition* parent_;
+  int _index;
+  int _depth;
+};
+
+// Functor for cudf to flatbuf::type conversion
+struct dispatch_to_flatbuf_type {};
+
+/**
+ * @brief Construct and return arrow schema from input parquet schema
+ *
+ * Recursively traverses through parquet schema to construct the arrow schema tree.
+ * Serializes the arrow schema tree and stores it as the header (or metadata) of
+ * an otherwise empty ipc message using flatbuffers. The ipc message is then prepended
+ * with header size (padded for 16 byte alignment) and a continuation string. The final
+ * string is base64 encoded and returned.
+ */
+std::string construct_arrow_schema_ipc_message(host_span<SchemaElement const> parquet_schema)
+{
+  // intantiate a flatbuffer builder
+  FlatBufferBuilder fbb;
+
+  // Lambda function to construct a tree of arrow schema fields
+  std::function<FieldOffset(FieldPosition, int32_t const)> make_arrow_schema_fields =
+    [&](FieldPosition pos, int32_t const schema_idx) -> FieldOffset {
+    SchemaElement const schema_elem = parquet_schema[schema_idx];
+
+    std::vector<FieldOffset> children{};
+
+    std::transform(thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(schema_elem.num_children),
+                   std::back_inserter(children),
+                   [&](auto const idx) {
+                     return make_arrow_schema_fields(pos.child(idx), schema_elem.children_idx[idx]);
+                   });
+
+    auto type_type = flatbuf::Type_NONE;
+    Offset type_offset;
+
+    // TODO: Implement functor
+    /*cudf::type_dispatcher(schema_elem.arrow_type.value_or(type_id::EMPTY),
+                          dispatch_to_flatbuf_type{},
+                          schema_elem,
+                          type_offset,
+                          type_type,
+                          children);*/
+
+    auto const fb_name     = fbb.CreateString(schema_elem.name);
+    auto const fb_children = fbb.CreateVector(children.data(), children.size());
+    auto const is_nullable = schema_elem.repetition_type == FieldRepetitionType::OPTIONAL or
+                             schema_elem.repetition_type == FieldRepetitionType::REPEATED;
+    DictionaryOffset dictionary = 0;
+
+    // push to field offsets vector
+    return flatbuf::CreateField(
+      fbb, fb_name, is_nullable, type_type, type_offset, dictionary, fb_children);
+  };
+
+  // Lambda function to convert int32 to a string of uint8 bytes
+  auto const convert_int32_to_byte_string = [&](int32_t const value) {
+    std::array<uint8_t, sizeof(int32_t)> buffer;
+    std::memcpy(buffer.data(), &value, sizeof(int32_t));
+    return std::string(reinterpret_cast<char*>(buffer.data()), buffer.size());
+  };
+
+  // TODO: What to do with this?
+  [[maybe_unused]] FieldPosition pos;
+  std::vector<FieldOffset> field_offsets;
+
+  // populate field offsets (aka schema fields)
+  std::transform(thrust::make_counting_iterator(0),
+                 thrust::make_counting_iterator(parquet_schema[0].num_children),
+                 std::back_inserter(field_offsets),
+                 [&](auto const idx) {
+                   return make_arrow_schema_fields(pos.child(idx),
+                                                   parquet_schema[0].children_idx[idx]);
+                 });
+
+  // Create a flatbuffer vector from the field offset vector
+  auto const fb_offsets = fbb.CreateVector(field_offsets);
+
+  // Create an arrow:schema flatbuffer
+  flatbuffers::Offset<flatbuf::Schema> const fb_schema =
+    flatbuf::CreateSchema(fbb, flatbuf::Endianness::Endianness_Little, fb_offsets);
+
+  // Schema type message has zero length body
+  constexpr int64_t bodylength = 0;
+
+  // Create an ipc message flatbuffer
+  auto const ipc_message_flatbuffer = flatbuf::CreateMessage(
+    fbb, flatbuf::MetadataVersion_V5, flatbuf::MessageHeader_Schema, fb_schema.Union(), bodylength);
+
+  // All done, finish building flatbuffers
+  fbb.Finish(ipc_message_flatbuffer);
+
+  // Since the ipc message doesn't have a body or other custom key value metadata,
+  //  its size is equal to the size of its header (the schema flatbuffer)
+  int32_t const metadata_len = fbb.GetSize();
+
+  // Construct the final string and store in this variable here to use in base64_encode
+  std::string const ipc_message =
+    convert_int32_to_byte_string(IPC_CONTINUATION_TOKEN) +
+    convert_int32_to_byte_string(metadata_len) +
+    std::string(reinterpret_cast<char*>(fbb.GetBufferPointer()), metadata_len);
+
+  // Encode the final ipc message string to base64 and return
+  return cudf::io::detail::base64_encode(ipc_message);
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index 24ebbc61812..c9217b8d376 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -31,6 +31,8 @@
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+
 #include <cstdint>
 #include <memory>
 #include <string>
@@ -49,72 +51,6 @@ using FieldOffset       = flatbuffers::Offset<flatbuf::Field>;
 using Offset            = flatbuffers::Offset<void>;
 using FBString          = flatbuffers::Offset<flatbuffers::String>;
 
-class FieldPosition {
- public:
-  FieldPosition() : parent_(nullptr), _index(-1), _depth(0) {}
-
-  FieldPosition child(int index) const { return {this, index}; }
-
-  std::vector<int> path() const
-  {
-    std::vector<int> path(_depth);
-    const FieldPosition* cur = this;
-    for (int i = _depth - 1; i >= 0; --i) {
-      path[i] = cur->_index;
-      cur     = cur->parent_;
-    }
-    return path;
-  }
-
- protected:
-  FieldPosition(const FieldPosition* parent, int index)
-    : parent_(parent), _index(index), _depth(parent->_depth + 1)
-  {
-  }
-
-  const FieldPosition* parent_;
-  int _index;
-  int _depth;
-};
-
-struct dispatch_to_flatbuf_type {};
-
-std::vector<FieldOffset> make_field_offsets(FlatBufferBuilder& fbb,
-                                            host_span<SchemaElement const> parquet_schema)
-{
-  // MH: Get here
-  std::vector<FieldOffset> field_offsets;
-  [[maybe_unused]] FieldPosition pos;
-
-  // Create flatbuffer Fields and insert in field offsets vector
-  std::transform(parquet_schema.begin(),
-                 parquet_schema.end(),
-                 std::back_inserter(field_offsets),
-                 [&](auto schema_elem) {
-                   auto fb_name = fbb.CreateString(schema_elem.name);
-                   auto is_nullable =
-                     schema_elem.repetition_type == FieldRepetitionType::OPTIONAL or
-                     schema_elem.repetition_type == FieldRepetitionType::REPEATED;
-
-                   auto type_type = flatbuf::Type_NONE;
-                   Offset type_offset;
-
-                   DictionaryOffset dictionary = 0;
-
-                   std::vector<FieldOffset> children{};
-                   auto fb_children = fbb.CreateVector(children.data(), children.size());
-                   // cudf::type_dispatcher(
-                   // schema_elem.type, dispatch_to_flatbuf_type{}, schema_elem, type_offset,
-                   // children);
-
-                   // push to field offsets vector
-                   return flatbuf::CreateField(
-                     fbb, fb_name, is_nullable, type_type, type_offset, dictionary, fb_children);
-                 });
-
-  return field_offsets;
-}
-
 /**
  * @brief Construct and return arrow schema from input parquet schema
  *
@@ -124,39 +60,6 @@ std::vector<FieldOffset> make_field_offsets(FlatBufferBuilder& fbb,
  * with header size (padded for 16 byte alignment) and a continuation string. The final
  * string is base64 encoded and returned.
  */
-std::string construct_arrow_schema_ipc_message(host_span<SchemaElement const> parquet_schema)
-{
-  // Lambda function to convert int32 to a string of uint8 bytes
-  auto const convert_int32_to_byte_string = [&](int32_t const value) {
-    std::array<uint8_t, sizeof(int32_t)> buffer;
-    std::memcpy(buffer.data(), &value, sizeof(int32_t));
-    return std::string(reinterpret_cast<char*>(buffer.data()), buffer.size());
-  };
-
-  FlatBufferBuilder fbb;
-  auto field_offsets = make_field_offsets(fbb, parquet_schema);
-  auto fb_offsets    = fbb.CreateVector(field_offsets);
-
-  flatbuffers::Offset<flatbuf::Schema> const fb_schema =
-    flatbuf::CreateSchema(fbb, flatbuf::Endianness::Endianness_Little, fb_offsets);
-
-  auto const ipc_message_flatbuffer = flatbuf::CreateMessage(fbb,
-                                                             flatbuf::MetadataVersion_V5,
-                                                             flatbuf::MessageHeader_Schema,
-                                                             fb_schema.Union(),
-                                                             0 /* body_length = 0 */);
-  fbb.Finish(ipc_message_flatbuffer);
-
-  int32_t metadata_len = fbb.GetSize();
-
-  // Store the final string here to pass its view to base64_encode
-  std::string ipc_message =
-    convert_int32_to_byte_string(IPC_CONTINUATION_TOKEN) +
-    convert_int32_to_byte_string(metadata_len) +
-    std::string(reinterpret_cast<char*>(fbb.GetBufferPointer()), metadata_len);
-
-  // encode the final ipc message to base64 and return
-  return cudf::io::detail::base64_encode(ipc_message);
-}
+std::string construct_arrow_schema_ipc_message(host_span<SchemaElement const> parquet_schema);
 
 }  // namespace cudf::io::parquet::detail

From 497727ebce22a3fae86b382ad1a1ad8b5c275479 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Fri, 24 May 2024 17:44:27 +0000
Subject: [PATCH 05/45] merge with upstream

---
 python/cudf/cudf/io/parquet.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index ac8b79424b3..25647d16271 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -107,6 +107,11 @@ def _write_parquet(
         "force_nullable_schema": force_nullable_schema,
         "header_version": header_version,
         "use_dictionary": use_dictionary,
+        "skip_compression": skip_compression,
+        "column_encoding": column_encoding,
+        "column_type_length": column_type_length,
+        "output_as_binary": output_as_binary,
+        "write_arrow_schema": write_arrow_schema,
     }
     if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs):
         with ExitStack() as stack:
@@ -953,6 +958,11 @@ def to_parquet(
     force_nullable_schema=False,
     header_version="1.0",
     use_dictionary=True,
+    skip_compression=None,
+    column_encoding=None,
+    column_type_length=None,
+    output_as_binary=None,
+    store_schema=True,
     *args,
     **kwargs,
 ):
@@ -1036,6 +1046,11 @@ def to_parquet(
             force_nullable_schema=force_nullable_schema,
             header_version=header_version,
             use_dictionary=use_dictionary,
+            skip_compression=skip_compression,
+            column_encoding=column_encoding,
+            column_type_length=column_type_length,
+            output_as_binary=output_as_binary,
+            write_arrow_schema=store_schema,
         )
 
     else:

From d166fe6f367c36db86e460c095267296df66ac90 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Sat, 25 May 2024 02:23:49 +0000
Subject: [PATCH 06/45] Workin arrow schema builder. Need to handle
 nested_types and dict32

---
 cpp/src/io/parquet/arrow_schema_writer.cpp | 352 +++++++++++++++++----
 cpp/src/io/parquet/arrow_schema_writer.hpp |  13 +-
 cpp/src/io/parquet/writer_impl.cu          |  15 +-
 3 files changed, 317 insertions(+), 63 deletions(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index 55e1a081c15..9f55a61f630 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -23,10 +23,39 @@
 
 namespace cudf::io::parquet::detail {
 
-// Helper class copied over from Arrow source
+class FieldPosition;
+
+/**
+ * @brief Function to construct a tree of arrow schema fields
+ */
+FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
+                                     FieldPosition field_position,
+                                     cudf::detail::LinkedColPtr const& col,
+                                     column_in_metadata const& col_meta,
+                                     single_write_mode const write_mode,
+                                     bool const utc_timestamps);
+
+// TODO: Copied over from ``writer_impl.cu``. Need to placed at a common location to avoid
+// duplication.
+inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
+                            column_in_metadata const& col_meta,
+                            single_write_mode write_mode)
+{
+  if (col_meta.is_nullability_defined()) {
+    CUDF_EXPECTS(col_meta.nullable() or col->null_count() == 0,
+                 "Mismatch in metadata prescribed nullability and input column. "
+                 "Metadata for input column with nulls cannot prescribe nullability = false");
+    return col_meta.nullable();
+  }
+  // For chunked write, when not provided nullability, we assume the worst case scenario
+  // that all columns are nullable.
+  return write_mode == single_write_mode::NO or col->nullable();
+}
+
+// Helper class copied over from Arrow source. Do we need it even?
 class FieldPosition {
  public:
-  FieldPosition() : parent_(nullptr), _index(-1), _depth(0) {}
+  FieldPosition() : _parent(nullptr), _index(-1), _depth(0) {}
 
   FieldPosition child(int index) const { return {this, index}; }
 
@@ -36,24 +65,271 @@ class FieldPosition {
     const FieldPosition* cur = this;
     for (int i = _depth - 1; i >= 0; --i) {
       path[i] = cur->_index;
-      cur     = cur->parent_;
+      cur     = cur->_parent;
     }
     return path;
   }
 
  protected:
   FieldPosition(const FieldPosition* parent, int index)
-    : parent_(parent), _index(index), _depth(parent->_depth + 1)
+    : _parent(parent), _index(index), _depth(parent->_depth + 1)
   {
   }
 
-  const FieldPosition* parent_;
+  const FieldPosition* _parent;
   int _index;
   int _depth;
 };
 
-// Functor for cudf to flatbuf::type conversion
-struct dispatch_to_flatbuf_type {};
+/**
+ * @brief Functor to convert cudf column metadata to arrow schema
+ */
+struct dispatch_to_flatbuf {
+  FlatBufferBuilder& fbb;
+  cudf::detail::LinkedColPtr const& col;
+  column_in_metadata const& col_meta;
+  single_write_mode const write_mode;
+  bool const utc_timestamps;
+  FieldPosition& field_position;
+  Offset& field_offset;
+  flatbuf::Type& type_type;
+  std::vector<FieldOffset>& children;
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, bool>, void> operator()()
+  {
+    type_type    = flatbuf::Type_Bool;
+    field_offset = flatbuf::CreateBool(fbb).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, int8_t>, void> operator()()
+  {
+    type_type    = flatbuf::Type_Int;
+    field_offset = flatbuf::CreateInt(fbb, 8, true).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, int16_t>, void> operator()()
+  {
+    type_type    = flatbuf::Type_Int;
+    field_offset = flatbuf::CreateInt(fbb, 16, true).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, int32_t>, void> operator()()
+  {
+    type_type    = flatbuf::Type_Int;
+    field_offset = flatbuf::CreateInt(fbb, 32, true).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, int64_t>, void> operator()()
+  {
+    type_type    = flatbuf::Type_Int;
+    field_offset = flatbuf::CreateInt(fbb, 64, true).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, uint8_t>, void> operator()()
+  {
+    type_type    = flatbuf::Type_Int;
+    field_offset = flatbuf::CreateInt(fbb, 8, false).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, uint16_t>, void> operator()()
+  {
+    type_type    = flatbuf::Type_Int;
+    field_offset = flatbuf::CreateInt(fbb, 16, false).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, uint32_t>, void> operator()()
+  {
+    type_type    = flatbuf::Type_Int;
+    field_offset = flatbuf::CreateInt(fbb, 32, false).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, uint64_t>, void> operator()()
+  {
+    type_type    = flatbuf::Type_Int;
+    field_offset = flatbuf::CreateInt(fbb, 64, false).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, float>, void> operator()()
+  {
+    type_type    = flatbuf::Type_FloatingPoint;
+    field_offset = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_SINGLE).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, double>, void> operator()()
+  {
+    type_type    = flatbuf::Type_FloatingPoint;
+    field_offset = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_DOUBLE).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> operator()()
+  {
+    type_type    = flatbuf::Type_Utf8View;
+    field_offset = flatbuf::CreateUtf8View(fbb).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_D> or std::is_same_v<T, cudf::timestamp_s>,
+                   void>
+  operator()()
+  {
+    type_type    = flatbuf::Type_Timestamp;
+    field_offset = flatbuf::CreateTimestamp(
+                     fbb, flatbuf::TimeUnit_SECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
+                     .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_ms>, void> operator()()
+  {
+    type_type = flatbuf::Type_Timestamp;
+    field_offset =
+      flatbuf::CreateTimestamp(
+        fbb, flatbuf::TimeUnit_MILLISECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
+        .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_us>, void> operator()()
+  {
+    type_type = flatbuf::Type_Timestamp;
+    field_offset =
+      flatbuf::CreateTimestamp(
+        fbb, flatbuf::TimeUnit_MICROSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
+        .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
+  {
+    type_type = flatbuf::Type_Timestamp;
+    field_offset =
+      flatbuf::CreateTimestamp(
+        fbb, flatbuf::TimeUnit_NANOSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
+        .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_D> or std::is_same_v<T, cudf::duration_s>, void>
+  operator()()
+  {
+    type_type    = flatbuf::Type_Duration;
+    field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_SECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
+  {
+    type_type    = flatbuf::Type_Duration;
+    field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MILLISECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_us>, void> operator()()
+  {
+    type_type    = flatbuf::Type_Duration;
+    field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MICROSECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_ns>, void> operator()()
+  {
+    type_type    = flatbuf::Type_Duration;
+    field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_NANOSECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
+  {
+    if (std::is_same_v<T, numeric::decimal128>) {
+      type_type = flatbuf::Type_Decimal;
+      field_offset =
+        flatbuf::CreateDecimal(fbb, col_meta.get_decimal_precision(), col->type().scale(), 128)
+          .Union();
+    } else {
+      CUDF_FAIL("fixed point type other than decimal128 not supported for arrow schema");
+    }
+  }
+
+  template <typename T>
+  std::enable_if_t<cudf::is_nested<T>(), void> operator()()
+  {
+    // TODO: Handle list and struct types. Remember, Lists are different in arrow schema and PQ
+    // schema pq schema. List<int> in PQ schema:  "column_name" : { "list" : { "element" }} in
+    // List<int> in arrow schema: "column_name" : { "list<element>" }
+    // TODO: Arrow expects only 1 child for Lists and Structs. How and Why?
+    std::transform(thrust::make_counting_iterator(0ul),
+                   thrust::make_counting_iterator(col->children.size()),
+                   std::back_inserter(children),
+                   [&](auto const idx) {
+                     return make_arrow_schema_fields(fbb,
+                                                     field_position.child(idx),
+                                                     col->children[idx],
+                                                     col_meta.child(idx),
+                                                     write_mode,
+                                                     utc_timestamps);
+                   });
+
+    if (std::is_same_v<T, cudf::list_view>) {
+      type_type    = flatbuf::Type_List;
+      field_offset = flatbuf::CreateList(fbb).Union();
+    } else if (std::is_same_v<T, cudf::struct_view>) {
+      type_type    = flatbuf::Type_Struct_;
+      field_offset = flatbuf::CreateStruct_(fbb).Union();
+    } else {
+      CUDF_FAIL("Unexpected nested type");
+    }
+  }
+
+  template <typename T>
+  std::enable_if_t<cudf::is_dictionary<T>(), void> operator()()
+  {
+    CUDF_FAIL("Dictionary columns are not supported for writing");
+  }
+};
+
+FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
+                                     FieldPosition field_position,
+                                     cudf::detail::LinkedColPtr const& col,
+                                     column_in_metadata const& col_meta,
+                                     single_write_mode const write_mode,
+                                     bool const utc_timestamps)
+{
+  Offset field_offset     = 0;
+  flatbuf::Type type_type = flatbuf::Type_NONE;
+  std::vector<FieldOffset> children;
+
+  cudf::type_dispatcher(col->type(),
+                        dispatch_to_flatbuf{fbb,
+                                            col,
+                                            col_meta,
+                                            write_mode,
+                                            utc_timestamps,
+                                            field_position,
+                                            field_offset,
+                                            type_type,
+                                            children});
+
+  auto const fb_name          = fbb.CreateString(col_meta.get_name());
+  auto const fb_children      = fbb.CreateVector(children.data(), children.size());
+  auto const is_nullable      = is_col_nullable(col, col_meta, write_mode);
+  DictionaryOffset dictionary = 0;
+
+  // push to field offsets vector
+  return flatbuf::CreateField(
+    fbb, fb_name, is_nullable, type_type, field_offset, dictionary, fb_children);
+}
 
 /**
  * @brief Construct and return arrow schema from input parquet schema
@@ -64,47 +340,11 @@ struct dispatch_to_flatbuf_type {};
  * with header size (padded for 16 byte alignment) and a continuation string. The final
  * string is base64 encoded and returned.
  */
-std::string construct_arrow_schema_ipc_message(host_span<SchemaElement const> parquet_schema)
+std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
+                                               table_input_metadata const& metadata,
+                                               single_write_mode const write_mode,
+                                               bool const utc_timestamps)
 {
-  // intantiate a flatbuffer builder
-  FlatBufferBuilder fbb;
-
-  // Lambda function to construct a tree of arrow schema fields
-  std::function<FieldOffset(FieldPosition, int32_t const)> make_arrow_schema_fields =
-    [&](FieldPosition pos, int32_t const schema_idx) -> FieldOffset {
-    SchemaElement const schema_elem = parquet_schema[schema_idx];
-
-    std::vector<FieldOffset> children{};
-
-    std::transform(thrust::make_counting_iterator(0),
-                   thrust::make_counting_iterator(schema_elem.num_children),
-                   std::back_inserter(children),
-                   [&](auto const idx) {
-                     return make_arrow_schema_fields(pos.child(idx), schema_elem.children_idx[idx]);
-                   });
-
-    auto type_type = flatbuf::Type_NONE;
-    Offset type_offset;
-
-    // TODO: Implement functor
-    /*cudf::type_dispatcher(schema_elem.arrow_type.value_or(type_id::EMPTY),
-                          dispatch_to_flatbuf_type{},
-                          schema_elem,
-                          type_offset,
-                          type_type,
-                          children);*/
-
-    auto const fb_name     = fbb.CreateString(schema_elem.name);
-    auto const fb_children = fbb.CreateVector(children.data(), children.size());
-    auto const is_nullable = schema_elem.repetition_type == FieldRepetitionType::OPTIONAL or
-                             schema_elem.repetition_type == FieldRepetitionType::REPEATED;
-    DictionaryOffset dictionary = 0;
-
-    // push to field offsets vector
-    return flatbuf::CreateField(
-      fbb, fb_name, is_nullable, type_type, type_offset, dictionary, fb_children);
-  };
-
   // Lambda function to convert int32 to a string of uint8 bytes
   auto const convert_int32_to_byte_string = [&](int32_t const value) {
     std::array<uint8_t, sizeof(int32_t)> buffer;
@@ -112,17 +352,23 @@ std::string construct_arrow_schema_ipc_message(host_span<SchemaElement const> pa
     return std::string(reinterpret_cast<char*>(buffer.data()), buffer.size());
   };
 
-  // TODO: What to do with this?
-  [[maybe_unused]] FieldPosition pos;
+  // intantiate a flatbuffer builder
+  FlatBufferBuilder fbb;
+
+  FieldPosition field_position;
   std::vector<FieldOffset> field_offsets;
 
   // populate field offsets (aka schema fields)
-  std::transform(thrust::make_counting_iterator(0),
-                 thrust::make_counting_iterator(parquet_schema[0].num_children),
+  std::transform(thrust::make_counting_iterator(0ul),
+                 thrust::make_counting_iterator(linked_columns.size()),
                  std::back_inserter(field_offsets),
                  [&](auto const idx) {
-                   return make_arrow_schema_fields(pos.child(idx),
-                                                   parquet_schema[0].children_idx[idx]);
+                   return make_arrow_schema_fields(fbb,
+                                                   field_position.child(idx),
+                                                   linked_columns[idx],
+                                                   metadata.column_metadata[idx],
+                                                   write_mode,
+                                                   utc_timestamps);
                  });
 
   // Create a flatbuffer vector from the field offset vector
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index c9217b8d376..bc5ddbaa27e 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -21,14 +21,18 @@
 
 #pragma once
 
-#include "io/parquet/parquet.hpp"
 #include "io/parquet/parquet_common.hpp"
 #include "io/utilities/base64_utilities.hpp"
 #include "ipc/Message_generated.h"
 #include "ipc/Schema_generated.h"
 
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/linked_column.hpp>
+#include <cudf/io/data_sink.hpp>
+#include <cudf/io/detail/parquet.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
@@ -60,6 +64,9 @@ using FBString          = flatbuffers::Offset<flatbuffers::String>;
  * with header size (padded for 16 byte alignment) and a continuation string. The final
  * string is base64 encoded and returned.
  */
-std::string construct_arrow_schema_ipc_message(host_span<SchemaElement const> parquet_schema);
+std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
+                                               table_input_metadata const& metadata,
+                                               single_write_mode write_mode,
+                                               bool utc_timestamps);
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 607b1f8b0f5..60b1970e979 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -26,7 +26,6 @@
 #include "io/parquet/parquet.hpp"
 #include "io/parquet/parquet_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
-#include "io/utilities/base64_utilities.hpp"
 #include "io/utilities/column_utils.cuh"
 #include "io/utilities/config_utils.hpp"
 #include "ipc/Message_generated.h"
@@ -76,7 +75,7 @@ struct aggregate_writer_metadata {
                             host_span<SchemaElement const> tbl_schema,
                             size_type num_columns,
                             statistics_freq stats_granularity,
-                            bool const write_arrow_schema)
+                            std::string const arrow_schema_ipc_message)
     : version(1),
       schema(std::vector<SchemaElement>(tbl_schema.begin(), tbl_schema.end())),
       files(partitions.size())
@@ -90,9 +89,6 @@ struct aggregate_writer_metadata {
       this->column_orders       = std::vector<ColumnOrder>(num_columns, default_order);
     }
 
-    // Construct the arrow schema ipc message string.
-    auto const arrow_schema_ipc_message = construct_arrow_schema_ipc_message(schema);
-
     for (size_t p = 0; p < kv_md.size(); ++p) {
       std::transform(kv_md[p].begin(),
                      kv_md[p].end(),
@@ -101,7 +97,7 @@ struct aggregate_writer_metadata {
                        return KeyValue{kv.first, kv.second};
                      });
       // Append arrow schema to the key-value metadata
-      if (write_arrow_schema and not arrow_schema_ipc_message.empty()) {
+      if (not arrow_schema_ipc_message.empty()) {
         this->files[p].key_value_metadata.emplace_back(
           KeyValue{"ARROW:schema", std::move(arrow_schema_ipc_message)});
       }
@@ -1883,7 +1879,12 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   std::unique_ptr<aggregate_writer_metadata> agg_meta;
   if (!curr_agg_meta) {
     agg_meta = std::make_unique<aggregate_writer_metadata>(
-      partitions, kv_meta, this_table_schema, num_columns, stats_granularity, write_arrow_schema);
+      partitions,
+      kv_meta,
+      this_table_schema,
+      num_columns,
+      stats_granularity,
+      construct_arrow_schema_ipc_message(vec, table_meta, write_mode, utc_timestamps));
   } else {
     agg_meta = std::make_unique<aggregate_writer_metadata>(*curr_agg_meta);
 

From 7dad37bed8763788e019d59e6dda04ea2f5ebb64 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 29 May 2024 00:34:05 +0000
Subject: [PATCH 07/45] Handle structs and lists

---
 cpp/src/io/parquet/arrow_schema_writer.cpp | 103 ++++++------
 cpp/src/io/parquet/arrow_schema_writer.hpp |   4 +-
 cpp/src/io/parquet/parquet_common.hpp      |   3 +
 python/cudf/cudf/tests/test_parquet.py     | 174 ++++++++++++++++++---
 4 files changed, 213 insertions(+), 71 deletions(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index 9f55a61f630..1452bbea8f9 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -52,7 +52,7 @@ inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
   return write_mode == single_write_mode::NO or col->nullable();
 }
 
-// Helper class copied over from Arrow source. Do we need it even?
+// TODO: Helper class copied over from Arrow source. Do we need it even?
 class FieldPosition {
  public:
   FieldPosition() : _parent(nullptr), _index(-1), _depth(0) {}
@@ -184,7 +184,8 @@ struct dispatch_to_flatbuf {
                    void>
   operator()()
   {
-    type_type    = flatbuf::Type_Timestamp;
+    type_type = flatbuf::Type_Timestamp;
+    // TODO: Verify if this is the correct logic
     field_offset = flatbuf::CreateTimestamp(
                      fbb, flatbuf::TimeUnit_SECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
                      .Union();
@@ -194,6 +195,7 @@ struct dispatch_to_flatbuf {
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ms>, void> operator()()
   {
     type_type = flatbuf::Type_Timestamp;
+    // TODO: Verify if this is the correct logic for UTC
     field_offset =
       flatbuf::CreateTimestamp(
         fbb, flatbuf::TimeUnit_MILLISECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
@@ -204,6 +206,7 @@ struct dispatch_to_flatbuf {
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_us>, void> operator()()
   {
     type_type = flatbuf::Type_Timestamp;
+    // TODO: Verify if this is the correct logic for UTC
     field_offset =
       flatbuf::CreateTimestamp(
         fbb, flatbuf::TimeUnit_MICROSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
@@ -214,6 +217,7 @@ struct dispatch_to_flatbuf {
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
   {
     type_type = flatbuf::Type_Timestamp;
+    // TODO: Verify if this is the correct logic for UTC
     field_offset =
       flatbuf::CreateTimestamp(
         fbb, flatbuf::TimeUnit_NANOSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
@@ -252,43 +256,52 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
   {
+    // TODO: cuDF-PQ writer supports d32 and d64 types not supported by Arrow without conversion.
+    // See more: https://github.com/rapidsai/cudf/blob/branch-24.08/cpp/src/interop/to_arrow.cu#L155
+    //
     if (std::is_same_v<T, numeric::decimal128>) {
       type_type = flatbuf::Type_Decimal;
       field_offset =
         flatbuf::CreateDecimal(fbb, col_meta.get_decimal_precision(), col->type().scale(), 128)
           .Union();
     } else {
-      CUDF_FAIL("fixed point type other than decimal128 not supported for arrow schema");
+      // TODO: Should we fail or just not write arrow:schema anymore?
+      CUDF_FAIL("Fixed point types other than decimal128 are not supported for arrow schema");
     }
   }
 
   template <typename T>
   std::enable_if_t<cudf::is_nested<T>(), void> operator()()
   {
-    // TODO: Handle list and struct types. Remember, Lists are different in arrow schema and PQ
-    // schema pq schema. List<int> in PQ schema:  "column_name" : { "list" : { "element" }} in
-    // List<int> in arrow schema: "column_name" : { "list<element>" }
-    // TODO: Arrow expects only 1 child for Lists and Structs. How and Why?
-    std::transform(thrust::make_counting_iterator(0ul),
-                   thrust::make_counting_iterator(col->children.size()),
-                   std::back_inserter(children),
-                   [&](auto const idx) {
-                     return make_arrow_schema_fields(fbb,
-                                                     field_position.child(idx),
-                                                     col->children[idx],
-                                                     col_meta.child(idx),
+    // Lists are represented differently in arrow and cuDF.
+    // cuDF representation: List<int>: "col_name" : { "list" : { "element" }} (2 children)
+    // arrow schema representation: List<int>: "col_name" : { "list<item>" } (1 child)
+    if constexpr (std::is_same_v<T, cudf::list_view>) {
+      // Only need to process the second child (at idx = 1)
+      children.emplace_back(make_arrow_schema_fields(fbb,
+                                                     field_position.child(0),
+                                                     col->children[1],
+                                                     col_meta.child(1),
                                                      write_mode,
-                                                     utc_timestamps);
-                   });
-
-    if (std::is_same_v<T, cudf::list_view>) {
+                                                     utc_timestamps));
       type_type    = flatbuf::Type_List;
       field_offset = flatbuf::CreateList(fbb).Union();
-    } else if (std::is_same_v<T, cudf::struct_view>) {
+    }
+    // Traverse the struct in DFS manner and process children fields.
+    else if constexpr (std::is_same_v<T, cudf::struct_view>) {
+      std::transform(thrust::make_counting_iterator(0UL),
+                     thrust::make_counting_iterator(col->children.size()),
+                     std::back_inserter(children),
+                     [&](auto const idx) {
+                       return make_arrow_schema_fields(fbb,
+                                                       field_position.child(idx),
+                                                       col->children[idx],
+                                                       col_meta.child(idx),
+                                                       write_mode,
+                                                       utc_timestamps);
+                     });
       type_type    = flatbuf::Type_Struct_;
       field_offset = flatbuf::CreateStruct_(fbb).Union();
-    } else {
-      CUDF_FAIL("Unexpected nested type");
     }
   }
 
@@ -352,10 +365,13 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con
     return std::string(reinterpret_cast<char*>(buffer.data()), buffer.size());
   };
 
-  // intantiate a flatbuffer builder
+  // Intantiate a flatbuffer builder
   FlatBufferBuilder fbb;
 
+  // Instantiate a field position mapper struct (not sure if needed yet?)
   FieldPosition field_position;
+
+  // Create an empty field offset vector
   std::vector<FieldOffset> field_offsets;
 
   // populate field offsets (aka schema fields)
@@ -371,32 +387,25 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con
                                                    utc_timestamps);
                  });
 
-  // Create a flatbuffer vector from the field offset vector
-  auto const fb_offsets = fbb.CreateVector(field_offsets);
-
-  // Create an arrow:schema flatbuffer
-  flatbuffers::Offset<flatbuf::Schema> const fb_schema =
-    flatbuf::CreateSchema(fbb, flatbuf::Endianness::Endianness_Little, fb_offsets);
-
-  // Schema type message has zero length body
-  constexpr int64_t bodylength = 0;
-
-  // Create an ipc message flatbuffer
-  auto const ipc_message_flatbuffer = flatbuf::CreateMessage(
-    fbb, flatbuf::MetadataVersion_V5, flatbuf::MessageHeader_Schema, fb_schema.Union(), bodylength);
-
-  // All done, finish building flatbuffers
-  fbb.Finish(ipc_message_flatbuffer);
-
-  // Since the ipc message doesn't have a body or other custom key value metadata,
-  //  its size is equal to the size of its header (the schema flatbuffer)
-  int32_t const metadata_len = fbb.GetSize();
-
-  // Construct the final string and store in this variable here to use in base64_encode
+  // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to
+  // create an ipc message flatbuffer
+  fbb.Finish(flatbuf::CreateMessage(
+    fbb,
+    flatbuf::MetadataVersion_V5,   /* Metadata version V5 (latest) */
+    flatbuf::MessageHeader_Schema, /* Schema type message header */
+    flatbuf::CreateSchema(
+      fbb, flatbuf::Endianness::Endianness_Little, fbb.CreateVector(field_offsets))
+      .Union(),                               /* Build an arrow:schema from the field vector */
+    SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH /* Body length is zero for schema type ipc message */
+    ));
+
+  // Construct the final string and store it here to use its view in base64_encode
   std::string const ipc_message =
     convert_int32_to_byte_string(IPC_CONTINUATION_TOKEN) +
-    convert_int32_to_byte_string(metadata_len) +
-    std::string(reinterpret_cast<char*>(fbb.GetBufferPointer()), metadata_len);
+    // Since the schema type ipc message doesn't have a body, the flatbuffer size is equal to the
+    // ipc message's metadata length
+    convert_int32_to_byte_string(fbb.GetSize()) +
+    std::string(reinterpret_cast<char*>(fbb.GetBufferPointer()), fbb.GetSize());
 
   // Encode the final ipc message string to base64 and return
   return cudf::io::detail::base64_encode(ipc_message);
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index bc5ddbaa27e..29db9f05df4 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -66,7 +66,7 @@ using FBString          = flatbuffers::Offset<flatbuffers::String>;
  */
 std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
                                                table_input_metadata const& metadata,
-                                               single_write_mode write_mode,
-                                               bool utc_timestamps);
+                                               single_write_mode const write_mode,
+                                               bool const utc_timestamps);
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index be469dd25c5..69c0a89fd57 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -31,6 +31,9 @@ constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL         = sizeof(in
 constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t);
 constexpr int32_t IPC_CONTINUATION_TOKEN                             = -1;
 
+// Schema type ipc message has zero length body
+constexpr int64_t SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH = 0;
+
 /**
  * @brief Basic data types in Parquet, determines how data is physically stored
  */
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index e32fdacd8d6..0776a3a6ada 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3372,31 +3372,28 @@ def test_parquet_reader_roundtrip_with_arrow_schema():
 def test_parquet_reader_roundtrip_structs_with_arrow_schema():
     # Ensure that the structs with duration types are faithfully being
     # roundtripped across Parquet with arrow schema
-    pdf = pd.DataFrame(
-        {
-            "struct": {
-                "payload": {
-                    "Domain": {
-                        "Name": "abc",
-                        "Id": {"Name": "host", "Value": "127.0.0.8"},
-                        "Duration": datetime.timedelta(minutes=12),
-                    },
-                    "StreamId": "12345678",
-                    "Duration": datetime.timedelta(minutes=4),
-                    "Offset": None,
-                    "Resource": [
-                        {
-                            "Name": "ZoneName",
-                            "Value": "RAPIDS",
-                            "Duration": datetime.timedelta(seconds=1),
-                        }
-                    ],
+    data = {
+        "payload": {
+            "Domain": {
+                "Name": "abc",
+                "Id": {"Name": "host", "Value": "127.0.0.8"},
+                "Duration": datetime.timedelta(minutes=12),
+            },
+            "StreamId": "12345678",
+            "Duration": datetime.timedelta(minutes=4),
+            "Offset": None,
+            "Resource": [
+                {
+                    "Name": "ZoneName",
+                    "Value": "RAPIDS",
+                    "Duration": datetime.timedelta(seconds=1),
                 }
-            }
+            ],
         }
-    )
+    }
+
+    pdf = pd.DataFrame({"struct": pd.Series(data)})
 
-    # Reset the buffer and write parquet with arrow
     buffer = BytesIO()
     pdf.to_parquet(buffer, engine="pyarrow")
 
@@ -3407,3 +3404,136 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema():
 
     # Check results
     assert_eq(expected, got)
+
+
+def test_parquet_writer_roundtrip_with_arrow_schema():
+    expected = cudf.DataFrame(
+        {
+            "s": cudf.Series([None, None, None], dtype="timedelta64[s]"),
+            "ms": cudf.Series([1234, None, 32442], dtype="timedelta64[ms]"),
+            "us": cudf.Series([None, 3456, None], dtype="timedelta64[us]"),
+            "ns": cudf.Series([1234, 3456, 32442], dtype="timedelta64[ns]"),
+            "duration_list": list(
+                [
+                    [
+                        datetime.timedelta(minutes=7, seconds=4),
+                        datetime.timedelta(minutes=7),
+                    ],
+                    [
+                        None,
+                        None,
+                    ],
+                    [
+                        datetime.timedelta(minutes=7, seconds=4),
+                        None,
+                    ],
+                ]
+            ),
+            "int64": cudf.Series([1234, 123, 4123], dtype="int64"),
+            "int64_list": list([[1, 2], [1, 2], [1, 2]]),
+            "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
+            "map": cudf.Series(["cat", "dog", "lion"]).map(
+                {"cat": "kitten", "dog": "puppy", "lion": "cub"}
+            ),
+        }
+    )
+
+    buffer = BytesIO()
+    expected.to_parquet(buffer)
+    read = cudf.DataFrame.from_arrow(pq.read_table(buffer))
+
+    assert_eq(expected, read)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        # struct
+        [
+            {"a": 1, "b": 2},
+            {"a": 10, "b": 20},
+            {"a": None, "b": 22},
+            {"a": None, "b": None},
+            {"a": 15, "b": None},
+        ],
+        # struct-of-list
+        [
+            {"a": 1, "b": 2, "c": [1, 2, 3]},
+            {"a": 10, "b": 20, "c": [4, 5]},
+            {"a": None, "b": 22, "c": [6]},
+            {"a": None, "b": None, "c": None},
+            {"a": 15, "b": None, "c": [-1, -2]},
+            None,
+            {"a": 100, "b": 200, "c": [-10, None, -20]},
+        ],
+        # list-of-struct
+        [
+            [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}],
+            None,
+            [{"a": 10, "b": 20}],
+            [{"a": 100, "b": 200}, {"a": None, "b": 300}, None],
+        ],
+        # struct-of-struct
+        [
+            {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2},
+            {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4},
+            {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6},
+            {"a": 7, "b": None, "c": 8},
+            {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None},
+            None,
+            {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10},
+        ],
+        # struct-with-mixed-types
+        [
+            {
+                "struct": {
+                    "payload": {
+                        "Domain": {
+                            "Name": "abc",
+                            "Id": {"Name": "host", "Value": "127.0.0.8"},
+                            "Duration": datetime.timedelta(minutes=12),
+                        },
+                        "StreamId": "12345678",
+                        "Duration": datetime.timedelta(minutes=4),
+                        "Offset": None,
+                        "Resource": [
+                            {
+                                "Name": "ZoneName",
+                                "Value": "RAPIDS",
+                                "Duration": datetime.timedelta(seconds=1),
+                            }
+                        ],
+                    }
+                }
+            }
+        ],
+    ],
+)
+def test_parquet_writer_roundtrip_structs_with_arrow_schema(tmpdir, data):
+    # Ensure that the structs are faithfully being roundtripped across
+    # Parquet with arrow schema
+    pa_expected = pa.Table.from_pydict({"struct": data})
+
+    expected = cudf.DataFrame.from_arrow(pa_expected)
+
+    # IO buffer
+    buffer = BytesIO()
+
+    # Write expected data frame to Parquet
+    expected.to_parquet(buffer)
+
+    # Read Parquet with pyarrow
+    pa_got = pq.read_table(buffer)
+
+    # Check results
+    assert_eq(pa_expected, pa_got)
+
+    # Convert to cuDF table and also read Parquet with cuDF reader
+    got = cudf.DataFrame.from_arrow(pa_got)
+    got2 = cudf.read_parquet(buffer)
+
+    # Check results
+    assert_eq(expected, got)
+    assert_eq(expected, got2)
+
+

From e1fc02ef62dcdd054c5fefb08e112304b2b2ec5f Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 29 May 2024 01:16:25 +0000
Subject: [PATCH 08/45] Remove unused code borrowed from arrow.

---
 cpp/src/io/parquet/arrow_schema_writer.cpp | 94 +++++-----------------
 cpp/src/io/parquet/writer_impl.cu          | 50 ------------
 python/cudf/cudf/tests/test_parquet.py     |  2 -
 3 files changed, 21 insertions(+), 125 deletions(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index 1452bbea8f9..3bc728abe10 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -23,13 +23,10 @@
 
 namespace cudf::io::parquet::detail {
 
-class FieldPosition;
-
 /**
  * @brief Function to construct a tree of arrow schema fields
  */
 FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
-                                     FieldPosition field_position,
                                      cudf::detail::LinkedColPtr const& col,
                                      column_in_metadata const& col_meta,
                                      single_write_mode const write_mode,
@@ -52,35 +49,6 @@ inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
   return write_mode == single_write_mode::NO or col->nullable();
 }
 
-// TODO: Helper class copied over from Arrow source. Do we need it even?
-class FieldPosition {
- public:
-  FieldPosition() : _parent(nullptr), _index(-1), _depth(0) {}
-
-  FieldPosition child(int index) const { return {this, index}; }
-
-  std::vector<int> path() const
-  {
-    std::vector<int> path(_depth);
-    const FieldPosition* cur = this;
-    for (int i = _depth - 1; i >= 0; --i) {
-      path[i] = cur->_index;
-      cur     = cur->_parent;
-    }
-    return path;
-  }
-
- protected:
-  FieldPosition(const FieldPosition* parent, int index)
-    : _parent(parent), _index(index), _depth(parent->_depth + 1)
-  {
-  }
-
-  const FieldPosition* _parent;
-  int _index;
-  int _depth;
-};
-
 /**
  * @brief Functor to convert cudf column metadata to arrow schema
  */
@@ -90,7 +58,6 @@ struct dispatch_to_flatbuf {
   column_in_metadata const& col_meta;
   single_write_mode const write_mode;
   bool const utc_timestamps;
-  FieldPosition& field_position;
   Offset& field_offset;
   flatbuf::Type& type_type;
   std::vector<FieldOffset>& children;
@@ -274,16 +241,12 @@ struct dispatch_to_flatbuf {
   std::enable_if_t<cudf::is_nested<T>(), void> operator()()
   {
     // Lists are represented differently in arrow and cuDF.
-    // cuDF representation: List<int>: "col_name" : { "list" : { "element" }} (2 children)
-    // arrow schema representation: List<int>: "col_name" : { "list<item>" } (1 child)
+    // cuDF representation: List<int>: "col_name" : { "list","element : int" } (2 children)
+    // arrow schema representation: List<int>: "col_name" : { "list<item : int>" } (1 child)
     if constexpr (std::is_same_v<T, cudf::list_view>) {
       // Only need to process the second child (at idx = 1)
-      children.emplace_back(make_arrow_schema_fields(fbb,
-                                                     field_position.child(0),
-                                                     col->children[1],
-                                                     col_meta.child(1),
-                                                     write_mode,
-                                                     utc_timestamps));
+      children.emplace_back(make_arrow_schema_fields(
+        fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps));
       type_type    = flatbuf::Type_List;
       field_offset = flatbuf::CreateList(fbb).Union();
     }
@@ -293,12 +256,8 @@ struct dispatch_to_flatbuf {
                      thrust::make_counting_iterator(col->children.size()),
                      std::back_inserter(children),
                      [&](auto const idx) {
-                       return make_arrow_schema_fields(fbb,
-                                                       field_position.child(idx),
-                                                       col->children[idx],
-                                                       col_meta.child(idx),
-                                                       write_mode,
-                                                       utc_timestamps);
+                       return make_arrow_schema_fields(
+                         fbb, col->children[idx], col_meta.child(idx), write_mode, utc_timestamps);
                      });
       type_type    = flatbuf::Type_Struct_;
       field_offset = flatbuf::CreateStruct_(fbb).Union();
@@ -308,12 +267,13 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<cudf::is_dictionary<T>(), void> operator()()
   {
+    // TODO: Implementing ``dictionary32`` would need ``DictionaryFieldMapper`` and
+    // ``FieldPosition`` classes from arrow source to keep track of dictionary encoding paths.
     CUDF_FAIL("Dictionary columns are not supported for writing");
   }
 };
 
 FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
-                                     FieldPosition field_position,
                                      cudf::detail::LinkedColPtr const& col,
                                      column_in_metadata const& col_meta,
                                      single_write_mode const write_mode,
@@ -323,16 +283,10 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
   flatbuf::Type type_type = flatbuf::Type_NONE;
   std::vector<FieldOffset> children;
 
-  cudf::type_dispatcher(col->type(),
-                        dispatch_to_flatbuf{fbb,
-                                            col,
-                                            col_meta,
-                                            write_mode,
-                                            utc_timestamps,
-                                            field_position,
-                                            field_offset,
-                                            type_type,
-                                            children});
+  cudf::type_dispatcher(
+    col->type(),
+    dispatch_to_flatbuf{
+      fbb, col, col_meta, write_mode, utc_timestamps, field_offset, type_type, children});
 
   auto const fb_name          = fbb.CreateString(col_meta.get_name());
   auto const fb_children      = fbb.CreateVector(children.data(), children.size());
@@ -365,27 +319,21 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con
     return std::string(reinterpret_cast<char*>(buffer.data()), buffer.size());
   };
 
-  // Intantiate a flatbuffer builder
+  // Instantiate a flatbuffer builder
   FlatBufferBuilder fbb;
 
-  // Instantiate a field position mapper struct (not sure if needed yet?)
-  FieldPosition field_position;
-
   // Create an empty field offset vector
   std::vector<FieldOffset> field_offsets;
 
   // populate field offsets (aka schema fields)
-  std::transform(thrust::make_counting_iterator(0ul),
-                 thrust::make_counting_iterator(linked_columns.size()),
-                 std::back_inserter(field_offsets),
-                 [&](auto const idx) {
-                   return make_arrow_schema_fields(fbb,
-                                                   field_position.child(idx),
-                                                   linked_columns[idx],
-                                                   metadata.column_metadata[idx],
-                                                   write_mode,
-                                                   utc_timestamps);
-                 });
+  std::transform(
+    thrust::make_counting_iterator(0ul),
+    thrust::make_counting_iterator(linked_columns.size()),
+    std::back_inserter(field_offsets),
+    [&](auto const idx) {
+      return make_arrow_schema_fields(
+        fbb, linked_columns[idx], metadata.column_metadata[idx], write_mode, utc_timestamps);
+    });
 
   // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to
   // create an ipc message flatbuffer
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 60b1970e979..81fb42bee15 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -516,56 +516,6 @@ struct leaf_schema_fn {
     }
   }
 
-  /* TODO: This code block should be ``time`` type and not ``duration`` type
-    //  unsupported outside cudf for parquet 1.0.
-    template <typename T>
-    std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
-    {
-      col_schema.type           = Type::INT32;
-      col_schema.converted_type = ConvertedType::TIME_MILLIS;
-      col_schema.stats_dtype    = statistics_dtype::dtype_int32;
-      col_schema.ts_scale       = 24 * 60 * 60 * 1000;
-      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
-    }
-
-    template <typename T>
-    std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
-    {
-      col_schema.type           = Type::INT32;
-      col_schema.converted_type = ConvertedType::TIME_MILLIS;
-      col_schema.stats_dtype    = statistics_dtype::dtype_int32;
-      col_schema.ts_scale       = 1000;
-      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
-    }
-
-    template <typename T>
-    std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
-    {
-      col_schema.type           = Type::INT32;
-      col_schema.converted_type = ConvertedType::TIME_MILLIS;
-      col_schema.stats_dtype    = statistics_dtype::dtype_int32;
-      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
-    }
-
-    template <typename T>
-    std::enable_if_t<std::is_same_v<T, cudf::duration_us>, void> operator()()
-    {
-      col_schema.type           = Type::INT64;
-      col_schema.converted_type = ConvertedType::TIME_MICROS;
-      col_schema.stats_dtype    = statistics_dtype::dtype_int64;
-      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}};
-    }
-
-    //  unsupported outside cudf for parquet 1.0.
-    template <typename T>
-    std::enable_if_t<std::is_same_v<T, cudf::duration_ns>, void> operator()()
-    {
-      col_schema.type         = Type::INT64;
-      col_schema.stats_dtype  = statistics_dtype::dtype_int64;
-      col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}};
-    }
-  */
-
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
   {
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 0776a3a6ada..b183c5ef6e5 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3535,5 +3535,3 @@ def test_parquet_writer_roundtrip_structs_with_arrow_schema(tmpdir, data):
     # Check results
     assert_eq(expected, got)
     assert_eq(expected, got2)
-
-

From 44fb0ef7d6e1ef3098ad6cce527a89c966894511 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 29 May 2024 02:37:09 +0000
Subject: [PATCH 09/45] minor improvements to tests and code

---
 cpp/src/io/parquet/arrow_schema_writer.cpp | 24 +++++++++----------
 cpp/src/io/parquet/writer_impl.cu          |  4 +---
 python/cudf/cudf/tests/test_parquet.py     | 27 ++++++++++++++--------
 3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index 3bc728abe10..90f32a2b3cb 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -73,56 +73,56 @@ struct dispatch_to_flatbuf {
   std::enable_if_t<std::is_same_v<T, int8_t>, void> operator()()
   {
     type_type    = flatbuf::Type_Int;
-    field_offset = flatbuf::CreateInt(fbb, 8, true).Union();
+    field_offset = flatbuf::CreateInt(fbb, 8, std::numeric_limits<T>::is_signed).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, int16_t>, void> operator()()
   {
     type_type    = flatbuf::Type_Int;
-    field_offset = flatbuf::CreateInt(fbb, 16, true).Union();
+    field_offset = flatbuf::CreateInt(fbb, 16, std::numeric_limits<T>::is_signed).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, int32_t>, void> operator()()
   {
     type_type    = flatbuf::Type_Int;
-    field_offset = flatbuf::CreateInt(fbb, 32, true).Union();
+    field_offset = flatbuf::CreateInt(fbb, 32, std::numeric_limits<T>::is_signed).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, int64_t>, void> operator()()
   {
     type_type    = flatbuf::Type_Int;
-    field_offset = flatbuf::CreateInt(fbb, 64, true).Union();
+    field_offset = flatbuf::CreateInt(fbb, 64, std::numeric_limits<T>::is_signed).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, uint8_t>, void> operator()()
   {
     type_type    = flatbuf::Type_Int;
-    field_offset = flatbuf::CreateInt(fbb, 8, false).Union();
+    field_offset = flatbuf::CreateInt(fbb, 8, std::numeric_limits<T>::is_signed).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, uint16_t>, void> operator()()
   {
     type_type    = flatbuf::Type_Int;
-    field_offset = flatbuf::CreateInt(fbb, 16, false).Union();
+    field_offset = flatbuf::CreateInt(fbb, 16, std::numeric_limits<T>::is_signed).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, uint32_t>, void> operator()()
   {
     type_type    = flatbuf::Type_Int;
-    field_offset = flatbuf::CreateInt(fbb, 32, false).Union();
+    field_offset = flatbuf::CreateInt(fbb, 32, std::numeric_limits<T>::is_signed).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, uint64_t>, void> operator()()
   {
     type_type    = flatbuf::Type_Int;
-    field_offset = flatbuf::CreateInt(fbb, 64, false).Union();
+    field_offset = flatbuf::CreateInt(fbb, 64, std::numeric_limits<T>::is_signed).Union();
   }
 
   template <typename T>
@@ -226,14 +226,14 @@ struct dispatch_to_flatbuf {
     // TODO: cuDF-PQ writer supports d32 and d64 types not supported by Arrow without conversion.
     // See more: https://github.com/rapidsai/cudf/blob/branch-24.08/cpp/src/interop/to_arrow.cu#L155
     //
-    if (std::is_same_v<T, numeric::decimal128>) {
+    if constexpr (std::is_same_v<T, numeric::decimal128>) {
       type_type = flatbuf::Type_Decimal;
       field_offset =
         flatbuf::CreateDecimal(fbb, col_meta.get_decimal_precision(), col->type().scale(), 128)
           .Union();
     } else {
       // TODO: Should we fail or just not write arrow:schema anymore?
-      CUDF_FAIL("Fixed point types other than decimal128 are not supported for arrow schema");
+      CUDF_FAIL("Fixed point types smaller than `decimal128` are not supported in arrow schema");
     }
   }
 
@@ -269,7 +269,7 @@ struct dispatch_to_flatbuf {
   {
     // TODO: Implementing ``dictionary32`` would need ``DictionaryFieldMapper`` and
     // ``FieldPosition`` classes from arrow source to keep track of dictionary encoding paths.
-    CUDF_FAIL("Dictionary columns are not supported for writing");
+    CUDF_FAIL("Dictionary columns are not supported for writing arrow schema");
   }
 };
 
@@ -327,7 +327,7 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con
 
   // populate field offsets (aka schema fields)
   std::transform(
-    thrust::make_counting_iterator(0ul),
+    thrust::make_counting_iterator(0UL),
     thrust::make_counting_iterator(linked_columns.size()),
     std::back_inserter(field_offsets),
     [&](auto const idx) {
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 81fb42bee15..0c7798b9b18 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -28,8 +28,6 @@
 #include "io/statistics/column_statistics.cuh"
 #include "io/utilities/column_utils.cuh"
 #include "io/utilities/config_utils.hpp"
-#include "ipc/Message_generated.h"
-#include "ipc/Schema_generated.h"
 #include "parquet_common.hpp"
 #include "parquet_gpu.cuh"
 #include "writer_impl.hpp"
@@ -99,7 +97,7 @@ struct aggregate_writer_metadata {
       // Append arrow schema to the key-value metadata
       if (not arrow_schema_ipc_message.empty()) {
         this->files[p].key_value_metadata.emplace_back(
-          KeyValue{"ARROW:schema", std::move(arrow_schema_ipc_message)});
+          KeyValue{"ARROW:schema", arrow_schema_ipc_message});
       }
     }
   }
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index b183c5ef6e5..62d40cff67c 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3407,12 +3407,12 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema():
 
 
 def test_parquet_writer_roundtrip_with_arrow_schema():
+    # Ensure that the concrete and nested types are faithfully being roundtripped
+    # across Parquet with arrow schema
     expected = cudf.DataFrame(
         {
             "s": cudf.Series([None, None, None], dtype="timedelta64[s]"),
-            "ms": cudf.Series([1234, None, 32442], dtype="timedelta64[ms]"),
             "us": cudf.Series([None, 3456, None], dtype="timedelta64[us]"),
-            "ns": cudf.Series([1234, 3456, 32442], dtype="timedelta64[ns]"),
             "duration_list": list(
                 [
                     [
@@ -3429,8 +3429,13 @@ def test_parquet_writer_roundtrip_with_arrow_schema():
                     ],
                 ]
             ),
-            "int64": cudf.Series([1234, 123, 4123], dtype="int64"),
-            "int64_list": list([[1, 2], [1, 2], [1, 2]]),
+            "int64": cudf.Series([-1234, 123, 4123], dtype="int64"),
+            "uint32": cudf.Series([1234, 123, 4123], dtype="uint32"),
+            "list": list([[1, 2], [1, 2], [1, 2]]),
+            "bool": cudf.Series([True, None, False], dtype=bool),
+            "fixed_pt": cudf.Series([0.00, 1.0, None]).astype(
+                cudf.Decimal128Dtype(7, 2)
+            ),
             "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
             "map": cudf.Series(["cat", "dog", "lion"]).map(
                 {"cat": "kitten", "dog": "puppy", "lion": "cub"}
@@ -3438,11 +3443,17 @@ def test_parquet_writer_roundtrip_with_arrow_schema():
         }
     )
 
+    # Write to Parquet
     buffer = BytesIO()
     expected.to_parquet(buffer)
-    read = cudf.DataFrame.from_arrow(pq.read_table(buffer))
 
-    assert_eq(expected, read)
+    # Read parquet with pyarrow and cudf readers
+    got = cudf.DataFrame.from_arrow(pq.read_table(buffer))
+    got2 = cudf.read_parquet(buffer)
+
+    # Check results
+    assert_eq(expected, got)
+    assert_eq(expected, got2)
 
 
 @pytest.mark.parametrize(
@@ -3516,10 +3527,8 @@ def test_parquet_writer_roundtrip_structs_with_arrow_schema(tmpdir, data):
 
     expected = cudf.DataFrame.from_arrow(pa_expected)
 
-    # IO buffer
-    buffer = BytesIO()
-
     # Write expected data frame to Parquet
+    buffer = BytesIO()
     expected.to_parquet(buffer)
 
     # Read Parquet with pyarrow

From e733ff150cf9e3d3af4c37598b3f7a955037eac8 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 29 May 2024 03:21:59 +0000
Subject: [PATCH 10/45] Code cleanup and add API docs.

---
 cpp/src/io/parquet/arrow_schema_writer.cpp | 104 ++++++++++-----------
 cpp/src/io/parquet/arrow_schema_writer.hpp |  53 ++++++-----
 cpp/src/io/parquet/writer_impl.cu          |  15 ---
 3 files changed, 84 insertions(+), 88 deletions(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index 90f32a2b3cb..25d3e01abc4 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -15,40 +15,46 @@
  */
 
 /**
- * @file arrow_schema.cpp
+ * @file arrow_schema_writer.cpp
  * @brief Arrow IPC schema writer implementation
  */
 
 #include "arrow_schema_writer.hpp"
 
+#include "io/parquet/parquet_common.hpp"
+#include "io/utilities/base64_utilities.hpp"
+#include "ipc/Message_generated.h"
+#include "ipc/Schema_generated.h"
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
 namespace cudf::io::parquet::detail {
 
+// Copied over from arrow source for better code readability
+namespace flatbuf       = cudf::io::parquet::flatbuf;
+using FlatBufferBuilder = flatbuffers::FlatBufferBuilder;
+using DictionaryOffset  = flatbuffers::Offset<flatbuf::DictionaryEncoding>;
+using FieldOffset       = flatbuffers::Offset<flatbuf::Field>;
+using Offset            = flatbuffers::Offset<void>;
+using FBString          = flatbuffers::Offset<flatbuffers::String>;
+
 /**
  * @brief Function to construct a tree of arrow schema fields
+ *
+ * @param fbb
+ * @param column
+ * @param column_metadata
+ * @param write_mode
+ * @param utc_timestamps
  */
 FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
-                                     cudf::detail::LinkedColPtr const& col,
-                                     column_in_metadata const& col_meta,
+                                     cudf::detail::LinkedColPtr const& column,
+                                     column_in_metadata const& column_metadata,
                                      single_write_mode const write_mode,
                                      bool const utc_timestamps);
 
-// TODO: Copied over from ``writer_impl.cu``. Need to placed at a common location to avoid
-// duplication.
-inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
-                            column_in_metadata const& col_meta,
-                            single_write_mode write_mode)
-{
-  if (col_meta.is_nullability_defined()) {
-    CUDF_EXPECTS(col_meta.nullable() or col->null_count() == 0,
-                 "Mismatch in metadata prescribed nullability and input column. "
-                 "Metadata for input column with nulls cannot prescribe nullability = false");
-    return col_meta.nullable();
-  }
-  // For chunked write, when not provided nullability, we assume the worst case scenario
-  // that all columns are nullable.
-  return write_mode == single_write_mode::NO or col->nullable();
-}
-
 /**
  * @brief Functor to convert cudf column metadata to arrow schema
  */
@@ -152,7 +158,7 @@ struct dispatch_to_flatbuf {
   operator()()
   {
     type_type = flatbuf::Type_Timestamp;
-    // TODO: Verify if this is the correct logic
+    // TODO: Verify if this is the correct logic for UTC
     field_offset = flatbuf::CreateTimestamp(
                      fbb, flatbuf::TimeUnit_SECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
                      .Union();
@@ -223,16 +229,17 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
   {
-    // TODO: cuDF-PQ writer supports d32 and d64 types not supported by Arrow without conversion.
-    // See more: https://github.com/rapidsai/cudf/blob/branch-24.08/cpp/src/interop/to_arrow.cu#L155
-    //
     if constexpr (std::is_same_v<T, numeric::decimal128>) {
       type_type = flatbuf::Type_Decimal;
       field_offset =
         flatbuf::CreateDecimal(fbb, col_meta.get_decimal_precision(), col->type().scale(), 128)
           .Union();
-    } else {
-      // TODO: Should we fail or just not write arrow:schema anymore?
+    }
+    // cuDF-PQ writer supports ``decimal32`` and ``decimal64`` types, not directly supported by
+    // Arrow without explicit conversion. See more:
+    // https://github.com/rapidsai/cudf/blob/branch-24.08/cpp/src/interop/to_arrow.cu#L155.
+    else {
+      // TODO: Should we fail here or just not write arrow schema?.
       CUDF_FAIL("Fixed point types smaller than `decimal128` are not supported in arrow schema");
     }
   }
@@ -241,15 +248,16 @@ struct dispatch_to_flatbuf {
   std::enable_if_t<cudf::is_nested<T>(), void> operator()()
   {
     // Lists are represented differently in arrow and cuDF.
-    // cuDF representation: List<int>: "col_name" : { "list","element : int" } (2 children)
-    // arrow schema representation: List<int>: "col_name" : { "list<item : int>" } (1 child)
+    // cuDF representation: List<int>: "col_name" : { "list", "element:int" } (2 children)
+    // arrow schema representation: List<int>: "col_name" : { "list<item:int>" } (1 child)
+    // Hence, we only need to process the second child of the list.
     if constexpr (std::is_same_v<T, cudf::list_view>) {
-      // Only need to process the second child (at idx = 1)
       children.emplace_back(make_arrow_schema_fields(
         fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps));
       type_type    = flatbuf::Type_List;
       field_offset = flatbuf::CreateList(fbb).Union();
     }
+
     // Traverse the struct in DFS manner and process children fields.
     else if constexpr (std::is_same_v<T, cudf::struct_view>) {
       std::transform(thrust::make_counting_iterator(0UL),
@@ -274,8 +282,8 @@ struct dispatch_to_flatbuf {
 };
 
 FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
-                                     cudf::detail::LinkedColPtr const& col,
-                                     column_in_metadata const& col_meta,
+                                     cudf::detail::LinkedColPtr const& column,
+                                     column_in_metadata const& column_metadata,
                                      single_write_mode const write_mode,
                                      bool const utc_timestamps)
 {
@@ -284,13 +292,13 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
   std::vector<FieldOffset> children;
 
   cudf::type_dispatcher(
-    col->type(),
+    column->type(),
     dispatch_to_flatbuf{
-      fbb, col, col_meta, write_mode, utc_timestamps, field_offset, type_type, children});
+      fbb, column, column_metadata, write_mode, utc_timestamps, field_offset, type_type, children});
 
-  auto const fb_name          = fbb.CreateString(col_meta.get_name());
+  auto const fb_name          = fbb.CreateString(column_metadata.get_name());
   auto const fb_children      = fbb.CreateVector(children.data(), children.size());
-  auto const is_nullable      = is_col_nullable(col, col_meta, write_mode);
+  auto const is_nullable      = is_col_nullable(column, column_metadata, write_mode);
   DictionaryOffset dictionary = 0;
 
   // push to field offsets vector
@@ -298,15 +306,6 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
     fbb, fb_name, is_nullable, type_type, field_offset, dictionary, fb_children);
 }
 
-/**
- * @brief Construct and return arrow schema from input parquet schema
- *
- * Recursively traverses through parquet schema to construct the arrow schema tree.
- * Serializes the arrow schema tree and stores it as the header (or metadata) of
- * an otherwise empty ipc message using flatbuffers. The ipc message is then prepended
- * with header size (padded for 16 byte alignment) and a continuation string. The final
- * string is base64 encoded and returned.
- */
 std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
                                                table_input_metadata const& metadata,
                                                single_write_mode const write_mode,
@@ -326,14 +325,15 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con
   std::vector<FieldOffset> field_offsets;
 
   // populate field offsets (aka schema fields)
-  std::transform(
-    thrust::make_counting_iterator(0UL),
-    thrust::make_counting_iterator(linked_columns.size()),
-    std::back_inserter(field_offsets),
-    [&](auto const idx) {
-      return make_arrow_schema_fields(
-        fbb, linked_columns[idx], metadata.column_metadata[idx], write_mode, utc_timestamps);
-    });
+  std::transform(thrust::make_zip_iterator(
+                   thrust::make_tuple(linked_columns.begin(), metadata.column_metadata.begin())),
+                 thrust::make_zip_iterator(
+                   thrust::make_tuple(linked_columns.end(), metadata.column_metadata.end())),
+                 std::back_inserter(field_offsets),
+                 [&](auto const& elem) {
+                   return make_arrow_schema_fields(
+                     fbb, thrust::get<0>(elem), thrust::get<1>(elem), write_mode, utc_timestamps);
+                 });
 
   // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to
   // create an ipc message flatbuffer
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index 29db9f05df4..309704d4e87 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -15,45 +15,49 @@
  */
 
 /**
- * @file arrow_schema.hpp
+ * @file arrow_schema_writer.hpp
  * @brief Arrow IPC schema writer implementation
  */
 
 #pragma once
 
-#include "io/parquet/parquet_common.hpp"
-#include "io/utilities/base64_utilities.hpp"
-#include "ipc/Message_generated.h"
-#include "ipc/Schema_generated.h"
-
-#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/linked_column.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/detail/parquet.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
 
-#include <cstdint>
-#include <memory>
 #include <string>
-#include <tuple>
 #include <vector>
 
 namespace cudf::io::parquet::detail {
 
 using namespace cudf::io::detail;
 
-namespace flatbuf = cudf::io::parquet::flatbuf;
-
-using FlatBufferBuilder = flatbuffers::FlatBufferBuilder;
-using DictionaryOffset  = flatbuffers::Offset<flatbuf::DictionaryEncoding>;
-using FieldOffset       = flatbuffers::Offset<flatbuf::Field>;
-using Offset            = flatbuffers::Offset<void>;
-using FBString          = flatbuffers::Offset<flatbuffers::String>;
+/**
+ * @brief Returns ``true`` if the column is nullable or if the write mode is not
+ *        set to write the table all at once instead of chunked
+ *
+ * @param column A view of the column
+ * @param column_metadata Metadata of the column
+ * @param write_mode Flag to indicate that we are guaranteeing a single table write
+ *
+ * @return Whether the column is nullable.
+ */
+[[nodiscard]] inline bool is_col_nullable(cudf::detail::LinkedColPtr const& column,
+                                          column_in_metadata const& column_metadata,
+                                          single_write_mode write_mode)
+{
+  if (column_metadata.is_nullability_defined()) {
+    CUDF_EXPECTS(column_metadata.nullable() or column->null_count() == 0,
+                 "Mismatch in metadata prescribed nullability and input column. "
+                 "Metadata for input column with nulls cannot prescribe nullability = false");
+    return column_metadata.nullable();
+  }
+  // For chunked write, when not provided nullability, we assume the worst case scenario
+  // that all columns are nullable.
+  return write_mode == single_write_mode::NO or column->nullable();
+}
 
 /**
  * @brief Construct and return arrow schema from input parquet schema
@@ -63,6 +67,13 @@ using FBString          = flatbuffers::Offset<flatbuffers::String>;
  * an otherwise empty ipc message using flatbuffers. The ipc message is then prepended
  * with header size (padded for 16 byte alignment) and a continuation string. The final
  * string is base64 encoded and returned.
+ *
+ * @param linked_columns Vector of table column views
+ * @param metadata Metadata of the columns of the table
+ * @param write_mode Flag to indicate that we are guaranteeing a single table write
+ * @param utc_timestamps Flag to indicate if timestamps are UTC
+ *
+ * @return The constructed arrow ipc message string
  */
 std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
                                                table_input_metadata const& metadata,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 0c7798b9b18..63745939755 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -599,21 +599,6 @@ struct leaf_schema_fn {
   }
 };
 
-inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
-                            column_in_metadata const& col_meta,
-                            single_write_mode write_mode)
-{
-  if (col_meta.is_nullability_defined()) {
-    CUDF_EXPECTS(col_meta.nullable() or col->null_count() == 0,
-                 "Mismatch in metadata prescribed nullability and input column. "
-                 "Metadata for input column with nulls cannot prescribe nullability = false");
-    return col_meta.nullable();
-  }
-  // For chunked write, when not provided nullability, we assume the worst case scenario
-  // that all columns are nullable.
-  return write_mode == single_write_mode::NO or col->nullable();
-}
-
 /**
  * @brief Construct schema from input columns and per-column input options
  *

From f4a9595d393a8362f857e778876ba3b542fcd9e5 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 29 May 2024 03:22:52 +0000
Subject: [PATCH 11/45] Revert changes to types.hpp

---
 cpp/include/cudf/types.hpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 101791cee0b..466d53fcafc 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -216,10 +216,6 @@ enum class type_id : int32_t {
   TIMESTAMP_MILLISECONDS,  ///< point in time in milliseconds since Unix Epoch in int64
   TIMESTAMP_MICROSECONDS,  ///< point in time in microseconds since Unix Epoch in int64
   TIMESTAMP_NANOSECONDS,   ///< point in time in nanoseconds since Unix Epoch in int64
-  TIME_SECONDS,            ///< time of day since midnight in seconds in int64
-  TIME_MILLISECONDS,       ///< time of day since midnight in milliseconds in int64
-  TIME_MICROSECONDS,       ///< time of day since midnight in microseconds in int64
-  TIME_NANOSECONDS,        ///< time of day since midnight in nanoseconds in int64
   DURATION_DAYS,           ///< time interval of days in int64
   DURATION_SECONDS,        ///< time interval of seconds in int64
   DURATION_MILLISECONDS,   ///< time interval of milliseconds in int64

From ede6191e2a6483c0309439262a514b9f53908c51 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 29 May 2024 03:31:51 +0000
Subject: [PATCH 12/45] Minor code and doc cleanup

---
 cpp/src/io/parquet/arrow_schema_writer.cpp | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index 25d3e01abc4..1c3f66fc818 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -41,13 +41,15 @@ using Offset            = flatbuffers::Offset<void>;
 using FBString          = flatbuffers::Offset<flatbuffers::String>;
 
 /**
- * @brief Function to construct a tree of arrow schema fields
+ * @brief Recursively construct the arrow schema (fields) tree
  *
- * @param fbb
- * @param column
- * @param column_metadata
- * @param write_mode
- * @param utc_timestamps
+ * @param fbb The root flatbuffer builder object instance
+ * @param column A view of the column
+ * @param column_metadata Metadata of the column
+ * @param write_mode Flag to indicate that we are guaranteeing a single table write
+ * @param utc_timestamps Flag to indicate if timestamps are UTC
+ *
+ * @return Flatbuffer offset to the constructed field
  */
 FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
                                      cudf::detail::LinkedColPtr const& column,
@@ -56,7 +58,7 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
                                      bool const utc_timestamps);
 
 /**
- * @brief Functor to convert cudf column metadata to arrow schema
+ * @brief Functor to convert cudf column metadata to arrow schema field metadata
  */
 struct dispatch_to_flatbuf {
   FlatBufferBuilder& fbb;

From 62a26843be733c53fa4d4a378bd5676013984be0 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 29 May 2024 03:40:30 +0000
Subject: [PATCH 13/45] Minor fix for failing pytest

---
 python/cudf/cudf/tests/test_parquet.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 62d40cff67c..ae5846f9bcc 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3170,10 +3170,10 @@ def test_parquet_writer_time_delta_physical_type():
     got = pd.read_parquet(buffer)
     expected = pd.DataFrame(
         {
-            "s": ["00:00:01"],
-            "ms": ["00:00:00.002000"],
-            "us": ["00:00:00.000003"],
-            "ns": ["00:00:00.000004"],
+            "s": ["0 days 00:00:01"],
+            "ms": ["0 days 00:00:00.002000"],
+            "us": ["0 days 00:00:00.000003"],
+            "ns": ["0 days 00:00:00.000004"],
         },
         dtype="str",
     )

From 6e448abefffa5d82c9b9f72f7f98abecebfd2540 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 29 May 2024 04:46:29 +0000
Subject: [PATCH 14/45] Handle int96 timestamps.

---
 cpp/src/io/parquet/arrow_schema_writer.cpp | 71 +++++++++++++++++-----
 cpp/src/io/parquet/arrow_schema_writer.hpp |  4 +-
 cpp/src/io/parquet/writer_impl.cu          |  4 +-
 python/cudf/cudf/tests/test_parquet.py     |  8 ++-
 4 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index 1c3f66fc818..b13bcc98581 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -48,6 +48,7 @@ using FBString          = flatbuffers::Offset<flatbuffers::String>;
  * @param column_metadata Metadata of the column
  * @param write_mode Flag to indicate that we are guaranteeing a single table write
  * @param utc_timestamps Flag to indicate if timestamps are UTC
+ * @param int96_timestamps Flag to indicate if timestamps was written as INT96
  *
  * @return Flatbuffer offset to the constructed field
  */
@@ -55,7 +56,8 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
                                      cudf::detail::LinkedColPtr const& column,
                                      column_in_metadata const& column_metadata,
                                      single_write_mode const write_mode,
-                                     bool const utc_timestamps);
+                                     bool const utc_timestamps,
+                                     bool const int96_timestamps);
 
 /**
  * @brief Functor to convert cudf column metadata to arrow schema field metadata
@@ -66,6 +68,7 @@ struct dispatch_to_flatbuf {
   column_in_metadata const& col_meta;
   single_write_mode const write_mode;
   bool const utc_timestamps;
+  bool const int96_timestamps;
   Offset& field_offset;
   flatbuf::Type& type_type;
   std::vector<FieldOffset>& children;
@@ -159,8 +162,13 @@ struct dispatch_to_flatbuf {
                    void>
   operator()()
   {
+    // INT96 timestamps have been deprecated in arrow
+    if (int96_timestamps or col_meta.is_enabled_int96_timestamps()) {
+      CUDF_FAIL("INT96 timestamps have been deprecated in arrow schema");
+    }
+
     type_type = flatbuf::Type_Timestamp;
-    // TODO: Verify if this is the correct logic for UTC
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset = flatbuf::CreateTimestamp(
                      fbb, flatbuf::TimeUnit_SECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
                      .Union();
@@ -169,8 +177,13 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ms>, void> operator()()
   {
+    // INT96 timestamps have been deprecated in arrow
+    if (int96_timestamps or col_meta.is_enabled_int96_timestamps()) {
+      CUDF_FAIL("INT96 timestamps have been deprecated in arrow schema");
+    }
+
     type_type = flatbuf::Type_Timestamp;
-    // TODO: Verify if this is the correct logic for UTC
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset =
       flatbuf::CreateTimestamp(
         fbb, flatbuf::TimeUnit_MILLISECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
@@ -180,8 +193,13 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_us>, void> operator()()
   {
+    // INT96 timestamps have been deprecated in arrow
+    if (int96_timestamps or col_meta.is_enabled_int96_timestamps()) {
+      CUDF_FAIL("INT96 timestamps have been deprecated in arrow schema");
+    }
+
     type_type = flatbuf::Type_Timestamp;
-    // TODO: Verify if this is the correct logic for UTC
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset =
       flatbuf::CreateTimestamp(
         fbb, flatbuf::TimeUnit_MICROSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
@@ -191,8 +209,13 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
   {
+    // INT96 timestamps have been deprecated in arrow
+    if (int96_timestamps or col_meta.is_enabled_int96_timestamps()) {
+      CUDF_FAIL("INT96 timestamps have been deprecated in arrow schema");
+    }
+
     type_type = flatbuf::Type_Timestamp;
-    // TODO: Verify if this is the correct logic for UTC
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset =
       flatbuf::CreateTimestamp(
         fbb, flatbuf::TimeUnit_NANOSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
@@ -255,7 +278,7 @@ struct dispatch_to_flatbuf {
     // Hence, we only need to process the second child of the list.
     if constexpr (std::is_same_v<T, cudf::list_view>) {
       children.emplace_back(make_arrow_schema_fields(
-        fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps));
+        fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps, int96_timestamps));
       type_type    = flatbuf::Type_List;
       field_offset = flatbuf::CreateList(fbb).Union();
     }
@@ -266,8 +289,12 @@ struct dispatch_to_flatbuf {
                      thrust::make_counting_iterator(col->children.size()),
                      std::back_inserter(children),
                      [&](auto const idx) {
-                       return make_arrow_schema_fields(
-                         fbb, col->children[idx], col_meta.child(idx), write_mode, utc_timestamps);
+                       return make_arrow_schema_fields(fbb,
+                                                       col->children[idx],
+                                                       col_meta.child(idx),
+                                                       write_mode,
+                                                       utc_timestamps,
+                                                       int96_timestamps);
                      });
       type_type    = flatbuf::Type_Struct_;
       field_offset = flatbuf::CreateStruct_(fbb).Union();
@@ -287,16 +314,23 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
                                      cudf::detail::LinkedColPtr const& column,
                                      column_in_metadata const& column_metadata,
                                      single_write_mode const write_mode,
-                                     bool const utc_timestamps)
+                                     bool const utc_timestamps,
+                                     bool const int96_timestamps)
 {
   Offset field_offset     = 0;
   flatbuf::Type type_type = flatbuf::Type_NONE;
   std::vector<FieldOffset> children;
 
-  cudf::type_dispatcher(
-    column->type(),
-    dispatch_to_flatbuf{
-      fbb, column, column_metadata, write_mode, utc_timestamps, field_offset, type_type, children});
+  cudf::type_dispatcher(column->type(),
+                        dispatch_to_flatbuf{fbb,
+                                            column,
+                                            column_metadata,
+                                            write_mode,
+                                            utc_timestamps,
+                                            int96_timestamps,
+                                            field_offset,
+                                            type_type,
+                                            children});
 
   auto const fb_name          = fbb.CreateString(column_metadata.get_name());
   auto const fb_children      = fbb.CreateVector(children.data(), children.size());
@@ -311,7 +345,8 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
 std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
                                                table_input_metadata const& metadata,
                                                single_write_mode const write_mode,
-                                               bool const utc_timestamps)
+                                               bool const utc_timestamps,
+                                               bool const int96_timestamps)
 {
   // Lambda function to convert int32 to a string of uint8 bytes
   auto const convert_int32_to_byte_string = [&](int32_t const value) {
@@ -333,8 +368,12 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con
                    thrust::make_tuple(linked_columns.end(), metadata.column_metadata.end())),
                  std::back_inserter(field_offsets),
                  [&](auto const& elem) {
-                   return make_arrow_schema_fields(
-                     fbb, thrust::get<0>(elem), thrust::get<1>(elem), write_mode, utc_timestamps);
+                   return make_arrow_schema_fields(fbb,
+                                                   thrust::get<0>(elem),
+                                                   thrust::get<1>(elem),
+                                                   write_mode,
+                                                   utc_timestamps,
+                                                   int96_timestamps);
                  });
 
   // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index 309704d4e87..8ba48361eb3 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -72,12 +72,14 @@ using namespace cudf::io::detail;
  * @param metadata Metadata of the columns of the table
  * @param write_mode Flag to indicate that we are guaranteeing a single table write
  * @param utc_timestamps Flag to indicate if timestamps are UTC
+ * @param int96_timestamps Flag to indicate if timestamps was written as INT96
  *
  * @return The constructed arrow ipc message string
  */
 std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
                                                table_input_metadata const& metadata,
                                                single_write_mode const write_mode,
-                                               bool const utc_timestamps);
+                                               bool const utc_timestamps,
+                                               bool const int96_timestamps);
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 63745939755..d08fed7761c 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1817,7 +1817,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       this_table_schema,
       num_columns,
       stats_granularity,
-      construct_arrow_schema_ipc_message(vec, table_meta, write_mode, utc_timestamps));
+      (write_arrow_schema) ? construct_arrow_schema_ipc_message(
+                               vec, table_meta, write_mode, utc_timestamps, int96_timestamps)
+                           : "");
   } else {
     agg_meta = std::make_unique<aggregate_writer_metadata>(*curr_agg_meta);
 
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index ae5846f9bcc..220cef67bd8 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1620,7 +1620,13 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf):
     assert_eq(pdf, gdf)
 
     # Write out the gdf using the GPU accelerated writer with INT96 timestamps
-    gdf.to_parquet(gdf_fname.strpath, index=None, int96_timestamps=True)
+    # INT96 timestamps have been deprecated in Arrow so set `store_schema=False`
+    gdf.to_parquet(
+        gdf_fname.strpath,
+        index=None,
+        int96_timestamps=True,
+        store_schema=False,
+    )
 
     assert os.path.exists(gdf_fname)
 

From 3c800f5e65ff261acc8966d56e53a99357c366f6 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 29 May 2024 19:15:44 +0000
Subject: [PATCH 15/45] Add `stats_dtype` to INT64 duration columns

---
 cpp/src/io/parquet/writer_impl.cu | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index d08fed7761c..29ac5125a85 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -517,37 +517,37 @@ struct leaf_schema_fn {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
   {
-    col_schema.type       = Type::INT64;
-    col_schema.arrow_type = cudf::type_id::DURATION_SECONDS;
-    col_schema.ts_scale   = 24 * 60 * 60;
+    col_schema.type        = Type::INT64;
+    col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    col_schema.ts_scale    = 24 * 60 * 60;
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
   {
-    col_schema.type       = Type::INT64;
-    col_schema.arrow_type = cudf::type_id::DURATION_SECONDS;
+    col_schema.type        = Type::INT64;
+    col_schema.stats_dtype = statistics_dtype::dtype_int64;
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
   {
-    col_schema.type       = Type::INT64;
-    col_schema.arrow_type = cudf::type_id::DURATION_MILLISECONDS;
+    col_schema.type        = Type::INT64;
+    col_schema.stats_dtype = statistics_dtype::dtype_int64;
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_us>, void> operator()()
   {
-    col_schema.type       = Type::INT64;
-    col_schema.arrow_type = cudf::type_id::DURATION_MICROSECONDS;
+    col_schema.type        = Type::INT64;
+    col_schema.stats_dtype = statistics_dtype::dtype_int64;
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ns>, void> operator()()
   {
-    col_schema.type       = Type::INT64;
-    col_schema.arrow_type = cudf::type_id::DURATION_NANOSECONDS;
+    col_schema.type        = Type::INT64;
+    col_schema.stats_dtype = statistics_dtype::dtype_int64;
   }
 
   template <typename T>

From f7aaaad7476aad0def7d903cbca9e84968219db5 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 30 May 2024 01:42:15 +0000
Subject: [PATCH 16/45] turn arrow schema off by default

---
 cpp/include/cudf/io/parquet.hpp            |   6 +-
 cpp/src/io/parquet/arrow_schema_writer.cpp |  67 +++----------
 cpp/src/io/parquet/arrow_schema_writer.hpp |   4 +-
 cpp/src/io/parquet/writer_impl.cu          | 110 ++++++++++-----------
 cpp/src/io/parquet/writer_impl.hpp         |   1 -
 python/cudf/cudf/_lib/parquet.pyx          |   6 +-
 python/cudf/cudf/tests/test_parquet.py     |  40 +-------
 7 files changed, 81 insertions(+), 153 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index ac04cd4c11f..ede1994312d 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -72,7 +72,7 @@ class parquet_reader_options {
   // Whether to use PANDAS metadata to load columns
   bool _use_pandas_metadata = true;
   // Whether to read and use ARROW schema
-  bool _use_arrow_schema = true;
+  bool _use_arrow_schema = false;
   // Cast timestamp columns to a specific type
   data_type _timestamp_type{type_id::EMPTY};
 
@@ -603,7 +603,7 @@ class parquet_writer_options {
   // Defaults to true because libcudf timestamps are implicitly UTC
   bool _write_timestamps_as_UTC = true;
   // Whether to write ARROW schema
-  bool _write_arrow_schema = true;
+  bool _write_arrow_schema = false;
   // Column chunks file paths to be set in the raw output metadata. One per output file
   std::vector<std::string> _column_chunks_file_paths;
   // Maximum size of each row group (unless smaller than a single page)
@@ -1327,7 +1327,7 @@ class chunked_parquet_writer_options {
   // Parquet writer can write timestamps as UTC. Defaults to true.
   bool _write_timestamps_as_UTC = true;
   // Whether to write ARROW schema
-  bool _write_arrow_schema = true;
+  bool _write_arrow_schema = false;
   // Maximum size of each row group (unless smaller than a single page)
   size_t _row_group_size_bytes = default_row_group_size_bytes;
   // Maximum number of rows in row group (unless smaller than a single page)
diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index b13bcc98581..f3f3fba4bac 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -48,7 +48,6 @@ using FBString          = flatbuffers::Offset<flatbuffers::String>;
  * @param column_metadata Metadata of the column
  * @param write_mode Flag to indicate that we are guaranteeing a single table write
  * @param utc_timestamps Flag to indicate if timestamps are UTC
- * @param int96_timestamps Flag to indicate if timestamps was written as INT96
  *
  * @return Flatbuffer offset to the constructed field
  */
@@ -56,8 +55,7 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
                                      cudf::detail::LinkedColPtr const& column,
                                      column_in_metadata const& column_metadata,
                                      single_write_mode const write_mode,
-                                     bool const utc_timestamps,
-                                     bool const int96_timestamps);
+                                     bool const utc_timestamps);
 
 /**
  * @brief Functor to convert cudf column metadata to arrow schema field metadata
@@ -68,7 +66,6 @@ struct dispatch_to_flatbuf {
   column_in_metadata const& col_meta;
   single_write_mode const write_mode;
   bool const utc_timestamps;
-  bool const int96_timestamps;
   Offset& field_offset;
   flatbuf::Type& type_type;
   std::vector<FieldOffset>& children;
@@ -162,11 +159,6 @@ struct dispatch_to_flatbuf {
                    void>
   operator()()
   {
-    // INT96 timestamps have been deprecated in arrow
-    if (int96_timestamps or col_meta.is_enabled_int96_timestamps()) {
-      CUDF_FAIL("INT96 timestamps have been deprecated in arrow schema");
-    }
-
     type_type = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset = flatbuf::CreateTimestamp(
@@ -177,11 +169,6 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ms>, void> operator()()
   {
-    // INT96 timestamps have been deprecated in arrow
-    if (int96_timestamps or col_meta.is_enabled_int96_timestamps()) {
-      CUDF_FAIL("INT96 timestamps have been deprecated in arrow schema");
-    }
-
     type_type = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset =
@@ -193,11 +180,6 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_us>, void> operator()()
   {
-    // INT96 timestamps have been deprecated in arrow
-    if (int96_timestamps or col_meta.is_enabled_int96_timestamps()) {
-      CUDF_FAIL("INT96 timestamps have been deprecated in arrow schema");
-    }
-
     type_type = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset =
@@ -209,11 +191,6 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
   {
-    // INT96 timestamps have been deprecated in arrow
-    if (int96_timestamps or col_meta.is_enabled_int96_timestamps()) {
-      CUDF_FAIL("INT96 timestamps have been deprecated in arrow schema");
-    }
-
     type_type = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset =
@@ -278,7 +255,7 @@ struct dispatch_to_flatbuf {
     // Hence, we only need to process the second child of the list.
     if constexpr (std::is_same_v<T, cudf::list_view>) {
       children.emplace_back(make_arrow_schema_fields(
-        fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps, int96_timestamps));
+        fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps));
       type_type    = flatbuf::Type_List;
       field_offset = flatbuf::CreateList(fbb).Union();
     }
@@ -289,12 +266,8 @@ struct dispatch_to_flatbuf {
                      thrust::make_counting_iterator(col->children.size()),
                      std::back_inserter(children),
                      [&](auto const idx) {
-                       return make_arrow_schema_fields(fbb,
-                                                       col->children[idx],
-                                                       col_meta.child(idx),
-                                                       write_mode,
-                                                       utc_timestamps,
-                                                       int96_timestamps);
+                       return make_arrow_schema_fields(
+                         fbb, col->children[idx], col_meta.child(idx), write_mode, utc_timestamps);
                      });
       type_type    = flatbuf::Type_Struct_;
       field_offset = flatbuf::CreateStruct_(fbb).Union();
@@ -314,23 +287,20 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
                                      cudf::detail::LinkedColPtr const& column,
                                      column_in_metadata const& column_metadata,
                                      single_write_mode const write_mode,
-                                     bool const utc_timestamps,
-                                     bool const int96_timestamps)
+                                     bool const utc_timestamps)
 {
   Offset field_offset     = 0;
   flatbuf::Type type_type = flatbuf::Type_NONE;
   std::vector<FieldOffset> children;
 
-  cudf::type_dispatcher(column->type(),
-                        dispatch_to_flatbuf{fbb,
-                                            column,
-                                            column_metadata,
-                                            write_mode,
-                                            utc_timestamps,
-                                            int96_timestamps,
-                                            field_offset,
-                                            type_type,
-                                            children});
+  cudf::type_dispatcher(
+    column->type(),
+    dispatch_to_flatbuf{
+      fbb, column, column_metadata, write_mode, utc_timestamps, field_offset, type_type, children});
+
+  std::cout << "Name: " << column_metadata.get_name()
+            << ", Type: " << static_cast<std::underlying_type<type_id>::type>(column->type().id())
+            << std::endl;
 
   auto const fb_name          = fbb.CreateString(column_metadata.get_name());
   auto const fb_children      = fbb.CreateVector(children.data(), children.size());
@@ -345,8 +315,7 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
 std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
                                                table_input_metadata const& metadata,
                                                single_write_mode const write_mode,
-                                               bool const utc_timestamps,
-                                               bool const int96_timestamps)
+                                               bool const utc_timestamps)
 {
   // Lambda function to convert int32 to a string of uint8 bytes
   auto const convert_int32_to_byte_string = [&](int32_t const value) {
@@ -368,12 +337,8 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con
                    thrust::make_tuple(linked_columns.end(), metadata.column_metadata.end())),
                  std::back_inserter(field_offsets),
                  [&](auto const& elem) {
-                   return make_arrow_schema_fields(fbb,
-                                                   thrust::get<0>(elem),
-                                                   thrust::get<1>(elem),
-                                                   write_mode,
-                                                   utc_timestamps,
-                                                   int96_timestamps);
+                   return make_arrow_schema_fields(
+                     fbb, thrust::get<0>(elem), thrust::get<1>(elem), write_mode, utc_timestamps);
                  });
 
   // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index 8ba48361eb3..309704d4e87 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -72,14 +72,12 @@ using namespace cudf::io::detail;
  * @param metadata Metadata of the columns of the table
  * @param write_mode Flag to indicate that we are guaranteeing a single table write
  * @param utc_timestamps Flag to indicate if timestamps are UTC
- * @param int96_timestamps Flag to indicate if timestamps was written as INT96
  *
  * @return The constructed arrow ipc message string
  */
 std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
                                                table_input_metadata const& metadata,
                                                single_write_mode const write_mode,
-                                               bool const utc_timestamps,
-                                               bool const int96_timestamps);
+                                               bool const utc_timestamps);
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 29ac5125a85..1c2b26f1f8f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -345,8 +345,8 @@ struct leaf_schema_fn {
   schema_tree_node& col_schema;
   cudf::detail::LinkedColPtr const& col;
   column_in_metadata const& col_meta;
-  bool timestamp_is_int96;
   bool timestamp_is_utc;
+  bool write_arrow_schema;
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, bool>, void> operator()()
@@ -468,72 +468,75 @@ struct leaf_schema_fn {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_s>, void> operator()()
   {
-    col_schema.type        = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
-    col_schema.ts_scale    = 1000;
-    if (not timestamp_is_int96) {
-      col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS;
-      col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}};
-    }
+    col_schema.type           = Type::INT64;
+    col_schema.stats_dtype    = statistics_dtype::dtype_timestamp64;
+    col_schema.ts_scale       = 1000;
+    col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS;
+    col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}};
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ms>, void> operator()()
   {
-    col_schema.type        = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
-    if (not timestamp_is_int96) {
-      col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS;
-      col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}};
-    }
+    col_schema.type           = Type::INT64;
+    col_schema.stats_dtype    = statistics_dtype::dtype_timestamp64;
+    col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS;
+    col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}};
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_us>, void> operator()()
   {
-    col_schema.type        = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
-    if (not timestamp_is_int96) {
-      col_schema.converted_type = ConvertedType::TIMESTAMP_MICROS;
-      col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MICROS}};
-    }
+    col_schema.type           = Type::INT64;
+    col_schema.stats_dtype    = statistics_dtype::dtype_timestamp64;
+    col_schema.converted_type = ConvertedType::TIMESTAMP_MICROS;
+    col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MICROS}};
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
   {
-    col_schema.type           = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
+    col_schema.type           = Type::INT64;
     col_schema.converted_type = thrust::nullopt;
     col_schema.stats_dtype    = statistics_dtype::dtype_timestamp64;
-    if (timestamp_is_int96) {
-      col_schema.ts_scale = -1000;  // negative value indicates division by absolute value
-    }
-    // set logical type if it's not int96
-    else {
-      col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::NANOS}};
-    }
+    col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::NANOS}};
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
   {
-    col_schema.type        = Type::INT64;
-    col_schema.stats_dtype = statistics_dtype::dtype_int64;
-    col_schema.ts_scale    = 24 * 60 * 60;
+    col_schema.type = (write_arrow_schema) ? Type::INT64 : Type::INT32;
+    col_schema.stats_dtype =
+      (write_arrow_schema) ? statistics_dtype::dtype_int64 : statistics_dtype::dtype_int32;
+    col_schema.ts_scale = 24 * 60 * 60;
+
+    if (not write_arrow_schema) {
+      col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+      col_schema.ts_scale *= 1000;
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
   {
-    col_schema.type        = Type::INT64;
-    col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    col_schema.type = (write_arrow_schema) ? Type::INT64 : Type::INT32;
+    col_schema.stats_dtype =
+      (write_arrow_schema) ? statistics_dtype::dtype_int64 : statistics_dtype::dtype_int32;
+    if (not write_arrow_schema) {
+      col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+      col_schema.ts_scale     = 1000;
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
   {
-    col_schema.type        = Type::INT64;
-    col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    col_schema.type = (write_arrow_schema) ? Type::INT64 : Type::INT32;
+    col_schema.stats_dtype =
+      (write_arrow_schema) ? statistics_dtype::dtype_int64 : statistics_dtype::dtype_int32;
+    if (not write_arrow_schema) {
+      col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    }
   }
 
   template <typename T>
@@ -541,6 +544,9 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    if (not write_arrow_schema) {
+      col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}};
+    }
   }
 
   template <typename T>
@@ -548,6 +554,9 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    if (not write_arrow_schema) {
+      col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}};
+    }
   }
 
   template <typename T>
@@ -609,8 +618,8 @@ std::vector<schema_tree_node> construct_parquet_schema_tree(
   cudf::detail::LinkedColVector const& linked_columns,
   table_input_metadata& metadata,
   single_write_mode write_mode,
-  bool int96_timestamps,
-  bool utc_timestamps)
+  bool utc_timestamps,
+  bool write_arrow_schema)
 {
   std::vector<schema_tree_node> schema;
   schema_tree_node root{};
@@ -876,11 +885,9 @@ std::vector<schema_tree_node> construct_parquet_schema_tree(
 
         schema_tree_node col_schema{};
 
-        bool timestamp_is_int96 = int96_timestamps or col_meta.is_enabled_int96_timestamps();
-
         cudf::type_dispatcher(
           col->type(),
-          leaf_schema_fn{col_schema, col, col_meta, timestamp_is_int96, utc_timestamps});
+          leaf_schema_fn{col_schema, col, col_meta, utc_timestamps, write_arrow_schema});
 
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
@@ -1133,19 +1140,17 @@ void calculate_page_fragments(device_span<PageFragment> frag,
  *
  * @param frag_stats output statistics
  * @param frags Input page fragments
- * @param int96_timestamps Flag to indicate if timestamps will be written as INT96
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void gather_fragment_statistics(device_span<statistics_chunk> frag_stats,
                                 device_span<PageFragment const> frags,
-                                bool int96_timestamps,
                                 rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<statistics_group> frag_stats_group(frag_stats.size(), stream);
 
   InitFragmentStatistics(frag_stats_group, frags, stream);
   detail::calculate_group_statistics<detail::io_file_format::PARQUET>(
-    frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream, int96_timestamps);
+    frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream);
   stream.synchronize();
 }
 
@@ -1655,7 +1660,6 @@ void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta)
  * @param dict_policy Policy for dictionary use
  * @param max_dictionary_size Maximum dictionary size, in bytes
  * @param single_write_mode Flag to indicate that we are guaranteeing a single table write
- * @param int96_timestamps Flag to indicate if timestamps will be written as INT96
  * @param utc_timestamps Flag to indicate if timestamps are UTC
  * @param write_v2_headers True if V2 page headers are to be written
  * @param out_sink Sink for checking if device write is supported, should not be used to write any
@@ -1680,7 +1684,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    dictionary_policy dict_policy,
                                    size_t max_dictionary_size,
                                    single_write_mode write_mode,
-                                   bool int96_timestamps,
                                    bool utc_timestamps,
                                    bool write_v2_headers,
                                    bool write_arrow_schema,
@@ -1689,7 +1692,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 {
   auto vec = table_to_linked_columns(input);
   auto schema_tree =
-    construct_parquet_schema_tree(vec, table_meta, write_mode, int96_timestamps, utc_timestamps);
+    construct_parquet_schema_tree(vec, table_meta, write_mode, utc_timestamps, write_arrow_schema);
   // Construct parquet_column_views from the schema tree leaf nodes.
   std::vector<parquet_column_view> parquet_columns;
 
@@ -1817,9 +1820,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       this_table_schema,
       num_columns,
       stats_granularity,
-      (write_arrow_schema) ? construct_arrow_schema_ipc_message(
-                               vec, table_meta, write_mode, utc_timestamps, int96_timestamps)
-                           : "");
+      (write_arrow_schema)
+        ? construct_arrow_schema_ipc_message(vec, table_meta, write_mode, utc_timestamps)
+        : "");
   } else {
     agg_meta = std::make_unique<aggregate_writer_metadata>(*curr_agg_meta);
 
@@ -1990,10 +1993,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
     // and gather fragment statistics
     if (not frag_stats.is_empty()) {
-      gather_fragment_statistics(frag_stats,
-                                 {page_fragments.device_ptr(), static_cast<size_t>(total_frags)},
-                                 int96_timestamps,
-                                 stream);
+      gather_fragment_statistics(
+        frag_stats, {page_fragments.device_ptr(), static_cast<size_t>(total_frags)}, stream);
     }
   }
 
@@ -2297,7 +2298,6 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _dict_policy(options.get_dictionary_policy()),
     _max_dictionary_size(options.get_max_dictionary_size()),
     _max_page_fragment_size(options.get_max_page_fragment_size()),
-    _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
     _write_arrow_schema(options.is_enabled_write_arrow_schema()),
@@ -2328,7 +2328,6 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _dict_policy(options.get_dictionary_policy()),
     _max_dictionary_size(options.get_max_dictionary_size()),
     _max_page_fragment_size(options.get_max_page_fragment_size()),
-    _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
     _write_arrow_schema(options.is_enabled_write_arrow_schema()),
@@ -2407,7 +2406,6 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                                            _dict_policy,
                                            _max_dictionary_size,
                                            _single_write_mode,
-                                           _int96_timestamps,
                                            _utc_timestamps,
                                            _write_v2_headers,
                                            _write_arrow_schema,
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 63128faf993..bcc8de13ceb 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -153,7 +153,6 @@ class writer::impl {
   dictionary_policy const _dict_policy;
   size_t const _max_dictionary_size;
   std::optional<size_type> const _max_page_fragment_size;
-  bool const _int96_timestamps;
   bool const _utc_timestamps;
   bool const _write_v2_headers;
   bool const _write_arrow_schema;
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index c11a6d026b9..886133e6763 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -412,7 +412,7 @@ def write_parquet(
     object column_encoding=None,
     object column_type_length=None,
     object output_as_binary=None,
-    write_arrow_schema=True,
+    write_arrow_schema=False,
 ):
     """
     Cython function to call into libcudf API, see `write_parquet`.
@@ -597,7 +597,7 @@ cdef class ParquetWriter:
         If ``True``, enable dictionary encoding for Parquet page data
         subject to ``max_dictionary_size`` constraints.
         If ``False``, disable dictionary encoding for Parquet page data.
-    write_arrow_schema : bool, default True
+    write_arrow_schema : bool, default False
         If ``True``, enable computing and writing arrow schema to Parquet
         file footer's key-value metadata section.
     See Also
@@ -628,7 +628,7 @@ cdef class ParquetWriter:
                   int max_page_size_rows=20000,
                   int max_dictionary_size=1048576,
                   bool use_dictionary=True,
-                  bool store_schema=True):
+                  bool store_schema=False):
         filepaths_or_buffers = (
             list(filepath_or_buffer)
             if is_list_like(filepath_or_buffer)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 220cef67bd8..a44e9612d8f 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1604,38 +1604,6 @@ def clone_field(table, name, datatype):
     assert_eq(expect, got)
 
 
-@pytest.mark.filterwarnings("ignore:Using CPU")
-def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf):
-    gdf_fname = tmpdir.join("gdf.parquet")
-
-    if len(pdf) == 0:
-        pdf = pdf.reset_index(drop=True)
-        gdf = gdf.reset_index(drop=True)
-
-    if "col_category" in pdf.columns:
-        pdf = pdf.drop(columns=["col_category"])
-    if "col_category" in gdf.columns:
-        gdf = gdf.drop(columns=["col_category"])
-
-    assert_eq(pdf, gdf)
-
-    # Write out the gdf using the GPU accelerated writer with INT96 timestamps
-    # INT96 timestamps have been deprecated in Arrow so set `store_schema=False`
-    gdf.to_parquet(
-        gdf_fname.strpath,
-        index=None,
-        int96_timestamps=True,
-        store_schema=False,
-    )
-
-    assert os.path.exists(gdf_fname)
-
-    expect = pdf
-    got = pd.read_parquet(gdf_fname)
-
-    # verify INT96 timestamps were converted back to the same data.
-    assert_eq(expect, got, check_categorical=False, check_dtype=False)
-
 
 def test_multifile_parquet_folder(tmpdir):
     test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2, dtype="float64")
@@ -3171,7 +3139,7 @@ def test_parquet_writer_time_delta_physical_type():
         }
     )
     buffer = BytesIO()
-    df.to_parquet(buffer)
+    df.to_parquet(buffer, store_schema=True)
 
     got = pd.read_parquet(buffer)
     expected = pd.DataFrame(
@@ -3209,7 +3177,7 @@ def test_parquet_roundtrip_time_delta():
         }
     )
     buffer = BytesIO()
-    df.to_parquet(buffer)
+    df.to_parquet(buffer, store_schema=True)
     # TODO: Remove `check_dtype` once following issue is fixed in arrow:
     # https://github.com/apache/arrow/issues/33321
     assert_eq(df, cudf.read_parquet(buffer), check_dtype=False)
@@ -3451,7 +3419,7 @@ def test_parquet_writer_roundtrip_with_arrow_schema():
 
     # Write to Parquet
     buffer = BytesIO()
-    expected.to_parquet(buffer)
+    expected.to_parquet(buffer, store_schema=True)
 
     # Read parquet with pyarrow and cudf readers
     got = cudf.DataFrame.from_arrow(pq.read_table(buffer))
@@ -3535,7 +3503,7 @@ def test_parquet_writer_roundtrip_structs_with_arrow_schema(tmpdir, data):
 
     # Write expected data frame to Parquet
     buffer = BytesIO()
-    expected.to_parquet(buffer)
+    expected.to_parquet(buffer, store_schema=True)
 
     # Read Parquet with pyarrow
     pa_got = pq.read_table(buffer)

From 04a19985be93f3f9201ad3716fb29ce4459fdbbf Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 30 May 2024 02:12:42 +0000
Subject: [PATCH 17/45] Disable the missed `store_schema` in parquet.py

---
 python/cudf/cudf/io/parquet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 25647d16271..ef5bd50053f 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -962,7 +962,7 @@ def to_parquet(
     column_encoding=None,
     column_type_length=None,
     output_as_binary=None,
-    store_schema=True,
+    store_schema=False,
     *args,
     **kwargs,
 ):

From ff22e7ddd173cd4f74d5f7540eec1a71271a77d8 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 30 May 2024 05:44:23 +0000
Subject: [PATCH 18/45] minor bug fixing

---
 cpp/examples/parquet_io/parquet_io.cpp     |  2 +-
 cpp/src/io/parquet/arrow_schema_writer.cpp |  4 --
 cpp/src/io/parquet/writer_impl.cu          | 12 +++--
 cpp/tests/io/parquet_writer_test.cpp       | 60 ++++++++++++++--------
 python/cudf/cudf/tests/test_parquet.py     | 55 +++++++++++++-------
 5 files changed, 84 insertions(+), 49 deletions(-)

diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 8be17db3781..90d956e578d 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -67,7 +67,7 @@ void write_parquet(cudf::table_view input,
                 table_metadata.column_metadata.end(),
                 [=](auto& col_meta) { col_meta.set_encoding(encoding); });
 
-  builder.metadata(table_metadata);
+  builder.metadata(table_metadata).write_arrow_schema(true);
   auto options = builder.build();
   options.set_compression(compression);
   // Either use the input stats level or don't write stats
diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index f3f3fba4bac..5f7fd9e9409 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -298,10 +298,6 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
     dispatch_to_flatbuf{
       fbb, column, column_metadata, write_mode, utc_timestamps, field_offset, type_type, children});
 
-  std::cout << "Name: " << column_metadata.get_name()
-            << ", Type: " << static_cast<std::underlying_type<type_id>::type>(column->type().id())
-            << std::endl;
-
   auto const fb_name          = fbb.CreateString(column_metadata.get_name());
   auto const fb_children      = fbb.CreateVector(children.data(), children.size());
   auto const is_nullable      = is_col_nullable(column, column_metadata, write_mode);
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 1c2b26f1f8f..5d1001633f8 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -94,11 +94,13 @@ struct aggregate_writer_metadata {
                      [](auto const& kv) {
                        return KeyValue{kv.first, kv.second};
                      });
-      // Append arrow schema to the key-value metadata
-      if (not arrow_schema_ipc_message.empty()) {
-        this->files[p].key_value_metadata.emplace_back(
-          KeyValue{"ARROW:schema", arrow_schema_ipc_message});
-      }
+    }
+
+    // Append arrow schema to the key-value metadata
+    if (not arrow_schema_ipc_message.empty()) {
+      std::for_each(this->files.begin(), this->files.end(), [&](auto& file) {
+        file.key_value_metadata.emplace_back(KeyValue{"ARROW:schema", arrow_schema_ipc_message});
+      });
     }
   }
 
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index ad0860e265e..509c89480e3 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -35,7 +35,7 @@
 using cudf::test::iterators::no_nulls;
 
 template <typename mask_op_t>
-void test_durations(mask_op_t mask_op, bool use_byte_stream_split)
+void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_schema)
 {
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution_d(0, 30);
@@ -73,23 +73,33 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split)
       col_meta.set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT);
     }
   }
-
-  auto filepath = temp_env->get_temp_filepath("Durations.parquet");
+  std::string a = (arrow_schema) ? "1" : "0";
+  auto filepath = "/home/coder/Durations" + a + ".parquet";
   cudf::io::parquet_writer_options out_opts =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .write_arrow_schema(arrow_schema);
+
   cudf::io::write_parquet(out_opts);
 
   cudf::io::parquet_reader_options in_opts =
-    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+      .use_arrow_schema(arrow_schema);
   auto result = cudf::io::read_parquet(in_opts);
 
-  auto durations_d_got =
-    cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());
-
-  auto durations_s_got =
-    cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view());
+  if (arrow_schema) {
+    auto durations_d_got =
+      cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(1));
+  } else {
+    auto durations_d_got =
+      cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());
+
+    auto durations_s_got =
+      cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view());
+  }
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3));
@@ -98,10 +108,15 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split)
 
 TEST_F(ParquetWriterTest, Durations)
 {
-  test_durations([](auto i) { return true; }, false);
-  test_durations([](auto i) { return (i % 2) != 0; }, false);
-  test_durations([](auto i) { return (i % 3) != 0; }, false);
-  test_durations([](auto i) { return false; }, false);
+  test_durations([](auto i) { return true; }, false, false);
+  test_durations([](auto i) { return (i % 2) != 0; }, false, false);
+  test_durations([](auto i) { return (i % 3) != 0; }, false, false);
+  test_durations([](auto i) { return false; }, false, false);
+
+  test_durations([](auto i) { return true; }, false, true);
+  test_durations([](auto i) { return (i % 2) != 0; }, false, true);
+  test_durations([](auto i) { return (i % 3) != 0; }, false, true);
+  test_durations([](auto i) { return false; }, false, true);
 }
 
 TEST_F(ParquetWriterTest, MultiIndex)
@@ -1866,10 +1881,15 @@ TEST_F(ParquetWriterTest, DecimalByteStreamSplit)
 
 TEST_F(ParquetWriterTest, DurationByteStreamSplit)
 {
-  test_durations([](auto i) { return true; }, true);
-  test_durations([](auto i) { return (i % 2) != 0; }, true);
-  test_durations([](auto i) { return (i % 3) != 0; }, true);
-  test_durations([](auto i) { return false; }, true);
+  test_durations([](auto i) { return true; }, true, false);
+  test_durations([](auto i) { return (i % 2) != 0; }, true, false);
+  test_durations([](auto i) { return (i % 3) != 0; }, true, false);
+  test_durations([](auto i) { return false; }, true, false);
+
+  test_durations([](auto i) { return true; }, true, true);
+  test_durations([](auto i) { return (i % 2) != 0; }, true, true);
+  test_durations([](auto i) { return (i % 3) != 0; }, true, true);
+  test_durations([](auto i) { return false; }, true, true);
 }
 
 TEST_F(ParquetWriterTest, WriteFixedLenByteArray)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index a44e9612d8f..460d917e95e 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2527,6 +2527,9 @@ def normalized_equals(value1, value2):
         value1 = value1.replace(tzinfo=None)
     if isinstance(value2, datetime.datetime):
         value2 = value2.replace(tzinfo=None)
+    if isinstance(value1, pd.Timedelta):
+        unit = "ms" if value1.unit == "s" else value1.unit
+        value2 = pd.Timedelta(value2, unit=unit)
 
     # if one is datetime then both values are datetimes now
     if isinstance(value1, datetime.datetime):
@@ -2540,7 +2543,8 @@ def normalized_equals(value1, value2):
 
 
 @pytest.mark.parametrize("add_nulls", [True, False])
-def test_parquet_writer_statistics(tmpdir, pdf, add_nulls):
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_writer_statistics(tmpdir, pdf, add_nulls, store_schema):
     file_path = tmpdir.join("cudf.parquet")
     if "col_category" in pdf.columns:
         pdf = pdf.drop(columns=["col_category", "col_bool"])
@@ -2557,7 +2561,7 @@ def test_parquet_writer_statistics(tmpdir, pdf, add_nulls):
     if add_nulls:
         for col in gdf:
             set_random_null_mask_inplace(gdf[col])
-    gdf.to_parquet(file_path, index=False)
+    gdf.to_parquet(file_path, index=False, store_schema=store_schema)
 
     # Read back from pyarrow
     pq_file = pq.ParquetFile(file_path)
@@ -3126,8 +3130,8 @@ def test_parquet_writer_zstd():
         got = pd.read_parquet(buff)
         assert_eq(expected, got)
 
-
-def test_parquet_writer_time_delta_physical_type():
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_writer_time_delta_physical_type(store_schema):
     df = cudf.DataFrame(
         {
             "s": cudf.Series([1], dtype="timedelta64[s]"),
@@ -3139,22 +3143,35 @@ def test_parquet_writer_time_delta_physical_type():
         }
     )
     buffer = BytesIO()
-    df.to_parquet(buffer, store_schema=True)
+    df.to_parquet(buffer, store_schema=store_schema)
 
     got = pd.read_parquet(buffer)
-    expected = pd.DataFrame(
-        {
-            "s": ["0 days 00:00:01"],
-            "ms": ["0 days 00:00:00.002000"],
-            "us": ["0 days 00:00:00.000003"],
-            "ns": ["0 days 00:00:00.000004"],
-        },
-        dtype="str",
-    )
+
+    if (store_schema):
+        expected = pd.DataFrame(
+            {
+                "s": ["0 days 00:00:01"],
+                "ms": ["0 days 00:00:00.002000"],
+                "us": ["0 days 00:00:00.000003"],
+                "ns": ["0 days 00:00:00.000004"],
+            },
+            dtype="str",
+        )
+    else:
+        expected = pd.DataFrame(
+            {
+                "s": ["00:00:01"],
+                "ms": ["00:00:00.002000"],
+                "us": ["00:00:00.000003"],
+                "ns": ["00:00:00.000004"],
+            },
+            dtype="str",
+        )
     assert_eq(got.astype("str"), expected)
 
 
-def test_parquet_roundtrip_time_delta():
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_roundtrip_time_delta(store_schema):
     num_rows = 12345
     df = cudf.DataFrame(
         {
@@ -3177,11 +3194,11 @@ def test_parquet_roundtrip_time_delta():
         }
     )
     buffer = BytesIO()
-    df.to_parquet(buffer, store_schema=True)
-    # TODO: Remove `check_dtype` once following issue is fixed in arrow:
-    # https://github.com/apache/arrow/issues/33321
+    df.to_parquet(buffer, store_schema=store_schema)
+    # `check_dtype` cannot be removed here as timedelta64[s] will change to `timedelta[ms]`
     assert_eq(df, cudf.read_parquet(buffer), check_dtype=False)
-
+    if (store_schema == True):
+        assert_eq(df, pd.read_parquet(buffer)) 
 
 def test_parquet_reader_malformed_file(datadir):
     fname = datadir / "nested-unsigned-malformed.parquet"

From d5f01beda3f41456703c048623238ba0084a166a Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 30 May 2024 07:46:25 +0000
Subject: [PATCH 19/45] Fixes for tests

---
 cpp/src/io/parquet/arrow_schema_writer.cpp | 22 +++++++++++++++++-----
 cpp/src/io/parquet/parquet.hpp             | 11 +++++++++++
 cpp/src/io/parquet/reader_impl_helpers.cpp |  2 +-
 cpp/src/io/parquet/writer_impl.cu          | 15 +++++----------
 cpp/tests/io/parquet_writer_test.cpp       | 11 ++++-------
 python/cudf/cudf/tests/test_parquet.py     |  9 +++++----
 6 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index 5f7fd9e9409..7d5419e2ebd 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -155,9 +155,15 @@ struct dispatch_to_flatbuf {
   }
 
   template <typename T>
-  std::enable_if_t<std::is_same_v<T, cudf::timestamp_D> or std::is_same_v<T, cudf::timestamp_s>,
-                   void>
-  operator()()
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_D>, void> operator()()
+  {
+    type_type = flatbuf::Type_Date;
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
+    field_offset = flatbuf::CreateDate(fbb, flatbuf::DateUnit_DAY).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_s>, void> operator()()
   {
     type_type = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
@@ -200,8 +206,14 @@ struct dispatch_to_flatbuf {
   }
 
   template <typename T>
-  std::enable_if_t<std::is_same_v<T, cudf::duration_D> or std::is_same_v<T, cudf::duration_s>, void>
-  operator()()
+  std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
+  {
+    type_type    = flatbuf::Type_Time;
+    field_offset = flatbuf::CreateTime(fbb, flatbuf::TimeUnit_MILLISECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
   {
     type_type    = flatbuf::Type_Duration;
     field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_SECOND).Union();
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index e35742c2527..b0734719002 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -57,6 +57,15 @@ struct TimeUnit {
   Type type;
 };
 
+struct DateUnit {
+  enum Type : char { DAYS = 0, MILLIS = 1 };
+  Type type;
+};
+
+struct DateType {
+  DateUnit unit = {DateUnit::DAYS};
+};
+
 struct TimeType {
   // Default to true because the timestamps are implicitly in UTC
   // Writer option overrides this default
@@ -97,12 +106,14 @@ struct LogicalType {
   thrust::optional<DecimalType> decimal_type;
   thrust::optional<TimeType> time_type;
   thrust::optional<TimestampType> timestamp_type;
+  thrust::optional<DateType> date_type;
   thrust::optional<IntType> int_type;
 
   LogicalType(Type tp = UNDEFINED) : type(tp) {}
   LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
   LogicalType(TimeType&& tt) : type(TIME), time_type(tt) {}
   LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {}
+  LogicalType(DateType&& date) : type(DATE), date_type(date) {}
   LogicalType(IntType&& it) : type(INTEGER), int_type(it) {}
 
   constexpr bool is_time_millis() const
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 565dc2e02f2..9c7a1348aec 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -45,7 +45,7 @@ thrust::optional<LogicalType> converted_to_logical_type(SchemaElement const& sch
       case MAP: return LogicalType{LogicalType::MAP};
       case LIST: return LogicalType{LogicalType::LIST};
       case DECIMAL: return LogicalType{DecimalType{schema.decimal_scale, schema.decimal_precision}};
-      case DATE: return LogicalType{LogicalType::DATE};
+      case DATE: return LogicalType{DateType{DateUnit::DAYS}};
       case TIME_MILLIS: return LogicalType{TimeType{true, TimeUnit::MILLIS}};
       case TIME_MICROS: return LogicalType{TimeType{true, TimeUnit::MICROS}};
       case TIMESTAMP_MILLIS: return LogicalType{TimestampType{true, TimeUnit::MILLIS}};
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 5d1001633f8..0bfbad8d260 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -464,7 +464,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::DATE;
     col_schema.stats_dtype    = statistics_dtype::dtype_int32;
-    col_schema.logical_type   = LogicalType{LogicalType::DATE};
+    col_schema.logical_type   = LogicalType{DateType{DateUnit::DAYS}};
   }
 
   template <typename T>
@@ -507,15 +507,10 @@ struct leaf_schema_fn {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
   {
-    col_schema.type = (write_arrow_schema) ? Type::INT64 : Type::INT32;
-    col_schema.stats_dtype =
-      (write_arrow_schema) ? statistics_dtype::dtype_int64 : statistics_dtype::dtype_int32;
-    col_schema.ts_scale = 24 * 60 * 60;
-
-    if (not write_arrow_schema) {
-      col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
-      col_schema.ts_scale *= 1000;
-    }
+    col_schema.type         = Type::INT32;
+    col_schema.stats_dtype  = statistics_dtype::dtype_int32;
+    col_schema.ts_scale     = 24 * 60 * 60 * 1000;
+    col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
   }
 
   template <typename T>
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 509c89480e3..58dfc49d4aa 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -86,16 +86,13 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc
       .use_arrow_schema(arrow_schema);
   auto result = cudf::io::read_parquet(in_opts);
 
+  auto durations_d_got =
+    cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());
+
   if (arrow_schema) {
-    auto durations_d_got =
-      cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(1));
   } else {
-    auto durations_d_got =
-      cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());
-
     auto durations_s_got =
       cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view());
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 460d917e95e..41f47899d84 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1604,7 +1604,6 @@ def clone_field(table, name, datatype):
     assert_eq(expect, got)
 
 
-
 def test_multifile_parquet_folder(tmpdir):
     test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2, dtype="float64")
     test_pdf2 = make_pdf(nrows=20, dtype="float64")
@@ -3130,6 +3129,7 @@ def test_parquet_writer_zstd():
         got = pd.read_parquet(buff)
         assert_eq(expected, got)
 
+
 @pytest.mark.parametrize("store_schema", [True, False])
 def test_parquet_writer_time_delta_physical_type(store_schema):
     df = cudf.DataFrame(
@@ -3147,7 +3147,7 @@ def test_parquet_writer_time_delta_physical_type(store_schema):
 
     got = pd.read_parquet(buffer)
 
-    if (store_schema):
+    if store_schema:
         expected = pd.DataFrame(
             {
                 "s": ["0 days 00:00:01"],
@@ -3197,8 +3197,9 @@ def test_parquet_roundtrip_time_delta(store_schema):
     df.to_parquet(buffer, store_schema=store_schema)
     # `check_dtype` cannot be removed here as timedelta64[s] will change to `timedelta[ms]`
     assert_eq(df, cudf.read_parquet(buffer), check_dtype=False)
-    if (store_schema == True):
-        assert_eq(df, pd.read_parquet(buffer)) 
+    if store_schema:
+        assert_eq(df, pd.read_parquet(buffer))
+
 
 def test_parquet_reader_malformed_file(datadir):
     fname = datadir / "nested-unsigned-malformed.parquet"

From 55296dfbe903587f9f101e1f6d002c158d014510 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 30 May 2024 19:37:27 +0000
Subject: [PATCH 20/45] Cleanup and restore int96timestamps for this PR.

---
 cpp/examples/parquet_io/parquet_io.cpp     |  2 +-
 cpp/include/cudf/types.hpp                 |  2 +-
 cpp/src/io/parquet/arrow_schema_writer.cpp | 48 +++++++++++---
 cpp/src/io/parquet/arrow_schema_writer.hpp |  1 +
 cpp/src/io/parquet/parquet_common.hpp      | 10 +--
 cpp/src/io/parquet/reader_impl_helpers.cpp | 10 ++-
 cpp/src/io/parquet/reader_impl_helpers.hpp |  2 +-
 cpp/src/io/parquet/writer_impl.cu          | 77 ++++++++++++++--------
 cpp/src/io/parquet/writer_impl.hpp         |  1 +
 cpp/tests/io/parquet_writer_test.cpp       |  4 +-
 python/cudf/cudf/tests/test_parquet.py     | 48 ++++++++++++--
 11 files changed, 150 insertions(+), 55 deletions(-)

diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 90d956e578d..8be17db3781 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -67,7 +67,7 @@ void write_parquet(cudf::table_view input,
                 table_metadata.column_metadata.end(),
                 [=](auto& col_meta) { col_meta.set_encoding(encoding); });
 
-  builder.metadata(table_metadata).write_arrow_schema(true);
+  builder.metadata(table_metadata);
   auto options = builder.build();
   options.set_compression(compression);
   // Either use the input stats level or don't write stats
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 466d53fcafc..baf07fa3db6 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -216,7 +216,7 @@ enum class type_id : int32_t {
   TIMESTAMP_MILLISECONDS,  ///< point in time in milliseconds since Unix Epoch in int64
   TIMESTAMP_MICROSECONDS,  ///< point in time in microseconds since Unix Epoch in int64
   TIMESTAMP_NANOSECONDS,   ///< point in time in nanoseconds since Unix Epoch in int64
-  DURATION_DAYS,           ///< time interval of days in int64
+  DURATION_DAYS,           ///< time interval of days in int32
   DURATION_SECONDS,        ///< time interval of seconds in int64
   DURATION_MILLISECONDS,   ///< time interval of milliseconds in int64
   DURATION_MICROSECONDS,   ///< time interval of microseconds in int64
diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index 7d5419e2ebd..6e56c2f1e35 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -55,6 +55,7 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
                                      cudf::detail::LinkedColPtr const& column,
                                      column_in_metadata const& column_metadata,
                                      single_write_mode const write_mode,
+                                     bool const int96_timestamps,
                                      bool const utc_timestamps);
 
 /**
@@ -65,6 +66,7 @@ struct dispatch_to_flatbuf {
   cudf::detail::LinkedColPtr const& col;
   column_in_metadata const& col_meta;
   single_write_mode const write_mode;
+  bool const int96_timestamps;
   bool const utc_timestamps;
   Offset& field_offset;
   flatbuf::Type& type_type;
@@ -165,6 +167,9 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_s>, void> operator()()
   {
+    // Check if writing INT96 timestamps with arrow schema
+    if (int96_timestamps) { CUDF_FAIL("INT96 timestamps are deprecated in arrow schema"); }
+
     type_type = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset = flatbuf::CreateTimestamp(
@@ -175,6 +180,9 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ms>, void> operator()()
   {
+    // Check if writing INT96 timestamps with arrow schema
+    if (int96_timestamps) { CUDF_FAIL("INT96 timestamps are deprecated in arrow schema"); }
+
     type_type = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset =
@@ -186,6 +194,9 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_us>, void> operator()()
   {
+    // Check if writing INT96 timestamps with arrow schema
+    if (int96_timestamps) { CUDF_FAIL("INT96 timestamps are deprecated in arrow schema"); }
+
     type_type = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset =
@@ -197,6 +208,9 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
   {
+    // Check if writing INT96 timestamps with arrow schema
+    if (int96_timestamps) { CUDF_FAIL("INT96 timestamps are deprecated in arrow schema"); }
+
     type_type = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset =
@@ -267,7 +281,7 @@ struct dispatch_to_flatbuf {
     // Hence, we only need to process the second child of the list.
     if constexpr (std::is_same_v<T, cudf::list_view>) {
       children.emplace_back(make_arrow_schema_fields(
-        fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps));
+        fbb, col->children[1], col_meta.child(1), write_mode, int96_timestamps, utc_timestamps));
       type_type    = flatbuf::Type_List;
       field_offset = flatbuf::CreateList(fbb).Union();
     }
@@ -278,8 +292,12 @@ struct dispatch_to_flatbuf {
                      thrust::make_counting_iterator(col->children.size()),
                      std::back_inserter(children),
                      [&](auto const idx) {
-                       return make_arrow_schema_fields(
-                         fbb, col->children[idx], col_meta.child(idx), write_mode, utc_timestamps);
+                       return make_arrow_schema_fields(fbb,
+                                                       col->children[idx],
+                                                       col_meta.child(idx),
+                                                       write_mode,
+                                                       int96_timestamps,
+                                                       utc_timestamps);
                      });
       type_type    = flatbuf::Type_Struct_;
       field_offset = flatbuf::CreateStruct_(fbb).Union();
@@ -299,16 +317,23 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
                                      cudf::detail::LinkedColPtr const& column,
                                      column_in_metadata const& column_metadata,
                                      single_write_mode const write_mode,
+                                     bool const int96_timestamps,
                                      bool const utc_timestamps)
 {
   Offset field_offset     = 0;
   flatbuf::Type type_type = flatbuf::Type_NONE;
   std::vector<FieldOffset> children;
 
-  cudf::type_dispatcher(
-    column->type(),
-    dispatch_to_flatbuf{
-      fbb, column, column_metadata, write_mode, utc_timestamps, field_offset, type_type, children});
+  cudf::type_dispatcher(column->type(),
+                        dispatch_to_flatbuf{fbb,
+                                            column,
+                                            column_metadata,
+                                            write_mode,
+                                            int96_timestamps,
+                                            utc_timestamps,
+                                            field_offset,
+                                            type_type,
+                                            children});
 
   auto const fb_name          = fbb.CreateString(column_metadata.get_name());
   auto const fb_children      = fbb.CreateVector(children.data(), children.size());
@@ -323,6 +348,7 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
 std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
                                                table_input_metadata const& metadata,
                                                single_write_mode const write_mode,
+                                               bool const int96_timestamps,
                                                bool const utc_timestamps)
 {
   // Lambda function to convert int32 to a string of uint8 bytes
@@ -345,8 +371,12 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con
                    thrust::make_tuple(linked_columns.end(), metadata.column_metadata.end())),
                  std::back_inserter(field_offsets),
                  [&](auto const& elem) {
-                   return make_arrow_schema_fields(
-                     fbb, thrust::get<0>(elem), thrust::get<1>(elem), write_mode, utc_timestamps);
+                   return make_arrow_schema_fields(fbb,
+                                                   thrust::get<0>(elem),
+                                                   thrust::get<1>(elem),
+                                                   write_mode,
+                                                   int96_timestamps,
+                                                   utc_timestamps);
                  });
 
   // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index 309704d4e87..15e4f63a9f9 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -78,6 +78,7 @@ using namespace cudf::io::detail;
 std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
                                                table_input_metadata const& metadata,
                                                single_write_mode const write_mode,
+                                               bool const int96_timestamps,
                                                bool const utc_timestamps);
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index 69c0a89fd57..e42c259b1bf 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cstdint>
+#include <string>
 
 namespace cudf::io::parquet::detail {
 
@@ -27,12 +28,13 @@ auto constexpr MAX_DECIMAL64_PRECISION  = 18;
 auto constexpr MAX_DECIMAL128_PRECISION = 38;  // log10(2^(sizeof(int128_t) * 8 - 1) - 1)
 
 // Constants copied from arrow source and renamed to match the case
-constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL         = sizeof(int32_t);
-constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t);
-constexpr int32_t IPC_CONTINUATION_TOKEN                             = -1;
+int32_t constexpr MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL         = sizeof(int32_t);
+int32_t constexpr MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t);
+int32_t constexpr IPC_CONTINUATION_TOKEN                             = -1;
+std::string const ARROW_SCHEMA_KEY                                   = "ARROW:schema";
 
 // Schema type ipc message has zero length body
-constexpr int64_t SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH = 0;
+int64_t constexpr SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH = 0;
 
 /**
  * @brief Basic data types in Parquet, determines how data is physically stored
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 9c7a1348aec..2998bf6f0eb 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -562,14 +562,14 @@ aggregate_reader_metadata::aggregate_reader_metadata(
   // Collect and apply arrow:schema from Parquet's key value metadata section
   if (use_arrow_schema) { apply_arrow_schema(); }
 
-  // Erase "ARROW:schema" from the output pfm if exists
+  // Erase ARROW_SCHEMA_KEY from the output pfm if exists
   std::for_each(
-    keyval_maps.begin(), keyval_maps.end(), [](auto& pfm) { pfm.erase("ARROW:schema"); });
+    keyval_maps.begin(), keyval_maps.end(), [](auto& pfm) { pfm.erase(ARROW_SCHEMA_KEY); });
 }
 
 arrow_schema_data_types aggregate_reader_metadata::collect_arrow_schema() const
 {
-  // Check the key_value metadata for ARROW:schema, decode and walk it
+  // Check the key_value metadata for arrow schema, decode and walk it
   // Function to convert from flatbuf::duration type to cudf::type_id
   auto const duration_from_flatbuffer = [](flatbuf::Duration const* duration) {
     // TODO: we only need this for arrow::DurationType for now. Else, we can take in a
@@ -643,9 +643,7 @@ arrow_schema_data_types aggregate_reader_metadata::collect_arrow_schema() const
       return true;
     };
 
-  // TODO: Should we check if any file has the "ARROW:schema" key
-  // Or if all files have the same "ARROW:schema"?
-  auto const it = keyval_maps[0].find("ARROW:schema");
+  auto const it = keyval_maps[0].find(ARROW_SCHEMA_KEY);
   if (it == keyval_maps[0].end()) { return {}; }
 
   // Decode the base64 encoded ipc message string
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 8b0f59ef33d..6bfa8519c76 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -145,7 +145,7 @@ class aggregate_reader_metadata {
     const;
 
   /**
-   * @brief Decodes and constructs the arrow schema from the "ARROW:schema" IPC message
+   * @brief Decodes and constructs the arrow schema from the ARROW_SCHEMA_KEY IPC message
    * in key value metadata section of Parquet file footer
    */
   [[nodiscard]] arrow_schema_data_types collect_arrow_schema() const;
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 0bfbad8d260..b49f0d3ea73 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -99,7 +99,7 @@ struct aggregate_writer_metadata {
     // Append arrow schema to the key-value metadata
     if (not arrow_schema_ipc_message.empty()) {
       std::for_each(this->files.begin(), this->files.end(), [&](auto& file) {
-        file.key_value_metadata.emplace_back(KeyValue{"ARROW:schema", arrow_schema_ipc_message});
+        file.key_value_metadata.emplace_back(KeyValue{ARROW_SCHEMA_KEY, arrow_schema_ipc_message});
       });
     }
   }
@@ -347,6 +347,7 @@ struct leaf_schema_fn {
   schema_tree_node& col_schema;
   cudf::detail::LinkedColPtr const& col;
   column_in_metadata const& col_meta;
+  bool timestamp_is_int96;
   bool timestamp_is_utc;
   bool write_arrow_schema;
 
@@ -470,38 +471,50 @@ struct leaf_schema_fn {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_s>, void> operator()()
   {
-    col_schema.type           = Type::INT64;
-    col_schema.stats_dtype    = statistics_dtype::dtype_timestamp64;
-    col_schema.ts_scale       = 1000;
-    col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS;
-    col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}};
+    col_schema.type        = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
+    col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+    col_schema.ts_scale    = 1000;
+    if (not timestamp_is_int96) {
+      col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS;
+      col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ms>, void> operator()()
   {
-    col_schema.type           = Type::INT64;
-    col_schema.stats_dtype    = statistics_dtype::dtype_timestamp64;
-    col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS;
-    col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}};
+    col_schema.type        = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
+    col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+    if (not timestamp_is_int96) {
+      col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS;
+      col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_us>, void> operator()()
   {
-    col_schema.type           = Type::INT64;
-    col_schema.stats_dtype    = statistics_dtype::dtype_timestamp64;
-    col_schema.converted_type = ConvertedType::TIMESTAMP_MICROS;
-    col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MICROS}};
+    col_schema.type        = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
+    col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+    if (not timestamp_is_int96) {
+      col_schema.converted_type = ConvertedType::TIMESTAMP_MICROS;
+      col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MICROS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
   {
-    col_schema.type           = Type::INT64;
+    col_schema.type           = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
     col_schema.converted_type = thrust::nullopt;
     col_schema.stats_dtype    = statistics_dtype::dtype_timestamp64;
-    col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::NANOS}};
+    if (timestamp_is_int96) {
+      col_schema.ts_scale = -1000;  // negative value indicates division by absolute value
+    }
+    // set logical type if it's not int96
+    else {
+      col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::NANOS}};
+    }
   }
 
   template <typename T>
@@ -615,6 +628,7 @@ std::vector<schema_tree_node> construct_parquet_schema_tree(
   cudf::detail::LinkedColVector const& linked_columns,
   table_input_metadata& metadata,
   single_write_mode write_mode,
+  bool int96_timestamps,
   bool utc_timestamps,
   bool write_arrow_schema)
 {
@@ -882,9 +896,12 @@ std::vector<schema_tree_node> construct_parquet_schema_tree(
 
         schema_tree_node col_schema{};
 
+        bool timestamp_is_int96 = int96_timestamps or col_meta.is_enabled_int96_timestamps();
+
         cudf::type_dispatcher(
           col->type(),
-          leaf_schema_fn{col_schema, col, col_meta, utc_timestamps, write_arrow_schema});
+          leaf_schema_fn{
+            col_schema, col, col_meta, timestamp_is_int96, utc_timestamps, write_arrow_schema});
 
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
@@ -1141,13 +1158,14 @@ void calculate_page_fragments(device_span<PageFragment> frag,
  */
 void gather_fragment_statistics(device_span<statistics_chunk> frag_stats,
                                 device_span<PageFragment const> frags,
+                                bool int96_timestamps,
                                 rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<statistics_group> frag_stats_group(frag_stats.size(), stream);
 
   InitFragmentStatistics(frag_stats_group, frags, stream);
   detail::calculate_group_statistics<detail::io_file_format::PARQUET>(
-    frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream);
+    frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream, int96_timestamps);
   stream.synchronize();
 }
 
@@ -1657,6 +1675,7 @@ void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta)
  * @param dict_policy Policy for dictionary use
  * @param max_dictionary_size Maximum dictionary size, in bytes
  * @param single_write_mode Flag to indicate that we are guaranteeing a single table write
+ * @param int96_timestamps Flag to indicate if timestamps will be written as INT96
  * @param utc_timestamps Flag to indicate if timestamps are UTC
  * @param write_v2_headers True if V2 page headers are to be written
  * @param out_sink Sink for checking if device write is supported, should not be used to write any
@@ -1681,15 +1700,16 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    dictionary_policy dict_policy,
                                    size_t max_dictionary_size,
                                    single_write_mode write_mode,
+                                   bool int96_timestamps,
                                    bool utc_timestamps,
                                    bool write_v2_headers,
                                    bool write_arrow_schema,
                                    host_span<std::unique_ptr<data_sink> const> out_sink,
                                    rmm::cuda_stream_view stream)
 {
-  auto vec = table_to_linked_columns(input);
-  auto schema_tree =
-    construct_parquet_schema_tree(vec, table_meta, write_mode, utc_timestamps, write_arrow_schema);
+  auto vec         = table_to_linked_columns(input);
+  auto schema_tree = construct_parquet_schema_tree(
+    vec, table_meta, write_mode, int96_timestamps, utc_timestamps, write_arrow_schema);
   // Construct parquet_column_views from the schema tree leaf nodes.
   std::vector<parquet_column_view> parquet_columns;
 
@@ -1817,9 +1837,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       this_table_schema,
       num_columns,
       stats_granularity,
-      (write_arrow_schema)
-        ? construct_arrow_schema_ipc_message(vec, table_meta, write_mode, utc_timestamps)
-        : "");
+      (write_arrow_schema) ? construct_arrow_schema_ipc_message(
+                               vec, table_meta, write_mode, int96_timestamps, utc_timestamps)
+                           : "");
   } else {
     agg_meta = std::make_unique<aggregate_writer_metadata>(*curr_agg_meta);
 
@@ -1990,8 +2010,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
     // and gather fragment statistics
     if (not frag_stats.is_empty()) {
-      gather_fragment_statistics(
-        frag_stats, {page_fragments.device_ptr(), static_cast<size_t>(total_frags)}, stream);
+      gather_fragment_statistics(frag_stats,
+                                 {page_fragments.device_ptr(), static_cast<size_t>(total_frags)},
+                                 int96_timestamps,
+                                 stream);
     }
   }
 
@@ -2295,6 +2317,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _dict_policy(options.get_dictionary_policy()),
     _max_dictionary_size(options.get_max_dictionary_size()),
     _max_page_fragment_size(options.get_max_page_fragment_size()),
+    _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
     _write_arrow_schema(options.is_enabled_write_arrow_schema()),
@@ -2325,6 +2348,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _dict_policy(options.get_dictionary_policy()),
     _max_dictionary_size(options.get_max_dictionary_size()),
     _max_page_fragment_size(options.get_max_page_fragment_size()),
+    _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
     _write_arrow_schema(options.is_enabled_write_arrow_schema()),
@@ -2403,6 +2427,7 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                                            _dict_policy,
                                            _max_dictionary_size,
                                            _single_write_mode,
+                                           _int96_timestamps,
                                            _utc_timestamps,
                                            _write_v2_headers,
                                            _write_arrow_schema,
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index bcc8de13ceb..63128faf993 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -153,6 +153,7 @@ class writer::impl {
   dictionary_policy const _dict_policy;
   size_t const _max_dictionary_size;
   std::optional<size_type> const _max_page_fragment_size;
+  bool const _int96_timestamps;
   bool const _utc_timestamps;
   bool const _write_v2_headers;
   bool const _write_arrow_schema;
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 58dfc49d4aa..82e4c4bd82f 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -73,8 +73,8 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_sc
       col_meta.set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT);
     }
   }
-  std::string a = (arrow_schema) ? "1" : "0";
-  auto filepath = "/home/coder/Durations" + a + ".parquet";
+
+  auto filepath = temp_env->get_temp_filepath("Durations.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .write_arrow_schema(arrow_schema);
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 41f47899d84..8561e2ac4b3 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1604,6 +1604,39 @@ def clone_field(table, name, datatype):
     assert_eq(expect, got)
 
 
+@pytest.mark.filterwarnings("ignore:Using CPU")
+def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf):
+    gdf_fname = tmpdir.join("gdf.parquet")
+
+    if len(pdf) == 0:
+        pdf = pdf.reset_index(drop=True)
+        gdf = gdf.reset_index(drop=True)
+
+    if "col_category" in pdf.columns:
+        pdf = pdf.drop(columns=["col_category"])
+    if "col_category" in gdf.columns:
+        gdf = gdf.drop(columns=["col_category"])
+
+    assert_eq(pdf, gdf)
+
+    # Write out the gdf using the GPU accelerated writer with INT96 timestamps
+    # TODO: store_schema must be false when working with INT96 timestamps
+    gdf.to_parquet(
+        gdf_fname.strpath,
+        index=None,
+        int96_timestamps=True,
+        store_schema=False,
+    )
+
+    assert os.path.exists(gdf_fname)
+
+    expect = pdf
+    got = pd.read_parquet(gdf_fname)
+
+    # verify INT96 timestamps were converted back to the same data.
+    assert_eq(expect, got, check_categorical=False, check_dtype=False)
+
+
 def test_multifile_parquet_folder(tmpdir):
     test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2, dtype="float64")
     test_pdf2 = make_pdf(nrows=20, dtype="float64")
@@ -3435,17 +3468,19 @@ def test_parquet_writer_roundtrip_with_arrow_schema():
         }
     )
 
-    # Write to Parquet
+    # Write to Parquet with arrow schema
     buffer = BytesIO()
     expected.to_parquet(buffer, store_schema=True)
 
-    # Read parquet with pyarrow and cudf readers
+    # Read parquet with pyarrow, pandas and cudf readers
     got = cudf.DataFrame.from_arrow(pq.read_table(buffer))
-    got2 = cudf.read_parquet(buffer)
+    got2 = cudf.DataFrame.from_pandas(pd.read_parquet(buffer))
+    got3 = cudf.read_parquet(buffer)
 
     # Check results
     assert_eq(expected, got)
     assert_eq(expected, got2)
+    assert_eq(expected, got3)
 
 
 @pytest.mark.parametrize(
@@ -3516,18 +3551,21 @@ def test_parquet_writer_roundtrip_structs_with_arrow_schema(tmpdir, data):
     # Ensure that the structs are faithfully being roundtripped across
     # Parquet with arrow schema
     pa_expected = pa.Table.from_pydict({"struct": data})
+    pd_expected = pa_expected.to_pandas()
 
     expected = cudf.DataFrame.from_arrow(pa_expected)
 
-    # Write expected data frame to Parquet
+    # Write expected data frame to Parquet with arrow schema
     buffer = BytesIO()
     expected.to_parquet(buffer, store_schema=True)
 
-    # Read Parquet with pyarrow
+    # Read Parquet with pyarrow and pandas
     pa_got = pq.read_table(buffer)
+    pd_got = pd.read_parquet(buffer)
 
     # Check results
     assert_eq(pa_expected, pa_got)
+    assert_eq(pd_expected, pd_got)
 
     # Convert to cuDF table and also read Parquet with cuDF reader
     got = cudf.DataFrame.from_arrow(pa_got)

From 706eb186805dfa352681d2ced0f14437ad6a9f0b Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 30 May 2024 20:16:56 +0000
Subject: [PATCH 21/45] Modify int96 and arrow schema option behavior

---
 cpp/src/io/parquet/arrow_schema_writer.cpp    | 48 ++++---------------
 cpp/src/io/parquet/arrow_schema_writer.hpp    |  1 -
 cpp/src/io/parquet/writer_impl.cu             | 19 ++++++--
 .../_lib/pylibcudf/libcudf/io/parquet.pxd     |  6 +++
 4 files changed, 29 insertions(+), 45 deletions(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index 6e56c2f1e35..7d5419e2ebd 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -55,7 +55,6 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
                                      cudf::detail::LinkedColPtr const& column,
                                      column_in_metadata const& column_metadata,
                                      single_write_mode const write_mode,
-                                     bool const int96_timestamps,
                                      bool const utc_timestamps);
 
 /**
@@ -66,7 +65,6 @@ struct dispatch_to_flatbuf {
   cudf::detail::LinkedColPtr const& col;
   column_in_metadata const& col_meta;
   single_write_mode const write_mode;
-  bool const int96_timestamps;
   bool const utc_timestamps;
   Offset& field_offset;
   flatbuf::Type& type_type;
@@ -167,9 +165,6 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_s>, void> operator()()
   {
-    // Check if writing INT96 timestamps with arrow schema
-    if (int96_timestamps) { CUDF_FAIL("INT96 timestamps are deprecated in arrow schema"); }
-
     type_type = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset = flatbuf::CreateTimestamp(
@@ -180,9 +175,6 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ms>, void> operator()()
   {
-    // Check if writing INT96 timestamps with arrow schema
-    if (int96_timestamps) { CUDF_FAIL("INT96 timestamps are deprecated in arrow schema"); }
-
     type_type = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset =
@@ -194,9 +186,6 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_us>, void> operator()()
   {
-    // Check if writing INT96 timestamps with arrow schema
-    if (int96_timestamps) { CUDF_FAIL("INT96 timestamps are deprecated in arrow schema"); }
-
     type_type = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset =
@@ -208,9 +197,6 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
   {
-    // Check if writing INT96 timestamps with arrow schema
-    if (int96_timestamps) { CUDF_FAIL("INT96 timestamps are deprecated in arrow schema"); }
-
     type_type = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset =
@@ -281,7 +267,7 @@ struct dispatch_to_flatbuf {
     // Hence, we only need to process the second child of the list.
     if constexpr (std::is_same_v<T, cudf::list_view>) {
       children.emplace_back(make_arrow_schema_fields(
-        fbb, col->children[1], col_meta.child(1), write_mode, int96_timestamps, utc_timestamps));
+        fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps));
       type_type    = flatbuf::Type_List;
       field_offset = flatbuf::CreateList(fbb).Union();
     }
@@ -292,12 +278,8 @@ struct dispatch_to_flatbuf {
                      thrust::make_counting_iterator(col->children.size()),
                      std::back_inserter(children),
                      [&](auto const idx) {
-                       return make_arrow_schema_fields(fbb,
-                                                       col->children[idx],
-                                                       col_meta.child(idx),
-                                                       write_mode,
-                                                       int96_timestamps,
-                                                       utc_timestamps);
+                       return make_arrow_schema_fields(
+                         fbb, col->children[idx], col_meta.child(idx), write_mode, utc_timestamps);
                      });
       type_type    = flatbuf::Type_Struct_;
       field_offset = flatbuf::CreateStruct_(fbb).Union();
@@ -317,23 +299,16 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
                                      cudf::detail::LinkedColPtr const& column,
                                      column_in_metadata const& column_metadata,
                                      single_write_mode const write_mode,
-                                     bool const int96_timestamps,
                                      bool const utc_timestamps)
 {
   Offset field_offset     = 0;
   flatbuf::Type type_type = flatbuf::Type_NONE;
   std::vector<FieldOffset> children;
 
-  cudf::type_dispatcher(column->type(),
-                        dispatch_to_flatbuf{fbb,
-                                            column,
-                                            column_metadata,
-                                            write_mode,
-                                            int96_timestamps,
-                                            utc_timestamps,
-                                            field_offset,
-                                            type_type,
-                                            children});
+  cudf::type_dispatcher(
+    column->type(),
+    dispatch_to_flatbuf{
+      fbb, column, column_metadata, write_mode, utc_timestamps, field_offset, type_type, children});
 
   auto const fb_name          = fbb.CreateString(column_metadata.get_name());
   auto const fb_children      = fbb.CreateVector(children.data(), children.size());
@@ -348,7 +323,6 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
 std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
                                                table_input_metadata const& metadata,
                                                single_write_mode const write_mode,
-                                               bool const int96_timestamps,
                                                bool const utc_timestamps)
 {
   // Lambda function to convert int32 to a string of uint8 bytes
@@ -371,12 +345,8 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con
                    thrust::make_tuple(linked_columns.end(), metadata.column_metadata.end())),
                  std::back_inserter(field_offsets),
                  [&](auto const& elem) {
-                   return make_arrow_schema_fields(fbb,
-                                                   thrust::get<0>(elem),
-                                                   thrust::get<1>(elem),
-                                                   write_mode,
-                                                   int96_timestamps,
-                                                   utc_timestamps);
+                   return make_arrow_schema_fields(
+                     fbb, thrust::get<0>(elem), thrust::get<1>(elem), write_mode, utc_timestamps);
                  });
 
   // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index 15e4f63a9f9..309704d4e87 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -78,7 +78,6 @@ using namespace cudf::io::detail;
 std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
                                                table_input_metadata const& metadata,
                                                single_write_mode const write_mode,
-                                               bool const int96_timestamps,
                                                bool const utc_timestamps);
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index b49f0d3ea73..ab338c4ab49 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1837,9 +1837,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       this_table_schema,
       num_columns,
       stats_granularity,
-      (write_arrow_schema) ? construct_arrow_schema_ipc_message(
-                               vec, table_meta, write_mode, int96_timestamps, utc_timestamps)
-                           : "");
+      (write_arrow_schema)
+        ? construct_arrow_schema_ipc_message(vec, table_meta, write_mode, utc_timestamps)
+        : "");
   } else {
     agg_meta = std::make_unique<aggregate_writer_metadata>(*curr_agg_meta);
 
@@ -2317,7 +2317,8 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _dict_policy(options.get_dictionary_policy()),
     _max_dictionary_size(options.get_max_dictionary_size()),
     _max_page_fragment_size(options.get_max_page_fragment_size()),
-    _int96_timestamps(options.is_enabled_int96_timestamps()),
+    _int96_timestamps(options.is_enabled_int96_timestamps() and
+                      not options.is_enabled_write_arrow_schema()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
     _write_arrow_schema(options.is_enabled_write_arrow_schema()),
@@ -2328,6 +2329,10 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _out_sink(std::move(sinks)),
     _compression_statistics{options.get_compression_statistics()}
 {
+  if (options.is_enabled_int96_timestamps() and options.is_enabled_write_arrow_schema()) {
+    CUDF_LOG_WARN("INT96 timestamps are deprecated in arrow schema. Disabling INT96 timestamps.");
+  }
+
   if (options.get_metadata()) {
     _table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
   }
@@ -2348,7 +2353,8 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _dict_policy(options.get_dictionary_policy()),
     _max_dictionary_size(options.get_max_dictionary_size()),
     _max_page_fragment_size(options.get_max_page_fragment_size()),
-    _int96_timestamps(options.is_enabled_int96_timestamps()),
+    _int96_timestamps(options.is_enabled_int96_timestamps() and
+                      not options.is_enabled_write_arrow_schema()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
     _write_arrow_schema(options.is_enabled_write_arrow_schema()),
@@ -2359,6 +2365,9 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _out_sink(std::move(sinks)),
     _compression_statistics{options.get_compression_statistics()}
 {
+  if (options.is_enabled_int96_timestamps() and options.is_enabled_write_arrow_schema()) {
+    CUDF_LOG_WARN("INT96 timestamps are deprecated in arrow schema. Disabling INT96 timestamps.");
+  }
   if (options.get_metadata()) {
     _table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
   }
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
index cb4ce142543..32245539d3c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
@@ -100,6 +100,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_column_chunks_file_paths(
             vector[string] column_chunks_file_paths
         ) except +
+        void set_int96_timestamps(
+            bool enabled
+        ) except +
+        void set_utc_timestamps(
+            bool enabled
+        ) except +
         void enable_int96_timestamps(bool val) except +
         void enable_utc_timestamps(bool val) except +
         void enable_write_arrow_schema(bool val) except +

From fa247b7f015eb3df7c52b60da66f1ff716c42ada Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 30 May 2024 20:23:25 +0000
Subject: [PATCH 22/45] Revert _use_arrow_schema to true

---
 cpp/include/cudf/io/parquet.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index ede1994312d..e13d7aab4bd 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -72,7 +72,7 @@ class parquet_reader_options {
   // Whether to use PANDAS metadata to load columns
   bool _use_pandas_metadata = true;
   // Whether to read and use ARROW schema
-  bool _use_arrow_schema = false;
+  bool _use_arrow_schema = true;
   // Cast timestamp columns to a specific type
   data_type _timestamp_type{type_id::EMPTY};
 

From 9607618d1e782e1933fba3620aee583bd80fac4b Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Fri, 31 May 2024 01:48:18 +0000
Subject: [PATCH 23/45] Add tests

---
 python/cudf/cudf/tests/test_parquet.py | 149 +++++++++++++++++++------
 1 file changed, 116 insertions(+), 33 deletions(-)

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 8561e2ac4b3..f2c46c1e192 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1798,10 +1798,11 @@ def test_parquet_write_bytes_io(simple_gdf):
     assert_eq(cudf.read_parquet(output), simple_gdf)
 
 
-def test_parquet_writer_bytes_io(simple_gdf):
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_writer_bytes_io(simple_gdf, store_schema):
     output = BytesIO()
 
-    writer = ParquetWriter(output)
+    writer = ParquetWriter(output, store_schema=store_schema)
     writer.write_table(simple_gdf)
     writer.write_table(simple_gdf)
     writer.close()
@@ -2133,7 +2134,8 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory):
 
 
 @pytest.mark.parametrize("cols", [None, ["b"]])
-def test_parquet_write_to_dataset(tmpdir_factory, cols):
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema):
     dir1 = tmpdir_factory.mktemp("dir1")
     dir2 = tmpdir_factory.mktemp("dir2")
     if cols is None:
@@ -2149,7 +2151,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
             "b": np.random.choice(np.arange(4), size=size),
         }
     )
-    gdf.to_parquet(dir1, partition_cols=cols)
+    gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema)
     cudf.io.write_to_dataset(gdf, dir2, partition_cols=cols)
 
     # Read back with cudf
@@ -2165,7 +2167,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
         }
     )
     with pytest.raises(ValueError):
-        gdf.to_parquet(dir1, partition_cols=cols)
+        gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema)
 
 
 @pytest.mark.parametrize(
@@ -2395,7 +2397,8 @@ def test_parquet_writer_list_large_mixed(tmpdir):
     assert_eq(expect, got)
 
 
-def test_parquet_writer_list_chunked(tmpdir):
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_writer_list_chunked(tmpdir, store_schema):
     table1 = cudf.DataFrame(
         {
             "a": list_gen(string_gen, 128, 80, 50),
@@ -2416,7 +2419,7 @@ def test_parquet_writer_list_chunked(tmpdir):
     expect = cudf.concat([table1, table2])
     expect = expect.reset_index(drop=True)
 
-    writer = ParquetWriter(fname)
+    writer = ParquetWriter(fname, store_schema=store_schema)
     writer.write_table(table1)
     writer.write_table(table2)
     writer.close()
@@ -3393,30 +3396,85 @@ def test_parquet_reader_roundtrip_with_arrow_schema():
     # Check results for reader with schema
     assert_eq(expected, got)
 
+    # Reset buffer
+    buffer = BytesIO()
+
+    # Write to buffer with cudf
+    expected.to_parquet(buffer, store_schema=True)
+
+    # Read parquet with arrow schema
+    got = cudf.read_parquet(buffer)
+    # Convert to cudf table for an apple to apple comparison
+    expected = cudf.from_pandas(pdf)
 
-def test_parquet_reader_roundtrip_structs_with_arrow_schema():
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        # struct
+        [
+            {"a": 1, "b": 2},
+            {"a": 10, "b": 20},
+            {"a": None, "b": 22},
+            {"a": None, "b": None},
+            {"a": 15, "b": None},
+        ],
+        # struct-of-list
+        [
+            {"a": 1, "b": 2, "c": [1, 2, 3]},
+            {"a": 10, "b": 20, "c": [4, 5]},
+            {"a": None, "b": 22, "c": [6]},
+            {"a": None, "b": None, "c": None},
+            {"a": 15, "b": None, "c": [-1, -2]},
+            None,
+            {"a": 100, "b": 200, "c": [-10, None, -20]},
+        ],
+        # list-of-struct
+        [
+            [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}],
+            None,
+            [{"a": 10, "b": 20}],
+            [{"a": 100, "b": 200}, {"a": None, "b": 300}, None],
+        ],
+        # struct-of-struct
+        [
+            {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2},
+            {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4},
+            {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6},
+            {"a": 7, "b": None, "c": 8},
+            {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None},
+            None,
+            {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10},
+        ],
+        # struct-with-mixed-types
+        [
+            {
+                "struct": {
+                    "payload": {
+                        "Domain": {
+                            "Name": "abc",
+                            "Id": {"Name": "host", "Value": "127.0.0.8"},
+                            "Duration": datetime.timedelta(minutes=12),
+                        },
+                        "StreamId": "12345678",
+                        "Duration": datetime.timedelta(minutes=4),
+                        "Offset": None,
+                        "Resource": [
+                            {
+                                "Name": "ZoneName",
+                                "Value": "RAPIDS",
+                                "Duration": datetime.timedelta(seconds=1),
+                            }
+                        ],
+                    }
+                }
+            }
+        ],
+    ],
+)
+def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmpdir, data):
     # Ensure that the structs with duration types are faithfully being
     # roundtripped across Parquet with arrow schema
-    data = {
-        "payload": {
-            "Domain": {
-                "Name": "abc",
-                "Id": {"Name": "host", "Value": "127.0.0.8"},
-                "Duration": datetime.timedelta(minutes=12),
-            },
-            "StreamId": "12345678",
-            "Duration": datetime.timedelta(minutes=4),
-            "Offset": None,
-            "Resource": [
-                {
-                    "Name": "ZoneName",
-                    "Value": "RAPIDS",
-                    "Duration": datetime.timedelta(seconds=1),
-                }
-            ],
-        }
-    }
-
     pdf = pd.DataFrame({"struct": pd.Series(data)})
 
     buffer = BytesIO()
@@ -3430,8 +3488,20 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema():
     # Check results
     assert_eq(expected, got)
 
+    # Reset buffer
+    buffer = BytesIO()
+
+    # Write to buffer with cudf
+    expected.to_parquet(buffer, store_schema=True)
+
+    # Read parquet with arrow schema
+    got = cudf.read_parquet(buffer)
+    # Convert to cudf table for an apple to apple comparison
+    expected = cudf.from_pandas(pdf)
+
 
-def test_parquet_writer_roundtrip_with_arrow_schema():
+@pytest.mark.parametrize("index", [None, True, False])
+def test_parquet_writer_roundtrip_with_arrow_schema(index):
     # Ensure that the concrete and nested types are faithfully being roundtripped
     # across Parquet with arrow schema
     expected = cudf.DataFrame(
@@ -3468,15 +3538,20 @@ def test_parquet_writer_roundtrip_with_arrow_schema():
         }
     )
 
-    # Write to Parquet with arrow schema
+    # Write to Parquet with arrow schema for faithful roundtrip
     buffer = BytesIO()
-    expected.to_parquet(buffer, store_schema=True)
+    expected.to_parquet(buffer, store_schema=True, index=index)
 
     # Read parquet with pyarrow, pandas and cudf readers
     got = cudf.DataFrame.from_arrow(pq.read_table(buffer))
     got2 = cudf.DataFrame.from_pandas(pd.read_parquet(buffer))
     got3 = cudf.read_parquet(buffer)
 
+    # drop the index column for comparison: __index_level_0__
+    if index:
+        got.drop(columns="__index_level_0__", inplace=True)
+        got2.drop(columns="__index_level_0__", inplace=True)
+
     # Check results
     assert_eq(expected, got)
     assert_eq(expected, got2)
@@ -3547,7 +3622,10 @@ def test_parquet_writer_roundtrip_with_arrow_schema():
         ],
     ],
 )
-def test_parquet_writer_roundtrip_structs_with_arrow_schema(tmpdir, data):
+@pytest.mark.parametrize("index", [None, True, False])
+def test_parquet_writer_roundtrip_structs_with_arrow_schema(
+    tmpdir, data, index
+):
     # Ensure that the structs are faithfully being roundtripped across
     # Parquet with arrow schema
     pa_expected = pa.Table.from_pydict({"struct": data})
@@ -3557,12 +3635,17 @@ def test_parquet_writer_roundtrip_structs_with_arrow_schema(tmpdir, data):
 
     # Write expected data frame to Parquet with arrow schema
     buffer = BytesIO()
-    expected.to_parquet(buffer, store_schema=True)
+    expected.to_parquet(buffer, store_schema=True, index=index)
 
     # Read Parquet with pyarrow and pandas
     pa_got = pq.read_table(buffer)
     pd_got = pd.read_parquet(buffer)
 
+    # drop the index column for comparison: __index_level_0__
+    if index:
+        pa_got = pa_got.drop(columns="__index_level_0__")
+        pd_got = pd_got.drop(columns="__index_level_0__")
+
     # Check results
     assert_eq(pa_expected, pa_got)
     assert_eq(pd_expected, pd_got)

From a044f3fd90cc016717355aa83ed183f53628bb77 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Fri, 31 May 2024 02:06:09 +0000
Subject: [PATCH 24/45] remove temp variables

---
 cpp/src/io/parquet/arrow_schema_writer.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index 7d5419e2ebd..30759d323d1 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -301,6 +301,7 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
                                      single_write_mode const write_mode,
                                      bool const utc_timestamps)
 {
+  // Variables to be set by the dispatch_to_flatbuf functor
   Offset field_offset     = 0;
   flatbuf::Type type_type = flatbuf::Type_NONE;
   std::vector<FieldOffset> children;
@@ -310,14 +311,15 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
     dispatch_to_flatbuf{
       fbb, column, column_metadata, write_mode, utc_timestamps, field_offset, type_type, children});
 
-  auto const fb_name          = fbb.CreateString(column_metadata.get_name());
-  auto const fb_children      = fbb.CreateVector(children.data(), children.size());
-  auto const is_nullable      = is_col_nullable(column, column_metadata, write_mode);
-  DictionaryOffset dictionary = 0;
-
   // push to field offsets vector
   return flatbuf::CreateField(
-    fbb, fb_name, is_nullable, type_type, field_offset, dictionary, fb_children);
+    fbb,
+    fbb.CreateString(column_metadata.get_name()),          // name
+    is_col_nullable(column, column_metadata, write_mode),  // nullable
+    type_type,                                             // type id
+    field_offset,                                          // field offset
+    {0},                                                   // DictionaryOffset
+    fbb.CreateVector(children.data(), children.size()));   // children vector
 }
 
 std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,

From a5ab9fbd1b60b8e37e3a60554ed8d28f43d886f1 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Fri, 31 May 2024 02:09:03 +0000
Subject: [PATCH 25/45] minor comments cleanup

---
 cpp/src/io/parquet/arrow_schema_writer.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index 30759d323d1..a9b3131166d 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -355,12 +355,13 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con
   // create an ipc message flatbuffer
   fbb.Finish(flatbuf::CreateMessage(
     fbb,
-    flatbuf::MetadataVersion_V5,   /* Metadata version V5 (latest) */
-    flatbuf::MessageHeader_Schema, /* Schema type message header */
-    flatbuf::CreateSchema(
-      fbb, flatbuf::Endianness::Endianness_Little, fbb.CreateVector(field_offsets))
-      .Union(),                               /* Build an arrow:schema from the field vector */
-    SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH /* Body length is zero for schema type ipc message */
+    flatbuf::MetadataVersion_V5,    // Metadata version V5 (latest)
+    flatbuf::MessageHeader_Schema,  // Schema type message header
+    flatbuf::CreateSchema(fbb,
+                          flatbuf::Endianness::Endianness_Little,
+                          fbb.CreateVector(field_offsets))
+      .Union(),                                // arrow:schema built from the field vector
+    SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH  // Body length is zero for schema type ipc message
     ));
 
   // Construct the final string and store it here to use its view in base64_encode

From 844a1d6030f9ba89d33deba9d7c0b48c67625b49 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Fri, 31 May 2024 02:25:09 +0000
Subject: [PATCH 26/45] revert convertedtype setting

---
 cpp/src/io/parquet/writer_impl.cu | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index ab338c4ab49..b9e31f5704f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -520,10 +520,13 @@ struct leaf_schema_fn {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
   {
-    col_schema.type         = Type::INT32;
-    col_schema.stats_dtype  = statistics_dtype::dtype_int32;
-    col_schema.ts_scale     = 24 * 60 * 60 * 1000;
-    col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    // duration_D is based on int32_t and not a valid arrow duration type so simply convert to
+    // time32(ms).
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+    col_schema.ts_scale       = 24 * 60 * 60 * 1000;
+    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
   }
 
   template <typename T>
@@ -532,9 +535,11 @@ struct leaf_schema_fn {
     col_schema.type = (write_arrow_schema) ? Type::INT64 : Type::INT32;
     col_schema.stats_dtype =
       (write_arrow_schema) ? statistics_dtype::dtype_int64 : statistics_dtype::dtype_int32;
+    // only write as time32 logical type if not writing arrow schema
     if (not write_arrow_schema) {
-      col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
-      col_schema.ts_scale     = 1000;
+      col_schema.converted_type = ConvertedType::TIME_MILLIS;
+      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+      col_schema.ts_scale       = 1000;
     }
   }
 
@@ -544,8 +549,10 @@ struct leaf_schema_fn {
     col_schema.type = (write_arrow_schema) ? Type::INT64 : Type::INT32;
     col_schema.stats_dtype =
       (write_arrow_schema) ? statistics_dtype::dtype_int64 : statistics_dtype::dtype_int32;
+    // only write as time32 logical type if not writing arrow schema
     if (not write_arrow_schema) {
-      col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+      col_schema.converted_type = ConvertedType::TIME_MILLIS;
+      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
     }
   }
 
@@ -554,8 +561,10 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    // only write as time64 logical type if not writing arrow schema
     if (not write_arrow_schema) {
-      col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}};
+      col_schema.converted_type = ConvertedType::TIME_MICROS;
+      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}};
     }
   }
 
@@ -564,6 +573,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    // only write as time64 logical type if not writing arrow schema
     if (not write_arrow_schema) {
       col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}};
     }

From dc1608c446d0ef38e1fc11bd9309b1b4f654e5ce Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 4 Jun 2024 08:30:18 +0000
Subject: [PATCH 27/45] Add decimal column conversion

---
 cpp/src/io/parquet/arrow_schema_writer.cpp |  24 +--
 cpp/src/io/parquet/arrow_schema_writer.hpp |  25 ---
 cpp/src/io/parquet/writer_impl.cu          | 217 ++++++++++++++-------
 cpp/src/io/parquet/writer_impl_helpers.cpp | 107 ++++++++++
 cpp/src/io/parquet/writer_impl_helpers.hpp |  80 ++++++++
 cpp/tests/io/parquet_writer_test.cpp       |  44 +++++
 python/cudf/cudf/tests/test_parquet.py     |  12 +-
 7 files changed, 399 insertions(+), 110 deletions(-)
 create mode 100644 cpp/src/io/parquet/writer_impl_helpers.cpp
 create mode 100644 cpp/src/io/parquet/writer_impl_helpers.hpp

diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index a9b3131166d..accc5e52533 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -25,6 +25,7 @@
 #include "io/utilities/base64_utilities.hpp"
 #include "ipc/Message_generated.h"
 #include "ipc/Schema_generated.h"
+#include "writer_impl_helpers.hpp"
 
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/utilities/error.hpp>
@@ -243,19 +244,20 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
   {
-    if constexpr (std::is_same_v<T, numeric::decimal128>) {
-      type_type = flatbuf::Type_Decimal;
-      field_offset =
-        flatbuf::CreateDecimal(fbb, col_meta.get_decimal_precision(), col->type().scale(), 128)
-          .Union();
-    }
-    // cuDF-PQ writer supports ``decimal32`` and ``decimal64`` types, not directly supported by
-    // Arrow without explicit conversion. See more:
-    // https://github.com/rapidsai/cudf/blob/branch-24.08/cpp/src/interop/to_arrow.cu#L155.
-    else {
-      // TODO: Should we fail here or just not write arrow schema?.
+    if constexpr (not std::is_same_v<T, numeric::decimal128>) {
+      // ``decimal32`` and ``decimal64`` types are not supported by
+      // Arrow without explicit conversion.
       CUDF_FAIL("Fixed point types smaller than `decimal128` are not supported in arrow schema");
     }
+
+    type_type    = flatbuf::Type_Decimal;
+    field_offset = flatbuf::CreateDecimal(fbb,
+                                          (col_meta.is_decimal_precision_set())
+                                            ? col_meta.get_decimal_precision()
+                                            : MAX_DECIMAL128_PRECISION,
+                                          col->type().scale(),
+                                          128)
+                     .Union();
   }
 
   template <typename T>
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index 309704d4e87..4043889ea99 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -34,31 +34,6 @@ namespace cudf::io::parquet::detail {
 
 using namespace cudf::io::detail;
 
-/**
- * @brief Returns ``true`` if the column is nullable or if the write mode is not
- *        set to write the table all at once instead of chunked
- *
- * @param column A view of the column
- * @param column_metadata Metadata of the column
- * @param write_mode Flag to indicate that we are guaranteeing a single table write
- *
- * @return Whether the column is nullable.
- */
-[[nodiscard]] inline bool is_col_nullable(cudf::detail::LinkedColPtr const& column,
-                                          column_in_metadata const& column_metadata,
-                                          single_write_mode write_mode)
-{
-  if (column_metadata.is_nullability_defined()) {
-    CUDF_EXPECTS(column_metadata.nullable() or column->null_count() == 0,
-                 "Mismatch in metadata prescribed nullability and input column. "
-                 "Metadata for input column with nulls cannot prescribe nullability = false");
-    return column_metadata.nullable();
-  }
-  // For chunked write, when not provided nullability, we assume the worst case scenario
-  // that all columns are nullable.
-  return write_mode == single_write_mode::NO or column->nullable();
-}
-
 /**
  * @brief Construct and return arrow schema from input parquet schema
  *
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index b9e31f5704f..ef2e3e54245 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -31,6 +31,7 @@
 #include "parquet_common.hpp"
 #include "parquet_gpu.cuh"
 #include "writer_impl.hpp"
+#include "writer_impl_helpers.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/copying.hpp>
@@ -192,26 +193,6 @@ struct aggregate_writer_metadata {
 
 namespace {
 
-/**
- * @brief Function that translates GDF compression to parquet compression.
- *
- * @param compression The compression type
- * @return The supported Parquet compression
- */
-Compression to_parquet_compression(compression_type compression)
-{
-  switch (compression) {
-    case compression_type::AUTO:
-    case compression_type::SNAPPY: return Compression::SNAPPY;
-    case compression_type::ZSTD: return Compression::ZSTD;
-    case compression_type::LZ4:
-      // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
-      return Compression::LZ4_RAW;
-    case compression_type::NONE: return Compression::UNCOMPRESSED;
-    default: CUDF_FAIL("Unsupported compression type");
-  }
-}
-
 /**
  * @brief Convert a mask of encodings to a vector.
  *
@@ -582,25 +563,30 @@ struct leaf_schema_fn {
   template <typename T>
   std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
   {
-    if (std::is_same_v<T, numeric::decimal32>) {
-      col_schema.type              = Type::INT32;
-      col_schema.stats_dtype       = statistics_dtype::dtype_int32;
-      col_schema.decimal_precision = MAX_DECIMAL32_PRECISION;
-      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}};
-    } else if (std::is_same_v<T, numeric::decimal64>) {
-      col_schema.type              = Type::INT64;
-      col_schema.stats_dtype       = statistics_dtype::dtype_decimal64;
-      col_schema.decimal_precision = MAX_DECIMAL64_PRECISION;
-      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}};
-    } else if (std::is_same_v<T, numeric::decimal128>) {
+    // If writing arrow schema, then convert d32 and d64 to d128
+    if (write_arrow_schema or std::is_same_v<T, numeric::decimal128>) {
       col_schema.type              = Type::FIXED_LEN_BYTE_ARRAY;
       col_schema.type_length       = sizeof(__int128_t);
       col_schema.stats_dtype       = statistics_dtype::dtype_decimal128;
       col_schema.decimal_precision = MAX_DECIMAL128_PRECISION;
       col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL128_PRECISION}};
     } else {
-      CUDF_FAIL("Unsupported fixed point type for parquet writer");
+      if (std::is_same_v<T, numeric::decimal32>) {
+        col_schema.type              = Type::INT32;
+        col_schema.stats_dtype       = statistics_dtype::dtype_int32;
+        col_schema.decimal_precision = MAX_DECIMAL32_PRECISION;
+        col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}};
+      } else if (std::is_same_v<T, numeric::decimal64>) {
+        col_schema.type              = Type::INT64;
+        col_schema.stats_dtype       = statistics_dtype::dtype_decimal64;
+        col_schema.decimal_precision = MAX_DECIMAL64_PRECISION;
+        col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}};
+      } else {
+        CUDF_FAIL("Unsupported fixed point type for parquet writer");
+      }
     }
+
+    // Write logical and converted types, decimal scale and precision
     col_schema.converted_type = ConvertedType::DECIMAL;
     col_schema.decimal_scale = -col->type().scale();  // parquet and cudf disagree about scale signs
     col_schema.logical_type->decimal_type->scale = -col->type().scale();
@@ -1179,32 +1165,6 @@ void gather_fragment_statistics(device_span<statistics_chunk> frag_stats,
   stream.synchronize();
 }
 
-auto to_nvcomp_compression_type(Compression codec)
-{
-  if (codec == Compression::SNAPPY) return nvcomp::compression_type::SNAPPY;
-  if (codec == Compression::ZSTD) return nvcomp::compression_type::ZSTD;
-  // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
-  if (codec == Compression::LZ4_RAW) return nvcomp::compression_type::LZ4;
-  CUDF_FAIL("Unsupported compression type");
-}
-
-auto page_alignment(Compression codec)
-{
-  if (codec == Compression::UNCOMPRESSED or
-      nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) {
-    return 1u;
-  }
-
-  return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec));
-}
-
-size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize)
-{
-  if (codec == Compression::UNCOMPRESSED) return 0;
-
-  return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
-}
-
 auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
                      device_span<parquet_column_device_view const> col_desc,
                      uint32_t num_columns,
@@ -1644,23 +1604,125 @@ size_t column_index_buffer_size(EncColumnChunk* ck,
 }
 
 /**
- * @brief Fill the table metadata with default column names.
+ * @brief Convert decimal32 and decimal64 data to decimal128 and return the device vector
+ *
+ * @tparam DecimalType to convert from
+ *
+ * @param column A view of the input columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return A device vector containing the converted decimal128 data
+ */
+template <typename DecimalType>
+rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& column,
+                                                           rmm::cuda_stream_view stream)
+{
+  size_type constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType);
+
+  rmm::device_uvector<__int128_t> d128_buffer(column.size(), stream);
+
+  thrust::for_each(rmm::exec_policy(stream),
+                   thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(column.size()),
+                   [in  = column.begin<DecimalType>(),
+                    out = reinterpret_cast<DecimalType*>(d128_buffer.data()),
+                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
+                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+                     // The lowest order bits are the value, the remainder
+                     // simply matches the sign bit to satisfy the two's
+                     // complement integer representation of negative numbers.
+                     out[out_idx] = in[in_idx];
+#pragma unroll BIT_WIDTH_RATIO - 1
+                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+                     }
+                   });
+
+  return d128_buffer;
+}
+
+/**
+ * @brief Helper function to convert decimal32 and decimal64 columns to decimal128 data,
+ *        update the input table metadata, and return a new vector of column views.
  *
- * @param table_meta The table metadata to fill
+ * @param[in,out] table_meta The table metadata
+ * @param input The input table
+ * @param[in,out] d128_vectors Vector containing the computed decimal128 data buffers.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return A device vector containing the converted decimal128 data
  */
-void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta)
+std::vector<column_view> convert_decimal_columns_and_metadata(
+  table_input_metadata& table_meta,
+  table_view const& table,
+  std::vector<rmm::device_uvector<__int128_t>>& d128_vectors,
+  rmm::cuda_stream_view stream)
 {
-  // Fill unnamed columns' names in table_meta
-  std::function<void(column_in_metadata&, std::string)> add_default_name =
-    [&](column_in_metadata& col_meta, std::string default_name) {
-      if (col_meta.get_name().empty()) col_meta.set_name(default_name);
-      for (size_type i = 0; i < col_meta.num_children(); ++i) {
-        add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i));
-      }
-    };
-  for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) {
-    add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i));
-  }
+  std::vector<column_view> converted_column_views{table.begin(), table.end()};
+
+  std::function<void(column_view&, column_in_metadata&)> convert_column =
+    [&](column_view& column, column_in_metadata& metadata) -> void {
+    // Vector of passable-by-reference children column views
+    std::vector<column_view> converted_children{column.child_begin(), column.child_end()};
+    // Process children column views first
+    std::for_each(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(column.num_children()),
+      [&](auto const idx) { convert_column(converted_children[idx], metadata.child(idx)); });
+
+    // Process this column view. Only convert if decimal32 and decimal64 column.
+    switch (column.type().id()) {
+      case type_id::DECIMAL32:
+        // Convert data to decimal128 type
+        d128_vectors.push_back(convert_data_to_decimal128<int32_t>(column, stream));
+        // Update metadata
+        metadata.set_decimal_precision(MAX_DECIMAL32_PRECISION);
+        metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
+        // Create a new column view from the d128 data vector
+        column = column_view{data_type{type_id::DECIMAL128, column.type().scale()},
+                             column.size(),
+                             d128_vectors.back().data(),
+                             column.null_mask(),
+                             column.null_count(),
+                             column.offset(),
+                             converted_children};
+        break;
+      case type_id::DECIMAL64:
+        // Convert data to decimal128 type
+        d128_vectors.push_back(convert_data_to_decimal128<int64_t>(column, stream));
+        // Update metadata
+        metadata.set_decimal_precision(MAX_DECIMAL64_PRECISION);
+        metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
+        // Create a new column view from the d128 data vector
+        column = column_view{data_type{type_id::DECIMAL128, column.type().scale()},
+                             column.size(),
+                             d128_vectors.back().data(),
+                             column.null_mask(),
+                             column.null_count(),
+                             column.offset(),
+                             converted_children};
+        break;
+      default:
+        // Update the children vector keeping everything else the same
+        column = column_view{column.type(),
+                             column.size(),
+                             column.head(),
+                             column.null_mask(),
+                             column.null_count(),
+                             column.offset(),
+                             converted_children};
+        break;
+    }
+  };
+
+  // Convert each column view
+  std::for_each(thrust::make_zip_iterator(thrust::make_tuple(converted_column_views.begin(),
+                                                             table_meta.column_metadata.begin())),
+                thrust::make_zip_iterator(thrust::make_tuple(converted_column_views.end(),
+                                                             table_meta.column_metadata.end())),
+                [&](auto elem) { convert_column(thrust::get<0>(elem), thrust::get<1>(elem)); });
+
+  return converted_column_views;
 }
 
 /**
@@ -1717,7 +1779,16 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    host_span<std::unique_ptr<data_sink> const> out_sink,
                                    rmm::cuda_stream_view stream)
 {
-  auto vec         = table_to_linked_columns(input);
+  // Container to store decimal128 converted data if needed
+  std::vector<rmm::device_uvector<__int128_t>> d128_vectors;
+
+  // Convert decimal32/decimal64 data to decimal128 if writing arrow schema
+  // and initialize LinkedColVector
+  auto vec = table_to_linked_columns(
+    (write_arrow_schema)
+      ? table_view({convert_decimal_columns_and_metadata(table_meta, input, d128_vectors, stream)})
+      : input);
+
   auto schema_tree = construct_parquet_schema_tree(
     vec, table_meta, write_mode, int96_timestamps, utc_timestamps, write_arrow_schema);
   // Construct parquet_column_views from the schema tree leaf nodes.
diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
new file mode 100644
index 00000000000..b2400ec19b2
--- /dev/null
+++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file writer_impl_helpers.cpp
+ * @brief Helper function implementation for Parquet writer
+ */
+
+#include "writer_impl_helpers.hpp"
+
+namespace cudf::io::parquet::detail {
+
+using namespace cudf::io::detail;
+
+Compression to_parquet_compression(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::AUTO:
+    case compression_type::SNAPPY: return Compression::SNAPPY;
+    case compression_type::ZSTD: return Compression::ZSTD;
+    case compression_type::LZ4:
+      // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
+      return Compression::LZ4_RAW;
+    case compression_type::NONE: return Compression::UNCOMPRESSED;
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+}
+
+nvcomp::compression_type to_nvcomp_compression_type(Compression codec)
+{
+  if (codec == Compression::SNAPPY) return nvcomp::compression_type::SNAPPY;
+  if (codec == Compression::ZSTD) return nvcomp::compression_type::ZSTD;
+  // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
+  if (codec == Compression::LZ4_RAW) return nvcomp::compression_type::LZ4;
+  CUDF_FAIL("Unsupported compression type");
+}
+
+uint32_t page_alignment(Compression codec)
+{
+  if (codec == Compression::UNCOMPRESSED or
+      nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) {
+    return 1u;
+  }
+
+  return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec));
+}
+
+size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize)
+{
+  if (codec == Compression::UNCOMPRESSED) return 0;
+
+  return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
+}
+
+void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta)
+{
+  // Fill unnamed columns' names in table_meta
+  std::function<void(column_in_metadata&, std::string)> add_default_name =
+    [&](column_in_metadata& col_meta, std::string default_name) {
+      if (col_meta.get_name().empty()) col_meta.set_name(default_name);
+      for (size_type i = 0; i < col_meta.num_children(); ++i) {
+        add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i));
+      }
+    };
+  for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) {
+    add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i));
+  }
+}
+
+[[nodiscard]] bool is_col_nullable(cudf::detail::LinkedColPtr const& column,
+                                   column_in_metadata const& column_metadata,
+                                   single_write_mode write_mode)
+{
+  if (column_metadata.is_nullability_defined()) {
+    CUDF_EXPECTS(column_metadata.nullable() or column->null_count() == 0,
+                 "Mismatch in metadata prescribed nullability and input column. "
+                 "Metadata for input column with nulls cannot prescribe nullability = false");
+    return column_metadata.nullable();
+  }
+  // For chunked write, when not provided nullability, we assume the worst case scenario
+  // that all columns are nullable.
+  return write_mode == single_write_mode::NO or column->nullable();
+}
+
+[[nodiscard]] bool is_col_fixed_width(column_view const& column)
+{
+  if (column.type().id() == type_id::STRUCT) {
+    return std::all_of(column.child_begin(), column.child_end(), is_col_fixed_width);
+  }
+
+  return is_fixed_width(column.type());
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp
new file mode 100644
index 00000000000..9ffa4538134
--- /dev/null
+++ b/cpp/src/io/parquet/writer_impl_helpers.hpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file writer_impl_helpers.hpp
+ * @brief Helper function implementation for Parquet writer
+ */
+
+#pragma once
+#include "io/comp/nvcomp_adapter.hpp"
+#include "parquet_common.hpp"
+
+#include <cudf/detail/utilities/linked_column.hpp>
+#include <cudf/io/detail/parquet.hpp>
+
+namespace cudf::io::parquet::detail {
+
+using namespace cudf::io::detail;
+
+/**
+ * @brief Function that translates GDF compression to parquet compression.
+ *
+ * @param compression The compression type
+ * @return The supported Parquet compression
+ */
+Compression to_parquet_compression(compression_type compression);
+
+nvcomp::compression_type to_nvcomp_compression_type(Compression codec);
+
+uint32_t page_alignment(Compression codec);
+
+size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize);
+
+/**
+ * @brief Fill the table metadata with default column names.
+ *
+ * @param table_meta The table metadata to fill
+ */
+void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta);
+
+/**
+ * @brief Returns ``true`` if the column is nullable or if the write mode is not
+ *        set to write the table all at once instead of chunked
+ *
+ * @param column A view of the column
+ * @param column_metadata Metadata of the column
+ * @param write_mode Flag to indicate that we are guaranteeing a single table write
+ *
+ * @return Whether the column is nullable.
+ */
+[[nodiscard]] bool is_col_nullable(cudf::detail::LinkedColPtr const& column,
+                                   column_in_metadata const& column_metadata,
+                                   single_write_mode write_mode);
+/**
+ * @brief Returns ``true`` if the given column has a fixed size.
+ *
+ * This doesn't check every row, so assumes string and list columns are not fixed, even
+ * if each row is the same width.
+ * TODO: update this if FIXED_LEN_BYTE_ARRAY is ever supported for writes.
+ *
+ * @param column A view of the column
+ *
+ * @return Whether the column has a fixed size
+ */
+[[nodiscard]] bool is_col_fixed_width(column_view const& column);
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 82e4c4bd82f..e3430c16363 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -503,6 +503,50 @@ TEST_F(ParquetWriterTest, DecimalWrite)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, table);
 }
 
+TEST_F(ParquetWriterTest, DecimalWriteWithArrowSchema)
+{
+  constexpr cudf::size_type num_rows = 500;
+  auto seq_col0                      = random_values<int32_t>(num_rows);
+  auto seq_col1                      = random_values<int64_t>(num_rows);
+
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
+
+  auto col0 = cudf::test::fixed_point_column_wrapper<int32_t>{
+    seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}};
+  auto col1 = cudf::test::fixed_point_column_wrapper<int64_t>{
+    seq_col1.begin(), seq_col1.end(), valids, numeric::scale_type{-9}};
+
+  auto table = table_view({col0, col1});
+
+  auto filepath = temp_env->get_temp_filepath("DecimalWriteWithArrowSchema.parquet");
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table)
+      .write_arrow_schema(true);
+
+  cudf::io::table_input_metadata expected_metadata(table);
+  // verify success if equal precision is given
+  expected_metadata.column_metadata[0].set_decimal_precision(
+    cudf::io::parquet::detail::MAX_DECIMAL32_PRECISION);
+  expected_metadata.column_metadata[1].set_decimal_precision(
+    cudf::io::parquet::detail::MAX_DECIMAL64_PRECISION);
+  args.set_metadata(std::move(expected_metadata));
+  cudf::io::write_parquet(args);
+
+  auto expected_col0 = cudf::test::fixed_point_column_wrapper<__int128_t>{
+    seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}};
+  auto expected_col1 = cudf::test::fixed_point_column_wrapper<__int128_t>{
+    seq_col1.begin(), seq_col1.end(), valids, numeric::scale_type{-9}};
+
+  auto expected_table = table_view({expected_col0, expected_col1});
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected_table);
+}
+
 TEST_F(ParquetWriterTest, RowGroupSizeInvalid)
 {
   auto const unused_table = std::make_unique<table>();
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index f2c46c1e192..fd5dd439f18 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3528,7 +3528,13 @@ def test_parquet_writer_roundtrip_with_arrow_schema(index):
             "uint32": cudf.Series([1234, 123, 4123], dtype="uint32"),
             "list": list([[1, 2], [1, 2], [1, 2]]),
             "bool": cudf.Series([True, None, False], dtype=bool),
-            "fixed_pt": cudf.Series([0.00, 1.0, None]).astype(
+            "fixed32": cudf.Series([0.00, 1.0, None]).astype(
+                cudf.Decimal32Dtype(7, 2)
+            ),
+            "fixed64": cudf.Series([0.00, 1.0, None]).astype(
+                cudf.Decimal64Dtype(7, 2)
+            ),
+            "fixed128": cudf.Series([0.00, 1.0, None]).astype(
                 cudf.Decimal128Dtype(7, 2)
             ),
             "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
@@ -3542,6 +3548,10 @@ def test_parquet_writer_roundtrip_with_arrow_schema(index):
     buffer = BytesIO()
     expected.to_parquet(buffer, store_schema=True, index=index)
 
+    # Convert decimal types to d128
+    expected = expected.astype({'fixed32': cudf.Decimal128Dtype(9, 2)})
+    expected = expected.astype({'fixed64': cudf.Decimal128Dtype(18, 2)})
+
     # Read parquet with pyarrow, pandas and cudf readers
     got = cudf.DataFrame.from_arrow(pq.read_table(buffer))
     got2 = cudf.DataFrame.from_pandas(pd.read_parquet(buffer))

From 0946eb48777a6c95c7bf5bc6379cbca195a0aa48 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Tue, 4 Jun 2024 08:35:54 +0000
Subject: [PATCH 28/45] minor ruff-formatting fix

---
 python/cudf/cudf/tests/test_parquet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index fd5dd439f18..8f6517e15a3 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3549,8 +3549,8 @@ def test_parquet_writer_roundtrip_with_arrow_schema(index):
     expected.to_parquet(buffer, store_schema=True, index=index)
 
     # Convert decimal types to d128
-    expected = expected.astype({'fixed32': cudf.Decimal128Dtype(9, 2)})
-    expected = expected.astype({'fixed64': cudf.Decimal128Dtype(18, 2)})
+    expected = expected.astype({"fixed32": cudf.Decimal128Dtype(9, 2)})
+    expected = expected.astype({"fixed64": cudf.Decimal128Dtype(18, 2)})
 
     # Read parquet with pyarrow, pandas and cudf readers
     got = cudf.DataFrame.from_arrow(pq.read_table(buffer))

From ea01aeedf4c3eef1f13268f1f63c3ed6df2fa8d1 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 4 Jun 2024 19:56:35 +0000
Subject: [PATCH 29/45] refactor and move some helpers to
 writer_impl_helpers.cpp

---
 cpp/CMakeLists.txt                         |   1 +
 cpp/src/io/parquet/writer_impl.cu          | 135 +++++++--------------
 cpp/src/io/parquet/writer_impl_helpers.cpp |  31 +++++
 cpp/src/io/parquet/writer_impl_helpers.hpp |   9 ++
 4 files changed, 85 insertions(+), 91 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 633f5f36b38..f66408e318a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -423,6 +423,7 @@ add_library(
   src/io/parquet/reader_impl_helpers.cpp
   src/io/parquet/reader_impl_preprocess.cu
   src/io/parquet/writer_impl.cu
+  src/io/parquet/writer_impl_helpers.cpp
   src/io/parquet/decode_fixed.cu
   src/io/statistics/orc_column_statistics.cu
   src/io/statistics/parquet_column_statistics.cu
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index ef2e3e54245..f9fd936a40b 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -42,9 +42,6 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/detail/utilities.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -254,52 +251,6 @@ void update_chunk_encoding_stats(ColumnChunkMetaData& chunk_meta,
   if (not result.empty()) { chunk_meta.encoding_stats = std::move(result); }
 }
 
-/**
- * @brief Compute size (in bytes) of the data stored in the given column.
- *
- * @param column The input column
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return The data size of the input
- */
-size_t column_size(column_view const& column, rmm::cuda_stream_view stream)
-{
-  if (column.is_empty()) { return 0; }
-
-  if (is_fixed_width(column.type())) {
-    return size_of(column.type()) * column.size();
-  } else if (column.type().id() == type_id::STRING) {
-    auto const scol = strings_column_view(column);
-    return cudf::strings::detail::get_offset_value(
-             scol.offsets(), column.size() + column.offset(), stream) -
-           cudf::strings::detail::get_offset_value(scol.offsets(), column.offset(), stream);
-  } else if (column.type().id() == type_id::STRUCT) {
-    auto const scol = structs_column_view(column);
-    size_t ret      = 0;
-    for (int i = 0; i < scol.num_children(); i++) {
-      ret += column_size(scol.get_sliced_child(i, stream), stream);
-    }
-    return ret;
-  } else if (column.type().id() == type_id::LIST) {
-    auto const lcol = lists_column_view(column);
-    return column_size(lcol.get_sliced_child(stream), stream);
-  }
-
-  CUDF_FAIL("Unexpected compound type");
-}
-
-// checks to see if the given column has a fixed size.  This doesn't
-// check every row, so assumes string and list columns are not fixed, even
-// if each row is the same width.
-// TODO: update this if FIXED_LEN_BYTE_ARRAY is ever supported for writes.
-bool is_col_fixed_width(column_view const& column)
-{
-  if (column.type().id() == type_id::STRUCT) {
-    return std::all_of(column.child_begin(), column.child_end(), is_col_fixed_width);
-  }
-
-  return is_fixed_width(column.type());
-}
-
 /**
  * @brief Extends SchemaElement to add members required in constructing parquet_column_view
  *
@@ -1642,85 +1593,87 @@ rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& co
 }
 
 /**
- * @brief Helper function to convert decimal32 and decimal64 columns to decimal128 data,
+ * @brief Function to convert decimal32 and decimal64 columns to decimal128 data,
  *        update the input table metadata, and return a new vector of column views.
  *
  * @param[in,out] table_meta The table metadata
- * @param input The input table
  * @param[in,out] d128_vectors Vector containing the computed decimal128 data buffers.
+ * @param input The input table
  * @param stream CUDA stream used for device memory operations and kernel launches
  *
  * @return A device vector containing the converted decimal128 data
  */
 std::vector<column_view> convert_decimal_columns_and_metadata(
   table_input_metadata& table_meta,
-  table_view const& table,
   std::vector<rmm::device_uvector<__int128_t>>& d128_vectors,
+  table_view const& table,
   rmm::cuda_stream_view stream)
 {
-  std::vector<column_view> converted_column_views{table.begin(), table.end()};
-
-  std::function<void(column_view&, column_in_metadata&)> convert_column =
-    [&](column_view& column, column_in_metadata& metadata) -> void {
+  // Lambda function to convert each decimal32/decimal64 column to decimal128.
+  std::function<column_view(column_view, column_in_metadata&)> convert_column =
+    [&](column_view column, column_in_metadata& metadata) -> column_view {
     // Vector of passable-by-reference children column views
-    std::vector<column_view> converted_children{column.child_begin(), column.child_end()};
+    std::vector<column_view> converted_children;
+
     // Process children column views first
-    std::for_each(
+    std::transform(
       thrust::make_counting_iterator(0),
       thrust::make_counting_iterator(column.num_children()),
-      [&](auto const idx) { convert_column(converted_children[idx], metadata.child(idx)); });
+      std::back_inserter(converted_children),
+      [&](auto const idx) { return convert_column(column.child(idx), metadata.child(idx)); });
 
     // Process this column view. Only convert if decimal32 and decimal64 column.
     switch (column.type().id()) {
       case type_id::DECIMAL32:
         // Convert data to decimal128 type
-        d128_vectors.push_back(convert_data_to_decimal128<int32_t>(column, stream));
+        d128_vectors.emplace_back(convert_data_to_decimal128<int32_t>(column, stream));
         // Update metadata
         metadata.set_decimal_precision(MAX_DECIMAL32_PRECISION);
         metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
         // Create a new column view from the d128 data vector
-        column = column_view{data_type{type_id::DECIMAL128, column.type().scale()},
-                             column.size(),
-                             d128_vectors.back().data(),
-                             column.null_mask(),
-                             column.null_count(),
-                             column.offset(),
-                             converted_children};
-        break;
+        return {data_type{type_id::DECIMAL128, column.type().scale()},
+                column.size(),
+                d128_vectors.back().data(),
+                column.null_mask(),
+                column.null_count(),
+                column.offset(),
+                converted_children};
       case type_id::DECIMAL64:
         // Convert data to decimal128 type
-        d128_vectors.push_back(convert_data_to_decimal128<int64_t>(column, stream));
+        d128_vectors.emplace_back(convert_data_to_decimal128<int64_t>(column, stream));
         // Update metadata
         metadata.set_decimal_precision(MAX_DECIMAL64_PRECISION);
         metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
         // Create a new column view from the d128 data vector
-        column = column_view{data_type{type_id::DECIMAL128, column.type().scale()},
-                             column.size(),
-                             d128_vectors.back().data(),
-                             column.null_mask(),
-                             column.null_count(),
-                             column.offset(),
-                             converted_children};
-        break;
+        return {data_type{type_id::DECIMAL128, column.type().scale()},
+                column.size(),
+                d128_vectors.back().data(),
+                column.null_mask(),
+                column.null_count(),
+                column.offset(),
+                converted_children};
       default:
         // Update the children vector keeping everything else the same
-        column = column_view{column.type(),
-                             column.size(),
-                             column.head(),
-                             column.null_mask(),
-                             column.null_count(),
-                             column.offset(),
-                             converted_children};
-        break;
+        return {column.type(),
+                column.size(),
+                column.head(),
+                column.null_mask(),
+                column.null_count(),
+                column.offset(),
+                converted_children};
     }
   };
 
+  // Vector of converted column views
+  std::vector<column_view> converted_column_views;
+
   // Convert each column view
-  std::for_each(thrust::make_zip_iterator(thrust::make_tuple(converted_column_views.begin(),
-                                                             table_meta.column_metadata.begin())),
-                thrust::make_zip_iterator(thrust::make_tuple(converted_column_views.end(),
-                                                             table_meta.column_metadata.end())),
-                [&](auto elem) { convert_column(thrust::get<0>(elem), thrust::get<1>(elem)); });
+  std::transform(
+    thrust::make_zip_iterator(
+      thrust::make_tuple(table.begin(), table_meta.column_metadata.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(table.end(), table_meta.column_metadata.end())),
+    std::back_inserter(converted_column_views),
+    [&](auto elem) { return convert_column(thrust::get<0>(elem), thrust::get<1>(elem)); });
 
   return converted_column_views;
 }
@@ -1786,7 +1739,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   // and initialize LinkedColVector
   auto vec = table_to_linked_columns(
     (write_arrow_schema)
-      ? table_view({convert_decimal_columns_and_metadata(table_meta, input, d128_vectors, stream)})
+      ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_vectors, input, stream)})
       : input);
 
   auto schema_tree = construct_parquet_schema_tree(
diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
index b2400ec19b2..364b1a9777a 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.cpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
@@ -21,6 +21,11 @@
 
 #include "writer_impl_helpers.hpp"
 
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+
 namespace cudf::io::parquet::detail {
 
 using namespace cudf::io::detail;
@@ -80,6 +85,32 @@ void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta)
   }
 }
 
+size_t column_size(column_view const& column, rmm::cuda_stream_view stream)
+{
+  if (column.is_empty()) { return 0; }
+
+  if (is_fixed_width(column.type())) {
+    return size_of(column.type()) * column.size();
+  } else if (column.type().id() == type_id::STRING) {
+    auto const scol = strings_column_view(column);
+    return cudf::strings::detail::get_offset_value(
+             scol.offsets(), column.size() + column.offset(), stream) -
+           cudf::strings::detail::get_offset_value(scol.offsets(), column.offset(), stream);
+  } else if (column.type().id() == type_id::STRUCT) {
+    auto const scol = structs_column_view(column);
+    size_t ret      = 0;
+    for (int i = 0; i < scol.num_children(); i++) {
+      ret += column_size(scol.get_sliced_child(i, stream), stream);
+    }
+    return ret;
+  } else if (column.type().id() == type_id::LIST) {
+    auto const lcol = lists_column_view(column);
+    return column_size(lcol.get_sliced_child(stream), stream);
+  }
+
+  CUDF_FAIL("Unexpected compound type");
+}
+
 [[nodiscard]] bool is_col_nullable(cudf::detail::LinkedColPtr const& column,
                                    column_in_metadata const& column_metadata,
                                    single_write_mode write_mode)
diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp
index 9ffa4538134..316ee1da240 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.hpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.hpp
@@ -51,6 +51,15 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block
  */
 void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta);
 
+/**
+ * @brief Compute size (in bytes) of the data stored in the given column.
+ *
+ * @param column The input column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The data size of the input
+ */
+size_t column_size(column_view const& column, rmm::cuda_stream_view stream);
+
 /**
  * @brief Returns ``true`` if the column is nullable or if the write mode is not
  *        set to write the table all at once instead of chunked

From b9f2989e31a86d76aef1a95f1b75058c972d9d87 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 6 Jun 2024 21:05:41 +0000
Subject: [PATCH 30/45] resolve conflicts, minor doc and pytest updates.

---
 python/cudf/cudf/_lib/parquet.pyx      |  4 ++--
 python/cudf/cudf/io/parquet.py         |  7 +++++++
 python/cudf/cudf/tests/test_parquet.py | 17 +++++++++--------
 python/cudf/cudf/utils/ioutils.py      |  3 +++
 4 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 1dbe31aac6a..86a2585121d 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -621,9 +621,9 @@ cdef class ParquetWriter:
         If ``True``, enable dictionary encoding for Parquet page data
         subject to ``max_dictionary_size`` constraints.
         If ``False``, disable dictionary encoding for Parquet page data.
-    write_arrow_schema : bool, default False
+    store_schema : bool, default False
         If ``True``, enable computing and writing arrow schema to Parquet
-        file footer's key-value metadata section.
+        file footer's key-value metadata section for faithful round-tripping.
     See Also
     --------
     cudf.io.parquet.write_parquet
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index ef5bd50053f..e86334633ef 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -156,6 +156,7 @@ def write_to_dataset(
     column_encoding=None,
     column_type_length=None,
     output_as_binary=None,
+    store_schema=False,
 ):
     """Wraps `to_parquet` to write partitioned Parquet datasets.
     For each combination of partition group and value,
@@ -244,6 +245,9 @@ def write_to_dataset(
     output_as_binary : set, optional, default None
         If a column name is present in the set, that column will be output as
         unannotated binary, rather than the default 'UTF-8'.
+    store_schema : bool, default False
+        If ``True``, enable computing and writing arrow schema to Parquet
+        file footer's key-value metadata section for faithful round-tripping.
     """
 
     fs = ioutils._ensure_filesystem(fs, root_path, storage_options)
@@ -287,6 +291,7 @@ def write_to_dataset(
             column_encoding=column_encoding,
             column_type_length=column_type_length,
             output_as_binary=output_as_binary,
+            store_schema=store_schema,
         )
 
     else:
@@ -314,6 +319,7 @@ def write_to_dataset(
             column_encoding=column_encoding,
             column_type_length=column_type_length,
             output_as_binary=output_as_binary,
+            store_schema=store_schema,
         )
 
     return metadata
@@ -1018,6 +1024,7 @@ def to_parquet(
                 column_encoding=column_encoding,
                 column_type_length=column_type_length,
                 output_as_binary=output_as_binary,
+                store_schema=store_schema,
             )
 
         partition_info = (
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 47334cf1371..6e1e491c732 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2555,6 +2555,10 @@ def normalized_equals(value1, value2):
         value1 = None
     if value2 is pd.NA or value2 is pd.NaT:
         value2 = None
+    if isinstance(value1, np.datetime64):
+        value1 = pd.Timestamp(value1).to_pydatetime()
+    if isinstance(value2, np.datetime64):
+        value2 = pd.Timestamp(value2).to_pydatetime()
     if isinstance(value1, pd.Timestamp):
         value1 = value1.to_pydatetime()
     if isinstance(value2, pd.Timestamp):
@@ -3489,7 +3493,6 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmpdir, data):
     # Check results
     assert_eq(expected, got)
 
-<<<<<<< arrow-schema-support-pq-writer
     # Reset buffer
     buffer = BytesIO()
 
@@ -3501,6 +3504,9 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmpdir, data):
     # Convert to cudf table for an apple to apple comparison
     expected = cudf.from_pandas(pdf)
 
+    # Check results
+    assert_eq(expected, got)
+
 
 @pytest.mark.parametrize("index", [None, True, False])
 def test_parquet_writer_roundtrip_with_arrow_schema(index):
@@ -3641,7 +3647,6 @@ def test_parquet_writer_roundtrip_structs_with_arrow_schema(
     # Ensure that the structs are faithfully being roundtripped across
     # Parquet with arrow schema
     pa_expected = pa.Table.from_pydict({"struct": data})
-    pd_expected = pa_expected.to_pandas()
 
     expected = cudf.DataFrame.from_arrow(pa_expected)
 
@@ -3649,18 +3654,15 @@ def test_parquet_writer_roundtrip_structs_with_arrow_schema(
     buffer = BytesIO()
     expected.to_parquet(buffer, store_schema=True, index=index)
 
-    # Read Parquet with pyarrow and pandas
+    # Read Parquet with pyarrow
     pa_got = pq.read_table(buffer)
-    pd_got = pd.read_parquet(buffer)
 
     # drop the index column for comparison: __index_level_0__
     if index:
         pa_got = pa_got.drop(columns="__index_level_0__")
-        pd_got = pd_got.drop(columns="__index_level_0__")
 
     # Check results
     assert_eq(pa_expected, pa_got)
-    assert_eq(pd_expected, pd_got)
 
     # Convert to cuDF table and also read Parquet with cuDF reader
     got = cudf.DataFrame.from_arrow(pa_got)
@@ -3669,7 +3671,7 @@ def test_parquet_writer_roundtrip_structs_with_arrow_schema(
     # Check results
     assert_eq(expected, got)
     assert_eq(expected, got2)
-=======
+
 
 @pytest.mark.parametrize("chunk_read_limit", [0, 240, 1024000000])
 @pytest.mark.parametrize("pass_read_limit", [0, 240, 1024000000])
@@ -3695,4 +3697,3 @@ def test_parquet_chunked_reader(
     )
     actual = reader.read()
     assert_eq(expected, actual)
->>>>>>> branch-24.08
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 0209c692935..5d115c6be5a 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -322,6 +322,9 @@
 output_as_binary : set, optional, default None
     If a column name is present in the set, that column will be output as
     unannotated binary, rather than the default 'UTF-8'.
+store_schema : bool, default False
+    If ``True``, enable computing and writing arrow schema to Parquet
+    file footer's key-value metadata section for faithful round-tripping.
 **kwargs
     Additional parameters will be passed to execution engines other
     than ``cudf``.

From 8e77687ceaf0870974e4707b3edcb497d9de0ea3 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Tue, 11 Jun 2024 20:21:34 +0000
Subject: [PATCH 31/45] Changes from reviewer suggestions

---
 cpp/include/cudf/io/parquet.hpp            |  8 --------
 cpp/src/io/parquet/arrow_schema_writer.cpp | 10 +++-------
 cpp/src/io/parquet/arrow_schema_writer.hpp |  3 ---
 cpp/src/io/parquet/parquet.hpp             | 11 -----------
 cpp/src/io/parquet/reader_impl_helpers.cpp |  2 +-
 cpp/src/io/parquet/writer_impl.cu          |  2 +-
 cpp/src/io/parquet/writer_impl_helpers.hpp |  2 +-
 7 files changed, 6 insertions(+), 32 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 6329f3dbe65..8a124069e54 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -840,14 +840,6 @@ class parquet_writer_options_base {
    */
   void enable_write_arrow_schema(bool val);
 
-  /**
-   * @brief Sets column chunks file path to be set in the raw output metadata.
-   *
-   * @param file_paths Vector of Strings which indicates file path. Must be same size as number of
-   * data sinks in sink info
-   */
-  void set_column_chunks_file_paths(std::vector<std::string> file_paths);
-
   /**
    * @brief Sets the maximum row group size, in bytes.
    *
diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index accc5e52533..668d4754800 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -159,7 +159,7 @@ struct dispatch_to_flatbuf {
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_D>, void> operator()()
   {
     type_type = flatbuf::Type_Date;
-    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
+    // Date type (Set unit type to DAY for arrows's Date32)
     field_offset = flatbuf::CreateDate(fbb, flatbuf::DateUnit_DAY).Union();
   }
 
@@ -209,6 +209,8 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
   {
+    // `duration_D` is written as TimeType as `duration_D` is not a valid arrow type.
+    //  This also allows for easy and faithful roundtripping with cudf.
     type_type    = flatbuf::Type_Time;
     field_offset = flatbuf::CreateTime(fbb, flatbuf::TimeUnit_MILLISECOND).Union();
   }
@@ -244,12 +246,6 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
   {
-    if constexpr (not std::is_same_v<T, numeric::decimal128>) {
-      // ``decimal32`` and ``decimal64`` types are not supported by
-      // Arrow without explicit conversion.
-      CUDF_FAIL("Fixed point types smaller than `decimal128` are not supported in arrow schema");
-    }
-
     type_type    = flatbuf::Type_Decimal;
     field_offset = flatbuf::CreateDecimal(fbb,
                                           (col_meta.is_decimal_precision_set())
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index 4043889ea99..7b7c6cf722c 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -27,9 +27,6 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/types.hpp>
 
-#include <string>
-#include <vector>
-
 namespace cudf::io::parquet::detail {
 
 using namespace cudf::io::detail;
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index b0734719002..e35742c2527 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -57,15 +57,6 @@ struct TimeUnit {
   Type type;
 };
 
-struct DateUnit {
-  enum Type : char { DAYS = 0, MILLIS = 1 };
-  Type type;
-};
-
-struct DateType {
-  DateUnit unit = {DateUnit::DAYS};
-};
-
 struct TimeType {
   // Default to true because the timestamps are implicitly in UTC
   // Writer option overrides this default
@@ -106,14 +97,12 @@ struct LogicalType {
   thrust::optional<DecimalType> decimal_type;
   thrust::optional<TimeType> time_type;
   thrust::optional<TimestampType> timestamp_type;
-  thrust::optional<DateType> date_type;
   thrust::optional<IntType> int_type;
 
   LogicalType(Type tp = UNDEFINED) : type(tp) {}
   LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
   LogicalType(TimeType&& tt) : type(TIME), time_type(tt) {}
   LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {}
-  LogicalType(DateType&& date) : type(DATE), date_type(date) {}
   LogicalType(IntType&& it) : type(INTEGER), int_type(it) {}
 
   constexpr bool is_time_millis() const
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 2998bf6f0eb..63f25d417ff 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -45,7 +45,7 @@ thrust::optional<LogicalType> converted_to_logical_type(SchemaElement const& sch
       case MAP: return LogicalType{LogicalType::MAP};
       case LIST: return LogicalType{LogicalType::LIST};
       case DECIMAL: return LogicalType{DecimalType{schema.decimal_scale, schema.decimal_precision}};
-      case DATE: return LogicalType{DateType{DateUnit::DAYS}};
+      case DATE: return LogicalType{LogicalType::DATE};
       case TIME_MILLIS: return LogicalType{TimeType{true, TimeUnit::MILLIS}};
       case TIME_MICROS: return LogicalType{TimeType{true, TimeUnit::MICROS}};
       case TIMESTAMP_MILLIS: return LogicalType{TimestampType{true, TimeUnit::MILLIS}};
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index f9fd936a40b..f9ca9553cf1 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -397,7 +397,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::DATE;
     col_schema.stats_dtype    = statistics_dtype::dtype_int32;
-    col_schema.logical_type   = LogicalType{DateType{DateUnit::DAYS}};
+    col_schema.logical_type   = LogicalType{LogicalType::DATE};
   }
 
   template <typename T>
diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp
index 316ee1da240..6adacc981d1 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.hpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.hpp
@@ -58,7 +58,7 @@ void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta);
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @return The data size of the input
  */
-size_t column_size(column_view const& column, rmm::cuda_stream_view stream);
+[[nodiscard]] size_t column_size(column_view const& column, rmm::cuda_stream_view stream);
 
 /**
  * @brief Returns ``true`` if the column is nullable or if the write mode is not

From 30057c0636e7c1dd683a647d35846bfd8197ea8d Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Tue, 11 Jun 2024 20:32:14 +0000
Subject: [PATCH 32/45] Minor changes from reviewer suggestions.

---
 cpp/src/io/parquet/writer_impl.cu | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index f9ca9553cf1..d2604f058bf 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -464,11 +464,14 @@ struct leaf_schema_fn {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
   {
-    col_schema.type = (write_arrow_schema) ? Type::INT64 : Type::INT32;
-    col_schema.stats_dtype =
-      (write_arrow_schema) ? statistics_dtype::dtype_int64 : statistics_dtype::dtype_int32;
-    // only write as time32 logical type if not writing arrow schema
-    if (not write_arrow_schema) {
+    // If writing arrow schema, no logical type nor converted type is necessary
+    if (write_arrow_schema) {
+      col_schema.type        = Type::INT64;
+      col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    } else {
+      // Write as Time32 logical type otherwise. Parquet TIME_MILLIS annotates INT32
+      col_schema.type           = Type::INT32;
+      col_schema.stats_dtype    = statistics_dtype::dtype_int32;
       col_schema.converted_type = ConvertedType::TIME_MILLIS;
       col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
       col_schema.ts_scale       = 1000;
@@ -478,11 +481,14 @@ struct leaf_schema_fn {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
   {
-    col_schema.type = (write_arrow_schema) ? Type::INT64 : Type::INT32;
-    col_schema.stats_dtype =
-      (write_arrow_schema) ? statistics_dtype::dtype_int64 : statistics_dtype::dtype_int32;
-    // only write as time32 logical type if not writing arrow schema
-    if (not write_arrow_schema) {
+    // If writing arrow schema, no logical type nor converted type is necessary
+    if (write_arrow_schema) {
+      col_schema.type        = Type::INT64;
+      col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    } else {
+      // Write as Time32 logical type otherwise. Parquet TIME_MILLIS annotates INT32
+      col_schema.type           = Type::INT32;
+      col_schema.stats_dtype    = statistics_dtype::dtype_int32;
       col_schema.converted_type = ConvertedType::TIME_MILLIS;
       col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
     }
@@ -493,7 +499,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_int64;
-    // only write as time64 logical type if not writing arrow schema
+    // Only write as time64 logical type if not writing arrow schema
     if (not write_arrow_schema) {
       col_schema.converted_type = ConvertedType::TIME_MICROS;
       col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}};
@@ -505,7 +511,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_int64;
-    // only write as time64 logical type if not writing arrow schema
+    // Only write as time64 logical type if not writing arrow schema
     if (not write_arrow_schema) {
       col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}};
     }

From 0f136429b34d171747904349926852c0d9554d19 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Tue, 11 Jun 2024 21:56:17 +0000
Subject: [PATCH 33/45] minor update. add nodiscard.

---
 cpp/src/io/parquet/writer_impl_helpers.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
index 364b1a9777a..9ded83736d6 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.cpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
@@ -85,7 +85,7 @@ void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta)
   }
 }
 
-size_t column_size(column_view const& column, rmm::cuda_stream_view stream)
+[[nodiscard]] size_t column_size(column_view const& column, rmm::cuda_stream_view stream)
 {
   if (column.is_empty()) { return 0; }
 

From f9c123bebba2456ef17cd90be7a73f486df1fe2f Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Fri, 14 Jun 2024 01:46:00 +0000
Subject: [PATCH 34/45] Minor changes addressing reviewer comments.

---
 cpp/src/io/functions.cpp                   |  11 +-
 cpp/src/io/parquet/arrow_schema_writer.cpp | 121 +++++++++++----------
 cpp/src/io/parquet/arrow_schema_writer.hpp |  11 +-
 cpp/src/io/parquet/writer_impl.cu          |   9 +-
 cpp/src/io/parquet/writer_impl_helpers.hpp |  26 ++++-
 5 files changed, 105 insertions(+), 73 deletions(-)

diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 3a844312367..b4ece9cec66 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -762,6 +762,9 @@ void parquet_writer_options_base::set_compression(compression_type compression)
 
 void parquet_writer_options_base::enable_int96_timestamps(bool req)
 {
+  CUDF_EXPECTS(not req or not is_enabled_write_arrow_schema(),
+               "INT96 timestamps and arrow schema cannot be simultaneously "
+               "enabled as INT96 timestamps are deprecated in Arrow.");
   _write_timestamps_as_int96 = req;
 }
 
@@ -770,7 +773,13 @@ void parquet_writer_options_base::enable_utc_timestamps(bool val)
   _write_timestamps_as_UTC = val;
 }
 
-void parquet_writer_options_base::enable_write_arrow_schema(bool val) { _write_arrow_schema = val; }
+void parquet_writer_options_base::enable_write_arrow_schema(bool val)
+{
+  CUDF_EXPECTS(not val or not is_enabled_int96_timestamps(),
+               "arrow schema and INT96 timestamps cannot be simultaneously "
+               "enabled as INT96 timestamps are deprecated in Arrow.");
+  _write_arrow_schema = val;
+}
 
 void parquet_writer_options_base::set_row_group_size_bytes(size_t size_bytes)
 {
diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index 668d4754800..458ef7f065d 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -33,6 +33,10 @@
 
 namespace cudf::io::parquet::detail {
 
+using namespace cudf::io::detail;
+
+namespace {
+
 // Copied over from arrow source for better code readability
 namespace flatbuf       = cudf::io::parquet::flatbuf;
 using FlatBufferBuilder = flatbuffers::FlatBufferBuilder;
@@ -68,97 +72,97 @@ struct dispatch_to_flatbuf {
   single_write_mode const write_mode;
   bool const utc_timestamps;
   Offset& field_offset;
-  flatbuf::Type& type_type;
+  flatbuf::Type& field_type_id;
   std::vector<FieldOffset>& children;
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, bool>, void> operator()()
   {
-    type_type    = flatbuf::Type_Bool;
-    field_offset = flatbuf::CreateBool(fbb).Union();
+    field_type_id = flatbuf::Type_Bool;
+    field_offset  = flatbuf::CreateBool(fbb).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, int8_t>, void> operator()()
   {
-    type_type    = flatbuf::Type_Int;
-    field_offset = flatbuf::CreateInt(fbb, 8, std::numeric_limits<T>::is_signed).Union();
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 8, std::numeric_limits<T>::is_signed).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, int16_t>, void> operator()()
   {
-    type_type    = flatbuf::Type_Int;
-    field_offset = flatbuf::CreateInt(fbb, 16, std::numeric_limits<T>::is_signed).Union();
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 16, std::numeric_limits<T>::is_signed).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, int32_t>, void> operator()()
   {
-    type_type    = flatbuf::Type_Int;
-    field_offset = flatbuf::CreateInt(fbb, 32, std::numeric_limits<T>::is_signed).Union();
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 32, std::numeric_limits<T>::is_signed).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, int64_t>, void> operator()()
   {
-    type_type    = flatbuf::Type_Int;
-    field_offset = flatbuf::CreateInt(fbb, 64, std::numeric_limits<T>::is_signed).Union();
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 64, std::numeric_limits<T>::is_signed).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, uint8_t>, void> operator()()
   {
-    type_type    = flatbuf::Type_Int;
-    field_offset = flatbuf::CreateInt(fbb, 8, std::numeric_limits<T>::is_signed).Union();
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 8, std::numeric_limits<T>::is_signed).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, uint16_t>, void> operator()()
   {
-    type_type    = flatbuf::Type_Int;
-    field_offset = flatbuf::CreateInt(fbb, 16, std::numeric_limits<T>::is_signed).Union();
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 16, std::numeric_limits<T>::is_signed).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, uint32_t>, void> operator()()
   {
-    type_type    = flatbuf::Type_Int;
-    field_offset = flatbuf::CreateInt(fbb, 32, std::numeric_limits<T>::is_signed).Union();
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 32, std::numeric_limits<T>::is_signed).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, uint64_t>, void> operator()()
   {
-    type_type    = flatbuf::Type_Int;
-    field_offset = flatbuf::CreateInt(fbb, 64, std::numeric_limits<T>::is_signed).Union();
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 64, std::numeric_limits<T>::is_signed).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, float>, void> operator()()
   {
-    type_type    = flatbuf::Type_FloatingPoint;
-    field_offset = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_SINGLE).Union();
+    field_type_id = flatbuf::Type_FloatingPoint;
+    field_offset  = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_SINGLE).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, double>, void> operator()()
   {
-    type_type    = flatbuf::Type_FloatingPoint;
-    field_offset = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_DOUBLE).Union();
+    field_type_id = flatbuf::Type_FloatingPoint;
+    field_offset  = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_DOUBLE).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> operator()()
   {
-    type_type    = flatbuf::Type_Utf8View;
-    field_offset = flatbuf::CreateUtf8View(fbb).Union();
+    field_type_id = flatbuf::Type_Utf8View;
+    field_offset  = flatbuf::CreateUtf8View(fbb).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_D>, void> operator()()
   {
-    type_type = flatbuf::Type_Date;
+    field_type_id = flatbuf::Type_Date;
     // Date type (Set unit type to DAY for arrows's Date32)
     field_offset = flatbuf::CreateDate(fbb, flatbuf::DateUnit_DAY).Union();
   }
@@ -166,7 +170,7 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_s>, void> operator()()
   {
-    type_type = flatbuf::Type_Timestamp;
+    field_type_id = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset = flatbuf::CreateTimestamp(
                      fbb, flatbuf::TimeUnit_SECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
@@ -176,7 +180,7 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ms>, void> operator()()
   {
-    type_type = flatbuf::Type_Timestamp;
+    field_type_id = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset =
       flatbuf::CreateTimestamp(
@@ -187,7 +191,7 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_us>, void> operator()()
   {
-    type_type = flatbuf::Type_Timestamp;
+    field_type_id = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset =
       flatbuf::CreateTimestamp(
@@ -198,7 +202,7 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
   {
-    type_type = flatbuf::Type_Timestamp;
+    field_type_id = flatbuf::Type_Timestamp;
     // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
     field_offset =
       flatbuf::CreateTimestamp(
@@ -211,46 +215,46 @@ struct dispatch_to_flatbuf {
   {
     // `duration_D` is written as TimeType as `duration_D` is not a valid arrow type.
     //  This also allows for easy and faithful roundtripping with cudf.
-    type_type    = flatbuf::Type_Time;
-    field_offset = flatbuf::CreateTime(fbb, flatbuf::TimeUnit_MILLISECOND).Union();
+    field_type_id = flatbuf::Type_Time;
+    field_offset  = flatbuf::CreateTime(fbb, flatbuf::TimeUnit_MILLISECOND).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
   {
-    type_type    = flatbuf::Type_Duration;
-    field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_SECOND).Union();
+    field_type_id = flatbuf::Type_Duration;
+    field_offset  = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_SECOND).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
   {
-    type_type    = flatbuf::Type_Duration;
-    field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MILLISECOND).Union();
+    field_type_id = flatbuf::Type_Duration;
+    field_offset  = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MILLISECOND).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_us>, void> operator()()
   {
-    type_type    = flatbuf::Type_Duration;
-    field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MICROSECOND).Union();
+    field_type_id = flatbuf::Type_Duration;
+    field_offset  = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MICROSECOND).Union();
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ns>, void> operator()()
   {
-    type_type    = flatbuf::Type_Duration;
-    field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_NANOSECOND).Union();
+    field_type_id = flatbuf::Type_Duration;
+    field_offset  = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_NANOSECOND).Union();
   }
 
   template <typename T>
   std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
   {
-    type_type    = flatbuf::Type_Decimal;
-    field_offset = flatbuf::CreateDecimal(fbb,
+    field_type_id = flatbuf::Type_Decimal;
+    field_offset  = flatbuf::CreateDecimal(fbb,
                                           (col_meta.is_decimal_precision_set())
-                                            ? col_meta.get_decimal_precision()
-                                            : MAX_DECIMAL128_PRECISION,
+                                             ? col_meta.get_decimal_precision()
+                                             : MAX_DECIMAL128_PRECISION,
                                           col->type().scale(),
                                           128)
                      .Union();
@@ -266,8 +270,8 @@ struct dispatch_to_flatbuf {
     if constexpr (std::is_same_v<T, cudf::list_view>) {
       children.emplace_back(make_arrow_schema_fields(
         fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps));
-      type_type    = flatbuf::Type_List;
-      field_offset = flatbuf::CreateList(fbb).Union();
+      field_type_id = flatbuf::Type_List;
+      field_offset  = flatbuf::CreateList(fbb).Union();
     }
 
     // Traverse the struct in DFS manner and process children fields.
@@ -279,8 +283,8 @@ struct dispatch_to_flatbuf {
                        return make_arrow_schema_fields(
                          fbb, col->children[idx], col_meta.child(idx), write_mode, utc_timestamps);
                      });
-      type_type    = flatbuf::Type_Struct_;
-      field_offset = flatbuf::CreateStruct_(fbb).Union();
+      field_type_id = flatbuf::Type_Struct_;
+      field_offset  = flatbuf::CreateStruct_(fbb).Union();
     }
   }
 
@@ -300,26 +304,33 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
                                      bool const utc_timestamps)
 {
   // Variables to be set by the dispatch_to_flatbuf functor
-  Offset field_offset     = 0;
-  flatbuf::Type type_type = flatbuf::Type_NONE;
+  Offset field_offset         = 0;
+  flatbuf::Type field_type_id = flatbuf::Type_NONE;
   std::vector<FieldOffset> children;
 
-  cudf::type_dispatcher(
-    column->type(),
-    dispatch_to_flatbuf{
-      fbb, column, column_metadata, write_mode, utc_timestamps, field_offset, type_type, children});
+  cudf::type_dispatcher(column->type(),
+                        dispatch_to_flatbuf{fbb,
+                                            column,
+                                            column_metadata,
+                                            write_mode,
+                                            utc_timestamps,
+                                            field_offset,
+                                            field_type_id,
+                                            children});
 
   // push to field offsets vector
   return flatbuf::CreateField(
     fbb,
     fbb.CreateString(column_metadata.get_name()),          // name
     is_col_nullable(column, column_metadata, write_mode),  // nullable
-    type_type,                                             // type id
+    field_type_id,                                         // type id
     field_offset,                                          // field offset
     {0},                                                   // DictionaryOffset
     fbb.CreateVector(children.data(), children.size()));   // children vector
 }
 
+}  // namespace
+
 std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
                                                table_input_metadata const& metadata,
                                                single_write_mode const write_mode,
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index 7b7c6cf722c..1b62ef35c86 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -29,8 +29,6 @@
 
 namespace cudf::io::parquet::detail {
 
-using namespace cudf::io::detail;
-
 /**
  * @brief Construct and return arrow schema from input parquet schema
  *
@@ -47,9 +45,10 @@ using namespace cudf::io::detail;
  *
  * @return The constructed arrow ipc message string
  */
-std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
-                                               table_input_metadata const& metadata,
-                                               single_write_mode const write_mode,
-                                               bool const utc_timestamps);
+std::string construct_arrow_schema_ipc_message(
+  cudf::detail::LinkedColVector const& linked_columns,
+  table_input_metadata const& metadata,
+  ::cudf::io::detail::single_write_mode const write_mode,
+  bool const utc_timestamps);
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 01d7de84f0f..c63b12eaa38 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1577,7 +1577,7 @@ rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& co
 
   rmm::device_uvector<__int128_t> d128_buffer(column.size(), stream);
 
-  thrust::for_each(rmm::exec_policy(stream),
+  thrust::for_each(rmm::exec_policy_nosync(stream),
                    thrust::make_counting_iterator(0),
                    thrust::make_counting_iterator(column.size()),
                    [in  = column.begin<DecimalType>(),
@@ -2356,8 +2356,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _dict_policy(options.get_dictionary_policy()),
     _max_dictionary_size(options.get_max_dictionary_size()),
     _max_page_fragment_size(options.get_max_page_fragment_size()),
-    _int96_timestamps(options.is_enabled_int96_timestamps() and
-                      not options.is_enabled_write_arrow_schema()),
+    _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
     _write_arrow_schema(options.is_enabled_write_arrow_schema()),
@@ -2368,10 +2367,6 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _out_sink(std::move(sinks)),
     _compression_statistics{options.get_compression_statistics()}
 {
-  if (options.is_enabled_int96_timestamps() and options.is_enabled_write_arrow_schema()) {
-    CUDF_LOG_WARN("INT96 timestamps are deprecated in arrow schema. Disabling INT96 timestamps.");
-  }
-
   if (options.get_metadata()) {
     _table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
   }
diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp
index 6adacc981d1..3f96d03ab01 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.hpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.hpp
@@ -28,8 +28,6 @@
 
 namespace cudf::io::parquet::detail {
 
-using namespace cudf::io::detail;
-
 /**
  * @brief Function that translates GDF compression to parquet compression.
  *
@@ -38,10 +36,30 @@ using namespace cudf::io::detail;
  */
 Compression to_parquet_compression(compression_type compression);
 
+/**
+ * @brief Function that translates the given compression codec to nvcomp compression type.
+ *
+ * @param codec Compression codec
+ * @return Translated nvcomp compression type
+ */
 nvcomp::compression_type to_nvcomp_compression_type(Compression codec);
 
+/**
+ * @brief Function that computes input alignment requirements for the given compression type.
+ *
+ * @param codec Compression codec
+ * @return Required alignment
+ */
 uint32_t page_alignment(Compression codec);
 
+/**
+ * @brief Gets the maximum compressed chunk size for the largest chunk uncompressed chunk in the
+ *        batch.
+ *
+ * @param codec Compression codec
+ * @param compression_blocksize Size of the largest uncompressed chunk in the batch
+ * @return Maximum compressed chunk size
+ */
 size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize);
 
 /**
@@ -64,7 +82,7 @@ void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta);
  * @brief Returns ``true`` if the column is nullable or if the write mode is not
  *        set to write the table all at once instead of chunked
  *
- * @param column A view of the column
+ * @param column A view of the (linked) column
  * @param column_metadata Metadata of the column
  * @param write_mode Flag to indicate that we are guaranteeing a single table write
  *
@@ -72,7 +90,7 @@ void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta);
  */
 [[nodiscard]] bool is_col_nullable(cudf::detail::LinkedColPtr const& column,
                                    column_in_metadata const& column_metadata,
-                                   single_write_mode write_mode);
+                                   ::cudf::io::detail::single_write_mode write_mode);
 /**
  * @brief Returns ``true`` if the given column has a fixed size.
  *

From 92d88a04dbdccb23962a865eac48df706c1619de Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Fri, 14 Jun 2024 19:47:04 +0000
Subject: [PATCH 35/45] Rename `is_col_nullable` to `is_output_col_nullable`

---
 cpp/src/io/parquet/arrow_schema_writer.cpp | 12 ++++++------
 cpp/src/io/parquet/writer_impl.cu          |  4 ++--
 cpp/src/io/parquet/writer_impl_helpers.cpp |  6 +++---
 cpp/src/io/parquet/writer_impl_helpers.hpp |  6 +++---
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index 458ef7f065d..d7e23b63774 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -321,12 +321,12 @@ FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
   // push to field offsets vector
   return flatbuf::CreateField(
     fbb,
-    fbb.CreateString(column_metadata.get_name()),          // name
-    is_col_nullable(column, column_metadata, write_mode),  // nullable
-    field_type_id,                                         // type id
-    field_offset,                                          // field offset
-    {0},                                                   // DictionaryOffset
-    fbb.CreateVector(children.data(), children.size()));   // children vector
+    fbb.CreateString(column_metadata.get_name()),                    // name
+    is_output_column_nullable(column, column_metadata, write_mode),  // nullable
+    field_type_id,                                                   // type id
+    field_offset,                                                    // field offset
+    {0},                                                             // DictionaryOffset
+    fbb.CreateVector(children.data(), children.size()));             // children vector
 }
 
 }  // namespace
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 229fc9d0401..fd4991da41f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -595,7 +595,7 @@ std::vector<schema_tree_node> construct_parquet_schema_tree(
 
   std::function<void(cudf::detail::LinkedColPtr const&, column_in_metadata&, size_t)> add_schema =
     [&](cudf::detail::LinkedColPtr const& col, column_in_metadata& col_meta, size_t parent_idx) {
-      bool const col_nullable = is_col_nullable(col, col_meta, write_mode);
+      bool const col_nullable = is_output_column_nullable(col, col_meta, write_mode);
 
       auto set_field_id = [&schema, parent_idx](schema_tree_node& s,
                                                 column_in_metadata const& col_meta) {
@@ -820,7 +820,7 @@ std::vector<schema_tree_node> construct_parquet_schema_tree(
         right_child_meta.set_name("value");
         // check the repetition type of key is required i.e. the col should be non-nullable
         auto key_col = col->children[lists_column_view::child_column_index]->children[0];
-        CUDF_EXPECTS(!is_col_nullable(key_col, left_child_meta, write_mode),
+        CUDF_EXPECTS(!is_output_column_nullable(key_col, left_child_meta, write_mode),
                      "key column cannot be nullable. For chunked writing, explicitly set the "
                      "nullability to false in metadata");
         // process key
diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
index 9ded83736d6..529fe1cac2c 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.cpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
@@ -111,9 +111,9 @@ void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta)
   CUDF_FAIL("Unexpected compound type");
 }
 
-[[nodiscard]] bool is_col_nullable(cudf::detail::LinkedColPtr const& column,
-                                   column_in_metadata const& column_metadata,
-                                   single_write_mode write_mode)
+[[nodiscard]] bool is_output_column_nullable(cudf::detail::LinkedColPtr const& column,
+                                             column_in_metadata const& column_metadata,
+                                             single_write_mode write_mode)
 {
   if (column_metadata.is_nullability_defined()) {
     CUDF_EXPECTS(column_metadata.nullable() or column->null_count() == 0,
diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp
index 3f96d03ab01..1e461e9b4bc 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.hpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.hpp
@@ -88,9 +88,9 @@ void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta);
  *
  * @return Whether the column is nullable.
  */
-[[nodiscard]] bool is_col_nullable(cudf::detail::LinkedColPtr const& column,
-                                   column_in_metadata const& column_metadata,
-                                   ::cudf::io::detail::single_write_mode write_mode);
+[[nodiscard]] bool is_output_column_nullable(cudf::detail::LinkedColPtr const& column,
+                                             column_in_metadata const& column_metadata,
+                                             ::cudf::io::detail::single_write_mode write_mode);
 /**
  * @brief Returns ``true`` if the given column has a fixed size.
  *

From df11288043f45bdf4de322d7091f3dcb8cf428fe Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Fri, 14 Jun 2024 19:56:30 +0000
Subject: [PATCH 36/45] minor comment update

---
 cpp/src/io/parquet/arrow_schema_writer.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index d7e23b63774..5f63c93cff3 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -291,9 +291,8 @@ struct dispatch_to_flatbuf {
   template <typename T>
   std::enable_if_t<cudf::is_dictionary<T>(), void> operator()()
   {
-    // TODO: Implementing ``dictionary32`` would need ``DictionaryFieldMapper`` and
-    // ``FieldPosition`` classes from arrow source to keep track of dictionary encoding paths.
-    CUDF_FAIL("Dictionary columns are not supported for writing arrow schema");
+    // `dictionary32` columns are not written to parquet by cudf.
+    CUDF_FAIL("Dictionary columns are not supported for writing");
   }
 };
 

From 578c8e1c89cee46741ea359172b37a45200ea1c8 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Fri, 14 Jun 2024 20:06:00 +0000
Subject: [PATCH 37/45] minor comment update

---
 cpp/src/io/parquet/writer_impl_helpers.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp
index 1e461e9b4bc..2f8125f3c44 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.hpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.hpp
@@ -79,8 +79,10 @@ void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta);
 [[nodiscard]] size_t column_size(column_view const& column, rmm::cuda_stream_view stream);
 
 /**
- * @brief Returns ``true`` if the column is nullable or if the write mode is not
- *        set to write the table all at once instead of chunked
+ * @brief Indicates if the column should be marked as nullable in the output schema
+ *
+ * Returns `true` if the input column is nullable or if the write mode is not set to
+ * write the table all at once instead of chunked.
  *
  * @param column A view of the (linked) column
  * @param column_metadata Metadata of the column

From b1e6b6fd51a0b2cf4f52c936e2b9613baa5e262b Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 26 Jun 2024 01:53:13 +0000
Subject: [PATCH 38/45] Minor refactor

---
 cpp/src/io/parquet/writer_impl.cu | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index f1ac122482b..94594c83996 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -2429,8 +2429,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _dict_policy(options.get_dictionary_policy()),
     _max_dictionary_size(options.get_max_dictionary_size()),
     _max_page_fragment_size(options.get_max_page_fragment_size()),
-    _int96_timestamps(options.is_enabled_int96_timestamps() and
-                      not options.is_enabled_write_arrow_schema()),
+    _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
     _write_arrow_schema(options.is_enabled_write_arrow_schema()),
@@ -2441,9 +2440,6 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _out_sink(std::move(sinks)),
     _compression_statistics{options.get_compression_statistics()}
 {
-  if (options.is_enabled_int96_timestamps() and options.is_enabled_write_arrow_schema()) {
-    CUDF_LOG_WARN("INT96 timestamps are deprecated in arrow schema. Disabling INT96 timestamps.");
-  }
   if (options.get_metadata()) {
     _table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
   }

From c011e512cefa2362b7e30617afc06f9008c99a69 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 27 Jun 2024 21:19:02 +0000
Subject: [PATCH 39/45] Incorporating minor suggestions from review

---
 cpp/src/io/parquet/arrow_schema_writer.cpp |  3 ++-
 cpp/src/io/parquet/writer_impl_helpers.cpp | 12 +++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index 5f63c93cff3..ddf65e9020f 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -345,8 +345,9 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con
   // Instantiate a flatbuffer builder
   FlatBufferBuilder fbb;
 
-  // Create an empty field offset vector
+  // Create an empty field offset vector and reserve space for linked columns
   std::vector<FieldOffset> field_offsets;
+  field_offsets.reserve(linked_columns.size());
 
   // populate field offsets (aka schema fields)
   std::transform(thrust::make_zip_iterator(
diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
index 04d5730528d..fb20a5b0999 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.cpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
@@ -46,11 +46,13 @@ Compression to_parquet_compression(compression_type compression)
 
 nvcomp::compression_type to_nvcomp_compression_type(Compression codec)
 {
-  if (codec == Compression::SNAPPY) return nvcomp::compression_type::SNAPPY;
-  if (codec == Compression::ZSTD) return nvcomp::compression_type::ZSTD;
-  // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
-  if (codec == Compression::LZ4_RAW) return nvcomp::compression_type::LZ4;
-  CUDF_FAIL("Unsupported compression type");
+  switch (codec) {
+    case Compression::SNAPPY: return nvcomp::compression_type::SNAPPY;
+    case Compression::ZSTD: return nvcomp::compression_type::ZSTD;
+    // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
+    case Compression::LZ4_RAW: return nvcomp::compression_type::LZ4;
+    default: CUDF_FAIL("Unsupported compression type");
+  }
 }
 
 uint32_t page_alignment(Compression codec)

From b6a54ecae453de986e3b5d8abbcac104569ea8a2 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 4 Jul 2024 01:54:24 +0000
Subject: [PATCH 40/45] Test for exception handling to_parquet with int96 and
 arrow schema enabled

---
 python/cudf/cudf/tests/test_parquet.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 1010823d643..2db9800fd86 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1617,12 +1617,10 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf):
     assert_eq(pdf, gdf)
 
     # Write out the gdf using the GPU accelerated writer with INT96 timestamps
-    # TODO: store_schema must be false when working with INT96 timestamps
     gdf.to_parquet(
         gdf_fname.strpath,
         index=None,
         int96_timestamps=True,
-        store_schema=False,
     )
 
     assert os.path.exists(gdf_fname)
@@ -3627,6 +3625,24 @@ def test_parquet_writer_roundtrip_with_arrow_schema(index):
     assert_eq(expected, got3)
 
 
+def test_parquet_writer_int96_timestamps_and_arrow_schema(index):
+    df = cudf.DataFrame(
+        {
+            "timestamp": cudf.Series(
+                [1234, 123, 4123], dtype="datetime64[ms]"
+            ),
+        }
+    )
+
+    # Output buffer
+    buffer = BytesIO()
+
+    # Writing out parquet with both INT96 timestamps and arrow_schema
+    # enabled should throw an exception.
+    with pytest.raises(RuntimeError):
+        df.to_parquet(buffer, int96_timestamps=True, store_schema=True)
+
+
 @pytest.mark.parametrize(
     "data",
     [

From bddfabe7f3de79d596727cd95f1e65f43d5610a4 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 5 Jul 2024 18:45:54 +0000
Subject: [PATCH 41/45] Minor fix for failing pytests

---
 python/cudf/cudf/tests/test_parquet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 2db9800fd86..ff0c9040737 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3625,7 +3625,7 @@ def test_parquet_writer_roundtrip_with_arrow_schema(index):
     assert_eq(expected, got3)
 
 
-def test_parquet_writer_int96_timestamps_and_arrow_schema(index):
+def test_parquet_writer_int96_timestamps_and_arrow_schema():
     df = cudf.DataFrame(
         {
             "timestamp": cudf.Series(

From e9ab52f0f1888fa0011b38616a9d589fa701cbaf Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 9 Jul 2024 01:28:02 +0000
Subject: [PATCH 42/45] Minor changes from reviewer suggestions

---
 cpp/src/io/parquet/writer_impl.cu          | 2 +-
 cpp/src/io/parquet/writer_impl_helpers.cpp | 6 +++---
 cpp/src/io/parquet/writer_impl_helpers.hpp | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 94594c83996..66b4fce16fe 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -2474,7 +2474,7 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
   CUDF_EXPECTS(not _closed, "Data has already been flushed to out and closed");
 
   if (not _table_meta) { _table_meta = std::make_unique<table_input_metadata>(input); }
-  fill_table_meta(_table_meta);
+  fill_table_meta(*_table_meta);
 
   // All kinds of memory allocation and data compressions/encoding are performed here.
   // If any error occurs, such as out-of-memory exception, the internal state of the current
diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
index fb20a5b0999..e2f09f872d3 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.cpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
@@ -72,7 +72,7 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block
   return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
 }
 
-void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta)
+void fill_table_meta(table_input_metadata& table_meta)
 {
   // Fill unnamed columns' names in table_meta
   std::function<void(column_in_metadata&, std::string)> add_default_name =
@@ -82,8 +82,8 @@ void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta)
         add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i));
       }
     };
-  for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) {
-    add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i));
+  for (size_t i = 0; i < table_meta.column_metadata.size(); ++i) {
+    add_default_name(table_meta.column_metadata[i], "_col" + std::to_string(i));
   }
 }
 
diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp
index 73d302ec3fc..a85411594e9 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.hpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.hpp
@@ -67,7 +67,7 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block
  *
  * @param table_meta The table metadata to fill
  */
-void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta);
+void fill_table_meta(table_input_metadata& table_meta);
 
 /**
  * @brief Compute size (in bytes) of the data stored in the given column.

From 9b163f791f2e1dae0833eed63f372d4e2bedb9cc Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 9 Jul 2024 11:22:22 -0700
Subject: [PATCH 43/45] Update cpp/src/io/parquet/arrow_schema_writer.hpp

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 cpp/src/io/parquet/arrow_schema_writer.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index 1b62ef35c86..bc901ded9af 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -48,7 +48,7 @@ namespace cudf::io::parquet::detail {
 std::string construct_arrow_schema_ipc_message(
   cudf::detail::LinkedColVector const& linked_columns,
   table_input_metadata const& metadata,
-  ::cudf::io::detail::single_write_mode const write_mode,
+  cudf::io::detail::single_write_mode const write_mode,
   bool const utc_timestamps);
 
 }  // namespace cudf::io::parquet::detail

From 13a06acbc6dd8e4dd8192752c1382b5aada506bc Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 9 Jul 2024 18:25:47 +0000
Subject: [PATCH 44/45] Apply clang-format

---
 cpp/src/io/parquet/arrow_schema_writer.hpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index bc901ded9af..9bc435bf6c8 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -45,10 +45,9 @@ namespace cudf::io::parquet::detail {
  *
  * @return The constructed arrow ipc message string
  */
-std::string construct_arrow_schema_ipc_message(
-  cudf::detail::LinkedColVector const& linked_columns,
-  table_input_metadata const& metadata,
-  cudf::io::detail::single_write_mode const write_mode,
-  bool const utc_timestamps);
+std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
+                                               table_input_metadata const& metadata,
+                                               cudf::io::detail::single_write_mode const write_mode,
+                                               bool const utc_timestamps);
 
 }  // namespace cudf::io::parquet::detail

From 1ceca42b7f4d16fa6b5550594469873c8abba18d Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 9 Jul 2024 22:06:56 +0000
Subject: [PATCH 45/45] Add details to `store_schema` docstring

---
 python/cudf/cudf/utils/ioutils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 5d115c6be5a..76c7f2bfdb8 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -323,8 +323,11 @@
     If a column name is present in the set, that column will be output as
     unannotated binary, rather than the default 'UTF-8'.
 store_schema : bool, default False
-    If ``True``, enable computing and writing arrow schema to Parquet
-    file footer's key-value metadata section for faithful round-tripping.
+    If ``True``, writes arrow schema to Parquet file footer's key-value
+    metadata section to faithfully round-trip ``duration`` types with arrow.
+    This cannot be used with ``int96_timestamps`` enabled as int96 timestamps
+    are deprecated in arrow. Also, all decimal32 and decimal64 columns will be
+    converted to decimal128 as arrow only supports decimal128 and decimal256 types.
 **kwargs
     Additional parameters will be passed to execution engines other
     than ``cudf``.