Skip to content

Commit

Permalink
Merge branch 'branch-24.12' into pylibcudf_pack_and_unpack
Browse files Browse the repository at this point in the history
  • Loading branch information
madsbk authored Oct 10, 2024
2 parents e700db6 + 31423d0 commit 11e73b9
Show file tree
Hide file tree
Showing 47 changed files with 305 additions and 138 deletions.
2 changes: 1 addition & 1 deletion cpp/include/cudf/utilities/span.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
std::is_convertible_v<OtherT (*)[], T (*)[]>, // NOLINT
void>* = nullptr>
constexpr host_span(host_span<OtherT, OtherExtent> const& other) noexcept
: base(other.data(), other.size())
: base(other.data(), other.size()), _is_device_accessible{other.is_device_accessible()}
{
}

Expand Down
8 changes: 4 additions & 4 deletions cpp/src/io/parquet/compact_protocol_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,10 +309,10 @@ class parquet_field_struct : public parquet_field {
template <typename E, typename T>
class parquet_field_union_struct : public parquet_field {
E& enum_val;
cuda::std::optional<T>& val; // union structs are always wrapped in std::optional
std::optional<T>& val; // union structs are always wrapped in std::optional

public:
parquet_field_union_struct(int f, E& ev, cuda::std::optional<T>& v)
parquet_field_union_struct(int f, E& ev, std::optional<T>& v)
: parquet_field(f), enum_val(ev), val(v)
{
}
Expand Down Expand Up @@ -439,10 +439,10 @@ class parquet_field_struct_blob : public parquet_field {
*/
template <typename T, typename FieldFunctor>
class parquet_field_optional : public parquet_field {
cuda::std::optional<T>& val;
std::optional<T>& val;

public:
parquet_field_optional(int f, cuda::std::optional<T>& v) : parquet_field(f), val(v) {}
parquet_field_optional(int f, std::optional<T>& v) : parquet_field(f), val(v) {}

inline void operator()(CompactProtocolReader* cpr, int field_type)
{
Expand Down
64 changes: 31 additions & 33 deletions cpp/src/io/parquet/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@

#include <cudf/types.hpp>

#include <cuda/std/optional>

#include <cstdint>
#include <optional>
#include <string>
Expand Down Expand Up @@ -94,10 +92,10 @@ struct LogicalType {
BSON
};
Type type;
cuda::std::optional<DecimalType> decimal_type;
cuda::std::optional<TimeType> time_type;
cuda::std::optional<TimestampType> timestamp_type;
cuda::std::optional<IntType> int_type;
std::optional<DecimalType> decimal_type;
std::optional<TimeType> time_type;
std::optional<TimestampType> timestamp_type;
std::optional<IntType> int_type;

LogicalType(Type tp = UNDEFINED) : type(tp) {}
LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
Expand Down Expand Up @@ -178,21 +176,21 @@ struct SchemaElement {
// 5: nested fields
int32_t num_children = 0;
// 6: DEPRECATED: record the original type before conversion to parquet type
cuda::std::optional<ConvertedType> converted_type;
std::optional<ConvertedType> converted_type;
// 7: DEPRECATED: record the scale for DECIMAL converted type
int32_t decimal_scale = 0;
// 8: DEPRECATED: record the precision for DECIMAL converted type
int32_t decimal_precision = 0;
// 9: save field_id from original schema
cuda::std::optional<int32_t> field_id;
std::optional<int32_t> field_id;
// 10: replaces converted type
cuda::std::optional<LogicalType> logical_type;
std::optional<LogicalType> logical_type;

// extra cudf specific fields
bool output_as_byte_array = false;

// cudf type determined from arrow:schema
cuda::std::optional<type_id> arrow_type;
std::optional<type_id> arrow_type;

// The following fields are filled in later during schema initialization
int max_definition_level = 0;
Expand Down Expand Up @@ -258,21 +256,21 @@ struct SchemaElement {
*/
struct Statistics {
// deprecated max value in signed comparison order
cuda::std::optional<std::vector<uint8_t>> max;
std::optional<std::vector<uint8_t>> max;
// deprecated min value in signed comparison order
cuda::std::optional<std::vector<uint8_t>> min;
std::optional<std::vector<uint8_t>> min;
// count of null values in the column
cuda::std::optional<int64_t> null_count;
std::optional<int64_t> null_count;
// count of distinct values occurring
cuda::std::optional<int64_t> distinct_count;
std::optional<int64_t> distinct_count;
// max value for column determined by ColumnOrder
cuda::std::optional<std::vector<uint8_t>> max_value;
std::optional<std::vector<uint8_t>> max_value;
// min value for column determined by ColumnOrder
cuda::std::optional<std::vector<uint8_t>> min_value;
std::optional<std::vector<uint8_t>> min_value;
// If true, max_value is the actual maximum value for a column
cuda::std::optional<bool> is_max_value_exact;
std::optional<bool> is_max_value_exact;
// If true, min_value is the actual minimum value for a column
cuda::std::optional<bool> is_min_value_exact;
std::optional<bool> is_min_value_exact;
};

/**
Expand All @@ -281,7 +279,7 @@ struct Statistics {
struct SizeStatistics {
// Number of variable-width bytes stored for the page/chunk. Should not be set for anything
// but the BYTE_ARRAY physical type.
cuda::std::optional<int64_t> unencoded_byte_array_data_bytes;
std::optional<int64_t> unencoded_byte_array_data_bytes;
/**
* When present, there is expected to be one element corresponding to each
* repetition (i.e. size=max repetition_level+1) where each element
Expand All @@ -290,14 +288,14 @@ struct SizeStatistics {
*
* This value should not be written if max_repetition_level is 0.
*/
cuda::std::optional<std::vector<int64_t>> repetition_level_histogram;
std::optional<std::vector<int64_t>> repetition_level_histogram;

/**
* Same as repetition_level_histogram except for definition levels.
*
* This value should not be written if max_definition_level is 0 or 1.
*/
cuda::std::optional<std::vector<int64_t>> definition_level_histogram;
std::optional<std::vector<int64_t>> definition_level_histogram;
};

/**
Expand All @@ -318,7 +316,7 @@ struct OffsetIndex {
std::vector<PageLocation> page_locations;
// per-page size info. see description of the same field in SizeStatistics. only present for
// columns with a BYTE_ARRAY physical type.
cuda::std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
};

/**
Expand All @@ -329,11 +327,11 @@ struct ColumnIndex {
std::vector<std::vector<uint8_t>> min_values; // lower bound for values in each page
std::vector<std::vector<uint8_t>> max_values; // upper bound for values in each page
BoundaryOrder boundary_order =
BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered
cuda::std::optional<std::vector<int64_t>> null_counts; // Optional count of null values per page
BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered
std::optional<std::vector<int64_t>> null_counts; // Optional count of null values per page
// Repetition/definition level histograms for the column chunk
cuda::std::optional<std::vector<int64_t>> repetition_level_histogram;
cuda::std::optional<std::vector<int64_t>> definition_level_histogram;
std::optional<std::vector<int64_t>> repetition_level_histogram;
std::optional<std::vector<int64_t>> definition_level_histogram;
};

/**
Expand Down Expand Up @@ -383,11 +381,11 @@ struct ColumnChunkMetaData {
Statistics statistics;
// Set of all encodings used for pages in this column chunk. This information can be used to
// determine if all data pages are dictionary encoded for example.
cuda::std::optional<std::vector<PageEncodingStats>> encoding_stats;
std::optional<std::vector<PageEncodingStats>> encoding_stats;
// Optional statistics to help estimate total memory when converted to in-memory representations.
// The histograms contained in these statistics can also be useful in some cases for more
// fine-grained nullability/list length filter pushdown.
cuda::std::optional<SizeStatistics> size_statistics;
std::optional<SizeStatistics> size_statistics;
};

/**
Expand Down Expand Up @@ -429,13 +427,13 @@ struct RowGroup {
int64_t num_rows = 0;
// If set, specifies a sort ordering of the rows in this RowGroup.
// The sorting columns can be a subset of all the columns.
cuda::std::optional<std::vector<SortingColumn>> sorting_columns;
std::optional<std::vector<SortingColumn>> sorting_columns;
// Byte offset from beginning of file to first page (data or dictionary) in this row group
cuda::std::optional<int64_t> file_offset;
std::optional<int64_t> file_offset;
// Total byte size of all compressed (and potentially encrypted) column data in this row group
cuda::std::optional<int64_t> total_compressed_size;
std::optional<int64_t> total_compressed_size;
// Row group ordinal in the file
cuda::std::optional<int16_t> ordinal;
std::optional<int16_t> ordinal;
};

/**
Expand All @@ -460,7 +458,7 @@ struct FileMetaData {
std::vector<RowGroup> row_groups;
std::vector<KeyValue> key_value_metadata;
std::string created_by = "";
cuda::std::optional<std::vector<ColumnOrder>> column_orders;
std::optional<std::vector<ColumnOrder>> column_orders;
};

/**
Expand Down
14 changes: 7 additions & 7 deletions cpp/src/io/parquet/parquet_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ struct ColumnChunkDesc {
uint8_t def_level_bits_,
uint8_t rep_level_bits_,
Compression codec_,
cuda::std::optional<LogicalType> logical_type_,
std::optional<LogicalType> logical_type_,
int32_t ts_clock_rate_,
int32_t src_col_index_,
int32_t src_col_schema_,
Expand Down Expand Up @@ -441,12 +441,12 @@ struct ColumnChunkDesc {
int32_t num_data_pages{}; // number of data pages
int32_t num_dict_pages{}; // number of dictionary pages
PageInfo const* dict_page{};
string_index_pair* str_dict_index{}; // index for string dictionary
bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column
void** column_data_base{}; // base pointers of column data
void** column_string_base{}; // base pointers of column string data
Compression codec{}; // compressed codec enum
cuda::std::optional<LogicalType> logical_type{}; // logical type
string_index_pair* str_dict_index{}; // index for string dictionary
bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column
void** column_data_base{}; // base pointers of column data
void** column_string_base{}; // base pointers of column string data
Compression codec{}; // compressed codec enum
std::optional<LogicalType> logical_type{}; // logical type
int32_t ts_clock_rate{}; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)

int32_t src_col_index{}; // my input column index
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/io/parquet/predicate_pushdown.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ struct stats_caster {
}

void set_index(size_type index,
cuda::std::optional<std::vector<uint8_t>> const& binary_value,
std::optional<std::vector<uint8_t>> const& binary_value,
Type const type)
{
if (binary_value.has_value()) {
Expand Down Expand Up @@ -234,8 +234,8 @@ struct stats_caster {
max.set_index(stats_idx, max_value, colchunk.meta_data.type);
} else {
// Marking it null, if column present in row group
min.set_index(stats_idx, cuda::std::nullopt, {});
max.set_index(stats_idx, cuda::std::nullopt, {});
min.set_index(stats_idx, std::nullopt, {});
max.set_index(stats_idx, std::nullopt, {});
}
stats_idx++;
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/parquet/reader_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ namespace {
// be treated as a string. Currently the only logical type that has special handling is DECIMAL.
// Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which
// for now would also be treated as a string).
inline bool is_treat_fixed_length_as_string(cuda::std::optional<LogicalType> const& logical_type)
inline bool is_treat_fixed_length_as_string(std::optional<LogicalType> const& logical_type)
{
if (!logical_type.has_value()) { return true; }
return logical_type->type != LogicalType::DECIMAL;
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/io/parquet/reader_impl_chunking.cu
Original file line number Diff line number Diff line change
Expand Up @@ -371,11 +371,11 @@ int64_t find_next_split(int64_t cur_pos,
*
* @return A tuple of Parquet clock rate and Parquet decimal type.
*/
[[nodiscard]] std::tuple<int32_t, cuda::std::optional<LogicalType>> conversion_info(
[[nodiscard]] std::tuple<int32_t, std::optional<LogicalType>> conversion_info(
type_id column_type_id,
type_id timestamp_type_id,
Type physical,
cuda::std::optional<LogicalType> logical_type)
std::optional<LogicalType> logical_type)
{
int32_t const clock_rate =
is_chrono(data_type{column_type_id}) ? to_clockrate(timestamp_type_id) : 0;
Expand All @@ -386,11 +386,11 @@ int64_t find_next_split(int64_t cur_pos,
// if decimal but not outputting as float or decimal, then convert to no logical type
if (column_type_id != type_id::FLOAT64 and
not cudf::is_fixed_point(data_type{column_type_id})) {
return std::make_tuple(clock_rate, cuda::std::nullopt);
return {clock_rate, std::nullopt};
}
}

return std::make_tuple(clock_rate, std::move(logical_type));
return {clock_rate, std::move(logical_type)};
}

/**
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/io/parquet/reader_impl_helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ namespace flatbuf = cudf::io::parquet::flatbuf;

namespace {

cuda::std::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
std::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
{
if (schema.converted_type.has_value()) {
switch (schema.converted_type.value()) {
Expand Down Expand Up @@ -66,7 +66,7 @@ cuda::std::optional<LogicalType> converted_to_logical_type(SchemaElement const&
default: return LogicalType{LogicalType::UNDEFINED};
}
}
return cuda::std::nullopt;
return std::nullopt;
}

} // namespace
Expand Down Expand Up @@ -246,7 +246,7 @@ void metadata::sanitize_schema()
struct_elem.repetition_type = REQUIRED;
struct_elem.num_children = schema_elem.num_children;
struct_elem.type = UNDEFINED_TYPE;
struct_elem.converted_type = cuda::std::nullopt;
struct_elem.converted_type = std::nullopt;

// swap children
struct_elem.children_idx = std::move(schema_elem.children_idx);
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/io/parquet/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ struct aggregate_writer_metadata {
std::vector<std::vector<uint8_t>> column_indexes;
};
std::vector<per_file_metadata> files;
cuda::std::optional<std::vector<ColumnOrder>> column_orders = cuda::std::nullopt;
std::optional<std::vector<ColumnOrder>> column_orders = std::nullopt;
};

namespace {
Expand Down Expand Up @@ -472,7 +472,7 @@ struct leaf_schema_fn {
std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
{
col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
col_schema.converted_type = cuda::std::nullopt;
col_schema.converted_type = std::nullopt;
col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
if (timestamp_is_int96) {
col_schema.ts_scale = -1000; // negative value indicates division by absolute value
Expand Down Expand Up @@ -750,7 +750,7 @@ std::vector<schema_tree_node> construct_parquet_schema_tree(
col_schema.type = Type::BYTE_ARRAY;
}

col_schema.converted_type = cuda::std::nullopt;
col_schema.converted_type = std::nullopt;
col_schema.stats_dtype = statistics_dtype::dtype_byte_array;
col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
Expand Down Expand Up @@ -2795,7 +2795,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
// See https://github.com/rapidsai/cudf/pull/14264#issuecomment-1778311615
for (auto& se : md.schema) {
if (se.logical_type.has_value() && se.logical_type.value().type == LogicalType::UNKNOWN) {
se.logical_type = cuda::std::nullopt;
se.logical_type = std::nullopt;
}
}

Expand Down
2 changes: 1 addition & 1 deletion cpp/tests/io/parquet_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -744,7 +744,7 @@ int32_t compare(T& v1, T& v2)
int32_t compare_binary(std::vector<uint8_t> const& v1,
std::vector<uint8_t> const& v2,
cudf::io::parquet::detail::Type ptype,
cuda::std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype)
std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype)
{
auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN);
switch (ptype) {
Expand Down
2 changes: 1 addition & 1 deletion cpp/tests/io/parquet_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ std::pair<cudf::table, std::string> create_parquet_typed_with_stats(std::string
int32_t compare_binary(std::vector<uint8_t> const& v1,
std::vector<uint8_t> const& v2,
cudf::io::parquet::detail::Type ptype,
cuda::std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype);
std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype);

void expect_compression_stats_empty(std::shared_ptr<cudf::io::writer_compression_statistics> stats);

Expand Down
Loading

0 comments on commit 11e73b9

Please sign in to comment.