Skip to content

Commit

Permalink
Merge branch 'plc/io/parquet' of github.com:Matt711/cudf into plc/io/…
Browse files Browse the repository at this point in the history
…parquet
  • Loading branch information
Matt711 committed Dec 3, 2024
2 parents bcbe608 + 994abc5 commit 2c0e1bd
Show file tree
Hide file tree
Showing 53 changed files with 1,175 additions and 865 deletions.
2 changes: 1 addition & 1 deletion ci/cpp_linters.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ source rapids-configure-sccache
# Run the build via CMake, which will run clang-tidy when CUDF_STATIC_LINTERS is enabled.

iwyu_flag=""
if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then
if [[ "${RAPIDS_BUILD_TYPE:-}" == "nightly" ]]; then
iwyu_flag="-DCUDF_IWYU=ON"
fi
cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_CLANG_TIDY=ON ${iwyu_flag} -DBUILD_TESTS=OFF -GNinja
Expand Down
14 changes: 11 additions & 3 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ option(
mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL)
option(CUDF_CLANG_TIDY "Enable clang-tidy during compilation" OFF)
option(CUDF_IWYU "Enable IWYU during compilation" OFF)
option(CUDF_CLANG_TIDY_AUTOFIX "Enable clang-tidy autofixes" OFF)

option(
CUDF_KVIKIO_REMOTE_IO
Expand Down Expand Up @@ -205,9 +206,16 @@ function(enable_static_checkers target)
if(_LINT_CLANG_TIDY)
# clang will complain about unused link libraries on the compile line unless we specify
# -Qunused-arguments.
set_target_properties(
${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments"
)
if(CUDF_CLANG_TIDY_AUTOFIX)
set_target_properties(
${target} PROPERTIES CXX_CLANG_TIDY
"${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments;--fix"
)
else()
set_target_properties(
${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments"
)
endif()
endif()
if(_LINT_IWYU)
# A few extra warnings pop up when building with IWYU. I'm not sure why, but they are not
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/bitmask/is_element_valid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ bool is_element_valid_sync(column_view const& col_view,
CUDF_EXPECTS(element_index >= 0 and element_index < col_view.size(), "invalid index.");
if (!col_view.nullable()) { return true; }

bitmask_type word;
bitmask_type word = 0;
// null_mask() returns device ptr to bitmask without offset
size_type index = element_index + col_view.offset();
size_type const index = element_index + col_view.offset();
CUDF_CUDA_TRY(cudaMemcpyAsync(&word,
col_view.null_mask() + word_index(index),
sizeof(bitmask_type),
Expand Down
97 changes: 49 additions & 48 deletions cpp/src/column/column_view.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view k
cudf::experimental::prefetch::detail::prefetch_noexcept(
key, data_ptr, col.size() * size_of(col.type()), cudf::get_default_stream());
} else if (col.type().id() == type_id::STRING) {
strings_column_view scv{col};
strings_column_view const scv{col};
if (data_ptr == nullptr) {
// Do not call chars_size if the data_ptr is nullptr.
return;
Expand All @@ -58,51 +58,6 @@ void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view k
}
}

} // namespace

column_view_base::column_view_base(data_type type,
size_type size,
void const* data,
bitmask_type const* null_mask,
size_type null_count,
size_type offset)
: _type{type},
_size{size},
_data{data},
_null_mask{null_mask},
_null_count{null_count},
_offset{offset}
{
CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");

if (type.id() == type_id::EMPTY) {
_null_count = size;
CUDF_EXPECTS(nullptr == data, "EMPTY column should have no data.");
CUDF_EXPECTS(nullptr == null_mask, "EMPTY column should have no null mask.");
} else if (is_compound(type)) {
if (type.id() != type_id::STRING) {
CUDF_EXPECTS(nullptr == data, "Compound (parent) columns cannot have data");
}
} else if (size > 0) {
CUDF_EXPECTS(nullptr != data, "Null data pointer.");
}

CUDF_EXPECTS(offset >= 0, "Invalid offset.");

if ((null_count > 0) and (type.id() != type_id::EMPTY)) {
CUDF_EXPECTS(nullptr != null_mask, "Invalid null mask for non-zero null count.");
}
}

size_type column_view_base::null_count(size_type begin, size_type end) const
{
CUDF_EXPECTS((begin >= 0) && (end <= size()) && (begin <= end), "Range is out of bounds.");
return (null_count() == 0)
? 0
: cudf::detail::null_count(
null_mask(), offset() + begin, offset() + end, cudf::get_default_stream());
}

// Struct to use custom hash combine and fold expression
struct HashValue {
std::size_t hash;
Expand Down Expand Up @@ -133,8 +88,6 @@ std::size_t shallow_hash_impl(column_view const& c, bool is_parent_empty = false
});
}

std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl(input); }

bool shallow_equivalent_impl(column_view const& lhs,
column_view const& rhs,
bool is_parent_empty = false)
Expand All @@ -151,11 +104,59 @@ bool shallow_equivalent_impl(column_view const& lhs,
return shallow_equivalent_impl(lhs_child, rhs_child, is_empty);
});
}

} // namespace

column_view_base::column_view_base(data_type type,
size_type size,
void const* data,
bitmask_type const* null_mask,
size_type null_count,
size_type offset)
: _type{type},
_size{size},
_data{data},
_null_mask{null_mask},
_null_count{null_count},
_offset{offset}
{
CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");

if (type.id() == type_id::EMPTY) {
_null_count = size;
CUDF_EXPECTS(nullptr == data, "EMPTY column should have no data.");
CUDF_EXPECTS(nullptr == null_mask, "EMPTY column should have no null mask.");
} else if (is_compound(type)) {
if (type.id() != type_id::STRING) {
CUDF_EXPECTS(nullptr == data, "Compound (parent) columns cannot have data");
}
} else if (size > 0) {
CUDF_EXPECTS(nullptr != data, "Null data pointer.");
}

CUDF_EXPECTS(offset >= 0, "Invalid offset.");

if ((null_count > 0) and (type.id() != type_id::EMPTY)) {
CUDF_EXPECTS(nullptr != null_mask, "Invalid null mask for non-zero null count.");
}
}

size_type column_view_base::null_count(size_type begin, size_type end) const
{
CUDF_EXPECTS((begin >= 0) && (end <= size()) && (begin <= end), "Range is out of bounds.");
return (null_count() == 0)
? 0
: cudf::detail::null_count(
null_mask(), offset() + begin, offset() + end, cudf::get_default_stream());
}

bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs)
{
return shallow_equivalent_impl(lhs, rhs);
}

std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl(input); }

} // namespace detail

// Immutable view constructor
Expand Down
12 changes: 7 additions & 5 deletions cpp/src/copying/copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,12 @@ struct scalar_empty_like_functor_impl<cudf::list_view> {
auto ls = static_cast<list_scalar const*>(&input);

// TODO: add a manual constructor for lists_column_view.
column_view offsets{cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0};
column_view const offsets{cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0};
std::vector<column_view> children;
children.push_back(offsets);
children.push_back(ls->view());
column_view lcv{cudf::data_type{cudf::type_id::LIST}, 0, nullptr, nullptr, 0, 0, children};
column_view const lcv{
cudf::data_type{cudf::type_id::LIST}, 0, nullptr, nullptr, 0, 0, children};

return empty_like(lcv);
}
Expand All @@ -81,8 +82,9 @@ struct scalar_empty_like_functor_impl<cudf::struct_view> {
// TODO: add a manual constructor for structs_column_view
// TODO: add cudf::get_element() support for structs
cudf::table_view tbl = ss->view();
std::vector<column_view> children(tbl.begin(), tbl.end());
column_view scv{cudf::data_type{cudf::type_id::STRUCT}, 0, nullptr, nullptr, 0, 0, children};
std::vector<column_view> const children(tbl.begin(), tbl.end());
column_view const scv{
cudf::data_type{cudf::type_id::STRUCT}, 0, nullptr, nullptr, 0, 0, children};

return empty_like(scv);
}
Expand Down Expand Up @@ -120,7 +122,7 @@ std::unique_ptr<column> allocate_like(column_view const& input,
CUDF_FUNC_RANGE();
CUDF_EXPECTS(
is_fixed_width(input.type()), "Expects only fixed-width type column", cudf::data_type_error);
mask_state allocate_mask = should_allocate_mask(mask_alloc, input.nullable());
mask_state const allocate_mask = should_allocate_mask(mask_alloc, input.nullable());

return std::make_unique<column>(input.type(),
size,
Expand Down
81 changes: 36 additions & 45 deletions cpp/src/copying/pack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,20 +48,20 @@ struct serialized_column {
null_count(_null_count),
data_offset(_data_offset),
null_mask_offset(_null_mask_offset),
num_children(_num_children),
pad(0)
num_children(_num_children)

{
}

data_type type;
size_type size;
size_type null_count;
int64_t data_offset; // offset into contiguous data buffer, or -1 if column data is null
int64_t null_mask_offset; // offset into contiguous data buffer, or -1 if column data is null
size_type num_children;
size_type size{};
size_type null_count{};
int64_t data_offset{}; // offset into contiguous data buffer, or -1 if column data is null
int64_t null_mask_offset{}; // offset into contiguous data buffer, or -1 if column data is null
size_type num_children{};
// Explicitly pad to avoid uninitialized padding bits, allowing `serialized_column` to be bit-wise
// comparable
int pad;
int pad{};
};

/**
Expand Down Expand Up @@ -137,6 +137,34 @@ void build_column_metadata(metadata_builder& mb,
});
}

table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data)
{
// gpu data can be null if everything is empty but the metadata must always be valid
CUDF_EXPECTS(metadata != nullptr, "Encountered invalid packed column input");
auto serialized_columns = reinterpret_cast<serialized_column const*>(metadata);
uint8_t const* base_ptr = gpu_data;
// first entry is a stub where size == the total # of top level columns (see pack_metadata above)
auto const num_columns = serialized_columns[0].size;
size_t current_index = 1;

std::function<std::vector<column_view>(size_type)> get_columns;
get_columns = [&serialized_columns, &current_index, base_ptr, &get_columns](size_t num_columns) {
std::vector<column_view> cols;
for (size_t i = 0; i < num_columns; i++) {
auto serial_column = serialized_columns[current_index];
current_index++;

std::vector<column_view> const children = get_columns(serial_column.num_children);

cols.emplace_back(deserialize_column(serial_column, children, base_ptr));
}

return cols;
};

return table_view{get_columns(num_columns)};
}

} // anonymous namespace

/**
Expand Down Expand Up @@ -198,37 +226,6 @@ class metadata_builder_impl {
std::vector<detail::serialized_column> metadata;
};

/**
* @copydoc cudf::detail::unpack
*/
table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data)
{
// gpu data can be null if everything is empty but the metadata must always be valid
CUDF_EXPECTS(metadata != nullptr, "Encountered invalid packed column input");
auto serialized_columns = reinterpret_cast<serialized_column const*>(metadata);
uint8_t const* base_ptr = gpu_data;
// first entry is a stub where size == the total # of top level columns (see pack_metadata above)
auto const num_columns = serialized_columns[0].size;
size_t current_index = 1;

std::function<std::vector<column_view>(size_type)> get_columns;
get_columns = [&serialized_columns, &current_index, base_ptr, &get_columns](size_t num_columns) {
std::vector<column_view> cols;
for (size_t i = 0; i < num_columns; i++) {
auto serial_column = serialized_columns[current_index];
current_index++;

std::vector<column_view> children = get_columns(serial_column.num_children);

cols.emplace_back(deserialize_column(serial_column, children, base_ptr));
}

return cols;
};

return table_view{get_columns(num_columns)};
}

metadata_builder::metadata_builder(size_type const num_root_columns)
: impl(std::make_unique<metadata_builder_impl>(num_root_columns +
1 /*one more extra metadata entry as below*/))
Expand Down Expand Up @@ -280,9 +277,6 @@ std::vector<uint8_t> pack_metadata(table_view const& table,
return detail::pack_metadata(table, contiguous_buffer, buffer_size, builder);
}

/**
* @copydoc cudf::unpack
*/
table_view unpack(packed_columns const& input)
{
CUDF_FUNC_RANGE();
Expand All @@ -292,9 +286,6 @@ table_view unpack(packed_columns const& input)
reinterpret_cast<uint8_t const*>(input.gpu_data->data()));
}

/**
* @copydoc cudf::unpack(uint8_t const*, uint8_t const* )
*/
table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data)
{
CUDF_FUNC_RANGE();
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/datetime/timezone.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ struct dst_transition_s {
#pragma pack(pop)

struct timezone_file {
timezone_file_header header;
timezone_file_header header{};
bool is_header_from_64bit = false;

std::vector<int64_t> transition_times;
Expand Down
Loading

0 comments on commit 2c0e1bd

Please sign in to comment.