Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-24.12' into shm-groupby
Browse files Browse the repository at this point in the history
  • Loading branch information
PointKernel committed Sep 26, 2024
2 parents 2d42b9b + 6b3d57d commit b2fb181
Show file tree
Hide file tree
Showing 66 changed files with 2,405 additions and 573 deletions.
1 change: 1 addition & 0 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ dependencies:
- pandas
- pandas>=2.0,<2.2.3dev0
- pandoc
- polars>=1.8,<1.9
- pre-commit
- ptxcompiler
- pyarrow>=14.0.0,<18.0.0a0
Expand Down
1 change: 1 addition & 0 deletions conda/environments/all_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ dependencies:
- pandas
- pandas>=2.0,<2.2.3dev0
- pandoc
- polars>=1.8,<1.9
- pre-commit
- pyarrow>=14.0.0,<18.0.0a0
- pydata-sphinx-theme!=0.14.2
Expand Down
36 changes: 36 additions & 0 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ class json_reader_options {
char _delimiter = '\n';
// Prune columns on read, selected based on the _dtypes option
bool _prune_columns = false;
// Experimental features: new column tree construction
bool _experimental = false;

// Bytes to skip from the start
size_t _byte_range_offset = 0;
Expand Down Expand Up @@ -277,6 +279,15 @@ class json_reader_options {
*/
[[nodiscard]] bool is_enabled_prune_columns() const { return _prune_columns; }

/**
* @brief Whether to enable experimental features.
*
* When set to true, experimental features, such as the new column tree construction,
* utf-8 matching of field names will be enabled.
* @return true if experimental features are enabled
*/
[[nodiscard]] bool is_enabled_experimental() const { return _experimental; }

/**
* @brief Whether to parse dates as DD/MM versus MM/DD.
*
Expand Down Expand Up @@ -453,6 +464,16 @@ class json_reader_options {
*/
void enable_prune_columns(bool val) { _prune_columns = val; }

/**
* @brief Set whether to enable experimental features.
*
* When set to true, experimental features, such as the new column tree construction,
* utf-8 matching of field names will be enabled.
*
* @param val Boolean value to enable/disable experimental features
*/
void enable_experimental(bool val) { _experimental = val; }

/**
* @brief Set whether to parse dates as DD/MM versus MM/DD.
*
Expand Down Expand Up @@ -695,6 +716,21 @@ class json_reader_options_builder {
return *this;
}

/**
* @brief Set whether to enable experimental features.
*
* When set to true, experimental features, such as the new column tree construction,
* utf-8 matching of field names will be enabled.
*
* @param val Boolean value to enable/disable experimental features
* @return this for chaining
*/
json_reader_options_builder& experimental(bool val)
{
options._experimental = val;
return *this;
}

/**
* @brief Set whether to parse dates as DD/MM versus MM/DD.
*
Expand Down
776 changes: 686 additions & 90 deletions cpp/src/io/json/host_tree_algorithms.cu

Large diffs are not rendered by default.

46 changes: 32 additions & 14 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ void print_tree(host_span<SymbolT const> input,
* max row offsets of columns
*/
std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
reduce_to_column_tree(tree_meta_t& tree,
reduce_to_column_tree(tree_meta_t const& tree,
device_span<NodeIndexT const> original_col_ids,
device_span<NodeIndexT const> sorted_col_ids,
device_span<NodeIndexT const> ordered_node_ids,
Expand Down Expand Up @@ -317,14 +317,21 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
// Note: json_col modified here, moves this memory
};

auto get_child_schema = [schema](auto child_name) -> std::optional<schema_element> {
auto get_child_schema = [&schema](auto child_name) -> std::optional<schema_element> {
if (schema.has_value()) {
auto const result = schema.value().child_types.find(child_name);
if (result != std::end(schema.value().child_types)) { return result->second; }
}
return {};
};

auto get_list_child_schema = [&schema]() -> std::optional<schema_element> {
if (schema.has_value()) {
if (schema.value().child_types.size() > 0) return schema.value().child_types.begin()->second;
}
return {};
};

switch (json_col.type) {
case json_col_t::StringColumn: {
// move string_offsets to GPU and transform to string column
Expand Down Expand Up @@ -439,9 +446,8 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
rmm::device_buffer{},
0);
// Create children column
auto child_schema_element = json_col.child_columns.empty()
? std::optional<schema_element>{}
: get_child_schema(json_col.child_columns.begin()->first);
auto child_schema_element =
json_col.child_columns.empty() ? std::optional<schema_element>{} : get_list_child_schema();
auto [child_column, names] =
json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value())
? std::pair<std::unique_ptr<column>,
Expand Down Expand Up @@ -479,6 +485,16 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
}
}

template <typename... Args>
auto make_device_json_column_dispatch(bool experimental, Args&&... args)
{
if (experimental) {
return experimental::make_device_json_column(std::forward<Args>(args)...);
} else {
return make_device_json_column(std::forward<Args>(args)...);
}
}

table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream,
Expand Down Expand Up @@ -524,6 +540,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
gpu_tree,
is_array_of_arrays,
options.is_enabled_lines(),
options.is_enabled_experimental(),
stream,
cudf::get_current_device_resource_ref());

Expand All @@ -536,15 +553,16 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
0);

// Get internal JSON column
make_device_json_column(d_input,
gpu_tree,
gpu_col_id,
gpu_row_offsets,
root_column,
is_array_of_arrays,
options,
stream,
mr);
make_device_json_column_dispatch(options.is_enabled_experimental(),
d_input,
gpu_tree,
gpu_col_id,
gpu_row_offsets,
root_column,
is_array_of_arrays,
options,
stream,
mr);

// data_root refers to the root column of the data represented by the given JSON string
auto& data_root =
Expand Down
Loading

0 comments on commit b2fb181

Please sign in to comment.