diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index a6112b8db4c..7374ffc37e6 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -101,6 +101,8 @@ class json_reader_options { bool _lines = false; // Parse mixed types as a string column bool _mixed_types_as_string = false; + // Prune columns on read, selected based on the _dtypes option + bool _prune_columns = false; // Bytes to skip from the start size_t _byte_range_offset = 0; @@ -241,6 +243,17 @@ class json_reader_options { */ bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; } + /** + * @brief Whether to prune columns on read, selected based on the @ref set_dtypes option. + * + * When set as true, if the reader options include @ref set_dtypes, then + * the reader will only return those columns which are mentioned in @ref set_dtypes. + * If false, then all columns are returned, independent of the @ref set_dtypes setting. + * + * @return True if column pruning is enabled + */ + bool is_enabled_prune_columns() const { return _prune_columns; } + /** * @brief Whether to parse dates as DD/MM versus MM/DD. * @@ -342,6 +355,17 @@ class json_reader_options { */ void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; } + /** + * @brief Set whether to prune columns on read, selected based on the @ref set_dtypes option. + * + * When set as true, if the reader options include @ref set_dtypes, then + * the reader will only return those columns which are mentioned in @ref set_dtypes. + * If false, then all columns are returned, independent of the @ref set_dtypes setting. + * + * @param val Boolean value to enable/disable column pruning + */ + void enable_prune_columns(bool val) { _prune_columns = val; } + /** * @brief Set whether to parse dates as DD/MM versus MM/DD. * @@ -508,6 +532,22 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether to prune columns on read, selected based on the @ref dtypes option. + * + * When set as true, if the reader options include @ref dtypes, then + * the reader will only return those columns which are mentioned in @ref dtypes. + * If false, then all columns are returned, independent of the @ref dtypes setting. + * + * @param val Boolean value to enable/disable column pruning + * @return this for chaining + */ + json_reader_options_builder& prune_columns(bool val) + { + options._prune_columns = val; + return *this; + } + /** * @brief Set whether to parse dates as DD/MM versus MM/DD. * diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 7117af8948b..631f8adbd6d 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -564,7 +564,7 @@ void make_device_json_column(device_span input, } }; auto init_to_zero = [stream](auto& v) { - thrust::uninitialized_fill(rmm::exec_policy(stream), v.begin(), v.end(), 0); + thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0); }; auto initialize_json_columns = [&](auto i, auto& col) { @@ -625,13 +625,14 @@ void make_device_json_column(device_span input, // find column_ids which are values, but should be ignored in validity std::vector ignore_vals(num_columns, 0); std::vector is_mixed_type_column(num_columns, 0); + std::vector is_pruned(num_columns, 0); columns.try_emplace(parent_node_sentinel, std::ref(root)); - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value + auto name_and_parent_index = [&is_array_of_arrays, + &row_array_parent_col_id, + &column_parent_ids, + &column_categories, + &column_names](auto this_col_id) { std::string name = ""; auto parent_col_id = column_parent_ids[this_col_id]; if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { @@ -647,11 +648,46 @@ void make_device_json_column(device_span input, } else { CUDF_FAIL("Unexpected parent column category"); } + return std::pair{name, parent_col_id}; + }; + + // Prune columns that are not required to be parsed. + if (options.is_enabled_prune_columns()) { + for (auto const this_col_id : unique_col_ids) { + if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { + continue; + } + // Struct, List, String, Value + auto [name, parent_col_id] = name_and_parent_index(this_col_id); + // get path of this column, and get its dtype if present in options + auto const nt = tree_path.get_path(this_col_id); + std::optional const user_dtype = get_path_data_type(nt, options); + if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) { + is_pruned[this_col_id] = 1; + continue; + } else { + // make sure all its parents are not pruned. + while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) { + is_pruned[parent_col_id] = 0; + parent_col_id = column_parent_ids[parent_col_id]; + } + } + } + } + + // Build the column tree, also, handles mixed types. + for (auto const this_col_id : unique_col_ids) { + if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { + continue; + } + // Struct, List, String, Value + auto [name, parent_col_id] = name_and_parent_index(this_col_id); - if (parent_col_id != parent_node_sentinel && is_mixed_type_column[parent_col_id] == 1) { - // if parent is mixed type column, ignore this column. - is_mixed_type_column[this_col_id] = 1; - ignore_vals[this_col_id] = 1; + // if parent is mixed type column or this column is pruned, ignore this column. + if (parent_col_id != parent_node_sentinel && + (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id])) { + ignore_vals[this_col_id] = 1; + if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } continue; } @@ -714,12 +750,13 @@ void make_device_json_column(device_span input, "A mix of lists and structs within the same column is not supported"); } } + if (is_enabled_mixed_types_as_string) { // get path of this column, check if it is a struct forced as string, and enforce it - auto nt = tree_path.get_path(this_col_id); - std::optional user_dt = get_path_data_type(nt, options); - if (column_categories[this_col_id] == NC_STRUCT and user_dt.has_value() and - user_dt.value().id() == type_id::STRING) { + auto const nt = tree_path.get_path(this_col_id); + std::optional const user_dtype = get_path_data_type(nt, options); + if (column_categories[this_col_id] == NC_STRUCT and user_dtype.has_value() and + user_dtype.value().id() == type_id::STRING) { is_mixed_type_column[this_col_id] = 1; column_categories[this_col_id] = NC_STR; } @@ -873,25 +910,27 @@ void make_device_json_column(device_span input, for (auto& [id, col_ref] : columns) { auto& col = col_ref.get(); if (col.type == json_col_t::StringColumn) { - thrust::inclusive_scan(rmm::exec_policy(stream), + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), col.string_offsets.begin(), col.string_offsets.end(), col.string_offsets.begin(), thrust::maximum{}); } else if (col.type == json_col_t::ListColumn) { - thrust::inclusive_scan(rmm::exec_policy(stream), + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), col.child_offsets.begin(), col.child_offsets.end(), col.child_offsets.begin(), thrust::maximum{}); } } + stream.synchronize(); } std::pair, std::vector> device_json_column_to_cudf_column( device_json_column& json_col, device_span d_input, cudf::io::parse_options const& options, + bool prune_columns, std::optional schema, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) @@ -982,13 +1021,16 @@ std::pair, std::vector> device_json_co for (auto const& col_name : json_col.column_order) { auto const& col = json_col.child_columns.find(col_name); column_names.emplace_back(col->first); - auto& child_col = col->second; - auto [child_column, names] = device_json_column_to_cudf_column( - child_col, d_input, options, get_child_schema(col_name), stream, mr); - CUDF_EXPECTS(num_rows == child_column->size(), - "All children columns must have the same size"); - child_columns.push_back(std::move(child_column)); - column_names.back().children = names; + auto& child_col = col->second; + auto child_schema_element = get_child_schema(col_name); + if (!prune_columns or child_schema_element.has_value()) { + auto [child_column, names] = device_json_column_to_cudf_column( + child_col, d_input, options, prune_columns, child_schema_element, stream, mr); + CUDF_EXPECTS(num_rows == child_column->size(), + "All children columns must have the same size"); + child_columns.push_back(std::move(child_column)); + column_names.back().children = names; + } } auto [result_bitmask, null_count] = make_validity(json_col); // The null_mask is set after creation of struct column is to skip the superimpose_nulls and @@ -1011,8 +1053,11 @@ std::pair, std::vector> device_json_co rmm::device_buffer{}, 0); // Create children column + auto child_schema_element = json_col.child_columns.empty() + ? std::optional{} + : get_child_schema(json_col.child_columns.begin()->first); auto [child_column, names] = - json_col.child_columns.empty() + json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value()) ? std::pair, // EMPTY type could not used because gather throws exception on EMPTY type. std::vector>{std::make_unique( @@ -1022,13 +1067,13 @@ std::pair, std::vector> device_json_co rmm::device_buffer{}, 0), std::vector{}} - : device_json_column_to_cudf_column( - json_col.child_columns.begin()->second, - d_input, - options, - get_child_schema(json_col.child_columns.begin()->first), - stream, - mr); + : device_json_column_to_cudf_column(json_col.child_columns.begin()->second, + d_input, + options, + prune_columns, + child_schema_element, + stream, + mr); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); auto ret_col = make_lists_column(num_rows, @@ -1140,8 +1185,6 @@ table_with_metadata device_parse_nested_json(device_span d_input, size_type column_index = 0; for (auto const& col_name : root_struct_col.column_order) { auto& json_col = root_struct_col.child_columns.find(col_name)->second; - // Insert this columns name into the schema - out_column_names.emplace_back(col_name); std::optional child_schema_element = std::visit( cudf::detail::visitor_overload{ @@ -1184,18 +1227,28 @@ table_with_metadata device_parse_nested_json(device_span d_input, debug_schema_print(child_schema_element); #endif - // Get this JSON column's cudf column and schema info, (modifies json_col) - auto [cudf_col, col_name_info] = device_json_column_to_cudf_column( - json_col, d_input, parse_opt, child_schema_element, stream, mr); - // TODO: RangeIndex as DataFrame.columns names for array of arrays - // if (is_array_of_arrays) { - // col_name_info.back().name = ""; - // } - - out_column_names.back().children = std::move(col_name_info); - out_columns.emplace_back(std::move(cudf_col)); - - column_index++; + if (!options.is_enabled_prune_columns() or child_schema_element.has_value()) { + // Get this JSON column's cudf column and schema info, (modifies json_col) + auto [cudf_col, col_name_info] = + device_json_column_to_cudf_column(json_col, + d_input, + parse_opt, + options.is_enabled_prune_columns(), + child_schema_element, + stream, + mr); + // Insert this column's name into the schema + out_column_names.emplace_back(col_name); + // TODO: RangeIndex as DataFrame.columns names for array of arrays + // if (is_array_of_arrays) { + // col_name_info.back().name = ""; + // } + + out_column_names.back().children = std::move(col_name_info); + out_columns.emplace_back(std::move(cudf_col)); + + column_index++; + } } return table_with_metadata{std::make_unique(std::move(out_columns)), {out_column_names}}; diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index a302785cee8..52ea23c7f1c 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -319,7 +319,7 @@ table_with_metadata device_parse_nested_json(device_span input, * @return data type of the column if present */ std::optional get_path_data_type( - host_span> path, + host_span const> path, cudf::io::json_reader_options const& options); /** diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp index 740b7523cc1..4caa5cd9e24 100644 --- a/cpp/src/io/json/parser_features.cpp +++ b/cpp/src/io/json/parser_features.cpp @@ -58,8 +58,15 @@ std::optional child_schema_element(std::string const& col_name, // "a": [ null] {"a", list}, {"element", str} // back() is root. // front() is leaf. +/** + * @brief Get the path data type of a column by path if present in input schema + * + * @param path path of the json column + * @param root root of input schema element + * @return data type of the column if present, otherwise std::nullopt + */ std::optional get_path_data_type( - host_span> path, schema_element const& root) + host_span const> path, schema_element const& root) { if (path.empty() || path.size() == 1) { return root.type; @@ -81,7 +88,7 @@ std::optional get_path_data_type( } std::optional get_path_data_type( - host_span> path, + host_span const> path, cudf::io::json_reader_options const& options) { if (path.empty()) return {}; @@ -98,11 +105,11 @@ std::optional get_path_data_type( std::vector path_from_tree::get_path(NodeIndexT this_col_id) { std::vector path; - // TODO Need to stop at row root. so, how to find row root? + // stops at root. while (this_col_id != parent_node_sentinel) { auto type = column_categories[this_col_id]; std::string name = ""; - // TODO make this ifelse into a separate lambda function, along with parent_col_id. + // code same as name_and_parent_index lambda. auto parent_col_id = column_parent_ids[this_col_id]; if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index f0f72d4e794..b25822f6613 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -2233,9 +2233,6 @@ TEST_F(JsonReaderTest, MixedTypes) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); - static int num_case = 0; - num_case++; - std::cout << "case:" << num_case << "\n"; CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected); }; // value + string (not mixed type case) @@ -2437,4 +2434,206 @@ TEST_F(JsonReaderTest, MapTypes) {type_id::LIST, type_id::STRING, type_id::STRING}); } +// Test case for dtype prune: +// all paths, only one. +// one present, another not present, nothing present +// nested, flat, not-jsonlines +TEST_F(JsonReaderTest, JsonNestedDtypeFilter) +{ + std::string json_stringl = R"( + {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true} + {"a": 1, "b": {"0": "abc" }, "c": false} + {"a": 1, "b": {}} + {"a": 1, "c": null} + )"; + std::string json_string = R"([ + {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true}, + {"a": 1, "b": {"0": "abc" }, "c": false}, + {"a": 1, "b": {}}, + {"a": 1, "c": null} + ])"; + for (auto& [json_string, lines] : {std::pair{json_stringl, true}, {json_string, false}}) { + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.data(), json_string.size()}) + .prune_columns(true) + .lines(lines); + + // include all columns + //// schema + { + std::map dtype_schema{ + {"b", + {data_type{cudf::type_id::STRUCT}, + {{"0", {data_type{cudf::type_id::STRING}}}, + {"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}}}}, + {"a", {dtype()}}, + {"c", {dtype()}}, + }; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "a", "b" and "c" + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + // "b" children checks + ASSERT_EQ(result.metadata.schema_info[1].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "1"); + ASSERT_EQ(result.metadata.schema_info[1].children[1].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[1].children[1].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[1].children[1].children[1].name, "element"); + // types + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::BOOL8); + EXPECT_EQ(result.tbl->get_column(1).child(0).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(1).child(1).type().id(), cudf::type_id::LIST); + EXPECT_EQ(result.tbl->get_column(1).child(1).child(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(1).child(1).child(1).type().id(), cudf::type_id::FLOAT32); + } + //// vector + { + std::vector types{ + {dtype()}, data_type{cudf::type_id::STRUCT}, {dtype()}}; + in_options.set_dtypes(types); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "a", "b" and "c" + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + } + //// map + { + std::map dtype_map{ + {"b", + { + data_type{cudf::type_id::STRUCT}, + }}, + {"a", {dtype()}}, + {"c", {dtype()}}, + }; + in_options.set_dtypes(dtype_map); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "a", "b" and "c" + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + } + + // include only one column + //// schema + { + std::map dtype_schema{ + {"a", {dtype()}}, + }; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "a" + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + } + //// vector + { + std::vector types{{dtype()}}; + in_options.set_dtypes(types); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "a" + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + } + //// map + { + std::map dtype_map{ + {"a", {dtype()}}, + }; + in_options.set_dtypes(dtype_map); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "a" + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + } + + // include only one column (nested) + { + std::map dtype_schema{ + {"b", + {data_type{cudf::type_id::STRUCT}, + {{"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}}}}, + }; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "b":"1":[float] + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "b"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "1"); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[1].name, "element"); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::LIST); + EXPECT_EQ(result.tbl->get_column(0).child(0).child(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(0).child(0).child(1).type().id(), cudf::type_id::FLOAT32); + } + // multiple - all present + { + std::map dtype_schema{ + {"a", {dtype()}}, + {"c", {dtype()}}, + }; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "a", and "c" + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "c"); + } + // multiple - not all present + { + std::map dtype_schema{ + {"a", {dtype()}}, + {"d", {dtype()}}, + }; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "a" + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + } + // multiple - not all present nested + { + std::map dtype_schema{ + + {"b", + {data_type{cudf::type_id::STRUCT}, + { + {"2", {data_type{cudf::type_id::STRING}}}, + }}}, + {"c", {dtype()}}, + }; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "b" (empty struct) and "c" + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "b"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 0); + EXPECT_EQ(result.metadata.schema_info[1].name, "c"); + } + } +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index b916c2b7ad9..1e1057beede 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -28,6 +28,7 @@ cdef extern from "cudf/io/json.hpp" \ size_type get_byte_range_size() except + bool is_enabled_lines() except + bool is_enabled_mixed_types_as_string() except + + bool is_enabled_prune_columns() except + bool is_enabled_dayfirst() except + bool is_enabled_experimental() except + @@ -41,6 +42,7 @@ cdef extern from "cudf/io/json.hpp" \ void set_byte_range_size(size_type size) except + void enable_lines(bool val) except + void enable_mixed_types_as_string(bool val) except + + void enable_prune_columns(bool val) except + void enable_dayfirst(bool val) except + void enable_experimental(bool val) except + void enable_keep_quotes(bool val) except + @@ -79,6 +81,9 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& mixed_types_as_string( bool val ) except + + json_reader_options_builder& prune_columns( + bool val + ) except + json_reader_options_builder& dayfirst( bool val ) except + diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index f2e03391f08..cef71ed24a5 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -49,7 +49,8 @@ cpdef read_json(object filepaths_or_buffers, object byte_range, bool legacy, bool keep_quotes, - bool mixed_types_as_string): + bool mixed_types_as_string, + bool prune_columns): """ Cython function to call into libcudf API, see `read_json`. @@ -128,6 +129,7 @@ cpdef read_json(object filepaths_or_buffers, opts.enable_keep_quotes(keep_quotes) opts.enable_mixed_types_as_string(mixed_types_as_string) + opts.enable_prune_columns(prune_columns) # Read JSON cdef cudf_io_types.table_with_metadata c_result diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 5ef25a99590..03d07fc3a50 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -26,6 +26,7 @@ def read_json( keep_quotes=False, storage_options=None, mixed_types_as_string=False, + prune_columns=False, *args, **kwargs, ): @@ -101,6 +102,7 @@ def read_json( False, keep_quotes, mixed_types_as_string, + prune_columns, ) else: warnings.warn( diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 66e14f4b9de..6bd7558d322 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -692,7 +692,6 @@ This parameter is only supported with ``engine='cudf'``. - This parameter is only supported in ``cudf`` engine. If `True`, any string values are read literally (and wrapped in an additional set of quotes). If `False` string values are parsed into Python strings. @@ -703,7 +702,22 @@ For other URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more details. +mixed_types_as_string : bool, default False + .. admonition:: GPU-accelerated feature + + This parameter is only supported with ``engine='cudf'``. + + If True, mixed type columns are returned as string columns. + If `False` parsing mixed type columns will thrown an error. +prune_columns : bool, default False + + .. admonition:: GPU-accelerated feature + + This parameter is only supported with ``engine='cudf'``. + + If True, only return those columns mentioned in the dtype argument. + If `False` dtype argument is used a type inference suggestion. Returns ------- result : Series or DataFrame, depending on the value of `typ`.