-
Notifications
You must be signed in to change notification settings - Fork 912
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Introduce benchmark suite for JSON reader options (#15124)
The goal of this piece of work is to analyze the performance of the reader for JSON lines. This PR establishes a baseline for the performance of single quote normalization, white space normalization, mixed type as string parsing and recovery mode options when the input JSON is valid, and does not have any single quotes. Modifying the data generation to produce inputs with single quotes/mixed types/invalid lines will be the focus of follow-on PRs. Addresses #15041 Authors: - Shruti Shivakumar (https://github.com/shrshi) - Nghia Truong (https://github.com/ttnghia) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Vukasin Milovanovic (https://github.com/vuule) - Yunsong Wang (https://github.com/PointKernel) - Nghia Truong (https://github.com/ttnghia) URL: #15124
- Loading branch information
Showing
3 changed files
with
264 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,197 @@ | ||
/* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <benchmarks/common/generate_input.hpp> | ||
#include <benchmarks/fixture/benchmark_fixture.hpp> | ||
#include <benchmarks/io/cuio_common.hpp> | ||
#include <benchmarks/io/nvbench_helpers.hpp> | ||
|
||
#include <cudf/detail/utilities/integer_utils.hpp> | ||
#include <cudf/io/json.hpp> | ||
#include <cudf/utilities/default_stream.hpp> | ||
|
||
#include <nvbench/nvbench.cuh> | ||
|
||
// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to | ||
// run on most GPUs, but large enough to allow highest throughput | ||
constexpr size_t data_size = 512 << 20; | ||
constexpr cudf::size_type num_cols = 64; | ||
|
||
template <json_lines JsonLines> | ||
void BM_json_read_options(nvbench::state& state, nvbench::type_list<nvbench::enum_type<JsonLines>>) | ||
{ | ||
constexpr auto json_lines_bool = JsonLines == json_lines::YES; | ||
|
||
cuio_source_sink_pair source_sink(io_type::HOST_BUFFER); | ||
auto const data_types = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL), | ||
static_cast<int32_t>(data_type::FLOAT), | ||
static_cast<int32_t>(data_type::DECIMAL), | ||
static_cast<int32_t>(data_type::STRING), | ||
static_cast<int32_t>(data_type::LIST), | ||
static_cast<int32_t>(data_type::STRUCT)}); | ||
|
||
auto const tbl = create_random_table( | ||
cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}, data_profile_builder()); | ||
auto const view = tbl->view(); | ||
cudf::io::json_writer_options const write_opts = | ||
cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view) | ||
.lines(json_lines_bool) | ||
.na_rep("null") | ||
.rows_per_chunk(100'000); | ||
cudf::io::write_json(write_opts); | ||
|
||
cudf::io::json_reader_options read_options = | ||
cudf::io::json_reader_options::builder(source_sink.make_source_info()).lines(json_lines_bool); | ||
|
||
auto mem_stats_logger = cudf::memory_stats_logger(); | ||
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); | ||
state.exec( | ||
nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { | ||
try_drop_l3_cache(); | ||
timer.start(); | ||
auto const result = cudf::io::read_json(read_options); | ||
auto const num_rows_read = result.tbl->num_rows(); | ||
auto const num_cols_read = result.tbl->num_columns(); | ||
timer.stop(); | ||
CUDF_EXPECTS(num_rows_read == view.num_rows(), "Benchmark did not read the entire table"); | ||
CUDF_EXPECTS(num_cols_read == num_cols, "Unexpected number of columns"); | ||
}); | ||
|
||
auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); | ||
auto const data_processed = data_size * num_cols / view.num_columns(); | ||
state.add_element_count(static_cast<double>(data_processed) / elapsed_time, "bytes_per_second"); | ||
state.add_buffer_size( | ||
mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); | ||
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); | ||
} | ||
|
||
template <row_selection RowSelection, | ||
normalize_single_quotes NormalizeSingleQuotes, | ||
normalize_whitespace NormalizeWhitespace, | ||
mixed_types_as_string MixedTypesAsString, | ||
recovery_mode RecoveryMode> | ||
void BM_jsonlines_read_options(nvbench::state& state, | ||
nvbench::type_list<nvbench::enum_type<RowSelection>, | ||
nvbench::enum_type<NormalizeSingleQuotes>, | ||
nvbench::enum_type<NormalizeWhitespace>, | ||
nvbench::enum_type<MixedTypesAsString>, | ||
nvbench::enum_type<RecoveryMode>>) | ||
{ | ||
constexpr auto normalize_single_quotes_bool = | ||
NormalizeSingleQuotes == normalize_single_quotes::YES; | ||
constexpr auto normalize_whitespace_bool = NormalizeWhitespace == normalize_whitespace::YES; | ||
constexpr auto mixed_types_as_string_bool = MixedTypesAsString == mixed_types_as_string::YES; | ||
constexpr auto recovery_mode_enum = RecoveryMode == recovery_mode::RECOVER_WITH_NULL | ||
? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL | ||
: cudf::io::json_recovery_mode_t::FAIL; | ||
size_t const num_chunks = state.get_int64("num_chunks"); | ||
if (num_chunks > 1 && RowSelection == row_selection::ALL) { | ||
state.skip( | ||
"No point running the same benchmark multiple times for different num_chunks when all rows " | ||
"are being selected anyway"); | ||
return; | ||
} | ||
|
||
cuio_source_sink_pair source_sink(io_type::HOST_BUFFER); | ||
auto const data_types = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL), | ||
static_cast<int32_t>(data_type::FLOAT), | ||
static_cast<int32_t>(data_type::DECIMAL), | ||
static_cast<int32_t>(data_type::STRING), | ||
static_cast<int32_t>(data_type::LIST), | ||
static_cast<int32_t>(data_type::STRUCT)}); | ||
|
||
auto const tbl = create_random_table( | ||
cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}, data_profile_builder()); | ||
auto const view = tbl->view(); | ||
cudf::io::json_writer_options const write_opts = | ||
cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view) | ||
.lines(true) | ||
.na_rep("null") | ||
.rows_per_chunk(100'000); | ||
cudf::io::write_json(write_opts); | ||
|
||
cudf::io::json_reader_options read_options = | ||
cudf::io::json_reader_options::builder(source_sink.make_source_info()) | ||
.lines(true) | ||
.normalize_single_quotes(normalize_single_quotes_bool) | ||
.normalize_whitespace(normalize_whitespace_bool) | ||
.mixed_types_as_string(mixed_types_as_string_bool) | ||
.recovery_mode(recovery_mode_enum); | ||
|
||
size_t const chunk_size = cudf::util::div_rounding_up_safe(source_sink.size(), num_chunks); | ||
auto mem_stats_logger = cudf::memory_stats_logger(); | ||
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); | ||
state.exec( | ||
nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { | ||
try_drop_l3_cache(); | ||
cudf::size_type num_rows_read = 0; | ||
cudf::size_type num_cols_read = 0; | ||
timer.start(); | ||
switch (RowSelection) { | ||
case row_selection::ALL: { | ||
auto const result = cudf::io::read_json(read_options); | ||
num_rows_read = result.tbl->num_rows(); | ||
num_cols_read = result.tbl->num_columns(); | ||
break; | ||
} | ||
case row_selection::BYTE_RANGE: { | ||
for (uint64_t chunk = 0; chunk < num_chunks; chunk++) { | ||
read_options.set_byte_range_offset(chunk * chunk_size); | ||
read_options.set_byte_range_size(chunk_size); | ||
auto const result = cudf::io::read_json(read_options); | ||
num_rows_read += result.tbl->num_rows(); | ||
num_cols_read = result.tbl->num_columns(); | ||
if (num_cols_read) | ||
CUDF_EXPECTS(num_cols_read == num_cols, "Unexpected number of columns"); | ||
} | ||
break; | ||
} | ||
default: CUDF_FAIL("Unsupported row selection method"); | ||
} | ||
timer.stop(); | ||
CUDF_EXPECTS(num_rows_read == view.num_rows(), "Benchmark did not read the entire table"); | ||
}); | ||
|
||
auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); | ||
auto const data_processed = data_size * num_cols / view.num_columns(); | ||
state.add_element_count(static_cast<double>(data_processed) / elapsed_time, "bytes_per_second"); | ||
state.add_buffer_size( | ||
mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); | ||
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); | ||
} | ||
|
||
NVBENCH_BENCH_TYPES( | ||
BM_jsonlines_read_options, | ||
NVBENCH_TYPE_AXES( | ||
nvbench::enum_type_list<row_selection::ALL, row_selection::BYTE_RANGE>, | ||
nvbench::enum_type_list<normalize_single_quotes::NO, normalize_single_quotes::YES>, | ||
nvbench::enum_type_list<normalize_whitespace::NO, normalize_whitespace::YES>, | ||
nvbench::enum_type_list<mixed_types_as_string::NO, mixed_types_as_string::YES>, | ||
nvbench::enum_type_list<recovery_mode::RECOVER_WITH_NULL, recovery_mode::FAIL>)) | ||
.set_name("jsonlines_reader") | ||
.set_type_axes_names({"row_selection", | ||
"normalize_single_quotes", | ||
"normalize_whitespace", | ||
"mixed_types_as_string", | ||
"recovery_mode"}) | ||
.set_min_samples(6) | ||
.add_int64_axis("num_chunks", nvbench::range(1, 5, 1)); | ||
|
||
NVBENCH_BENCH_TYPES(BM_json_read_options, | ||
NVBENCH_TYPE_AXES(nvbench::enum_type_list<json_lines::YES, json_lines::NO>)) | ||
.set_name("json_reader") | ||
.set_type_axes_names({"json_lines"}) | ||
.set_min_samples(6); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters