Introduce benchmark suite for JSON reader options (#15124)

The goal of this piece of work is to analyze the performance of the reader for JSON lines. This PR establishes a baseline for the performance of single quote normalization, white space normalization, mixed type as string parsing and recovery mode options when the input JSON is valid, and does not have any single quotes. Modifying the data generation to produce inputs with single quotes/mixed types/invalid lines will be the focus of follow-on PRs. Addresses #15041 Authors: - Shruti Shivakumar (https://github.com/shrshi) - Nghia Truong (https://github.com/ttnghia) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Vukasin Milovanovic (https://github.com/vuule) - Yunsong Wang (https://github.com/PointKernel) - Nghia Truong (https://github.com/ttnghia) URL: #15124
rapidsai · Apr 9, 2024 · 1862cdc · 1862cdc
1 parent b037ddf
commit 1862cdc
Show file tree

Hide file tree

Showing 3 changed files with 264 additions and 1 deletion.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -331,6 +331,7 @@ ConfigureNVBench(
 ConfigureBench(JSON_BENCH json/json.cu)
 ConfigureNVBench(FST_NVBENCH io/fst.cu)
 ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader_input.cpp)
+ConfigureNVBench(JSON_READER_OPTION io/json/json_reader_option.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
 # ##################################################################################################

diff --git a/cpp/benchmarks/io/json/json_reader_option.cpp b/cpp/benchmarks/io/json/json_reader_option.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/io/nvbench_helpers.hpp>
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
+constexpr size_t data_size         = 512 << 20;
+constexpr cudf::size_type num_cols = 64;
+
+template <json_lines JsonLines>
+void BM_json_read_options(nvbench::state& state, nvbench::type_list<nvbench::enum_type<JsonLines>>)
+{
+  constexpr auto json_lines_bool = JsonLines == json_lines::YES;
+
+  cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
+  auto const data_types = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
+                                             static_cast<int32_t>(data_type::FLOAT),
+                                             static_cast<int32_t>(data_type::DECIMAL),
+                                             static_cast<int32_t>(data_type::STRING),
+                                             static_cast<int32_t>(data_type::LIST),
+                                             static_cast<int32_t>(data_type::STRUCT)});
+
+  auto const tbl = create_random_table(
+    cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}, data_profile_builder());
+  auto const view = tbl->view();
+  cudf::io::json_writer_options const write_opts =
+    cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view)
+      .lines(json_lines_bool)
+      .na_rep("null")
+      .rows_per_chunk(100'000);
+  cudf::io::write_json(write_opts);
+
+  cudf::io::json_reader_options read_options =
+    cudf::io::json_reader_options::builder(source_sink.make_source_info()).lines(json_lines_bool);
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      try_drop_l3_cache();
+      timer.start();
+      auto const result        = cudf::io::read_json(read_options);
+      auto const num_rows_read = result.tbl->num_rows();
+      auto const num_cols_read = result.tbl->num_columns();
+      timer.stop();
+      CUDF_EXPECTS(num_rows_read == view.num_rows(), "Benchmark did not read the entire table");
+      CUDF_EXPECTS(num_cols_read == num_cols, "Unexpected number of columns");
+    });
+
+  auto const elapsed_time   = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  auto const data_processed = data_size * num_cols / view.num_columns();
+  state.add_element_count(static_cast<double>(data_processed) / elapsed_time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
+}
+
+template <row_selection RowSelection,
+          normalize_single_quotes NormalizeSingleQuotes,
+          normalize_whitespace NormalizeWhitespace,
+          mixed_types_as_string MixedTypesAsString,
+          recovery_mode RecoveryMode>
+void BM_jsonlines_read_options(nvbench::state& state,
+                               nvbench::type_list<nvbench::enum_type<RowSelection>,
+                                                  nvbench::enum_type<NormalizeSingleQuotes>,
+                                                  nvbench::enum_type<NormalizeWhitespace>,
+                                                  nvbench::enum_type<MixedTypesAsString>,
+                                                  nvbench::enum_type<RecoveryMode>>)
+{
+  constexpr auto normalize_single_quotes_bool =
+    NormalizeSingleQuotes == normalize_single_quotes::YES;
+  constexpr auto normalize_whitespace_bool  = NormalizeWhitespace == normalize_whitespace::YES;
+  constexpr auto mixed_types_as_string_bool = MixedTypesAsString == mixed_types_as_string::YES;
+  constexpr auto recovery_mode_enum         = RecoveryMode == recovery_mode::RECOVER_WITH_NULL
+                                                ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
+                                                : cudf::io::json_recovery_mode_t::FAIL;
+  size_t const num_chunks                   = state.get_int64("num_chunks");
+  if (num_chunks > 1 && RowSelection == row_selection::ALL) {
+    state.skip(
+      "No point running the same benchmark multiple times for different num_chunks when all rows "
+      "are being selected anyway");
+    return;
+  }
+
+  cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
+  auto const data_types = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
+                                             static_cast<int32_t>(data_type::FLOAT),
+                                             static_cast<int32_t>(data_type::DECIMAL),
+                                             static_cast<int32_t>(data_type::STRING),
+                                             static_cast<int32_t>(data_type::LIST),
+                                             static_cast<int32_t>(data_type::STRUCT)});
+
+  auto const tbl = create_random_table(
+    cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}, data_profile_builder());
+  auto const view = tbl->view();
+  cudf::io::json_writer_options const write_opts =
+    cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view)
+      .lines(true)
+      .na_rep("null")
+      .rows_per_chunk(100'000);
+  cudf::io::write_json(write_opts);
+
+  cudf::io::json_reader_options read_options =
+    cudf::io::json_reader_options::builder(source_sink.make_source_info())
+      .lines(true)
+      .normalize_single_quotes(normalize_single_quotes_bool)
+      .normalize_whitespace(normalize_whitespace_bool)
+      .mixed_types_as_string(mixed_types_as_string_bool)
+      .recovery_mode(recovery_mode_enum);
+
+  size_t const chunk_size = cudf::util::div_rounding_up_safe(source_sink.size(), num_chunks);
+  auto mem_stats_logger   = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      try_drop_l3_cache();
+      cudf::size_type num_rows_read = 0;
+      cudf::size_type num_cols_read = 0;
+      timer.start();
+      switch (RowSelection) {
+        case row_selection::ALL: {
+          auto const result = cudf::io::read_json(read_options);
+          num_rows_read     = result.tbl->num_rows();
+          num_cols_read     = result.tbl->num_columns();
+          break;
+        }
+        case row_selection::BYTE_RANGE: {
+          for (uint64_t chunk = 0; chunk < num_chunks; chunk++) {
+            read_options.set_byte_range_offset(chunk * chunk_size);
+            read_options.set_byte_range_size(chunk_size);
+            auto const result = cudf::io::read_json(read_options);
+            num_rows_read += result.tbl->num_rows();
+            num_cols_read = result.tbl->num_columns();
+            if (num_cols_read)
+              CUDF_EXPECTS(num_cols_read == num_cols, "Unexpected number of columns");
+          }
+          break;
+        }
+        default: CUDF_FAIL("Unsupported row selection method");
+      }
+      timer.stop();
+      CUDF_EXPECTS(num_rows_read == view.num_rows(), "Benchmark did not read the entire table");
+    });
+
+  auto const elapsed_time   = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  auto const data_processed = data_size * num_cols / view.num_columns();
+  state.add_element_count(static_cast<double>(data_processed) / elapsed_time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
+}
+
+NVBENCH_BENCH_TYPES(
+  BM_jsonlines_read_options,
+  NVBENCH_TYPE_AXES(
+    nvbench::enum_type_list<row_selection::ALL, row_selection::BYTE_RANGE>,
+    nvbench::enum_type_list<normalize_single_quotes::NO, normalize_single_quotes::YES>,
+    nvbench::enum_type_list<normalize_whitespace::NO, normalize_whitespace::YES>,
+    nvbench::enum_type_list<mixed_types_as_string::NO, mixed_types_as_string::YES>,
+    nvbench::enum_type_list<recovery_mode::RECOVER_WITH_NULL, recovery_mode::FAIL>))
+  .set_name("jsonlines_reader")
+  .set_type_axes_names({"row_selection",
+                        "normalize_single_quotes",
+                        "normalize_whitespace",
+                        "mixed_types_as_string",
+                        "recovery_mode"})
+  .set_min_samples(6)
+  .add_int64_axis("num_chunks", nvbench::range(1, 5, 1));
+
+NVBENCH_BENCH_TYPES(BM_json_read_options,
+                    NVBENCH_TYPE_AXES(nvbench::enum_type_list<json_lines::YES, json_lines::NO>))
+  .set_name("json_reader")
+  .set_type_axes_names({"json_lines"})
+  .set_min_samples(6);
diff --git a/cpp/benchmarks/io/nvbench_helpers.hpp b/cpp/benchmarks/io/nvbench_helpers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -169,3 +169,68 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
     }
   },
   [](auto) { return std::string{}; })
+
+enum class json_lines : bool { YES, NO };
+
+enum class normalize_single_quotes : bool { YES, NO };
+
+enum class normalize_whitespace : bool { YES, NO };
+
+enum class mixed_types_as_string : bool { YES, NO };
+
+enum class recovery_mode : bool { FAIL, RECOVER_WITH_NULL };
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  json_lines,
+  [](auto value) {
+    switch (value) {
+      case json_lines::YES: return "YES";
+      case json_lines::NO: return "NO";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  normalize_single_quotes,
+  [](auto value) {
+    switch (value) {
+      case normalize_single_quotes::YES: return "YES";
+      case normalize_single_quotes::NO: return "NO";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  normalize_whitespace,
+  [](auto value) {
+    switch (value) {
+      case normalize_whitespace::YES: return "YES";
+      case normalize_whitespace::NO: return "NO";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  mixed_types_as_string,
+  [](auto value) {
+    switch (value) {
+      case mixed_types_as_string::YES: return "YES";
+      case mixed_types_as_string::NO: return "NO";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  recovery_mode,
+  [](auto value) {
+    switch (value) {
+      case recovery_mode::FAIL: return "FAIL";
+      case recovery_mode::RECOVER_WITH_NULL: return "RECOVER_WITH_NULL";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })