Skip to content

Commit

Permalink
Merge branch 'branch-24.10' into join-stream-ordering
Browse files Browse the repository at this point in the history
  • Loading branch information
lamarrr authored Sep 12, 2024
2 parents 4b8f4bb + 3dbc33a commit c5d5d54
Show file tree
Hide file tree
Showing 31 changed files with 1,626 additions and 323 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12

### Conda

cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects/miniconda/en/latest/) or the full [Anaconda distribution](https://www.anaconda.com/download) from the `rapidsai` channel:
cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel:

```bash
conda install -c rapidsai -c conda-forge -c nvidia \
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ add_library(
src/io/json/nested_json_gpu.cu
src/io/json/read_json.cu
src/io/json/parser_features.cpp
src/io/json/process_tokens.cu
src/io/json/write_json.cu
src/io/orc/aggregate_orc_metadata.cpp
src/io/orc/dict_enc.cu
Expand Down
14 changes: 7 additions & 7 deletions cpp/benchmarks/hashing/hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ static void bench_hash(nvbench::state& state)
state.add_global_memory_reads<nvbench::int64_t>(num_rows);
// add memory read from bitmaks
if (!no_nulls) {
state.add_global_memory_reads<nvbench::int8_t>(2 *
state.add_global_memory_reads<nvbench::int8_t>(2L *
cudf::bitmask_allocation_size_bytes(num_rows));
}
// memory written depends on used hash
Expand All @@ -63,37 +63,37 @@ static void bench_hash(nvbench::state& state)
});
} else if (hash_name == "md5") {
// md5 creates a 32-byte string
state.add_global_memory_writes<nvbench::int8_t>(32 * num_rows);
state.add_global_memory_writes<nvbench::int8_t>(32L * num_rows);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = cudf::hashing::md5(data->view()); });
} else if (hash_name == "sha1") {
// sha1 creates a 40-byte string
state.add_global_memory_writes<nvbench::int8_t>(40 * num_rows);
state.add_global_memory_writes<nvbench::int8_t>(40L * num_rows);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = cudf::hashing::sha1(data->view()); });
} else if (hash_name == "sha224") {
// sha224 creates a 56-byte string
state.add_global_memory_writes<nvbench::int8_t>(56 * num_rows);
state.add_global_memory_writes<nvbench::int8_t>(56L * num_rows);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = cudf::hashing::sha224(data->view()); });
} else if (hash_name == "sha256") {
// sha256 creates a 64-byte string
state.add_global_memory_writes<nvbench::int8_t>(64 * num_rows);
state.add_global_memory_writes<nvbench::int8_t>(64L * num_rows);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = cudf::hashing::sha256(data->view()); });
} else if (hash_name == "sha384") {
// sha384 creates a 96-byte string
state.add_global_memory_writes<nvbench::int8_t>(96 * num_rows);
state.add_global_memory_writes<nvbench::int8_t>(96L * num_rows);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = cudf::hashing::sha384(data->view()); });
} else if (hash_name == "sha512") {
// sha512 creates a 128-byte string
state.add_global_memory_writes<nvbench::int8_t>(128 * num_rows);
state.add_global_memory_writes<nvbench::int8_t>(128L * num_rows);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = cudf::hashing::sha512(data->view()); });
Expand Down
17 changes: 8 additions & 9 deletions cpp/include/cudf/detail/tdigest/tdigest.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,29 +143,28 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
rmm::device_async_resource_ref mr);

/**
* @brief Create a tdigest column of empty clusters.
* @brief Create an empty tdigest column.
*
* The column created contains the specified number of rows of empty clusters.
* An empty tdigest column contains a single row of length 0
*
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
*
* @returns A tdigest column of empty clusters.
* @returns An empty tdigest column.
*/
CUDF_EXPORT
std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);
std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

/**
* @brief Create a scalar of an empty tdigest cluster.
* @brief Create an empty tdigest scalar.
*
* The returned scalar is a struct_scalar that contains a single row of an empty cluster.
* An empty tdigest scalar is a struct_scalar that contains a single row of length 0
*
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
*
* @returns A scalar of an empty tdigest cluster.
* @returns An empty tdigest scalar.
*/
std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);
Expand Down
190 changes: 190 additions & 0 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/memory_resource.hpp>

#include <map>
Expand Down Expand Up @@ -128,6 +129,19 @@ class json_reader_options {
// Whether to recover after an invalid JSON line
json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;

// Validation checks for spark
// Should the json validation be strict or not
// Note: strict validation enforces the JSON specification https://www.json.org/json-en.html
bool _strict_validation = false;
// Allow leading zeros for numeric values.
bool _allow_numeric_leading_zeros = true;
// Allow non-numeric numbers: NaN, +INF, -INF, +Infinity, Infinity, -Infinity
bool _allow_nonnumeric_numbers = true;
// Allow unquoted control characters
bool _allow_unquoted_control_chars = true;
// Additional values to recognize as null values
std::vector<std::string> _na_values;

/**
* @brief Constructor from source info.
*
Expand Down Expand Up @@ -298,6 +312,55 @@ class json_reader_options {
*/
[[nodiscard]] json_recovery_mode_t recovery_mode() const { return _recovery_mode; }

/**
* @brief Whether json validation should be enforced strictly or not.
*
* @return true if it should be.
*/
[[nodiscard]] bool is_strict_validation() const { return _strict_validation; }

/**
* @brief Whether leading zeros are allowed in numeric values.
*
* @note: This validation is enforced only if strict validation is enabled.
*
* @return true if leading zeros are allowed in numeric values
*/
[[nodiscard]] bool is_allowed_numeric_leading_zeros() const
{
return _allow_numeric_leading_zeros;
}

/**
* @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity,
* and -Infinity.
*
* @note: This validation is enforced only if strict validation is enabled.
*
* @return true if leading zeros are allowed in numeric values
*/
[[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; }

/**
* @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32
* be allowed without some form of escaping.
*
* @note: This validation is enforced only if strict validation is enabled.
*
* @return true if unquoted control chars are allowed.
*/
[[nodiscard]] bool is_allowed_unquoted_control_chars() const
{
return _allow_unquoted_control_chars;
}

/**
* @brief Returns additional values to recognize as null values.
*
* @return Additional values to recognize as null values
*/
[[nodiscard]] std::vector<std::string> const& get_na_values() const { return _na_values; }

/**
* @brief Set data types for columns to be read.
*
Expand Down Expand Up @@ -427,6 +490,63 @@ class json_reader_options {
* @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines.
*/
void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; }

/**
* @brief Set whether strict validation is enabled or not.
*
* @param val Boolean value to indicate whether strict validation is enabled.
*/
void set_strict_validation(bool val) { _strict_validation = val; }

/**
* @brief Set whether leading zeros are allowed in numeric values. Strict validation
* must be enabled for this to work.
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val Boolean value to indicate whether leading zeros are allowed in numeric values
*/
void allow_numeric_leading_zeros(bool val)
{
CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
_allow_numeric_leading_zeros = val;
}

/**
* @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity,
* Infinity, and -Infinity. Strict validation must be enabled for this to work.
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val Boolean value to indicate whether leading zeros are allowed in numeric values
*/
void allow_nonnumeric_numbers(bool val)
{
CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
_allow_nonnumeric_numbers = val;
}

/**
* @brief Set whether in a quoted string should characters greater than or equal to 0
* and less than 32 be allowed without some form of escaping. Strict validation must
* be enabled for this to work.
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val true to indicate whether unquoted control chars are allowed.
*/
void allow_unquoted_control_chars(bool val)
{
CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
_allow_unquoted_control_chars = val;
}

/**
* @brief Sets additional values to recognize as null values.
*
* @param vals Vector of values to be considered to be null
*/
void set_na_values(std::vector<std::string> vals) { _na_values = std::move(vals); }
};

/**
Expand Down Expand Up @@ -638,6 +758,76 @@ class json_reader_options_builder {
return *this;
}

/**
* @brief Set whether json validation should be strict or not.
*
* @param val Boolean value to indicate whether json validation should be strict or not.
* @return this for chaining
*/
json_reader_options_builder& strict_validation(bool val)
{
options.set_strict_validation(val);
return *this;
}

/**
* @brief Set Whether leading zeros are allowed in numeric values. Strict validation must
* be enabled for this to have any effect.
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val Boolean value to indicate whether leading zeros are allowed in numeric values
* @return this for chaining
*/
json_reader_options_builder& numeric_leading_zeros(bool val)
{
options.allow_numeric_leading_zeros(val);
return *this;
}

/**
* @brief Set whether specific unquoted number values are valid JSON. The values are NaN,
* +INF, -INF, +Infinity, Infinity, and -Infinity.
* Strict validation must be enabled for this to have any effect.
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val Boolean value to indicate if unquoted nonnumeric values are valid json or not.
* @return this for chaining
*/
json_reader_options_builder& nonnumeric_numbers(bool val)
{
options.allow_nonnumeric_numbers(val);
return *this;
}

/**
* @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without
* some form of escaping. Strict validation must be enabled for this to have any effect.
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val Boolean value to indicate if unquoted control chars are allowed or not.
* @return this for chaining
*/
json_reader_options_builder& unquoted_control_chars(bool val)
{
options.allow_unquoted_control_chars(val);
return *this;
}

/**
* @brief Sets additional values to recognize as null values.
*
* @param vals Vector of values to be considered to be null
* @return this for chaining
*/
json_reader_options_builder& na_values(std::vector<std::string> vals)
{
options.set_na_values(std::move(vals));
return *this;
}

/**
* @brief move json_reader_options member once it's built.
*/
Expand Down
20 changes: 10 additions & 10 deletions cpp/include/cudf_test/tdigest_utilities.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op)
static_cast<column_view>(values).type(), tdigest_gen{}, op, values, delta);

// NOTE: an empty tdigest column still has 1 row.
auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
cudf::get_default_stream(), cudf::get_current_device_resource_ref());

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
}
Expand Down Expand Up @@ -562,12 +562,12 @@ template <typename MergeFunc>
void tdigest_merge_empty(MergeFunc merge_op)
{
// 3 empty tdigests all in the same group
auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto b = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto a = cudf::tdigest::detail::make_empty_tdigest_column(
cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto b = cudf::tdigest::detail::make_empty_tdigest_column(
cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto c = cudf::tdigest::detail::make_empty_tdigest_column(
cudf::get_default_stream(), cudf::get_current_device_resource_ref());
std::vector<column_view> cols;
cols.push_back(*a);
cols.push_back(*b);
Expand All @@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op)
auto const delta = 1000;
auto result = merge_op(*values, delta);

auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
cudf::get_default_stream(), cudf::get_current_device_resource_ref());

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
}
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/io/json/json_normalization.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "io/fst/lookup_tables.cuh"

#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/io/detail/json.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/memory_resource.hpp>
Expand Down Expand Up @@ -302,6 +303,7 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
static constexpr std::int32_t min_out = 0;
static constexpr std::int32_t max_out = 2;
auto parser =
Expand Down Expand Up @@ -330,6 +332,7 @@ void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
static constexpr std::int32_t min_out = 0;
static constexpr std::int32_t max_out = 2;
auto parser =
Expand Down
Loading

0 comments on commit c5d5d54

Please sign in to comment.