Skip to content

Commit

Permalink
Fix mean computation for the geometric distribution in the data gener…
Browse files Browse the repository at this point in the history
…ator (#15282)

Since we moved random data generation to the GPU, geometric distribution has been approximated by half-normal distribution. However, the mean computation wasn't updated, causing a ~20% higher mean that the actual generated values.
Another issue that exasperated the problem is the implicit conversion to ints in the random generator. This effectively lowered the mean of generated values by 0.5.

Together, these lead to list columns having the last row with more than 20% of the total column data. Huge single row caused low performance in many benchmarks. For example, Parquet files end up with a few huge pages and load imbalance in decode.

This PR fixes the mean computation to reflex the actual distribution, and rounds the random values when converting to ints. The result is a correct distribution of the number of elements in each randomly generated list.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: #15282
  • Loading branch information
vuule authored Mar 14, 2024
1 parent 769c1bd commit 95ce0bb
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 23 deletions.
13 changes: 7 additions & 6 deletions cpp/benchmarks/common/generate_input.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,15 @@ double get_distribution_mean(distribution_params<T> const& dist)
case distribution_id::NORMAL:
case distribution_id::UNIFORM: return (dist.lower_bound / 2.) + (dist.upper_bound / 2.);
case distribution_id::GEOMETRIC: {
auto const range_size = dist.lower_bound < dist.upper_bound
? dist.upper_bound - dist.lower_bound
: dist.lower_bound - dist.upper_bound;
auto const p = geometric_dist_p(range_size);
// Geometric distribution is approximated by a half-normal distribution
// Doubling the standard deviation because the dist range only includes half of the (unfolded)
// normal distribution
auto const gauss_std_dev = std_dev_from_range(dist.lower_bound, dist.upper_bound) * 2;
auto const half_gauss_mean = gauss_std_dev * sqrt(2. / M_PI);
if (dist.lower_bound < dist.upper_bound)
return dist.lower_bound + (1. / p);
return dist.lower_bound + half_gauss_mean;
else
return dist.lower_bound - (1. / p);
return dist.lower_bound - half_gauss_mean;
}
default: CUDF_FAIL("Unsupported distribution type.");
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/benchmarks/common/generate_input.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ class data_profile {
std::map<cudf::type_id, distribution_params<double>> float_params;
distribution_params<cudf::string_view> string_dist_desc{{distribution_id::NORMAL, 0, 32}};
distribution_params<cudf::list_view> list_dist_desc{
cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 100}, 2};
cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 64}, 2};
distribution_params<cudf::struct_view> struct_dist_desc{
{cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::STRING}, 2};
std::map<cudf::type_id, distribution_params<__uint128_t>> decimal_params;
Expand Down
44 changes: 28 additions & 16 deletions cpp/benchmarks/common/random_distribution_factory.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -44,15 +44,25 @@ using integral_to_realType =
T,
std::conditional_t<sizeof(T) * 8 <= 23, float, double>>;

// standard deviation such that most samples fall within the given range
template <typename T>
constexpr double std_dev_from_range(T lower_bound, T upper_bound)
{
// 99.7% samples are within 3 standard deviations of the mean
constexpr double k = 6.0;
auto const range_size = std::abs(static_cast<double>(upper_bound) - lower_bound);
return range_size / k;
}

/**
* @brief Generates a normal distribution between zero and upper_bound.
*/
template <typename T>
auto make_normal_dist(T lower_bound, T upper_bound)
{
using realT = integral_to_realType<T>;
T const mean = lower_bound + (upper_bound - lower_bound) / 2;
T const stddev = (upper_bound - lower_bound) / 6;
using realT = integral_to_realType<T>;
realT const mean = lower_bound / 2. + upper_bound / 2.;
realT const stddev = std_dev_from_range(lower_bound, upper_bound);
return thrust::random::normal_distribution<realT>(mean, stddev);
}

Expand All @@ -68,14 +78,6 @@ auto make_uniform_dist(T range_start, T range_end)
return thrust::uniform_real_distribution<T>(range_start, range_end);
}

template <typename T>
double geometric_dist_p(T range_size)
{
constexpr double percentage_in_range = 0.99;
double const p = 1 - exp(log(1 - percentage_in_range) / range_size);
return p ? p : std::numeric_limits<double>::epsilon();
}

/**
* @brief Generates a geometric distribution between lower_bound and upper_bound.
* This distribution is an approximation generated using normal distribution.
Expand All @@ -89,10 +91,17 @@ class geometric_distribution : public thrust::random::normal_distribution<integr
T _lower_bound;
T _upper_bound;

super_t make_approx_normal_dist(T lower_bound, T upper_bound) const
{
auto const abs_range_size = std::abs(static_cast<realType>(upper_bound) - lower_bound);
// Generate normal distribution around zero; output will be shifted by lower_bound
return make_normal_dist(-abs_range_size, abs_range_size);
}

public:
using result_type = T;
__host__ __device__ explicit geometric_distribution(T lower_bound, T upper_bound)
: super_t(0, std::labs(upper_bound - lower_bound) / 4.0),
explicit geometric_distribution(T lower_bound, T upper_bound)
: super_t(make_approx_normal_dist(lower_bound, upper_bound)),
_lower_bound(lower_bound),
_upper_bound(upper_bound)
{
Expand All @@ -101,8 +110,11 @@ class geometric_distribution : public thrust::random::normal_distribution<integr
template <typename UniformRandomNumberGenerator>
__host__ __device__ result_type operator()(UniformRandomNumberGenerator& urng)
{
return _lower_bound < _upper_bound ? std::abs(super_t::operator()(urng)) + _lower_bound
: _lower_bound - std::abs(super_t::operator()(urng));
// Distribution always biases towards lower_bound
realType const result = _lower_bound < _upper_bound
? std::abs(super_t::operator()(urng)) + _lower_bound
: _lower_bound - std::abs(super_t::operator()(urng));
return std::round(result);
}
};

Expand Down

0 comments on commit 95ce0bb

Please sign in to comment.