From 95ce0bb0bd9fa0f2c855d7517274f96d835d861a Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 14 Mar 2024 14:27:48 -0700 Subject: [PATCH] Fix mean computation for the geometric distribution in the data generator (#15282) Since we moved random data generation to the GPU, geometric distribution has been approximated by half-normal distribution. However, the mean computation wasn't updated, causing a ~20% higher mean that the actual generated values. Another issue that exasperated the problem is the implicit conversion to ints in the random generator. This effectively lowered the mean of generated values by 0.5. Together, these lead to list columns having the last row with more than 20% of the total column data. Huge single row caused low performance in many benchmarks. For example, Parquet files end up with a few huge pages and load imbalance in decode. This PR fixes the mean computation to reflex the actual distribution, and rounds the random values when converting to ints. The result is a correct distribution of the number of elements in each randomly generated list. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Paul Mattione (https://github.com/pmattione-nvidia) - Shruti Shivakumar (https://github.com/shrshi) URL: https://github.com/rapidsai/cudf/pull/15282 --- cpp/benchmarks/common/generate_input.cu | 13 +++--- cpp/benchmarks/common/generate_input.hpp | 2 +- .../common/random_distribution_factory.cuh | 44 ++++++++++++------- 3 files changed, 36 insertions(+), 23 deletions(-) diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index ccc7bdef527..9857aac4473 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -77,14 +77,15 @@ double get_distribution_mean(distribution_params const& dist) case distribution_id::NORMAL: case distribution_id::UNIFORM: return (dist.lower_bound / 2.) + (dist.upper_bound / 2.); case distribution_id::GEOMETRIC: { - auto const range_size = dist.lower_bound < dist.upper_bound - ? dist.upper_bound - dist.lower_bound - : dist.lower_bound - dist.upper_bound; - auto const p = geometric_dist_p(range_size); + // Geometric distribution is approximated by a half-normal distribution + // Doubling the standard deviation because the dist range only includes half of the (unfolded) + // normal distribution + auto const gauss_std_dev = std_dev_from_range(dist.lower_bound, dist.upper_bound) * 2; + auto const half_gauss_mean = gauss_std_dev * sqrt(2. / M_PI); if (dist.lower_bound < dist.upper_bound) - return dist.lower_bound + (1. / p); + return dist.lower_bound + half_gauss_mean; else - return dist.lower_bound - (1. / p); + return dist.lower_bound - half_gauss_mean; } default: CUDF_FAIL("Unsupported distribution type."); } diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp index 3bc53e1b5c9..31dc2673d70 100644 --- a/cpp/benchmarks/common/generate_input.hpp +++ b/cpp/benchmarks/common/generate_input.hpp @@ -223,7 +223,7 @@ class data_profile { std::map> float_params; distribution_params string_dist_desc{{distribution_id::NORMAL, 0, 32}}; distribution_params list_dist_desc{ - cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 100}, 2}; + cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 64}, 2}; distribution_params struct_dist_desc{ {cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::STRING}, 2}; std::map> decimal_params; diff --git a/cpp/benchmarks/common/random_distribution_factory.cuh b/cpp/benchmarks/common/random_distribution_factory.cuh index a548e4c9392..c27616132d0 100644 --- a/cpp/benchmarks/common/random_distribution_factory.cuh +++ b/cpp/benchmarks/common/random_distribution_factory.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,15 +44,25 @@ using integral_to_realType = T, std::conditional_t>; +// standard deviation such that most samples fall within the given range +template +constexpr double std_dev_from_range(T lower_bound, T upper_bound) +{ + // 99.7% samples are within 3 standard deviations of the mean + constexpr double k = 6.0; + auto const range_size = std::abs(static_cast(upper_bound) - lower_bound); + return range_size / k; +} + /** * @brief Generates a normal distribution between zero and upper_bound. */ template auto make_normal_dist(T lower_bound, T upper_bound) { - using realT = integral_to_realType; - T const mean = lower_bound + (upper_bound - lower_bound) / 2; - T const stddev = (upper_bound - lower_bound) / 6; + using realT = integral_to_realType; + realT const mean = lower_bound / 2. + upper_bound / 2.; + realT const stddev = std_dev_from_range(lower_bound, upper_bound); return thrust::random::normal_distribution(mean, stddev); } @@ -68,14 +78,6 @@ auto make_uniform_dist(T range_start, T range_end) return thrust::uniform_real_distribution(range_start, range_end); } -template -double geometric_dist_p(T range_size) -{ - constexpr double percentage_in_range = 0.99; - double const p = 1 - exp(log(1 - percentage_in_range) / range_size); - return p ? p : std::numeric_limits::epsilon(); -} - /** * @brief Generates a geometric distribution between lower_bound and upper_bound. * This distribution is an approximation generated using normal distribution. @@ -89,10 +91,17 @@ class geometric_distribution : public thrust::random::normal_distribution(upper_bound) - lower_bound); + // Generate normal distribution around zero; output will be shifted by lower_bound + return make_normal_dist(-abs_range_size, abs_range_size); + } + public: using result_type = T; - __host__ __device__ explicit geometric_distribution(T lower_bound, T upper_bound) - : super_t(0, std::labs(upper_bound - lower_bound) / 4.0), + explicit geometric_distribution(T lower_bound, T upper_bound) + : super_t(make_approx_normal_dist(lower_bound, upper_bound)), _lower_bound(lower_bound), _upper_bound(upper_bound) { @@ -101,8 +110,11 @@ class geometric_distribution : public thrust::random::normal_distribution __host__ __device__ result_type operator()(UniformRandomNumberGenerator& urng) { - return _lower_bound < _upper_bound ? std::abs(super_t::operator()(urng)) + _lower_bound - : _lower_bound - std::abs(super_t::operator()(urng)); + // Distribution always biases towards lower_bound + realType const result = _lower_bound < _upper_bound + ? std::abs(super_t::operator()(urng)) + _lower_bound + : _lower_bound - std::abs(super_t::operator()(urng)); + return std::round(result); } };