Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a libcudf/thrust-based TPC-H derived datagen #16294

Merged
merged 141 commits into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
141 commits
Select commit Hold shift + click to select a range
cbf25da
Generate part, partsupp, customer, supplier, nation, region tables
JayjeetAtGithub Jul 14, 2024
455df49
Add consts
JayjeetAtGithub Jul 24, 2024
88ecc8a
Add consts
JayjeetAtGithub Jul 24, 2024
d779512
Fix consts placements
JayjeetAtGithub Jul 24, 2024
bdbe0e9
Add note about incomplete table
JayjeetAtGithub Jul 24, 2024
d2dd908
Can generate the timestamps cols
JayjeetAtGithub Jul 25, 2024
b2d6b56
Generate a mask based on predicate
JayjeetAtGithub Jul 25, 2024
963a160
Generate l_linestatus col
JayjeetAtGithub Jul 25, 2024
fd761cc
Generate l_returnflag col
JayjeetAtGithub Jul 25, 2024
bb1a86e
Minor changes
JayjeetAtGithub Jul 25, 2024
2485989
Generate o_orderkey
JayjeetAtGithub Jul 26, 2024
b218f5c
Finish the orders table except the ones dependent on lineitem
JayjeetAtGithub Jul 26, 2024
2c4891f
Rename fns
JayjeetAtGithub Jul 26, 2024
73bcf64
Add o_totalprice col
JayjeetAtGithub Jul 26, 2024
fb6c20a
Remove comment
JayjeetAtGithub Jul 26, 2024
a18e214
Fixes
JayjeetAtGithub Jul 26, 2024
4b974d4
Generate o_orderstatus
JayjeetAtGithub Jul 27, 2024
8bf1381
stream and mr params added to supplier table
JayjeetAtGithub Jul 27, 2024
a97a249
Add stream/mr to customer, nation, region
JayjeetAtGithub Jul 27, 2024
e4d9917
Add a lot of stream and mr params and refactoring
JayjeetAtGithub Jul 27, 2024
345a827
Extracted table helper functions into table_helpers.hpp
JayjeetAtGithub Jul 27, 2024
169e121
More refactorings
JayjeetAtGithub Jul 27, 2024
169ef5d
Renames, streams, mrs, refactoring
JayjeetAtGithub Jul 27, 2024
8ae5e45
update notes
JayjeetAtGithub Jul 27, 2024
de72ea2
Move calc_p_retailprice out to table helpers
JayjeetAtGithub Jul 27, 2024
67f63dd
Generating 6/8 correctly
JayjeetAtGithub Jul 27, 2024
e7a0320
Can gen lineitem table
JayjeetAtGithub Jul 27, 2024
3048a17
Can gen all tables
JayjeetAtGithub Jul 27, 2024
935a826
Remove extra col from lineitem
JayjeetAtGithub Jul 27, 2024
d43b1d1
Functionally complete data generator
JayjeetAtGithub Jul 27, 2024
f341de8
Make address fields as compliant as possible
JayjeetAtGithub Jul 27, 2024
a27edcd
Make address fields as compliant as possible
JayjeetAtGithub Jul 27, 2024
3be7ec5
Refactor
JayjeetAtGithub Jul 28, 2024
96762a6
Functionally complete data generator
JayjeetAtGithub Jul 28, 2024
d771023
More log statements
JayjeetAtGithub Jul 28, 2024
20477fb
Add optmizayions
JayjeetAtGithub Jul 30, 2024
85b417e
Add optmizations
JayjeetAtGithub Jul 30, 2024
be77ad4
Optimizations and refactoring
JayjeetAtGithub Jul 31, 2024
393ba2f
Use cudf gather instead
JayjeetAtGithub Jul 31, 2024
7aac965
Use int32 wherever possible
JayjeetAtGithub Jul 31, 2024
e674a11
Use smaller ints as much as possible
JayjeetAtGithub Jul 31, 2024
1967843
Minimize supplier, customer, nation, region
JayjeetAtGithub Jul 31, 2024
f59e24d
Use smaller integers
JayjeetAtGithub Jul 31, 2024
6ab11da
Use smaller integers
JayjeetAtGithub Jul 31, 2024
ccd484b
Fix bugs in lineitem
JayjeetAtGithub Jul 31, 2024
0a9a98c
Generate supplier
JayjeetAtGithub Jul 31, 2024
e49236f
Use ast
JayjeetAtGithub Jul 31, 2024
18d8b09
always move the tables
JayjeetAtGithub Jul 31, 2024
93c0255
Use a smaller pool size
JayjeetAtGithub Jul 31, 2024
8c84b2f
check peak memory
JayjeetAtGithub Jul 31, 2024
7e03d4d
check peak memory
JayjeetAtGithub Jul 31, 2024
b76cd86
remove inner join
JayjeetAtGithub Jul 31, 2024
2c1aabc
Track GPU memory
JayjeetAtGithub Jul 31, 2024
ee9a510
Track memory available and usage
JayjeetAtGithub Jul 31, 2024
8382b1c
use strings apis
karthikeyann Aug 5, 2024
fd1f51c
Remove old comments
JayjeetAtGithub Aug 5, 2024
91fe4f3
Remove usage of select
JayjeetAtGithub Aug 5, 2024
208e9df
Remove usage of select
JayjeetAtGithub Aug 5, 2024
f57701f
Add more commentS
JayjeetAtGithub Aug 5, 2024
5c4895c
Revert unintended change
JayjeetAtGithub Aug 5, 2024
d2c4d79
Use chunked parquet writer
JayjeetAtGithub Aug 5, 2024
e4e0a57
Change to single line comment
JayjeetAtGithub Aug 8, 2024
8f738d7
Fix log from write parquet
JayjeetAtGithub Aug 8, 2024
32f5851
Create a rand_utilities.cuh
JayjeetAtGithub Aug 8, 2024
e637129
Add a header file for dbgen
JayjeetAtGithub Aug 8, 2024
bbadf87
Refactor cudf_datagen
JayjeetAtGithub Aug 8, 2024
a33e532
Fix docstrings
JayjeetAtGithub Aug 8, 2024
0161572
Cleanup cmake
JayjeetAtGithub Aug 8, 2024
151d6f9
Install datagen globally and use from inside examples
JayjeetAtGithub Aug 8, 2024
e2ae44c
Rename files
JayjeetAtGithub Aug 8, 2024
1519812
Fix namespace comment
JayjeetAtGithub Aug 8, 2024
fc97e15
Remove unnecessary headers
JayjeetAtGithub Aug 9, 2024
1ad7794
Remove vocab.hpp
JayjeetAtGithub Aug 9, 2024
15c6a9d
Add stream param
JayjeetAtGithub Aug 9, 2024
a0122da
Write to pq files
JayjeetAtGithub Aug 9, 2024
cf74731
Fix clang errors
JayjeetAtGithub Aug 10, 2024
23886bf
use cudf::type_to_id
JayjeetAtGithub Aug 10, 2024
5435522
Move memory logger and rmm utils to common utils
JayjeetAtGithub Aug 10, 2024
c63dc11
Add statistics resource adaptor
JayjeetAtGithub Aug 10, 2024
189a962
Use auto
JayjeetAtGithub Aug 10, 2024
d1b8fb1
Round up double cols to 2 places
JayjeetAtGithub Aug 11, 2024
735ef6b
Add missing export of partsupp
JayjeetAtGithub Aug 11, 2024
530a468
generate ps in dbgen example
JayjeetAtGithub Aug 11, 2024
0eae25f
Fix o_custkey
JayjeetAtGithub Aug 12, 2024
2b3bd81
Refactor rand_utilities into .cu and .hpp files
JayjeetAtGithub Aug 13, 2024
bf4612d
Make tpch_datagen a .cpp file
JayjeetAtGithub Aug 13, 2024
adf5128
Refactor table_helpers into .cpp and .hpp files
JayjeetAtGithub Aug 13, 2024
2b84da9
Fix duplicate definitions
JayjeetAtGithub Aug 13, 2024
1d27719
Include header files
JayjeetAtGithub Aug 13, 2024
c085c40
Move stuff out of headers
JayjeetAtGithub Aug 13, 2024
bee84e6
Fixes
JayjeetAtGithub Aug 13, 2024
c4e3a99
Simplify lineitem generation
JayjeetAtGithub Aug 14, 2024
f1144b1
Fix variable naming
JayjeetAtGithub Aug 14, 2024
5eba67d
Fix l_extendedprice type
JayjeetAtGithub Aug 14, 2024
5acbcb4
Round up o_totalprice to 2 dec places
JayjeetAtGithub Aug 14, 2024
18b05cc
Remove cudf_benchmark
JayjeetAtGithub Aug 15, 2024
5235a8a
Remove cudf_benchmark include
JayjeetAtGithub Aug 15, 2024
1700bb9
Add default value of stream/mr
JayjeetAtGithub Aug 15, 2024
ea8bdbd
Remove dbgen binary
JayjeetAtGithub Aug 15, 2024
d6f9a30
Remove dbgen binary
JayjeetAtGithub Aug 15, 2024
d709019
dont install
JayjeetAtGithub Aug 15, 2024
085b591
remove template param from gen_primary_key_col
JayjeetAtGithub Aug 18, 2024
1ba70cb
Merge branch 'branch-24.10' into cudf_datagen
JayjeetAtGithub Aug 18, 2024
afccc6e
Update cpp/benchmarks/common/cudf_tpch_datagen/tpch_datagen.cpp
JayjeetAtGithub Aug 21, 2024
d963dcd
Fix naming of functions
JayjeetAtGithub Aug 21, 2024
5b6e9db
Fix naming tpch_datagen.cpp
JayjeetAtGithub Aug 21, 2024
0dc84b3
More name fixes
JayjeetAtGithub Aug 21, 2024
1847ba5
Use constexpr
JayjeetAtGithub Aug 21, 2024
7d39f48
Rename rand utilities to random column generator
JayjeetAtGithub Aug 21, 2024
64e57fe
Add space
JayjeetAtGithub Aug 22, 2024
4a3e91e
Support scale factor < 1
JayjeetAtGithub Aug 22, 2024
3fc4767
Remove the schema vectors
JayjeetAtGithub Aug 22, 2024
986ccde
Use cudf host span
JayjeetAtGithub Aug 23, 2024
ca391c1
Fix datagen example
JayjeetAtGithub Aug 23, 2024
9c5ac50
Fix includes
JayjeetAtGithub Aug 23, 2024
6b40484
Fix cmake
JayjeetAtGithub Aug 23, 2024
674d8be
Merge branch 'branch-24.10' into cudf_datagen
JayjeetAtGithub Aug 25, 2024
5b23f89
Remove const from inexpensive data types
JayjeetAtGithub Aug 26, 2024
b7bdbba
Support memory resource
JayjeetAtGithub Aug 27, 2024
f2a9106
Remove the datagen.cpp
JayjeetAtGithub Aug 27, 2024
f7ac4bd
Address reviews 1
JayjeetAtGithub Aug 27, 2024
ab2fec2
Use full names
JayjeetAtGithub Aug 27, 2024
cba1369
Add namespaces
JayjeetAtGithub Aug 27, 2024
90b8e6b
Avoid copies using release and move
JayjeetAtGithub Aug 27, 2024
ec927d9
Avoid copies using release and move
JayjeetAtGithub Aug 27, 2024
10db2bb
Use structured binding
JayjeetAtGithub Aug 27, 2024
e2daa49
Remove indices from gather maps
JayjeetAtGithub Aug 28, 2024
3b5e589
Use release and move pattern
JayjeetAtGithub Aug 28, 2024
8c8373d
Add examples in docstrings
JayjeetAtGithub Aug 28, 2024
1c408a5
Rename to tpch_data_generator
JayjeetAtGithub Aug 28, 2024
7bf394d
Use nested namespaces
JayjeetAtGithub Aug 28, 2024
736ddb9
Update cpp/benchmarks/common/tpch_data_generator/random_column_genera…
JayjeetAtGithub Aug 28, 2024
0252522
Rename files
JayjeetAtGithub Aug 28, 2024
6ba23f3
Use release and move
JayjeetAtGithub Aug 28, 2024
1cee77e
Undo the structured binding
JayjeetAtGithub Aug 29, 2024
99cb18c
Add back indices
JayjeetAtGithub Aug 29, 2024
b57dfda
Use macro and remove __host__ where not needed
JayjeetAtGithub Aug 29, 2024
38b6372
Remove datagen.cpp
JayjeetAtGithub Aug 29, 2024
3e83f87
Merge branch 'branch-24.10' into cudf_datagen
JayjeetAtGithub Aug 29, 2024
6e0df3c
Remove couts
JayjeetAtGithub Aug 29, 2024
91c5bd3
remove usless code
JayjeetAtGithub Aug 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,30 @@ target_include_directories(
"$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
)

add_library(
tpch_data_generator STATIC
common/tpch_data_generator/tpch_data_generator.cpp common/tpch_data_generator/table_helpers.cpp
common/tpch_data_generator/random_column_generator.cu
)
target_compile_features(tpch_data_generator PUBLIC cxx_std_17 cuda_std_17)

target_compile_options(
tpch_data_generator PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
)

target_link_libraries(
tpch_data_generator
PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp
PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
)

target_include_directories(
tpch_data_generator
PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>" "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
"$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
)

# ##################################################################################################
# * compiler function -----------------------------------------------------------------------------

Expand Down
246 changes: 246 additions & 0 deletions cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "random_column_generator.hpp"

#include <cudf_test/column_wrapper.hpp>

#include <cudf/binaryop.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/filling.hpp>
#include <cudf/strings/detail/strings_children.cuh>

#include <rmm/exec_policy.hpp>

#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
#include <thrust/transform.h>

#include <string>

namespace cudf::datagen {

namespace {

// Functor for generating random strings
struct random_string_generator {
char* chars;
thrust::default_random_engine engine;
thrust::uniform_int_distribution<unsigned char> char_dist;

CUDF_HOST_DEVICE random_string_generator(char* c) : chars(c), char_dist(44, 122) {}

__device__ void operator()(thrust::tuple<int64_t, int64_t> str_begin_end)
{
auto begin = thrust::get<0>(str_begin_end);
auto end = thrust::get<1>(str_begin_end);
engine.discard(begin);
for (auto i = begin; i < end; ++i) {
auto ch = char_dist(engine);
if (i == end - 1 && ch >= '\x7F') ch = ' '; // last element ASCII only.
if (ch >= '\x7F') // x7F is at the top edge of ASCII
chars[i++] = '\xC4'; // these characters are assigned two bytes
chars[i] = static_cast<char>(ch + (ch >= '\x7F'));
}
}
};

// Functor for generating random numbers
template <typename T>
struct random_number_generator {
T lower;
T upper;

CUDF_HOST_DEVICE random_number_generator(T lower, T upper) : lower(lower), upper(upper) {}

__device__ T operator()(const int64_t idx) const
{
if constexpr (cudf::is_integral<T>()) {
thrust::default_random_engine engine;
thrust::uniform_int_distribution<T> dist(lower, upper);
engine.discard(idx);
return dist(engine);
} else {
thrust::default_random_engine engine;
thrust::uniform_real_distribution<T> dist(lower, upper);
engine.discard(idx);
return dist(engine);
}
}
};

} // namespace

std::unique_ptr<cudf::column> generate_random_string_column(cudf::size_type lower,
cudf::size_type upper,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
auto offsets_begin = cudf::detail::make_counting_transform_iterator(
0, random_number_generator<cudf::size_type>(lower, upper));
auto [offsets_column, computed_bytes] = cudf::strings::detail::make_offsets_child_column(
offsets_begin, offsets_begin + num_rows, stream, mr);
rmm::device_uvector<char> chars(computed_bytes, stream);

auto const offset_itr =
cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());

// We generate the strings in parallel into the `chars` vector using the
// offsets vector generated above.
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_zip_iterator(offset_itr, offset_itr + 1),
num_rows,
random_string_generator(chars.data()));

return cudf::make_strings_column(
num_rows, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
}

template <typename T>
std::unique_ptr<cudf::column> generate_random_numeric_column(T lower,
T upper,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
auto col = cudf::make_numeric_column(
cudf::data_type{cudf::type_to_id<T>()}, num_rows, cudf::mask_state::UNALLOCATED, stream, mr);
cudf::size_type begin = 0;
cudf::size_type end = num_rows;
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator(begin),
thrust::make_counting_iterator(end),
col->mutable_view().begin<T>(),
random_number_generator<T>(lower, upper));
return col;
}

template std::unique_ptr<cudf::column> generate_random_numeric_column<int8_t>(
int8_t lower,
int8_t upper,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

template std::unique_ptr<cudf::column> generate_random_numeric_column<int16_t>(
int16_t lower,
int16_t upper,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

template std::unique_ptr<cudf::column> generate_random_numeric_column<cudf::size_type>(
cudf::size_type lower,
cudf::size_type upper,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

template std::unique_ptr<cudf::column> generate_random_numeric_column<double>(
double lower,
double upper,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

std::unique_ptr<cudf::column> generate_primary_key_column(cudf::scalar const& start,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return cudf::sequence(num_rows, start, stream, mr);
}

std::unique_ptr<cudf::column> generate_repeat_string_column(std::string const& value,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
auto const scalar = cudf::string_scalar(value);
return cudf::make_column_from_scalar(scalar, num_rows, stream, mr);
}

std::unique_ptr<cudf::column> generate_random_string_column_from_set(
cudf::host_span<const char* const> set,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
// Build a gather map of random strings to choose from
// The size of the string sets always fits within 16-bit integers
auto const indices =
generate_primary_key_column(cudf::numeric_scalar<int16_t>(0), set.size(), stream, mr);
auto const keys = cudf::test::strings_column_wrapper(set.begin(), set.end()).release();
auto const gather_map = cudf::table_view({indices->view(), keys->view()});

// Build a column of random keys to gather from the set
auto const gather_keys =
generate_random_numeric_column<int16_t>(0, set.size() - 1, num_rows, stream, mr);

// Perform the gather operation
auto const gathered_table = cudf::gather(
gather_map, gather_keys->view(), cudf::out_of_bounds_policy::DONT_CHECK, stream, mr);
auto gathered_table_columns = gathered_table->release();
return std::move(gathered_table_columns[1]);
}

template <typename T>
std::unique_ptr<cudf::column> generate_repeat_sequence_column(T seq_length,
bool zero_indexed,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
auto pkey =
generate_primary_key_column(cudf::numeric_scalar<cudf::size_type>(0), num_rows, stream, mr);
auto repeat_seq_zero_indexed = cudf::binary_operation(pkey->view(),
cudf::numeric_scalar<T>(seq_length),
cudf::binary_operator::MOD,
cudf::data_type{cudf::type_to_id<T>()},
stream,
mr);
if (zero_indexed) { return repeat_seq_zero_indexed; }
return cudf::binary_operation(repeat_seq_zero_indexed->view(),
cudf::numeric_scalar<T>(1),
cudf::binary_operator::ADD,
cudf::data_type{cudf::type_to_id<T>()},
stream,
mr);
}

template std::unique_ptr<cudf::column> generate_repeat_sequence_column<int8_t>(
int8_t seq_length,
bool zero_indexed,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

template std::unique_ptr<cudf::column> generate_repeat_sequence_column<cudf::size_type>(
cudf::size_type seq_length,
bool zero_indexed,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

} // namespace cudf::datagen
Loading
Loading