Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-25.02' into feat/logger
Browse files Browse the repository at this point in the history
  • Loading branch information
vyasr committed Dec 3, 2024
2 parents 7848713 + beb4296 commit 2951502
Show file tree
Hide file tree
Showing 69 changed files with 1,270 additions and 1,121 deletions.
2 changes: 1 addition & 1 deletion ci/cpp_linters.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ source rapids-configure-sccache
# Run the build via CMake, which will run clang-tidy when CUDF_STATIC_LINTERS is enabled.

iwyu_flag=""
if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then
if [[ "${RAPIDS_BUILD_TYPE:-}" == "nightly" ]]; then
iwyu_flag="-DCUDF_IWYU=ON"
fi
cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_CLANG_TIDY=ON ${iwyu_flag} -DBUILD_TESTS=OFF -GNinja
Expand Down
3 changes: 0 additions & 3 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ dependencies:
- python-confluent-kafka>=2.5.0,<2.6.0a0
- python-xxhash
- python>=3.10,<3.13
- pytorch>=2.1.0
- rapids-build-backend>=0.3.0,<0.4.0.dev0
- rapids-dask-dependency==25.2.*,>=0.0.0a0
- rich
Expand All @@ -97,8 +96,6 @@ dependencies:
- sphinxcontrib-websupport
- streamz
- sysroot_linux-64==2.17
- tokenizers==0.15.2
- transformers==4.39.3
- typing_extensions>=4.0.0
- zlib>=1.2.13
name: all_cuda-118_arch-x86_64
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ dependencies:
- python-confluent-kafka>=2.5.0,<2.6.0a0
- python-xxhash
- python>=3.10,<3.13
- pytorch>=2.1.0
- pytorch>=2.4.0
- rapids-build-backend>=0.3.0,<0.4.0.dev0
- rapids-dask-dependency==25.2.*,>=0.0.0a0
- rich
Expand Down
14 changes: 11 additions & 3 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ option(
mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL)
option(CUDF_CLANG_TIDY "Enable clang-tidy during compilation" OFF)
option(CUDF_IWYU "Enable IWYU during compilation" OFF)
option(CUDF_CLANG_TIDY_AUTOFIX "Enable clang-tidy autofixes" OFF)

option(
CUDF_KVIKIO_REMOTE_IO
Expand Down Expand Up @@ -205,9 +206,16 @@ function(enable_static_checkers target)
if(_LINT_CLANG_TIDY)
# clang will complain about unused link libraries on the compile line unless we specify
# -Qunused-arguments.
set_target_properties(
${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments"
)
if(CUDF_CLANG_TIDY_AUTOFIX)
set_target_properties(
${target} PROPERTIES CXX_CLANG_TIDY
"${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments;--fix"
)
else()
set_target_properties(
${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments"
)
endif()
endif()
if(_LINT_IWYU)
# A few extra warnings pop up when building with IWYU. I'm not sure why, but they are not
Expand Down
163 changes: 24 additions & 139 deletions cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,211 +15,96 @@
*/
#pragma once

#include <cudf/fixed_point/fixed_point.hpp>
#include <cudf/hashing.hpp>
#include <cudf/hashing/detail/hash_functions.cuh>
#include <cudf/strings/string_view.cuh>

#include <thrust/pair.h>
#include <cuco/hash_functions.cuh>
#include <cuda/std/array>
#include <cuda/std/cstddef>

namespace cudf::hashing::detail {

// MurmurHash3_x64_128 implementation from
// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
//-----------------------------------------------------------------------------
// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.
// Note - The x86 and x64 versions do _not_ produce the same results, as the
// algorithms are optimized for their respective platforms. You can still
// compile and run any of them on any platform, but your performance with the
// non-native version will be less than optimal.
template <typename Key>
struct MurmurHash3_x64_128 {
using result_type = thrust::pair<uint64_t, uint64_t>;
using result_type = cuda::std::array<uint64_t, 2>;

constexpr MurmurHash3_x64_128() = default;
constexpr MurmurHash3_x64_128(uint64_t seed) : m_seed(seed) {}

__device__ inline uint32_t getblock32(std::byte const* data, cudf::size_type offset) const
CUDF_HOST_DEVICE constexpr MurmurHash3_x64_128(uint64_t seed = cudf::DEFAULT_HASH_SEED)
: _impl{seed}
{
// Read a 4-byte value from the data pointer as individual bytes for safe
// unaligned access (very likely for string types).
auto block = reinterpret_cast<uint8_t const*>(data + offset);
return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
}

__device__ inline uint64_t getblock64(std::byte const* data, cudf::size_type offset) const
{
uint64_t result = getblock32(data, offset + 4);
result = result << 32;
return result | getblock32(data, offset);
}
__device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }

__device__ inline uint64_t fmix64(uint64_t k) const
__device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes,
std::uint64_t size) const
{
k ^= k >> 33;
k *= 0xff51afd7ed558ccdUL;
k ^= k >> 33;
k *= 0xc4ceb9fe1a85ec53UL;
k ^= k >> 33;
return k;
return this->_impl.compute_hash(bytes, size);
}

result_type __device__ inline operator()(Key const& key) const { return compute(key); }

private:
template <typename T>
result_type __device__ inline compute(T const& key) const
{
return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
}

result_type __device__ inline compute_remaining_bytes(std::byte const* data,
cudf::size_type len,
cudf::size_type tail_offset,
result_type h) const
{
// Process remaining bytes that do not fill a 8-byte chunk.
uint64_t k1 = 0;
uint64_t k2 = 0;
auto const tail = reinterpret_cast<uint8_t const*>(data) + tail_offset;
switch (len & (BLOCK_SIZE - 1)) {
case 15: k2 ^= static_cast<uint64_t>(tail[14]) << 48;
case 14: k2 ^= static_cast<uint64_t>(tail[13]) << 40;
case 13: k2 ^= static_cast<uint64_t>(tail[12]) << 32;
case 12: k2 ^= static_cast<uint64_t>(tail[11]) << 24;
case 11: k2 ^= static_cast<uint64_t>(tail[10]) << 16;
case 10: k2 ^= static_cast<uint64_t>(tail[9]) << 8;
case 9:
k2 ^= static_cast<uint64_t>(tail[8]) << 0;
k2 *= c2;
k2 = rotate_bits_left(k2, 33);
k2 *= c1;
h.second ^= k2;

case 8: k1 ^= static_cast<uint64_t>(tail[7]) << 56;
case 7: k1 ^= static_cast<uint64_t>(tail[6]) << 48;
case 6: k1 ^= static_cast<uint64_t>(tail[5]) << 40;
case 5: k1 ^= static_cast<uint64_t>(tail[4]) << 32;
case 4: k1 ^= static_cast<uint64_t>(tail[3]) << 24;
case 3: k1 ^= static_cast<uint64_t>(tail[2]) << 16;
case 2: k1 ^= static_cast<uint64_t>(tail[1]) << 8;
case 1:
k1 ^= static_cast<uint64_t>(tail[0]) << 0;
k1 *= c1;
k1 = rotate_bits_left(k1, 31);
k1 *= c2;
h.first ^= k1;
};
return h;
}

result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
__device__ constexpr result_type compute(T const& key) const
{
auto const nblocks = len / BLOCK_SIZE;
uint64_t h1 = m_seed;
uint64_t h2 = m_seed;

// Process all four-byte chunks.
for (cudf::size_type i = 0; i < nblocks; i++) {
uint64_t k1 = getblock64(data, (i * BLOCK_SIZE)); // 1st 8 bytes
uint64_t k2 = getblock64(data, (i * BLOCK_SIZE) + (BLOCK_SIZE / 2)); // 2nd 8 bytes

k1 *= c1;
k1 = rotate_bits_left(k1, 31);
k1 *= c2;

h1 ^= k1;
h1 = rotate_bits_left(h1, 27);
h1 += h2;
h1 = h1 * 5 + 0x52dce729;

k2 *= c2;
k2 = rotate_bits_left(k2, 33);
k2 *= c1;

h2 ^= k2;
h2 = rotate_bits_left(h2, 31);
h2 += h1;
h2 = h2 * 5 + 0x38495ab5;
}

thrust::tie(h1, h2) = compute_remaining_bytes(data, len, nblocks * BLOCK_SIZE, {h1, h2});

// Finalize hash.
h1 ^= len;
h2 ^= len;

h1 += h2;
h2 += h1;

h1 = fmix64(h1);
h2 = fmix64(h2);

h1 += h2;
h2 += h1;

return {h1, h2};
return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(T));
}

private:
uint64_t m_seed{};
static constexpr uint32_t BLOCK_SIZE = 16; // 2 x 64-bit = 16 bytes

static constexpr uint64_t c1 = 0x87c37b91114253d5UL;
static constexpr uint64_t c2 = 0x4cf5ad432745937fUL;
cuco::murmurhash3_x64_128<Key> _impl;
};

template <>
MurmurHash3_x64_128<bool>::result_type __device__ inline MurmurHash3_x64_128<bool>::operator()(
bool const& key) const
{
return compute<uint8_t>(key);
return this->compute<uint8_t>(key);
}

template <>
MurmurHash3_x64_128<float>::result_type __device__ inline MurmurHash3_x64_128<float>::operator()(
float const& key) const
{
return compute(normalize_nans(key));
return this->compute(normalize_nans(key));
}

template <>
MurmurHash3_x64_128<double>::result_type __device__ inline MurmurHash3_x64_128<double>::operator()(
double const& key) const
{
return compute(normalize_nans(key));
return this->compute(normalize_nans(key));
}

template <>
MurmurHash3_x64_128<cudf::string_view>::result_type
__device__ inline MurmurHash3_x64_128<cudf::string_view>::operator()(
cudf::string_view const& key) const
{
auto const data = reinterpret_cast<std::byte const*>(key.data());
auto const len = key.size_bytes();
return compute_bytes(data, len);
return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()),
key.size_bytes());
}

template <>
MurmurHash3_x64_128<numeric::decimal32>::result_type
__device__ inline MurmurHash3_x64_128<numeric::decimal32>::operator()(
numeric::decimal32 const& key) const
{
return compute(key.value());
return this->compute(key.value());
}

template <>
MurmurHash3_x64_128<numeric::decimal64>::result_type
__device__ inline MurmurHash3_x64_128<numeric::decimal64>::operator()(
numeric::decimal64 const& key) const
{
return compute(key.value());
return this->compute(key.value());
}

template <>
MurmurHash3_x64_128<numeric::decimal128>::result_type
__device__ inline MurmurHash3_x64_128<numeric::decimal128>::operator()(
numeric::decimal128 const& key) const
{
return compute(key.value());
return this->compute(key.value());
}

} // namespace cudf::hashing::detail
2 changes: 1 addition & 1 deletion cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ template <typename Key>
struct MurmurHash3_x86_32 {
using result_type = hash_value_type;

__host__ __device__ constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED)
CUDF_HOST_DEVICE constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED)
: _impl{seed}
{
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/cudf/hashing/detail/xxhash_64.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ template <typename Key>
struct XXHash_64 {
using result_type = std::uint64_t;

__host__ __device__ constexpr XXHash_64(uint64_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}
CUDF_HOST_DEVICE constexpr XXHash_64(uint64_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}

__device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/bitmask/is_element_valid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ bool is_element_valid_sync(column_view const& col_view,
CUDF_EXPECTS(element_index >= 0 and element_index < col_view.size(), "invalid index.");
if (!col_view.nullable()) { return true; }

bitmask_type word;
bitmask_type word = 0;
// null_mask() returns device ptr to bitmask without offset
size_type index = element_index + col_view.offset();
size_type const index = element_index + col_view.offset();
CUDF_CUDA_TRY(cudaMemcpyAsync(&word,
col_view.null_mask() + word_index(index),
sizeof(bitmask_type),
Expand Down
Loading

0 comments on commit 2951502

Please sign in to comment.