diff --git a/README.md b/README.md index 34594ca..2c8976f 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,8 @@ After starting the container, you can build and execute BGHT code without any ad ### limitations * Currently hash tables based on cuckoo hashing do not support concurrent insertion and queries. IHT and P2BHT support concurrent insertions and queries. -* Keys must be unique +For hash tables that use a probing scheme other than IHT: +* Keys must be unique. * Construction of the data structures offered *may* fail. In these scenarios, reconstructing the table using a larger capacity or a lower load factor should be considered. Our paper offers recommended hash table load factors (for uniformly distributed unsigned keys) to achieve at least a 99% success rate ([See Fig. 2](https://arxiv.org/abs/2108.07232)). For example, BCHT will offer a 100% success rate for up to 0.991 load factor. Please create an issue if you encounter any problems with different key distributions. ## Reproducing the arXiv paper results diff --git a/examples/1cht_example.cu b/examples/1cht_example.cu index d307047..b9b5611 100644 --- a/examples/1cht_example.cu +++ b/examples/1cht_example.cu @@ -1,5 +1,5 @@ /* - * Copyright 2021 The Regents of the University of California, Davis + * Copyright 2021-2024 The Regents of the University of California, Davis * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -109,7 +109,7 @@ int main(int argc, char** argv) { if (expected_pair.second != found_result) { std::cout << "Error: expected: " << expected_pair.second; std::cout << ", found: " << found_result << '\n'; - return; + return 0; } } std::cout << "Success\n"; diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index dcb3241..b2cc5f5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -11,6 +11,7 @@ set(example_targets custom_types_example iht_example hash_join + histogram ) foreach(target ${example_targets}) diff --git a/examples/bcht_example.cu b/examples/bcht_example.cu index 0d9da57..3da31b0 100644 --- a/examples/bcht_example.cu +++ b/examples/bcht_example.cu @@ -1,5 +1,5 @@ /* - * Copyright 2021 The Regents of the University of California, Davis + * Copyright 2021-2024 The Regents of the University of California, Davis * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -110,7 +110,7 @@ int main(int argc, char** argv) { if (expected_pair.second != found_result) { std::cout << "Error: expected: " << expected_pair.second; std::cout << ", found: " << found_result << '\n'; - return; + return 0; } } std::cout << "Success\n"; diff --git a/examples/histogram.cu b/examples/histogram.cu new file mode 100644 index 0000000..9665422 --- /dev/null +++ b/examples/histogram.cu @@ -0,0 +1,149 @@ +/* + * Copyright 2024 The Regents of the University of California, Davis + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +template +__global__ void print(HashMap result_table) { + using pair_type = typename HashMap::value_type; + using key_type = typename HashMap::key_type; + using value_type = typename HashMap::mapped_type; + + const auto capacity = result_table.max_size(); + const auto sentinel_key = result_table.get_sentinel_key(); + const auto sentinel_pair = result_table.get_sentinel_pair(); + + auto begin = result_table.begin(); + + auto thread_id = threadIdx.x + blockIdx.x * blockDim.x; + + const auto pair = thread_id < capacity + ? (begin + thread_id)->load(cuda::memory_order_relaxed) + : sentinel_pair; + if (pair.first != sentinel_key) { + printf("result_map[%u] = %u\n", pair.first, pair.second); + }; +} + +template +__global__ void histogram(HashMap map, key_type* keys, std::size_t count) { + using pair_type = typename HashMap::value_type; + using value_type = typename HashMap::mapped_type; + + auto block = cooperative_groups::this_thread_block(); + auto tile = cooperative_groups::tiled_partition(block); + + using queue_type = bght::tile_wide_queue; + + const auto sentinel_key = map.get_sentinel_key(); + + auto thread_id = threadIdx.x + blockIdx.x * blockDim.x; + + const auto key = thread_id < count ? keys[thread_id] : sentinel_key; + + queue_type work_queue(key, sentinel_key, tile); + + while (!work_queue.empty()) { + auto cur_key = work_queue.front(); + work_queue.pop(); + // Try inserting the key with count of 1 + auto result = map.insert({cur_key, 1}, tile); + const bool success = result.second == true; + // If insertion failed, one thread out of the tile need to try to atomically increment + // the count. + if (!success) { + // make sure one thread in the tile do the increment + if (tile.thread_rank() == 0) { + bool exchange_success{false}; + auto expected = result.first->load(cuda::memory_order_relaxed); + while (!exchange_success) { + pair_type desired{expected.first, expected.second + 1}; + cuda::memory_order success = cuda::memory_order_relaxed; + cuda::memory_order failure = cuda::memory_order_relaxed; + // `expected` here will be modified with the latest value if the + // `compare_exchange_strong` fails. On failure, the next iteration will add 1 to + // the last found value in memory. + // Similar logic can be used to implement min or max. + exchange_success = + result.first->compare_exchange_strong(expected, desired, success, failure); + } + } + } + } +} + +int main(int, char**) { + using key_type = std::uint32_t; + using count_type = std::uint32_t; + using pair_type = bght::pair; + + std::vector h_keys{512, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 4, + 2, 2, 4, 4, 4, 5, 5, 5, 5, 100, 512}; + + std::cout << "Building a histogram for keys: "; + for (const auto k : h_keys) { + std::cout << k << ", "; + } + std::cout << std::endl; + + const auto num_keys = h_keys.size(); + + auto invalid_key = std::numeric_limits::max(); + auto invalid_value = std::numeric_limits::max(); + + const float load_factor = 0.7; + + std::size_t capacity = double(num_keys) / load_factor; + + using hash_map = bght::iht; + + hash_map map(capacity, invalid_key, invalid_value); + + thrust::device_vector d_keys(h_keys); + + const uint32_t block_size = 128; + uint32_t num_blocks = (num_keys + block_size - 1) / block_size; + histogram<<>>(map, d_keys.data().get(), num_keys); + + cuda_try(cudaDeviceSynchronize()); + + std::cout << "Found results:" << std::endl; + + num_blocks = (capacity + block_size - 1) / block_size; + print<<>>(map); + + cuda_try(cudaDeviceSynchronize()); + + std::cout << "Exepcted reults: " << std::endl; + std::unordered_map histogram; + for (int key : h_keys) { + histogram[key]++; + } + + for (const auto& pair : histogram) { + printf("ground_truth[%u] = %u\n", pair.first, pair.second); + } +} \ No newline at end of file diff --git a/include/bght/detail/bucket.cuh b/include/bght/detail/bucket.cuh index 8cd6c81..30633bf 100644 --- a/include/bght/detail/bucket.cuh +++ b/include/bght/detail/bucket.cuh @@ -1,5 +1,5 @@ /* - * Copyright 2021 The Regents of the University of California, Davis + * Copyright 2021-2024 The Regents of the University of California, Davis * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -79,6 +79,19 @@ struct bucket { return cas_success; } + DEVICE_QUALIFIER + pair_type strong_cas_at_location_ret_old( + const pair_type& pair, + const int location, + const pair_type& sentinel, + cuda::memory_order success = cuda::memory_order_seq_cst, + cuda::memory_order failure = cuda::memory_order_seq_cst) { + pair_type expected = sentinel; + pair_type desired = pair; + ptr_[location].compare_exchange_strong(expected, desired, success, failure); + return expected; + } + DEVICE_QUALIFIER pair_type exch_at_location(const pair_type& pair, const int location, @@ -87,6 +100,9 @@ struct bucket { return old; } + DEVICE_QUALIFIER + atomic_pair_type* begin() { return ptr_; } + private: pair_type lane_pair_; atomic_pair_type* ptr_; diff --git a/include/bght/detail/iht_impl.cuh b/include/bght/detail/iht_impl.cuh index 855bbcd..94ce3c1 100644 --- a/include/bght/detail/iht_impl.cuh +++ b/include/bght/detail/iht_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright 2021 The Regents of the University of California, Davis + * Copyright 2021-2024 The Regents of the University of California, Davis * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -136,7 +136,7 @@ bool iht::insert( const auto num_keys = std::distance(first, last); const uint32_t block_size = 128; const uint32_t num_blocks = (num_keys + block_size - 1) / block_size; - detail::kernels::tiled_insert_kernel<<>>( + detail::kernels::iht_tiled_insert_kernel<<>>( first, last, *this); bool success; cuda_try(cudaMemcpyAsync( @@ -175,7 +175,10 @@ template template -__device__ bool bght::iht::insert( +__device__ cuda::std::pair< + typename bght::iht::iterator, + bool> +bght::iht::insert( value_type const& pair, tile_type const& tile) { auto primary_bucket = hfp_(pair.first) % num_buckets_; @@ -184,7 +187,7 @@ __device__ bool bght::iht; @@ -194,6 +197,11 @@ __device__ bool bght::iht 0) { bucket.load(cuda::memory_order_relaxed); INCREMENT_PROBES_IN_TILE + auto key_location = bucket.find_key_location(pair.first, key_equal{}); + // check if key exists + if (key_location != -1) { + return {bucket.begin() + key_location, false}; + } load = bucket.compute_load(sentinel_pair); } do { @@ -205,20 +213,36 @@ __device__ bool bght::iht::const_iter return d_table_; } +template +typename iht::iterator __device__ + __host__ + iht::begin() { + return d_table_; +} + template ::const_iter return d_table_ + capacity_; } +template +typename iht::iterator __device__ + __host__ + iht::end() { + return d_table_ + capacity_; +} template -#include #include +#include namespace bght { namespace detail { @@ -63,6 +63,47 @@ __global__ void tiled_insert_kernel(InputIt first, InputIt last, HashMap map) { } } +template +__global__ void iht_tiled_insert_kernel(InputIt first, InputIt last, HashMap map) { + // construct the tile + auto thread_id = threadIdx.x + blockIdx.x * blockDim.x; + auto block = cooperative_groups::this_thread_block(); + auto tile = cooperative_groups::tiled_partition(block); + + auto count = last - first; + if ((thread_id - tile.thread_rank()) >= count) { + return; + } + + bool do_op = false; + typename HashMap::value_type insertion_pair{}; + + // load the input + if (thread_id < count) { + insertion_pair = first[thread_id]; + do_op = true; + } + + bool success = true; + // Do the insertion + auto work_queue = tile.ballot(do_op); + while (work_queue) { + auto cur_rank = __ffs(work_queue) - 1; + auto cur_pair = tile.shfl(insertion_pair, cur_rank); + bool insertion_success = map.insert(cur_pair, tile).second; + + if (tile.thread_rank() == cur_rank) { + do_op = false; + success = insertion_success; + } + work_queue = tile.ballot(do_op); + } + + if (!tile.all(success)) { + *map.d_build_success_ = false; + } +} + template __global__ void tiled_find_kernel(InputIt first, InputIt last, diff --git a/include/bght/iht.hpp b/include/bght/iht.hpp index e1eddb1..eecdbfc 100644 --- a/include/bght/iht.hpp +++ b/include/bght/iht.hpp @@ -1,5 +1,5 @@ /* - * Copyright 2021 The Regents of the University of California, Davis + * Copyright 2021-2024 The Regents of the University of California, Davis * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include #include #include +#include #include namespace bght { @@ -144,18 +145,22 @@ struct iht { /** * @brief Device-side cooperative insertion API that inserts a single pair into the - * hash map. + * hash map if the key does not exist. * @tparam tile_type A cooperative group tile with a size that must match the bucket * size of the hash map (i.e., `bucket_size`). It must support the tile-wide * intrinsics `ballot`, `shfl` * @param pair A key-value pair to insert into the hash map. The pair must be the same * for all threads in the cooperative group tile * @param tile The cooperative group tile - * @return A boolean indicating success (true) or failure (false) of the insertion - * operation. + * @return A pair where the second element is a boolean indicating success (true) + * or failure (false) of the insertion operation. If insertion succeeded or the key + * exists, the first element in the pair contain a pointer to the inserted or old + * key-value pair, otherwise, the first pair element contain a pointer to the end of the + * map. */ template - __device__ bool insert(value_type const& pair, tile_type const& tile); + __device__ cuda::std::pair insert(value_type const& pair, + tile_type const& tile); /** * @brief Device-side cooperative find API that finds a single pair into the hash @@ -203,6 +208,21 @@ struct iht { */ __device__ __host__ const_iterator end() const; + /** + * @brief Returns an iterator to the first element of the tables including all invalid + * entries. + * + * @return iterator constant iterator to the first element of the table + */ + __device__ __host__ iterator begin(); + /** + * @brief Returns an iterator to the last element of the tables including all invalid + * entries. + * + * @return iterator constant iterator to the last element of the table + */ + __device__ __host__ iterator end(); + /** * @brief Returns the maximum number of elements the container is able to hold * @@ -237,6 +257,11 @@ struct iht { template friend __global__ void detail::kernels::tiled_insert_kernel(InputIt, InputIt, HashMap); + template + friend __global__ void detail::kernels::iht_tiled_insert_kernel(InputIt, + InputIt, + HashMap); + template friend __global__ void detail::kernels::tiled_find_kernel(InputIt, InputIt,