diff --git a/README.md b/README.md
index 34594ca..2c8976f 100644
--- a/README.md
+++ b/README.md
@@ -137,7 +137,8 @@ After starting the container, you can build and execute BGHT code without any ad
 
 ### limitations
 * Currently hash tables based on cuckoo hashing do not support concurrent insertion and queries. IHT and P2BHT support concurrent insertions and queries.
-* Keys must be unique
+For hash tables that use a probing scheme other than IHT:
+* Keys must be unique.
 * Construction of the data structures offered *may* fail. In these scenarios, reconstructing the table using a larger capacity or a lower load factor should be considered. Our paper offers recommended hash table load factors (for uniformly distributed unsigned keys) to achieve at least a 99% success rate ([See Fig. 2](https://arxiv.org/abs/2108.07232)). For example, BCHT will offer a 100% success rate for up to 0.991 load factor. Please create an issue if you encounter any problems with different key distributions.
 
 ## Reproducing the arXiv paper results
diff --git a/examples/1cht_example.cu b/examples/1cht_example.cu
index d307047..b9b5611 100644
--- a/examples/1cht_example.cu
+++ b/examples/1cht_example.cu
@@ -1,5 +1,5 @@
 ﻿/*
- *   Copyright 2021 The Regents of the University of California, Davis
+ *   Copyright 2021-2024 The Regents of the University of California, Davis
  *
  *   Licensed under the Apache License, Version 2.0 (the "License");
  *   you may not use this file except in compliance with the License.
@@ -109,7 +109,7 @@ int main(int argc, char** argv) {
     if (expected_pair.second != found_result) {
       std::cout << "Error: expected: " << expected_pair.second;
       std::cout << ", found: " << found_result << '\n';
-      return;
+      return 0;
     }
   }
   std::cout << "Success\n";
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index dcb3241..b2cc5f5 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -11,6 +11,7 @@ set(example_targets
     custom_types_example
     iht_example
     hash_join
+    histogram
     )
 
 foreach(target ${example_targets})
diff --git a/examples/bcht_example.cu b/examples/bcht_example.cu
index 0d9da57..3da31b0 100644
--- a/examples/bcht_example.cu
+++ b/examples/bcht_example.cu
@@ -1,5 +1,5 @@
 ﻿/*
- *   Copyright 2021 The Regents of the University of California, Davis
+ *   Copyright 2021-2024 The Regents of the University of California, Davis
  *
  *   Licensed under the Apache License, Version 2.0 (the "License");
  *   you may not use this file except in compliance with the License.
@@ -110,7 +110,7 @@ int main(int argc, char** argv) {
     if (expected_pair.second != found_result) {
       std::cout << "Error: expected: " << expected_pair.second;
       std::cout << ", found: " << found_result << '\n';
-      return;
+      return 0;
     }
   }
   std::cout << "Success\n";
diff --git a/examples/histogram.cu b/examples/histogram.cu
new file mode 100644
index 0000000..9665422
--- /dev/null
+++ b/examples/histogram.cu
@@ -0,0 +1,149 @@
+/*
+ *   Copyright 2024 The Regents of the University of California, Davis
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ */
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sequence.h>
+#include <bght/cmd.hpp>
+#include <bght/gpu_timer.hpp>
+#include <bght/iht.hpp>
+#include <bght/perf_report.hpp>
+#include <bght/rkg.hpp>
+#include <bght/tile_wide_queue.hpp>
+#include <limits>
+#include <type_traits>
+
+template <typename HashMap>
+__global__ void print(HashMap result_table) {
+  using pair_type = typename HashMap::value_type;
+  using key_type = typename HashMap::key_type;
+  using value_type = typename HashMap::mapped_type;
+
+  const auto capacity = result_table.max_size();
+  const auto sentinel_key = result_table.get_sentinel_key();
+  const auto sentinel_pair = result_table.get_sentinel_pair();
+
+  auto begin = result_table.begin();
+
+  auto thread_id = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const auto pair = thread_id < capacity
+                        ? (begin + thread_id)->load(cuda::memory_order_relaxed)
+                        : sentinel_pair;
+  if (pair.first != sentinel_key) {
+    printf("result_map[%u] = %u\n", pair.first, pair.second);
+  };
+}
+
+template <typename HashMap, typename key_type>
+__global__ void histogram(HashMap map, key_type* keys, std::size_t count) {
+  using pair_type = typename HashMap::value_type;
+  using value_type = typename HashMap::mapped_type;
+
+  auto block = cooperative_groups::this_thread_block();
+  auto tile = cooperative_groups::tiled_partition<HashMap::bucket_size>(block);
+
+  using queue_type = bght::tile_wide_queue<key_type, decltype(tile)>;
+
+  const auto sentinel_key = map.get_sentinel_key();
+
+  auto thread_id = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const auto key = thread_id < count ? keys[thread_id] : sentinel_key;
+
+  queue_type work_queue(key, sentinel_key, tile);
+
+  while (!work_queue.empty()) {
+    auto cur_key = work_queue.front();
+    work_queue.pop();
+    // Try inserting the key with count of 1
+    auto result = map.insert({cur_key, 1}, tile);
+    const bool success = result.second == true;
+    // If insertion failed, one thread out of the tile need to try to atomically increment
+    // the count.
+    if (!success) {
+      // make sure one thread in the tile do the increment
+      if (tile.thread_rank() == 0) {
+        bool exchange_success{false};
+        auto expected = result.first->load(cuda::memory_order_relaxed);
+        while (!exchange_success) {
+          pair_type desired{expected.first, expected.second + 1};
+          cuda::memory_order success = cuda::memory_order_relaxed;
+          cuda::memory_order failure = cuda::memory_order_relaxed;
+          // `expected` here will be modified with the latest value if the
+          // `compare_exchange_strong` fails. On failure, the next iteration will add 1 to
+          // the last found value in memory.
+          // Similar logic can be used to implement min or max.
+          exchange_success =
+              result.first->compare_exchange_strong(expected, desired, success, failure);
+        }
+      }
+    }
+  }
+}
+
+int main(int, char**) {
+  using key_type = std::uint32_t;
+  using count_type = std::uint32_t;
+  using pair_type = bght::pair<key_type, count_type>;
+
+  std::vector<key_type> h_keys{512, 1, 1, 1, 1, 1, 1, 1, 1, 1,   2,  4,
+                               2,   2, 4, 4, 4, 5, 5, 5, 5, 100, 512};
+
+  std::cout << "Building a histogram for keys: ";
+  for (const auto k : h_keys) {
+    std::cout << k << ", ";
+  }
+  std::cout << std::endl;
+
+  const auto num_keys = h_keys.size();
+
+  auto invalid_key = std::numeric_limits<key_type>::max();
+  auto invalid_value = std::numeric_limits<count_type>::max();
+
+  const float load_factor = 0.7;
+
+  std::size_t capacity = double(num_keys) / load_factor;
+
+  using hash_map = bght::iht<key_type, count_type>;
+
+  hash_map map(capacity, invalid_key, invalid_value);
+
+  thrust::device_vector<key_type> d_keys(h_keys);
+
+  const uint32_t block_size = 128;
+  uint32_t num_blocks = (num_keys + block_size - 1) / block_size;
+  histogram<<<num_blocks, block_size>>>(map, d_keys.data().get(), num_keys);
+
+  cuda_try(cudaDeviceSynchronize());
+
+  std::cout << "Found results:" << std::endl;
+
+  num_blocks = (capacity + block_size - 1) / block_size;
+  print<<<num_blocks, block_size>>>(map);
+
+  cuda_try(cudaDeviceSynchronize());
+
+  std::cout << "Exepcted reults: " << std::endl;
+  std::unordered_map<key_type, count_type> histogram;
+  for (int key : h_keys) {
+    histogram[key]++;
+  }
+
+  for (const auto& pair : histogram) {
+    printf("ground_truth[%u] = %u\n", pair.first, pair.second);
+  }
+}
\ No newline at end of file
diff --git a/include/bght/detail/bucket.cuh b/include/bght/detail/bucket.cuh
index 8cd6c81..30633bf 100644
--- a/include/bght/detail/bucket.cuh
+++ b/include/bght/detail/bucket.cuh
@@ -1,5 +1,5 @@
 /*
- *   Copyright 2021 The Regents of the University of California, Davis
+ *   Copyright 2021-2024 The Regents of the University of California, Davis
  *
  *   Licensed under the Apache License, Version 2.0 (the "License");
  *   you may not use this file except in compliance with the License.
@@ -79,6 +79,19 @@ struct bucket {
     return cas_success;
   }
 
+  DEVICE_QUALIFIER
+  pair_type strong_cas_at_location_ret_old(
+      const pair_type& pair,
+      const int location,
+      const pair_type& sentinel,
+      cuda::memory_order success = cuda::memory_order_seq_cst,
+      cuda::memory_order failure = cuda::memory_order_seq_cst) {
+    pair_type expected = sentinel;
+    pair_type desired = pair;
+    ptr_[location].compare_exchange_strong(expected, desired, success, failure);
+    return expected;
+  }
+
   DEVICE_QUALIFIER
   pair_type exch_at_location(const pair_type& pair,
                              const int location,
@@ -87,6 +100,9 @@ struct bucket {
     return old;
   }
 
+  DEVICE_QUALIFIER
+  atomic_pair_type* begin() { return ptr_; }
+
  private:
   pair_type lane_pair_;
   atomic_pair_type* ptr_;
diff --git a/include/bght/detail/iht_impl.cuh b/include/bght/detail/iht_impl.cuh
index 855bbcd..94ce3c1 100644
--- a/include/bght/detail/iht_impl.cuh
+++ b/include/bght/detail/iht_impl.cuh
@@ -1,5 +1,5 @@
 /*
- *   Copyright 2021 The Regents of the University of California, Davis
+ *   Copyright 2021-2024 The Regents of the University of California, Davis
  *
  *   Licensed under the Apache License, Version 2.0 (the "License");
  *   you may not use this file except in compliance with the License.
@@ -136,7 +136,7 @@ bool iht<Key, T, Hash, KeyEqual, Scope, Allocator, B, Threshold>::insert(
   const auto num_keys = std::distance(first, last);
   const uint32_t block_size = 128;
   const uint32_t num_blocks = (num_keys + block_size - 1) / block_size;
-  detail::kernels::tiled_insert_kernel<<<num_blocks, block_size, 0, stream>>>(
+  detail::kernels::iht_tiled_insert_kernel<<<num_blocks, block_size, 0, stream>>>(
       first, last, *this);
   bool success;
   cuda_try(cudaMemcpyAsync(
@@ -175,7 +175,10 @@ template <class Key,
           int B,
           int Threshold>
 template <typename tile_type>
-__device__ bool bght::iht<Key, T, Hash, KeyEqual, Scope, Allocator, B, Threshold>::insert(
+__device__ cuda::std::pair<
+    typename bght::iht<Key, T, Hash, KeyEqual, Scope, Allocator, B, Threshold>::iterator,
+    bool>
+bght::iht<Key, T, Hash, KeyEqual, Scope, Allocator, B, Threshold>::insert(
     value_type const& pair,
     tile_type const& tile) {
   auto primary_bucket = hfp_(pair.first) % num_buckets_;
@@ -184,7 +187,7 @@ __device__ bool bght::iht<Key, T, Hash, KeyEqual, Scope, Allocator, B, Threshold
   value_type sentinel_pair{sentinel_key_, sentinel_value_};
 
   if (key_equal{}(pair.first, sentinel_key_)) {
-    return false;
+    return {end(), false};
   }
 
   using bucket_type = detail::bucket<atomic_pair_type, value_type, tile_type>;
@@ -194,6 +197,11 @@ __device__ bool bght::iht<Key, T, Hash, KeyEqual, Scope, Allocator, B, Threshold
   if (threshold_ > 0) {
     bucket.load(cuda::memory_order_relaxed);
     INCREMENT_PROBES_IN_TILE
+    auto key_location = bucket.find_key_location(pair.first, key_equal{});
+    // check if key exists
+    if (key_location != -1) {
+      return {bucket.begin() + key_location, false};
+    }
     load = bucket.compute_load(sentinel_pair);
   }
   do {
@@ -205,20 +213,36 @@ __device__ bool bght::iht<Key, T, Hash, KeyEqual, Scope, Allocator, B, Threshold
       while (true) {
         bucket = bucket_type(&d_table_[bucket_id * bucket_size], tile);
         bucket.load(cuda::memory_order_relaxed);
+
+        // check if key exists
+        auto key_location = bucket.find_key_location(pair.first, key_equal{});
+        if (key_location != -1) {
+          return {bucket.begin() + key_location, false};
+        }
+
         load = bucket.compute_load(sentinel_pair);
         INCREMENT_PROBES_IN_TILE
         while (load < bucket_size) {
           bool cas_success = false;
+          bool key_exists = false;
           if (lane_id == elected_lane) {
-            cas_success = bucket.strong_cas_at_location(pair,
-                                                        load,
-                                                        sentinel_pair,
-                                                        cuda::memory_order_relaxed,
-                                                        cuda::memory_order_relaxed);
+            auto found =
+                bucket.strong_cas_at_location_ret_old(pair,
+                                                      load,
+                                                      sentinel_pair,
+                                                      cuda::memory_order_relaxed,
+                                                      cuda::memory_order_relaxed);
+            if (found.first == pair.first) {
+              key_exists = true;
+            }
+            if (found.first == sentinel_pair.first) {
+              cas_success = true;
+            }
           }
           cas_success = tile.shfl(cas_success, elected_lane);
-          if (cas_success) {
-            return true;
+          key_exists = tile.shfl(cas_success, elected_lane);
+          if (cas_success || key_exists) {
+            return {bucket.begin() + load, cas_success};
           }
           load++;
         }
@@ -226,21 +250,29 @@ __device__ bool bght::iht<Key, T, Hash, KeyEqual, Scope, Allocator, B, Threshold
       }
     } else {
       bool cas_success = false;
+      bool key_exists = false;
       if (lane_id == elected_lane) {
-        cas_success = bucket.strong_cas_at_location(pair,
-                                                    load,
-                                                    sentinel_pair,
-                                                    cuda::memory_order_relaxed,
-                                                    cuda::memory_order_relaxed);
+        auto found = bucket.strong_cas_at_location_ret_old(pair,
+                                                           load,
+                                                           sentinel_pair,
+                                                           cuda::memory_order_relaxed,
+                                                           cuda::memory_order_relaxed);
+        if (found.first == pair.first) {
+          key_exists = true;
+        }
+        if (found.first == sentinel_pair.first) {
+          cas_success = true;
+        }
       }
+      key_exists = tile.shfl(cas_success, elected_lane);
       cas_success = tile.shfl(cas_success, elected_lane);
-      if (cas_success) {
-        return true;
+      if (cas_success || key_exists) {
+        return {bucket.begin() + load, cas_success};
       }
       load++;
     }
   } while (true);
-  return false;
+  return {end(), false};
 }
 
 template <class Key,
@@ -353,6 +385,20 @@ typename iht<Key, T, Hash, KeyEqual, Scope, Allocator, B, Threshold>::const_iter
   return d_table_;
 }
 
+template <class Key,
+          class T,
+          class Hash,
+          class KeyEqual,
+          cuda::thread_scope Scope,
+          typename Allocator,
+          int B,
+          int Threshold>
+typename iht<Key, T, Hash, KeyEqual, Scope, Allocator, B, Threshold>::iterator __device__
+    __host__
+    iht<Key, T, Hash, KeyEqual, Scope, Allocator, B, Threshold>::begin() {
+  return d_table_;
+}
+
 template <class Key,
           class T,
           class Hash,
@@ -367,6 +413,19 @@ typename iht<Key, T, Hash, KeyEqual, Scope, Allocator, B, Threshold>::const_iter
   return d_table_ + capacity_;
 }
 
+template <class Key,
+          class T,
+          class Hash,
+          class KeyEqual,
+          cuda::thread_scope Scope,
+          typename Allocator,
+          int B,
+          int Threshold>
+typename iht<Key, T, Hash, KeyEqual, Scope, Allocator, B, Threshold>::iterator __device__
+    __host__
+    iht<Key, T, Hash, KeyEqual, Scope, Allocator, B, Threshold>::end() {
+  return d_table_ + capacity_;
+}
 template <class Key,
           class T,
           class Hash,
diff --git a/include/bght/detail/kernels.cuh b/include/bght/detail/kernels.cuh
index 21dd3fe..a22c6f4 100644
--- a/include/bght/detail/kernels.cuh
+++ b/include/bght/detail/kernels.cuh
@@ -1,5 +1,5 @@
 /*
- *   Copyright 2021 The Regents of the University of California, Davis
+ *   Copyright 2021-2024 The Regents of the University of California, Davis
  *
  *   Licensed under the Apache License, Version 2.0 (the "License");
  *   you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 
 #pragma once
 #include <cooperative_groups.h>
-#include <cub/cub.cuh>
 #include <bght/detail/cuda_helpers.cuh>
+#include <cub/cub.cuh>
 
 namespace bght {
 namespace detail {
@@ -63,6 +63,47 @@ __global__ void tiled_insert_kernel(InputIt first, InputIt last, HashMap map) {
   }
 }
 
+template <typename InputIt, typename HashMap>
+__global__ void iht_tiled_insert_kernel(InputIt first, InputIt last, HashMap map) {
+  // construct the tile
+  auto thread_id = threadIdx.x + blockIdx.x * blockDim.x;
+  auto block = cooperative_groups::this_thread_block();
+  auto tile = cooperative_groups::tiled_partition<HashMap::bucket_size>(block);
+
+  auto count = last - first;
+  if ((thread_id - tile.thread_rank()) >= count) {
+    return;
+  }
+
+  bool do_op = false;
+  typename HashMap::value_type insertion_pair{};
+
+  // load the input
+  if (thread_id < count) {
+    insertion_pair = first[thread_id];
+    do_op = true;
+  }
+
+  bool success = true;
+  // Do the insertion
+  auto work_queue = tile.ballot(do_op);
+  while (work_queue) {
+    auto cur_rank = __ffs(work_queue) - 1;
+    auto cur_pair = tile.shfl(insertion_pair, cur_rank);
+    bool insertion_success = map.insert(cur_pair, tile).second;
+
+    if (tile.thread_rank() == cur_rank) {
+      do_op = false;
+      success = insertion_success;
+    }
+    work_queue = tile.ballot(do_op);
+  }
+
+  if (!tile.all(success)) {
+    *map.d_build_success_ = false;
+  }
+}
+
 template <typename InputIt, typename OutputIt, typename HashMap>
 __global__ void tiled_find_kernel(InputIt first,
                                   InputIt last,
diff --git a/include/bght/iht.hpp b/include/bght/iht.hpp
index e1eddb1..eecdbfc 100644
--- a/include/bght/iht.hpp
+++ b/include/bght/iht.hpp
@@ -1,5 +1,5 @@
 /*
- *   Copyright 2021 The Regents of the University of California, Davis
+ *   Copyright 2021-2024 The Regents of the University of California, Davis
  *
  *   Licensed under the Apache License, Version 2.0 (the "License");
  *   you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <bght/hash_functions.hpp>
 #include <bght/pair.cuh>
 #include <cuda/atomic>
+#include <cuda/std/utility>
 #include <memory>
 
 namespace bght {
@@ -144,18 +145,22 @@ struct iht {
 
   /**
    * @brief Device-side cooperative insertion API that inserts a single pair into the
-   * hash map.
+   * hash map if the key does not exist.
    * @tparam tile_type A cooperative group tile with a size that must match the bucket
    * size of the hash map (i.e., `bucket_size`). It must support the tile-wide
    * intrinsics `ballot`, `shfl`
    * @param pair A key-value pair to insert into the hash map. The pair must be the same
    * for all threads in the  cooperative group tile
    * @param tile  The cooperative group tile
-   * @return A boolean indicating success (true) or failure (false) of the insertion
-   * operation.
+   * @return A pair where the second element is a boolean indicating success (true)
+   * or failure (false) of the insertion operation. If insertion succeeded or the key
+   * exists, the first element in the pair contain a pointer to the inserted or old
+   * key-value pair, otherwise, the first pair element contain a pointer to the end of the
+   * map.
    */
   template <typename tile_type>
-  __device__ bool insert(value_type const& pair, tile_type const& tile);
+  __device__ cuda::std::pair<iterator, bool> insert(value_type const& pair,
+                                                    tile_type const& tile);
 
   /**
    * @brief Device-side cooperative find API that finds a single pair into the hash
@@ -203,6 +208,21 @@ struct iht {
    */
   __device__ __host__ const_iterator end() const;
 
+  /**
+   * @brief Returns an iterator to the first element of the tables including all invalid
+   * entries.
+   *
+   * @return iterator constant iterator to the first element of the table
+   */
+  __device__ __host__ iterator begin();
+  /**
+   * @brief Returns an iterator to the last element of the tables including all invalid
+   * entries.
+   *
+   * @return iterator constant iterator to the last element of the table
+   */
+  __device__ __host__ iterator end();
+
   /**
    * @brief Returns the maximum number of elements the container is able to hold
    *
@@ -237,6 +257,11 @@ struct iht {
   template <typename InputIt, typename HashMap>
   friend __global__ void detail::kernels::tiled_insert_kernel(InputIt, InputIt, HashMap);
 
+  template <typename InputIt, typename HashMap>
+  friend __global__ void detail::kernels::iht_tiled_insert_kernel(InputIt,
+                                                                  InputIt,
+                                                                  HashMap);
+
   template <typename InputIt, typename OutputIt, typename HashMap>
   friend __global__ void detail::kernels::tiled_find_kernel(InputIt,
                                                             InputIt,