When wider slabs are enabled, use overflows and underflows of individ…

…ual cpu caches while growing slabs. PiperOrigin-RevId: 570113325 Change-Id: I8af5b82506790bedfcd1c43ffa9376c3680ee20c
google · Oct 2, 2023 · b2f29d1 · b2f29d1
1 parent 73fd374
commit b2f29d1
Show file tree

Hide file tree

Showing 2 changed files with 160 additions and 48 deletions.
diff --git a/tcmalloc/cpu_cache.h b/tcmalloc/cpu_cache.h
@@ -221,6 +221,12 @@ class CpuCache {
     }
   };
 
+  enum class DynamicSlabResize {
+    kNoop = 0,
+    kShrink,
+    kGrow,
+  };
+
   enum class PerClassMissType {
     // Tracks total number of misses.
     kTotal = 0,
@@ -578,6 +584,10 @@ class CpuCache {
   size_t Steal(int cpu, size_t size_class, size_t bytes,
                ObjectsToReturn* to_return);
 
+  // Depending on the number of misses that cpu caches encountered in the
+  // previous resize interval, returns if slabs should be grown, shrunk or
+  // remain the same.
+  DynamicSlabResize ShouldResizeSlab();
   // Records a cache underflow or overflow on <cpu>, increments underflow or
   // overflow by 1.
   // <is_alloc> determines whether the associated count corresponds to an
@@ -1894,15 +1904,31 @@ inline auto CpuCache<Forwarder>::AllocOrReuseSlabs(
 }
 
 template <class Forwarder>
-void CpuCache<Forwarder>::ResizeSlabIfNeeded() ABSL_NO_THREAD_SAFETY_ANALYSIS {
-  uint8_t per_cpu_shift = freelist_.GetShift();
-
+inline typename CpuCache<Forwarder>::DynamicSlabResize
+CpuCache<Forwarder>::ShouldResizeSlab() {
   const int num_cpus = NumCPUs();
   CpuCacheMissStats total_misses{};
+  DynamicSlabResize resize = DynamicSlabResize::kNoop;
+  const bool wider_slabs_enabled = UseWiderSlabs();
   for (int cpu = 0; cpu < num_cpus; ++cpu) {
-    total_misses +=
+    CpuCacheMissStats misses =
         GetAndUpdateIntervalCacheMissStats(cpu, MissCount::kSlabResize);
+    total_misses += misses;
+
+    if (misses.overflows >
+        misses.underflows *
+            forwarder_.per_cpu_caches_dynamic_slab_grow_threshold()) {
+      resize = DynamicSlabResize::kGrow;
+    }
   }
+
+  // When wider slabs featuee is enabled, we try to grow slabs when the
+  // condition for at least one cpu cache is met. Else, we use total misses to
+  // figure out whether to grow the slab, shrink it, or do nothing.
+  if (wider_slabs_enabled && resize == DynamicSlabResize::kGrow) {
+    return resize;
+  }
+
   // As a simple heuristic, we decide to grow if the total number of overflows
   // is large compared to total number of underflows during the growth period.
   // If the slab size was infinite, we would expect 0 overflows. If the slab
@@ -1911,14 +1937,30 @@ void CpuCache<Forwarder>::ResizeSlabIfNeeded() ABSL_NO_THREAD_SAFETY_ANALYSIS {
   if (total_misses.overflows >
       total_misses.underflows *
           forwarder_.per_cpu_caches_dynamic_slab_grow_threshold()) {
+    return DynamicSlabResize::kGrow;
+  } else if (total_misses.overflows <
+             total_misses.underflows *
+                 forwarder_.per_cpu_caches_dynamic_slab_shrink_threshold()) {
+    return DynamicSlabResize::kShrink;
+  }
+
+  return DynamicSlabResize::kNoop;
+}
+
+template <class Forwarder>
+void CpuCache<Forwarder>::ResizeSlabIfNeeded() ABSL_NO_THREAD_SAFETY_ANALYSIS {
+  uint8_t per_cpu_shift = freelist_.GetShift();
+
+  const int num_cpus = NumCPUs();
+  const DynamicSlabResize resize = ShouldResizeSlab();
+
+  if (resize == DynamicSlabResize::kGrow) {
     if (per_cpu_shift == shift_bounds_.max_shift) return;
     ++per_cpu_shift;
     dynamic_slab_info_
         .grow_count[ShiftOffset(per_cpu_shift, shift_bounds_.initial_shift)]
         .fetch_add(1, std::memory_order_relaxed);
-  } else if (total_misses.overflows <
-             total_misses.underflows *
-                 forwarder_.per_cpu_caches_dynamic_slab_shrink_threshold()) {
+  } else if (resize == DynamicSlabResize::kShrink) {
     if (per_cpu_shift == shift_bounds_.initial_shift) return;
     --per_cpu_shift;
     dynamic_slab_info_

diff --git a/tcmalloc/cpu_cache_test.cc b/tcmalloc/cpu_cache_test.cc
@@ -124,12 +124,15 @@ class TestStaticForwarder {
   bool resize_size_classes_enabled() { return resize_size_classes_enabled_; }
 
   double per_cpu_caches_dynamic_slab_grow_threshold() {
+    if (dynamic_slab_grow_threshold_ >= 0) return dynamic_slab_grow_threshold_;
     return dynamic_slab_ == DynamicSlab::kGrow
                ? -1.0
                : std::numeric_limits<double>::max();
   }
 
   double per_cpu_caches_dynamic_slab_shrink_threshold() {
+    if (dynamic_slab_shrink_threshold_ >= 0)
+      return dynamic_slab_shrink_threshold_;
     return dynamic_slab_ == DynamicSlab::kShrink
                ? std::numeric_limits<double>::max()
                : -1.0;
@@ -181,6 +184,8 @@ class TestStaticForwarder {
   int64_t arena_reported_impending_bytes_ = 0;
   size_t shrink_to_usage_limit_calls_ = 0;
   bool dynamic_slab_enabled_ = false;
+  double dynamic_slab_grow_threshold_ = -1;
+  double dynamic_slab_shrink_threshold_ = -1;
   bool wider_slabs_enabled_ = false;
   DynamicSlab dynamic_slab_ = DynamicSlab::kNoop;
   bool resize_size_classes_enabled_ = false;
@@ -845,6 +850,112 @@ TEST(CpuCacheTest, ResizeSizeClassesTest) {
   cache.Deactivate();
 }
 
+// Runs a single allocate and deallocate operation to warm up the cache. Once a
+// few objects are allocated in the cold cache, we can shuffle cpu caches to
+// steal that capacity from the cold cache to the hot cache.
+static void ColdCacheOperations(CpuCache& cache, int cpu_id,
+                                size_t size_class) {
+  // Temporarily fake being on the given CPU.
+  ScopedFakeCpuId fake_cpu_id(cpu_id);
+  void* ptr = cache.Allocate<NothrowPolicy>(size_class);
+  cache.Deallocate(ptr, size_class);
+}
+
+// Runs multiple allocate and deallocate operation on the cpu cache to collect
+// misses. Once we collect enough misses on this cache, we can shuffle cpu
+// caches to steal capacity from colder caches to the hot cache.
+static void HotCacheOperations(CpuCache& cache, int cpu_id) {
+  constexpr size_t kPtrs = 4096;
+  std::vector<void*> ptrs;
+  ptrs.resize(kPtrs);
+
+  // Temporarily fake being on the given CPU.
+  ScopedFakeCpuId fake_cpu_id(cpu_id);
+
+  // Allocate and deallocate objects to make sure we have enough misses on the
+  // cache. This will make sure we have sufficient disparity in misses between
+  // the hotter and colder cache, and that we may be able to steal bytes from
+  // the colder cache.
+  for (size_t size_class = 1; size_class <= 2; ++size_class) {
+    for (auto& ptr : ptrs) {
+      ptr = cache.Allocate<NothrowPolicy>(size_class);
+    }
+    for (void* ptr : ptrs) {
+      cache.Deallocate(ptr, size_class);
+    }
+  }
+
+  // We reclaim the cache to reset it so that we record underflows/overflows the
+  // next time we allocate and deallocate objects. Without reclaim, the cache
+  // would stay warmed up and it would take more time to drain the colder cache.
+  cache.Reclaim(cpu_id);
+}
+
+// Test that we are complying with the threshold when we grow the slab.
+// When wider slab is enabled, we check if overflow/underflow ratio is above the
+// threshold for individual cpu caches.
+TEST(CpuCacheTest, DynamicSlabThreshold) {
+  if (!subtle::percpu::IsFast()) {
+    return;
+  }
+
+  constexpr double kDynamicSlabGrowThreshold = 0.9;
+  for (bool wider_slabs : {false, true}) {
+    CpuCache cache;
+    TestStaticForwarder& forwarder = cache.forwarder();
+    forwarder.dynamic_slab_enabled_ = true;
+    forwarder.dynamic_slab_grow_threshold_ = kDynamicSlabGrowThreshold;
+    forwarder.wider_slabs_enabled_ = wider_slabs;
+
+    cache.Activate();
+
+    constexpr int kCpuId0 = 0;
+    constexpr int kCpuId1 = 1;
+
+    // Accumulate overflows and underflows for kCpuId0.
+    HotCacheOperations(cache, kCpuId0);
+    CpuCache::CpuCacheMissStats interval_misses =
+        cache.GetIntervalCacheMissStats(kCpuId0, MissCount::kSlabResize);
+    // Make sure that overflows/underflows ratio is greater than the threshold
+    // for kCpuId0 cache.
+    ASSERT_GT(interval_misses.overflows,
+              interval_misses.underflows * kDynamicSlabGrowThreshold);
+
+    // Perform allocations on kCpuId1 so that we accumulate only underflows.
+    // Reclaim after each allocation such that we have no objects in the cache
+    // for the next allocation.
+    for (int i = 0; i < 1024; ++i) {
+      ColdCacheOperations(cache, kCpuId1, /*size_class=*/1);
+      cache.Reclaim(kCpuId1);
+    }
+
+    // Total overflows/underflows ratio must be less than grow threshold now.
+    CpuCache::CpuCacheMissStats total_misses =
+        cache.GetIntervalCacheMissStats(kCpuId0, MissCount::kSlabResize);
+    total_misses +=
+        cache.GetIntervalCacheMissStats(kCpuId1, MissCount::kSlabResize);
+    ASSERT_LT(total_misses.overflows,
+              total_misses.underflows * kDynamicSlabGrowThreshold);
+
+    cpu_cache_internal::SlabShiftBounds shift_bounds =
+        cache.GetPerCpuSlabShiftBounds();
+    const int shift = shift_bounds.initial_shift;
+    EXPECT_EQ(CpuCachePeer::GetSlabShift(cache), shift);
+    cache.ResizeSlabIfNeeded();
+
+    // If wider slabs is enabled, we must use overflows and underflows of
+    // individual cpu caches to decide whether to grow the slab. Hence, the
+    // slab should have grown. If wider slabs is disabled, slab shift should
+    // stay the same as total miss ratio is lower than
+    // kDynamicSlabGrowThreshold.
+    if (wider_slabs) {
+      EXPECT_EQ(CpuCachePeer::GetSlabShift(cache), shift + 1);
+    } else {
+      EXPECT_EQ(CpuCachePeer::GetSlabShift(cache), shift);
+    }
+  }
+}
+
 // Test that when dynamic slab parameters change, things still work.
 TEST(CpuCacheTest, DynamicSlabParamsChange) {
   if (!subtle::percpu::IsFast()) {
@@ -902,47 +1013,6 @@ TEST(CpuCacheTest, SlabUsage) {
   CpuCachePeer::ValidateSlabBytes(tc_globals.cpu_cache());
 }
 
-// Runs a single allocate and deallocate operation to warm up the cache. Once a
-// few objects are allocated in the cold cache, we can shuffle cpu caches to
-// steal that capacity from the cold cache to the hot cache.
-static void ColdCacheOperations(CpuCache& cache, int cpu_id,
-                                size_t size_class) {
-  // Temporarily fake being on the given CPU.
-  ScopedFakeCpuId fake_cpu_id(cpu_id);
-  void* ptr = cache.Allocate<NothrowPolicy>(size_class);
-  cache.Deallocate(ptr, size_class);
-}
-
-// Runs multiple allocate and deallocate operation on the cpu cache to collect
-// misses. Once we collect enough misses on this cache, we can shuffle cpu
-// caches to steal capacity from colder caches to the hot cache.
-static void HotCacheOperations(CpuCache& cache, int cpu_id) {
-  constexpr size_t kPtrs = 4096;
-  std::vector<void*> ptrs;
-  ptrs.resize(kPtrs);
-
-  // Temporarily fake being on the given CPU.
-  ScopedFakeCpuId fake_cpu_id(cpu_id);
-
-  // Allocate and deallocate objects to make sure we have enough misses on the
-  // cache. This will make sure we have sufficient disparity in misses between
-  // the hotter and colder cache, and that we may be able to steal bytes from
-  // the colder cache.
-  for (size_t size_class = 1; size_class <= 2; ++size_class) {
-    for (auto& ptr : ptrs) {
-      ptr = cache.Allocate<NothrowPolicy>(size_class);
-    }
-    for (void* ptr : ptrs) {
-      cache.Deallocate(ptr, size_class);
-    }
-  }
-
-  // We reclaim the cache to reset it so that we record underflows/overflows the
-  // next time we allocate and deallocate objects. Without reclaim, the cache
-  // would stay warmed up and it would take more time to drain the colder cache.
-  cache.Reclaim(cpu_id);
-}
-
 TEST(CpuCacheTest, ColdHotCacheShuffleTest) {
   if (!subtle::percpu::IsFast()) {
     return;