Take hz into account in activerehashing to avoid CPU spikes (#977)

Currently in conf we describe activerehashing as: Active rehashing uses 1 millisecond every 100 milliseconds of CPU time. This is the case for hz = 10. If we change hz, the description in conf will be inaccurate. Users may notice that the server spends some CPU (used in activerehashing) at high hz but don't know why, since our cron calls are fixed to 1ms. This PR takes hz into account and fixed the CPU usage at 1% (this may not be accurate in some cases because we do 100 step rehashing in dictRehashMicroseconds but it can avoid CPU spikes in this case). This PR also improves the description of the activerehashing configuration item to explain this change. Signed-off-by: Binbin <[email protected]> Co-authored-by: Viktor Söderqvist <[email protected]>
valkey-io · Oct 15, 2024 · dc05a32 · dc05a32
1 parent e30ae76
commit dc05a32
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 25 deletions.
diff --git a/src/server.c b/src/server.c
@@ -1090,12 +1090,13 @@ void databasesCron(void) {
         /* Rehash */
         if (server.activerehashing) {
             uint64_t elapsed_us = 0;
+            uint64_t threshold_us = 1 * 1000000 / server.hz / 100;
             for (j = 0; j < dbs_per_call; j++) {
                 serverDb *db = &server.db[rehash_db % server.dbnum];
-                elapsed_us += kvstoreIncrementallyRehash(db->keys, INCREMENTAL_REHASHING_THRESHOLD_US - elapsed_us);
-                if (elapsed_us >= INCREMENTAL_REHASHING_THRESHOLD_US) break;
-                elapsed_us += kvstoreIncrementallyRehash(db->expires, INCREMENTAL_REHASHING_THRESHOLD_US - elapsed_us);
-                if (elapsed_us >= INCREMENTAL_REHASHING_THRESHOLD_US) break;
+                elapsed_us += kvstoreIncrementallyRehash(db->keys, threshold_us - elapsed_us);
+                if (elapsed_us >= threshold_us) break;
+                elapsed_us += kvstoreIncrementallyRehash(db->expires, threshold_us - elapsed_us);
+                if (elapsed_us >= threshold_us) break;
                 rehash_db++;
             }
         }

diff --git a/src/server.h b/src/server.h
@@ -140,9 +140,8 @@ struct hdr_histogram;
 #define CONFIG_BINDADDR_MAX 16
 #define CONFIG_MIN_RESERVED_FDS 32
 #define CONFIG_DEFAULT_PROC_TITLE_TEMPLATE "{title} {listen-addr} {server-mode}"
-#define DEFAULT_WAIT_BEFORE_RDB_CLIENT_FREE 60 /* Grace period in seconds for replica main \
-                                                  channel to establish psync. */
-#define INCREMENTAL_REHASHING_THRESHOLD_US 1000
+#define DEFAULT_WAIT_BEFORE_RDB_CLIENT_FREE 60      /* Grace period in seconds for replica main \
+                                                     * channel to establish psync. */
 #define LOADING_PROCESS_EVENTS_INTERVAL_DEFAULT 100 /* Default: 0.1 seconds */
 
 /* Bucket sizes for client eviction pools. Each bucket stores clients with

diff --git a/valkey.conf b/valkey.conf
@@ -2082,24 +2082,22 @@ hll-sparse-max-bytes 3000
 stream-node-max-bytes 4096
 stream-node-max-entries 100
 
-# Active rehashing uses 1 millisecond every 100 milliseconds of CPU time in
-# order to help rehashing the main server hash table (the one mapping top-level
-# keys to values). The hash table implementation the server uses (see dict.c)
-# performs a lazy rehashing: the more operation you run into a hash table
-# that is rehashing, the more rehashing "steps" are performed, so if the
-# server is idle the rehashing is never complete and some more memory is used
-# by the hash table.
-#
-# The default is to use this millisecond 10 times every second in order to
-# actively rehash the main dictionaries, freeing memory when possible.
-#
-# If unsure:
-# use "activerehashing no" if you have hard latency requirements and it is
-# not a good thing in your environment that the server can reply from time to time
-# to queries with 2 milliseconds delay.
-#
-# use "activerehashing yes" if you don't have such hard requirements but
-# want to free memory asap when possible.
+# Active rehashing uses 1% of the CPU time to help perform incremental rehashing
+# of the main server hash tables, the ones mapping top-level keys to values.
+#
+# If active rehashing is disabled and rehashing is needed, a hash table is
+# rehashed one "step" on every operation performed on the hash table (add, find,
+# etc.), so if the server is idle, the rehashing may never complete and some
+# more memory is used by the hash tables. Active rehashing helps prevent this.
+#
+# Active rehashing runs as a background task. Depending on the value of 'hz',
+# the frequency at which the server performs background tasks, active rehashing
+# can cause the server to freeze for a short time. For example, if 'hz' is set
+# to 10, active rehashing runs for up to one millisecond every 100 milliseconds.
+# If a freeze of one millisecond is not acceptable, you can increase 'hz' to let
+# active rehashing run more often. If instead 'hz' is set to 100, active
+# rehashing runs up to only 100 microseconds every 10 milliseconds. The total is
+# still 1% of the time.
 activerehashing yes
 
 # The client output buffer limits can be used to force disconnection of clients