From 5a4c0640cebe922de563e03ff2a683b89612f522 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Tue, 29 Oct 2024 14:26:17 -0700 Subject: [PATCH 01/92] Mark main and serverAssert as weak symbols to be overridden (#1232) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At some point unit tests stopped building on MacOS because of duplicate symbols. I had originally solved this problem by using a flag that overrides symbols, but the much better solution is to mark the duplicate symbols as weak and they can be overridden during linking. (Symbols by default are strong, strong symbols override weak symbols) I also added macos unit build to the CI, so that this doesn't silently break in the future again. --------- Signed-off-by: Madelyn Olson Co-authored-by: Viktor Söderqvist --- .github/workflows/ci.yml | 2 +- src/Makefile | 11 +---------- src/debug.c | 2 +- src/server.c | 3 ++- 4 files changed, 5 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f1d23f40fa..48a94ef984 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -83,7 +83,7 @@ jobs: steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: make - run: make -j3 SERVER_CFLAGS='-Werror' + run: make -j3 all-with-unit-tests SERVER_CFLAGS='-Werror' build-32bit: runs-on: ubuntu-latest diff --git a/src/Makefile b/src/Makefile index 020b70d6d5..ae2de1c626 100644 --- a/src/Makefile +++ b/src/Makefile @@ -98,15 +98,6 @@ ifeq ($(USE_JEMALLOC),no) MALLOC=libc endif -# Some unit tests compile files a second time to get access to static functions, the "--allow-multiple-definition" flag -# allows us to do that without an error, by using the first instance of function. This behavior can also be used -# to tweak behavior of code just for unit tests. The version of ld on MacOS apparently always does this. -ifneq ($(uname_S),Darwin) - ALLOW_DUPLICATE_FLAG=-Wl,--allow-multiple-definition -else - ALLOW_DUPLICATE_FLAG= -endif - ifdef SANITIZER ifeq ($(SANITIZER),address) MALLOC=libc @@ -494,7 +485,7 @@ $(ENGINE_LIB_NAME): $(ENGINE_SERVER_OBJ) # valkey-unit-tests $(ENGINE_UNIT_TESTS): $(ENGINE_TEST_OBJ) $(ENGINE_LIB_NAME) - $(SERVER_LD) $(ALLOW_DUPLICATE_FLAG) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/lua/src/liblua.a ../deps/hdr_histogram/libhdrhistogram.a ../deps/fpconv/libfpconv.a $(FINAL_LIBS) + $(SERVER_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/lua/src/liblua.a ../deps/hdr_histogram/libhdrhistogram.a ../deps/fpconv/libfpconv.a $(FINAL_LIBS) # valkey-sentinel $(ENGINE_SENTINEL_NAME): $(SERVER_NAME) diff --git a/src/debug.c b/src/debug.c index 98512fd436..d221a884ee 100644 --- a/src/debug.c +++ b/src/debug.c @@ -1023,7 +1023,7 @@ void debugCommand(client *c) { /* =========================== Crash handling ============================== */ -__attribute__((noinline)) void _serverAssert(const char *estr, const char *file, int line) { +__attribute__((noinline, weak)) void _serverAssert(const char *estr, const char *file, int line) { int new_report = bugReportStart(); serverLog(LL_WARNING, "=== %sASSERTION FAILED ===", new_report ? "" : "RECURSIVE "); serverLog(LL_WARNING, "==> %s:%d '%s' is not true", file, line, estr); diff --git a/src/server.c b/src/server.c index e95012eefa..508edc7112 100644 --- a/src/server.c +++ b/src/server.c @@ -6810,7 +6810,8 @@ serverTestProc *getTestProcByName(const char *name) { } #endif -int main(int argc, char **argv) { +/* Main is marked as weak so that unit tests can use their own main function. */ +__attribute__((weak)) int main(int argc, char **argv) { struct timeval tv; int j; char config_from_stdin = 0; From 13f5f665f259f229d707116432f8ef2969cae0c7 Mon Sep 17 00:00:00 2001 From: Shivshankar Date: Tue, 29 Oct 2024 19:19:56 -0400 Subject: [PATCH 02/92] Update the argument of clusterNodeGetReplica declaration (#1239) clusterNodeGetReplica agrumnets are missed to migrate during the slave to replication migration so updated the argument slave to replica. Signed-off-by: Shivshankar-Reddy --- src/cluster.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.h b/src/cluster.h index 2e4f33a3c9..65eadf4c65 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -96,7 +96,7 @@ int clusterNodeIsFailing(clusterNode *node); int clusterNodeIsNoFailover(clusterNode *node); char *clusterNodeGetShardId(clusterNode *node); int clusterNodeNumReplicas(clusterNode *node); -clusterNode *clusterNodeGetReplica(clusterNode *node, int slave_idx); +clusterNode *clusterNodeGetReplica(clusterNode *node, int replica_idx); clusterNode *getMigratingSlotDest(int slot); clusterNode *getImportingSlotSource(int slot); clusterNode *getNodeBySlot(int slot); From 789a73b0d0fc9e2b754adbb39ed3ca92e9c30669 Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 30 Oct 2024 10:25:50 +0800 Subject: [PATCH 03/92] Minor fix to debug logging in replicationFeedStreamFromPrimaryStream (#1235) We should only print logs when hide-user-data-from-log is off. Signed-off-by: Binbin --- src/replication.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/replication.c b/src/replication.c index a92bb79984..8ff8ad3f0f 100644 --- a/src/replication.c +++ b/src/replication.c @@ -651,7 +651,7 @@ void replicationFeedStreamFromPrimaryStream(char *buf, size_t buflen) { /* Debugging: this is handy to see the stream sent from primary * to replicas. Disabled with if(0). */ if (0) { - if (server.hide_user_data_from_log) { + if (!server.hide_user_data_from_log) { printf("%zu:", buflen); for (size_t j = 0; j < buflen; j++) { printf("%c", isprint(buf[j]) ? buf[j] : '.'); From ab98f375db51c83f3c56beac9440097f81af2048 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Wed, 30 Oct 2024 18:12:42 +0800 Subject: [PATCH 04/92] RDMA: Delete keepalive timer on closing (#1237) Typically, RDMA connection gets closed by client side, the server side handles diconnected CM event, and delete keepalive timer correctly. However, the server side may close connection voluntarily, for example the maxium connections exceed. Handle this case to avoid invalid memory access. Signed-off-by: zhenwei pi --- src/rdma.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/rdma.c b/src/rdma.c index 15e23758b7..9329fd3ab5 100644 --- a/src/rdma.c +++ b/src/rdma.c @@ -451,13 +451,22 @@ static int rdmaHandleEstablished(struct rdma_cm_event *ev) { return C_OK; } +static inline void rdmaDelKeepalive(aeEventLoop *el, RdmaContext *ctx) { + if (ctx->keepalive_te == AE_ERR) { + return; + } + + aeDeleteTimeEvent(el, ctx->keepalive_te); + ctx->keepalive_te = AE_ERR; +} + static int rdmaHandleDisconnect(aeEventLoop *el, struct rdma_cm_event *ev) { struct rdma_cm_id *cm_id = ev->id; RdmaContext *ctx = cm_id->context; connection *conn = ctx->conn; rdma_connection *rdma_conn = (rdma_connection *)conn; - aeDeleteTimeEvent(el, ctx->keepalive_te); + rdmaDelKeepalive(el, ctx); conn->state = CONN_STATE_CLOSED; /* we can't close connection now, let's mark this connection as closed state */ @@ -1173,6 +1182,7 @@ static void connRdmaClose(connection *conn) { } ctx = cm_id->context; + rdmaDelKeepalive(server.el, ctx); rdma_disconnect(cm_id); /* poll all CQ before close */ From 91cbf7744256b365651d4bc039c2913ecde9dfe6 Mon Sep 17 00:00:00 2001 From: Masahiro Ide Date: Fri, 1 Nov 2024 03:30:05 +0900 Subject: [PATCH 05/92] Eliminate snprintf usage at setDeferredAggregateLen (#1234) to align with how we encode the length at `_addReplyLongLongWithPrefix` Signed-off-by: Masahiro Ide Co-authored-by: Masahiro Ide --- src/networking.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/networking.c b/src/networking.c index 6751f5c7b8..96dd05d505 100644 --- a/src/networking.c +++ b/src/networking.c @@ -888,8 +888,11 @@ void setDeferredAggregateLen(client *c, void *node, long length, char prefix) { } char lenstr[128]; - size_t lenstr_len = snprintf(lenstr, sizeof(lenstr), "%c%ld\r\n", prefix, length); - setDeferredReply(c, node, lenstr, lenstr_len); + lenstr[0] = prefix; + size_t lenstr_len = ll2string(lenstr + 1, sizeof(lenstr) - 1, length); + lenstr[lenstr_len + 1] = '\r'; + lenstr[lenstr_len + 2] = '\n'; + setDeferredReply(c, node, lenstr, lenstr_len + 3); } void setDeferredArrayLen(client *c, void *node, long length) { From 1c222f77cecc100719bbc87c7a2ecd13402fe6db Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Thu, 31 Oct 2024 11:37:53 -0700 Subject: [PATCH 06/92] Improve performance of sdssplitargs (#1230) The current implementation of `sdssplitargs` does repeated `sdscatlen` to build the parsed arguments, which isn't very efficient because it does a lot of extra reallocations and moves through the sds code a lot. It also typically results in memory overhead, because `sdscatlen` over-allocates, which is usually not needed since args are usually not modified after being created. The new implementation of sdssplitargs does two passes, the first to parse the argument to figure out the final length and the second to actually copy the string. It's generally about 2x faster for larger strings (~100 bytes), and about 20% faster for small strings (~10 bytes). This is generally faster since as long as everything is in the CPU cache, it's going to be fast. There are a couple of sanity tests, none existed before, as well as some fuzzying which was used to find some bugs and also to do the benchmarking. The original benchmarking code can be seen https://github.com/valkey-io/valkey/pull/1230/commits/6576aeb86adfb5afa74aefb5bc2d2becde95ce4c. ``` test_sdssplitargs_benchmark - unit/test_sds.c:530] Using random seed: 1729883235 [test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 56.44%, new:13039us, old:29930us [test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 56.58%, new:12057us, old:27771us [test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 59.18%, new:9048us, old:22165us [test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 54.61%, new:12381us, old:27278us [test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 51.17%, new:16012us, old:32793us [test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 49.18%, new:16041us, old:31563us [test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 58.40%, new:12450us, old:29930us [test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 56.49%, new:13066us, old:30031us [test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 58.75%, new:12744us, old:30894us [test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 52.44%, new:16885us, old:35504us [test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 62.57%, new:8107us, old:21659us [test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 62.12%, new:8320us, old:21966us [test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 45.23%, new:13960us, old:25487us [test_sdssplitargs_benchmark - unit/test_sds.c:577] Improvement: 57.95%, new:9188us, old:21849us ``` --------- Signed-off-by: Madelyn Olson --- src/sds.c | 181 +++++++++++++++++++++++------------------- src/unit/test_files.h | 3 +- src/unit/test_sds.c | 41 ++++++++++ 3 files changed, 143 insertions(+), 82 deletions(-) diff --git a/src/sds.c b/src/sds.c index e14f4bd0bd..4dd7d709aa 100644 --- a/src/sds.c +++ b/src/sds.c @@ -1032,6 +1032,86 @@ int hex_digit_to_int(char c) { } } +/* Helper function for sdssplitargs that parses a single argument. It + * populates the number characters needed to store the parsed argument + * in len, if provided, or will copy the parsed string into dst, if provided. + * If the string is able to be parsed, this function returns the number of + * characters that were parsed. If the argument can't be parsed, it + * returns 0. */ +static int sdsparsearg(const char *arg, unsigned int *len, char *dst) { + const char *p = arg; + int inq = 0; /* set to 1 if we are in "quotes" */ + int insq = 0; /* set to 1 if we are in 'single quotes' */ + int done = 0; + + while (!done) { + int new_char = -1; + if (inq) { + if (*p == '\\' && *(p + 1) == 'x' && is_hex_digit(*(p + 2)) && is_hex_digit(*(p + 3))) { + new_char = (hex_digit_to_int(*(p + 2)) * 16) + hex_digit_to_int(*(p + 3)); + p += 3; + } else if (*p == '\\' && *(p + 1)) { + p++; + switch (*p) { + case 'n': new_char = '\n'; break; + case 'r': new_char = '\r'; break; + case 't': new_char = '\t'; break; + case 'b': new_char = '\b'; break; + case 'a': new_char = '\a'; break; + default: new_char = *p; break; + } + } else if (*p == '"') { + /* closing quote must be followed by a space or + * nothing at all. */ + if (*(p + 1) && !isspace(*(p + 1))) return 0; + done = 1; + } else if (!*p) { + /* unterminated quotes */ + return 0; + } else { + new_char = *p; + } + } else if (insq) { + if (*p == '\\' && *(p + 1) == '\'') { + p++; + new_char = *p; + } else if (*p == '\'') { + /* closing quote must be followed by a space or + * nothing at all. */ + if (*(p + 1) && !isspace(*(p + 1))) return 0; + done = 1; + } else if (!*p) { + /* unterminated quotes */ + return 0; + } else { + new_char = *p; + } + } else { + switch (*p) { + case ' ': + case '\n': + case '\r': + case '\t': + case '\0': done = 1; break; + case '"': inq = 1; break; + case '\'': insq = 1; break; + default: new_char = *p; break; + } + } + if (new_char != -1) { + if (len) (*len)++; + if (dst) { + *dst = (char)new_char; + dst++; + } + } + if (*p) { + p++; + } + } + return p - arg; +} + /* Split a line into arguments, where every argument can be in the * following programming-language REPL-alike form: * @@ -1049,103 +1129,42 @@ int hex_digit_to_int(char c) { * The function returns the allocated tokens on success, even when the * input string is empty, or NULL if the input contains unbalanced * quotes or closed quotes followed by non space characters - * as in: "foo"bar or "foo' + * as in: "foo"bar or "foo'. + * + * The sds strings returned by this function are not initialized with + * extra space. */ sds *sdssplitargs(const char *line, int *argc) { const char *p = line; - char *current = NULL; char **vector = NULL; *argc = 0; - while (1) { + while (*p) { /* skip blanks */ while (*p && isspace(*p)) p++; - if (*p) { - /* get a token */ - int inq = 0; /* set to 1 if we are in "quotes" */ - int insq = 0; /* set to 1 if we are in 'single quotes' */ - int done = 0; - - if (current == NULL) current = sdsempty(); - while (!done) { - if (inq) { - if (*p == '\\' && *(p + 1) == 'x' && is_hex_digit(*(p + 2)) && is_hex_digit(*(p + 3))) { - unsigned char byte; - - byte = (hex_digit_to_int(*(p + 2)) * 16) + hex_digit_to_int(*(p + 3)); - current = sdscatlen(current, (char *)&byte, 1); - p += 3; - } else if (*p == '\\' && *(p + 1)) { - char c; - - p++; - switch (*p) { - case 'n': c = '\n'; break; - case 'r': c = '\r'; break; - case 't': c = '\t'; break; - case 'b': c = '\b'; break; - case 'a': c = '\a'; break; - default: c = *p; break; - } - current = sdscatlen(current, &c, 1); - } else if (*p == '"') { - /* closing quote must be followed by a space or - * nothing at all. */ - if (*(p + 1) && !isspace(*(p + 1))) goto err; - done = 1; - } else if (!*p) { - /* unterminated quotes */ - goto err; - } else { - current = sdscatlen(current, p, 1); - } - } else if (insq) { - if (*p == '\\' && *(p + 1) == '\'') { - p++; - current = sdscatlen(current, "'", 1); - } else if (*p == '\'') { - /* closing quote must be followed by a space or - * nothing at all. */ - if (*(p + 1) && !isspace(*(p + 1))) goto err; - done = 1; - } else if (!*p) { - /* unterminated quotes */ - goto err; - } else { - current = sdscatlen(current, p, 1); - } - } else { - switch (*p) { - case ' ': - case '\n': - case '\r': - case '\t': - case '\0': done = 1; break; - case '"': inq = 1; break; - case '\'': insq = 1; break; - default: current = sdscatlen(current, p, 1); break; - } - } - if (*p) p++; - } + if (!(*p)) break; + unsigned int len = 0; + if (sdsparsearg(p, &len, NULL)) { + sds current = sdsnewlen(SDS_NOINIT, len); + int parsedlen = sdsparsearg(p, NULL, current); + assert(parsedlen > 0); + p += parsedlen; + /* add the token to the vector */ vector = s_realloc(vector, ((*argc) + 1) * sizeof(char *)); vector[*argc] = current; (*argc)++; current = NULL; } else { - /* Even on empty input string return something not NULL. */ - if (vector == NULL) vector = s_malloc(sizeof(void *)); - return vector; + while ((*argc)--) sdsfree(vector[*argc]); + s_free(vector); + *argc = 0; + return NULL; } } - -err: - while ((*argc)--) sdsfree(vector[*argc]); - s_free(vector); - if (current) sdsfree(current); - *argc = 0; - return NULL; + /* Even on empty input string return something not NULL. */ + if (vector == NULL) vector = s_malloc(sizeof(void *)); + return vector; } /* Modify the string substituting all the occurrences of the set of diff --git a/src/unit/test_files.h b/src/unit/test_files.h index cd2e0c5b92..c2b062039a 100644 --- a/src/unit/test_files.h +++ b/src/unit/test_files.h @@ -99,6 +99,7 @@ int test_raxFuzz(int argc, char **argv, int flags); int test_sds(int argc, char **argv, int flags); int test_typesAndAllocSize(int argc, char **argv, int flags); int test_sdsHeaderSizes(int argc, char **argv, int flags); +int test_sdssplitargs(int argc, char **argv, int flags); int test_sha1(int argc, char **argv, int flags); int test_string2ll(int argc, char **argv, int flags); int test_string2l(int argc, char **argv, int flags); @@ -157,7 +158,7 @@ unitTest __test_intset_c[] = {{"test_intsetValueEncodings", test_intsetValueEnco unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict}, {NULL, NULL}}; unitTest __test_listpack_c[] = {{"test_listpackCreateIntList", test_listpackCreateIntList}, {"test_listpackCreateList", test_listpackCreateList}, {"test_listpackLpPrepend", test_listpackLpPrepend}, {"test_listpackLpPrependInteger", test_listpackLpPrependInteger}, {"test_listpackGetELementAtIndex", test_listpackGetELementAtIndex}, {"test_listpackPop", test_listpackPop}, {"test_listpackGetELementAtIndex2", test_listpackGetELementAtIndex2}, {"test_listpackIterate0toEnd", test_listpackIterate0toEnd}, {"test_listpackIterate1toEnd", test_listpackIterate1toEnd}, {"test_listpackIterate2toEnd", test_listpackIterate2toEnd}, {"test_listpackIterateBackToFront", test_listpackIterateBackToFront}, {"test_listpackIterateBackToFrontWithDelete", test_listpackIterateBackToFrontWithDelete}, {"test_listpackDeleteWhenNumIsMinusOne", test_listpackDeleteWhenNumIsMinusOne}, {"test_listpackDeleteWithNegativeIndex", test_listpackDeleteWithNegativeIndex}, {"test_listpackDeleteInclusiveRange0_0", test_listpackDeleteInclusiveRange0_0}, {"test_listpackDeleteInclusiveRange0_1", test_listpackDeleteInclusiveRange0_1}, {"test_listpackDeleteInclusiveRange1_2", test_listpackDeleteInclusiveRange1_2}, {"test_listpackDeleteWitStartIndexOutOfRange", test_listpackDeleteWitStartIndexOutOfRange}, {"test_listpackDeleteWitNumOverflow", test_listpackDeleteWitNumOverflow}, {"test_listpackBatchDelete", test_listpackBatchDelete}, {"test_listpackDeleteFooWhileIterating", test_listpackDeleteFooWhileIterating}, {"test_listpackReplaceWithSameSize", test_listpackReplaceWithSameSize}, {"test_listpackReplaceWithDifferentSize", test_listpackReplaceWithDifferentSize}, {"test_listpackRegressionGt255Bytes", test_listpackRegressionGt255Bytes}, {"test_listpackCreateLongListAndCheckIndices", test_listpackCreateLongListAndCheckIndices}, {"test_listpackCompareStrsWithLpEntries", test_listpackCompareStrsWithLpEntries}, {"test_listpackLpMergeEmptyLps", test_listpackLpMergeEmptyLps}, {"test_listpackLpMergeLp1Larger", test_listpackLpMergeLp1Larger}, {"test_listpackLpMergeLp2Larger", test_listpackLpMergeLp2Larger}, {"test_listpackLpNextRandom", test_listpackLpNextRandom}, {"test_listpackLpNextRandomCC", test_listpackLpNextRandomCC}, {"test_listpackRandomPairWithOneElement", test_listpackRandomPairWithOneElement}, {"test_listpackRandomPairWithManyElements", test_listpackRandomPairWithManyElements}, {"test_listpackRandomPairsWithOneElement", test_listpackRandomPairsWithOneElement}, {"test_listpackRandomPairsWithManyElements", test_listpackRandomPairsWithManyElements}, {"test_listpackRandomPairsUniqueWithOneElement", test_listpackRandomPairsUniqueWithOneElement}, {"test_listpackRandomPairsUniqueWithManyElements", test_listpackRandomPairsUniqueWithManyElements}, {"test_listpackPushVariousEncodings", test_listpackPushVariousEncodings}, {"test_listpackLpFind", test_listpackLpFind}, {"test_listpackLpValidateIntegrity", test_listpackLpValidateIntegrity}, {"test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN", test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN}, {"test_listpackStressWithRandom", test_listpackStressWithRandom}, {"test_listpackSTressWithVariableSize", test_listpackSTressWithVariableSize}, {"test_listpackBenchmarkInit", test_listpackBenchmarkInit}, {"test_listpackBenchmarkLpAppend", test_listpackBenchmarkLpAppend}, {"test_listpackBenchmarkLpFindString", test_listpackBenchmarkLpFindString}, {"test_listpackBenchmarkLpFindNumber", test_listpackBenchmarkLpFindNumber}, {"test_listpackBenchmarkLpSeek", test_listpackBenchmarkLpSeek}, {"test_listpackBenchmarkLpValidateIntegrity", test_listpackBenchmarkLpValidateIntegrity}, {"test_listpackBenchmarkLpCompareWithString", test_listpackBenchmarkLpCompareWithString}, {"test_listpackBenchmarkLpCompareWithNumber", test_listpackBenchmarkLpCompareWithNumber}, {"test_listpackBenchmarkFree", test_listpackBenchmarkFree}, {NULL, NULL}}; unitTest __test_rax_c[] = {{"test_raxRandomWalk", test_raxRandomWalk}, {"test_raxIteratorUnitTests", test_raxIteratorUnitTests}, {"test_raxTryInsertUnitTests", test_raxTryInsertUnitTests}, {"test_raxRegressionTest1", test_raxRegressionTest1}, {"test_raxRegressionTest2", test_raxRegressionTest2}, {"test_raxRegressionTest3", test_raxRegressionTest3}, {"test_raxRegressionTest4", test_raxRegressionTest4}, {"test_raxRegressionTest5", test_raxRegressionTest5}, {"test_raxRegressionTest6", test_raxRegressionTest6}, {"test_raxBenchmark", test_raxBenchmark}, {"test_raxHugeKey", test_raxHugeKey}, {"test_raxFuzz", test_raxFuzz}, {NULL, NULL}}; -unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {NULL, NULL}}; +unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {"test_sdssplitargs", test_sdssplitargs}, {NULL, NULL}}; unitTest __test_sha1_c[] = {{"test_sha1", test_sha1}, {NULL, NULL}}; unitTest __test_util_c[] = {{"test_string2ll", test_string2ll}, {"test_string2l", test_string2l}, {"test_ll2string", test_ll2string}, {"test_ld2string", test_ld2string}, {"test_fixedpoint_d2string", test_fixedpoint_d2string}, {"test_version2num", test_version2num}, {"test_reclaimFilePageCache", test_reclaimFilePageCache}, {NULL, NULL}}; unitTest __test_ziplist_c[] = {{"test_ziplistCreateIntList", test_ziplistCreateIntList}, {"test_ziplistPop", test_ziplistPop}, {"test_ziplistGetElementAtIndex3", test_ziplistGetElementAtIndex3}, {"test_ziplistGetElementOutOfRange", test_ziplistGetElementOutOfRange}, {"test_ziplistGetLastElement", test_ziplistGetLastElement}, {"test_ziplistGetFirstElement", test_ziplistGetFirstElement}, {"test_ziplistGetElementOutOfRangeReverse", test_ziplistGetElementOutOfRangeReverse}, {"test_ziplistIterateThroughFullList", test_ziplistIterateThroughFullList}, {"test_ziplistIterateThroughListFrom1ToEnd", test_ziplistIterateThroughListFrom1ToEnd}, {"test_ziplistIterateThroughListFrom2ToEnd", test_ziplistIterateThroughListFrom2ToEnd}, {"test_ziplistIterateThroughStartOutOfRange", test_ziplistIterateThroughStartOutOfRange}, {"test_ziplistIterateBackToFront", test_ziplistIterateBackToFront}, {"test_ziplistIterateBackToFrontDeletingAllItems", test_ziplistIterateBackToFrontDeletingAllItems}, {"test_ziplistDeleteInclusiveRange0To0", test_ziplistDeleteInclusiveRange0To0}, {"test_ziplistDeleteInclusiveRange0To1", test_ziplistDeleteInclusiveRange0To1}, {"test_ziplistDeleteInclusiveRange1To2", test_ziplistDeleteInclusiveRange1To2}, {"test_ziplistDeleteWithStartIndexOutOfRange", test_ziplistDeleteWithStartIndexOutOfRange}, {"test_ziplistDeleteWithNumOverflow", test_ziplistDeleteWithNumOverflow}, {"test_ziplistDeleteFooWhileIterating", test_ziplistDeleteFooWhileIterating}, {"test_ziplistReplaceWithSameSize", test_ziplistReplaceWithSameSize}, {"test_ziplistReplaceWithDifferentSize", test_ziplistReplaceWithDifferentSize}, {"test_ziplistRegressionTestForOver255ByteStrings", test_ziplistRegressionTestForOver255ByteStrings}, {"test_ziplistRegressionTestDeleteNextToLastEntries", test_ziplistRegressionTestDeleteNextToLastEntries}, {"test_ziplistCreateLongListAndCheckIndices", test_ziplistCreateLongListAndCheckIndices}, {"test_ziplistCompareStringWithZiplistEntries", test_ziplistCompareStringWithZiplistEntries}, {"test_ziplistMergeTest", test_ziplistMergeTest}, {"test_ziplistStressWithRandomPayloadsOfDifferentEncoding", test_ziplistStressWithRandomPayloadsOfDifferentEncoding}, {"test_ziplistCascadeUpdateEdgeCases", test_ziplistCascadeUpdateEdgeCases}, {"test_ziplistInsertEdgeCase", test_ziplistInsertEdgeCase}, {"test_ziplistStressWithVariableSize", test_ziplistStressWithVariableSize}, {"test_BenchmarkziplistFind", test_BenchmarkziplistFind}, {"test_BenchmarkziplistIndex", test_BenchmarkziplistIndex}, {"test_BenchmarkziplistValidateIntegrity", test_BenchmarkziplistValidateIntegrity}, {"test_BenchmarkziplistCompareWithString", test_BenchmarkziplistCompareWithString}, {"test_BenchmarkziplistCompareWithNumber", test_BenchmarkziplistCompareWithNumber}, {"test_ziplistStress__ziplistCascadeUpdate", test_ziplistStress__ziplistCascadeUpdate}, {NULL, NULL}}; diff --git a/src/unit/test_sds.c b/src/unit/test_sds.c index 19b5c7d73f..b97d0d9d32 100644 --- a/src/unit/test_sds.c +++ b/src/unit/test_sds.c @@ -328,3 +328,44 @@ int test_sdsHeaderSizes(int argc, char **argv, int flags) { return 0; } + +int test_sdssplitargs(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + int len; + sds *sargv; + + sargv = sdssplitargs("Testing one two three", &len); + TEST_ASSERT(4 == len); + TEST_ASSERT(!strcmp("Testing", sargv[0])); + TEST_ASSERT(!strcmp("one", sargv[1])); + TEST_ASSERT(!strcmp("two", sargv[2])); + TEST_ASSERT(!strcmp("three", sargv[3])); + sdsfreesplitres(sargv, len); + + sargv = sdssplitargs("", &len); + TEST_ASSERT(0 == len); + TEST_ASSERT(sargv != NULL); + sdsfreesplitres(sargv, len); + + sargv = sdssplitargs("\"Testing split strings\" \'Another split string\'", &len); + TEST_ASSERT(2 == len); + TEST_ASSERT(!strcmp("Testing split strings", sargv[0])); + TEST_ASSERT(!strcmp("Another split string", sargv[1])); + sdsfreesplitres(sargv, len); + + sargv = sdssplitargs("\"Hello\" ", &len); + TEST_ASSERT(1 == len); + TEST_ASSERT(!strcmp("Hello", sargv[0])); + sdsfreesplitres(sargv, len); + + char *binary_string = "\"\\x73\\x75\\x70\\x65\\x72\\x20\\x00\\x73\\x65\\x63\\x72\\x65\\x74\\x20\\x70\\x61\\x73\\x73\\x77\\x6f\\x72\\x64\""; + sargv = sdssplitargs(binary_string, &len); + TEST_ASSERT(1 == len); + TEST_ASSERT(22 == sdslen(sargv[0])); + sdsfreesplitres(sargv, len); + + return 0; +} From e985ead7f9377e4d61d2e47c0b964f917d8054b4 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Fri, 1 Nov 2024 20:28:09 +0800 Subject: [PATCH 07/92] RDMA: Prevent IO for child process (#1244) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RDMA MR (memory region) is not forkable, the VMA (virtual memory area) of a MR gets empty in a child process. Prevent IO for child process to avoid server crash. In the check for whether read and write is allowed in an RDMA connection, a check that if we're in a child process is added. If we are, the function returns an error, which will cause the RDMA client to be disconnected. Suggested-by: Viktor Söderqvist Signed-off-by: zhenwei pi --- src/rdma.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/src/rdma.c b/src/rdma.c index 9329fd3ab5..bb38baa0f1 100644 --- a/src/rdma.c +++ b/src/rdma.c @@ -143,12 +143,34 @@ static void serverRdmaError(char *err, const char *fmt, ...) { va_end(ap); } +static inline int connRdmaAllowCommand(void) { + /* RDMA MR is not accessible in a child process, avoid segment fault due to + * invalid MR access, close it rather than server random crash */ + if (server.in_fork_child != CHILD_TYPE_NONE) { + return C_ERR; + } + + return C_OK; +} + +static inline int connRdmaAllowRW(connection *conn) { + if (conn->state == CONN_STATE_ERROR || conn->state == CONN_STATE_CLOSED) { + return C_ERR; + } + + return connRdmaAllowCommand(); +} + static int rdmaPostRecv(RdmaContext *ctx, struct rdma_cm_id *cm_id, ValkeyRdmaCmd *cmd) { struct ibv_sge sge; size_t length = sizeof(ValkeyRdmaCmd); struct ibv_recv_wr recv_wr, *bad_wr; int ret; + if (connRdmaAllowCommand()) { + return C_ERR; + } + sge.addr = (uint64_t)cmd; sge.length = length; sge.lkey = ctx->cmd_mr->lkey; @@ -1214,6 +1236,10 @@ static size_t connRdmaSend(connection *conn, const void *data, size_t data_len) char *remote_addr = ctx->tx_addr + ctx->tx.offset; int ret; + if (connRdmaAllowCommand()) { + return C_ERR; + } + memcpy(addr, data, data_len); sge.addr = (uint64_t)addr; @@ -1247,7 +1273,7 @@ static int connRdmaWrite(connection *conn, const void *data, size_t data_len) { RdmaContext *ctx = cm_id->context; uint32_t towrite; - if (conn->state == CONN_STATE_ERROR || conn->state == CONN_STATE_CLOSED) { + if (connRdmaAllowRW(conn)) { return C_ERR; } @@ -1290,7 +1316,7 @@ static int connRdmaRead(connection *conn, void *buf, size_t buf_len) { struct rdma_cm_id *cm_id = rdma_conn->cm_id; RdmaContext *ctx = cm_id->context; - if (conn->state == CONN_STATE_ERROR || conn->state == CONN_STATE_CLOSED) { + if (connRdmaAllowRW(conn)) { return C_ERR; } @@ -1312,7 +1338,7 @@ static ssize_t connRdmaSyncWrite(connection *conn, char *ptr, ssize_t size, long long long start = mstime(); uint32_t towrite; - if (conn->state == CONN_STATE_ERROR || conn->state == CONN_STATE_CLOSED) { + if (connRdmaAllowRW(conn)) { return C_ERR; } @@ -1355,7 +1381,7 @@ static ssize_t connRdmaSyncRead(connection *conn, char *ptr, ssize_t size, long long long start = mstime(); uint32_t toread; - if (conn->state == CONN_STATE_ERROR || conn->state == CONN_STATE_CLOSED) { + if (connRdmaAllowRW(conn)) { return C_ERR; } @@ -1390,7 +1416,7 @@ static ssize_t connRdmaSyncReadLine(connection *conn, char *ptr, ssize_t size, l char *c; char nl = 0; - if (conn->state == CONN_STATE_ERROR || conn->state == CONN_STATE_CLOSED) { + if (connRdmaAllowRW(conn)) { return C_ERR; } From 0d7b2344b2c0df0269a8e018efc87438a8ec510e Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Fri, 1 Nov 2024 15:16:18 -0700 Subject: [PATCH 08/92] correct type internal to kvstore (minor) (#1246) All of the internal variables related to number of dicts in the kvstore are type `int`. Not sure why these 2 items were declared as `long long`. Signed-off-by: Jim Brunner --- src/kvstore.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kvstore.c b/src/kvstore.c index e92af03784..7142fa0f61 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -54,8 +54,8 @@ struct _kvstore { int flags; dictType *dtype; dict **dicts; - long long num_dicts; - long long num_dicts_bits; + int num_dicts; + int num_dicts_bits; list *rehashing; /* List of dictionaries in this kvstore that are currently rehashing. */ int resize_cursor; /* Cron job uses this cursor to gradually resize dictionaries (only used if num_dicts > 1). */ int allocated_dicts; /* The number of allocated dicts. */ From a102852d5ed5316063d680362f910e725070b9ee Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 2 Nov 2024 19:51:14 +0800 Subject: [PATCH 09/92] Fix timing issue in cluster-shards tests (#1243) The cluster-node-timeout is 3000 in our tests, the timing test wasn't succeeding, so extending the wait_for made them much more reliable. Signed-off-by: Binbin --- tests/cluster/tests/28-cluster-shards.tcl | 2 +- tests/unit/cluster/cluster-shards.tcl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/cluster/tests/28-cluster-shards.tcl b/tests/cluster/tests/28-cluster-shards.tcl index d6534c816b..5fb6743246 100644 --- a/tests/cluster/tests/28-cluster-shards.tcl +++ b/tests/cluster/tests/28-cluster-shards.tcl @@ -117,7 +117,7 @@ test "Kill a node and tell the replica to immediately takeover" { # Primary 0 node should report as fail, wait until the new primary acknowledges it. test "Verify health as fail for killed node" { - wait_for_condition 50 100 { + wait_for_condition 1000 50 { "fail" eq [dict get [get_node_info_from_shard $node_0_id 4 "node"] "health"] } else { fail "New primary never detected the node failed" diff --git a/tests/unit/cluster/cluster-shards.tcl b/tests/unit/cluster/cluster-shards.tcl index 19acd186f5..170114d822 100644 --- a/tests/unit/cluster/cluster-shards.tcl +++ b/tests/unit/cluster/cluster-shards.tcl @@ -42,7 +42,7 @@ start_cluster 3 3 {tags {external:skip cluster}} { } test "Verify health as fail for killed node" { - wait_for_condition 50 100 { + wait_for_condition 1000 50 { "fail" eq [dict get [get_node_info_from_shard $node_0_id $validation_node "node"] "health"] } else { fail "New primary never detected the node failed" From 3c32ee1bdaddcd5fbe699aa6c8b320e86702d1b6 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Mon, 4 Nov 2024 12:36:20 -0800 Subject: [PATCH 10/92] Add a filter option to drop all cluster packets (#1252) A minor debugging change that helped in the investigation of https://github.com/valkey-io/valkey/issues/1251. Basically there are some edge cases where we want to fully isolate a note from receiving packets, but can't suspend the process because we need it to continue sending outbound traffic. So, added a filter for that. Signed-off-by: Madelyn Olson --- src/cluster_legacy.c | 5 +++-- src/debug.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 43d56b9a09..f1c9eb1fcf 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -2981,7 +2981,7 @@ int clusterIsValidPacket(clusterLink *link) { return 0; } - if (type == server.cluster_drop_packet_filter) { + if (type == server.cluster_drop_packet_filter || server.cluster_drop_packet_filter == -2) { serverLog(LL_WARNING, "Dropping packet that matches debug drop filter"); return 0; } @@ -3070,7 +3070,8 @@ int clusterProcessPacket(clusterLink *link) { if (!clusterIsValidPacket(link)) { clusterMsg *hdr = (clusterMsg *)link->rcvbuf; uint16_t type = ntohs(hdr->type); - if (server.debug_cluster_close_link_on_packet_drop && type == server.cluster_drop_packet_filter) { + if (server.debug_cluster_close_link_on_packet_drop && + (type == server.cluster_drop_packet_filter || server.cluster_drop_packet_filter == -2)) { freeClusterLink(link); serverLog(LL_WARNING, "Closing link for matching packet type %hu", type); return 0; diff --git a/src/debug.c b/src/debug.c index d221a884ee..13da7bcc93 100644 --- a/src/debug.c +++ b/src/debug.c @@ -432,7 +432,7 @@ void debugCommand(client *c) { " Some fields of the default behavior may be time consuming to fetch,", " and `fast` can be passed to avoid fetching them.", "DROP-CLUSTER-PACKET-FILTER ", - " Drop all packets that match the filtered type. Set to -1 allow all packets.", + " Drop all packets that match the filtered type. Set to -1 allow all packets or -2 to drop all packets.", "CLOSE-CLUSTER-LINK-ON-PACKET-DROP <0|1>", " This is valid only when DROP-CLUSTER-PACKET-FILTER is set to a valid packet type.", " When set to 1, the cluster link is closed after dropping a packet based on the filter.", From 48ebe21ad1a30eee60c22fe8235118f4c6b1aed3 Mon Sep 17 00:00:00 2001 From: Amit Nagler <58042354+naglera@users.noreply.github.com> Date: Tue, 5 Nov 2024 14:57:34 +0200 Subject: [PATCH 11/92] fix: clean up refactoring leftovers (#1264) This commit addresses issues that were likely introduced during a rebase related to: https://github.com/valkey-io/valkey/commit/b0f23df16522e91a769c75646166045ae70e8d4e Change dual channel replication state in main handler only Signed-off-by: naglera --- src/replication.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/replication.c b/src/replication.c index 8ff8ad3f0f..6e8faff7a2 100644 --- a/src/replication.c +++ b/src/replication.c @@ -3247,7 +3247,6 @@ int dualChannelReplMainConnSendHandshake(connection *conn, sds *err) { ull2string(llstr, sizeof(llstr), server.rdb_client_id); *err = sendCommand(conn, "REPLCONF", "set-rdb-client-id", llstr, NULL); if (*err) return C_ERR; - server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY; return C_OK; } @@ -3258,7 +3257,6 @@ int dualChannelReplMainConnRecvCapaReply(connection *conn, sds *err) { serverLog(LL_NOTICE, "Primary does not understand REPLCONF identify: %s", *err); return C_ERR; } - server.repl_state = REPL_STATE_SEND_PSYNC; return C_OK; } @@ -3269,7 +3267,6 @@ int dualChannelReplMainConnSendPsync(connection *conn, sds *err) { *err = sdsnew(connGetLastError(conn)); return C_ERR; } - server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY; return C_OK; } From 12c5af03b8b2d868fd35f4c1142162695f8dd41c Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 6 Nov 2024 10:32:00 +0800 Subject: [PATCH 12/92] Remove empty DB check branch in KEYS command (#1259) We don't think we really care about optimizing for the empty DB case, which should be uncommon. Adding branches hurts branch prediction. Signed-off-by: Binbin --- src/db.c | 5 ----- tests/unit/keyspace.tcl | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/db.c b/src/db.c index ceb3105f9b..3e0e5a2e63 100644 --- a/src/db.c +++ b/src/db.c @@ -830,11 +830,6 @@ void keysCommand(client *c) { kvstoreDictIterator *kvs_di = NULL; kvstoreIterator *kvs_it = NULL; if (pslot != -1) { - if (!kvstoreDictSize(c->db->keys, pslot)) { - /* Requested slot is empty */ - setDeferredArrayLen(c, replylen, 0); - return; - } kvs_di = kvstoreGetDictSafeIterator(c->db->keys, pslot); } else { kvs_it = kvstoreIteratorInit(c->db->keys); diff --git a/tests/unit/keyspace.tcl b/tests/unit/keyspace.tcl index ba55c1b8ea..1936f5e217 100644 --- a/tests/unit/keyspace.tcl +++ b/tests/unit/keyspace.tcl @@ -47,6 +47,10 @@ start_server {tags {"keyspace"}} { r dbsize } {0} + test {KEYS with empty DB} { + assert_equal {} [r keys *] + } + test "DEL against expired key" { r debug set-active-expire 0 r setex keyExpire 1 valExpire @@ -554,3 +558,14 @@ foreach {type large} [array get largevalue] { r KEYS [string repeat "*?" 50000] } {} } + +start_cluster 1 0 {tags {"keyspace external:skip cluster"}} { + test {KEYS with empty DB in cluster mode} { + assert_equal {} [r keys *] + assert_equal {} [r keys foo*] + } + + test {KEYS with empty slot in cluster mode} { + assert_equal {} [r keys foo] + } +} From a0b1cbad83012b93f1e04f77cb3a067a9f37dd97 Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 7 Nov 2024 12:13:00 +0800 Subject: [PATCH 13/92] Change errno from EEXIST to EALREADY in serverFork if child process exists (#1258) We set this to EEXIST in 568c2e039bac388003068cd8debb2f93619dd462, it prints "File exists" which is not quite accurate, change it to EALREADY, it will print "Operation already in progress". Signed-off-by: Binbin --- src/server.c | 2 +- tests/unit/moduleapi/fork.tcl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/server.c b/src/server.c index 508edc7112..5658b05115 100644 --- a/src/server.c +++ b/src/server.c @@ -6396,7 +6396,7 @@ void closeChildUnusedResourceAfterFork(void) { int serverFork(int purpose) { if (isMutuallyExclusiveChildType(purpose)) { if (hasActiveChildProcess()) { - errno = EEXIST; + errno = EALREADY; return -1; } diff --git a/tests/unit/moduleapi/fork.tcl b/tests/unit/moduleapi/fork.tcl index 9d1f9c184c..bf53bd2db8 100644 --- a/tests/unit/moduleapi/fork.tcl +++ b/tests/unit/moduleapi/fork.tcl @@ -26,7 +26,7 @@ start_server {tags {"modules"}} { # module fork twice assert_error {Fork failed} {r fork.create 0 1} - assert {[count_log_message 0 "Can't fork for module: File exists"] eq "1"} + assert {[count_log_message 0 "Can't fork for module: Operation already in progress"] eq "1"} r fork.kill From 22bc49c4a62894694f7977bb9047e1da27599c25 Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 7 Nov 2024 13:42:20 +0800 Subject: [PATCH 14/92] Try to stabilize the failover call in the slot migration test (#1078) The CI report replica will return the error when performing CLUSTER FAILOVER: ``` -ERR Master is down or failed, please use CLUSTER FAILOVER FORCE ``` This may because the primary state is fail or the cluster connection is disconnected during the primary pause. In this PR, we added some waits in wait_for_role, if the role is replica, we will wait for the replication link and the cluster link to be ok. Signed-off-by: Binbin --- tests/support/cluster_util.tcl | 8 +++++ tests/unit/cluster/slot-migration.tcl | 44 +++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index dd5cd84df2..4b399214b9 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -277,6 +277,14 @@ proc cluster_get_myself id { return {} } +# Returns the parsed "myself's primary" CLUSTER NODES entry as a dictionary. +proc cluster_get_myself_primary id { + set myself [cluster_get_myself $id] + set replicaof [dict get $myself slaveof] + set node [cluster_get_node_by_id $id $replicaof] + return $node +} + # Get a specific node by ID by parsing the CLUSTER NODES output # of the instance Number 'instance_id' proc cluster_get_node_by_id {instance_id node_id} { diff --git a/tests/unit/cluster/slot-migration.tcl b/tests/unit/cluster/slot-migration.tcl index d798971968..289c20578d 100644 --- a/tests/unit/cluster/slot-migration.tcl +++ b/tests/unit/cluster/slot-migration.tcl @@ -14,17 +14,61 @@ proc get_cluster_role {srv_idx} { return $role } +proc get_myself_primary_flags {srv_idx} { + set flags [dict get [cluster_get_myself_primary $srv_idx] flags] + return $flags +} + +proc get_myself_primary_linkstate {srv_idx} { + set linkstate [dict get [cluster_get_myself_primary $srv_idx] linkstate] + return $linkstate +} + proc wait_for_role {srv_idx role} { + # Wait for the role, make sure the replication role matches. wait_for_condition 100 100 { [lindex [split [R $srv_idx ROLE] " "] 0] eq $role } else { + puts "R $srv_idx ROLE: [R $srv_idx ROLE]" fail "R $srv_idx didn't assume the replication $role in time" } + + if {$role eq "slave"} { + # Wait for the replication link, make sure the replication link is normal. + wait_for_condition 100 100 { + [s -$srv_idx master_link_status] eq "up" + } else { + puts "R $srv_idx INFO REPLICATION: [R $srv_idx INFO REPLICATION]" + fail "R $srv_idx didn't assume the replication link in time" + } + } + + # Wait for the cluster role, make sure the cluster role matches. wait_for_condition 100 100 { [get_cluster_role $srv_idx] eq $role } else { + puts "R $srv_idx CLUSTER NODES: [R $srv_idx CLUSTER NODES]" fail "R $srv_idx didn't assume the cluster $role in time" } + + if {$role eq "slave"} { + # Wait for the flags, make sure the primary node is not failed. + wait_for_condition 100 100 { + [get_myself_primary_flags $srv_idx] eq "master" + } else { + puts "R $srv_idx CLUSTER NODES: [R $srv_idx CLUSTER NODES]" + fail "R $srv_idx didn't assume the primary state in time" + } + + # Wait for the cluster link, make sure that the cluster connection is normal. + wait_for_condition 100 100 { + [get_myself_primary_linkstate $srv_idx] eq "connected" + } else { + puts "R $srv_idx CLUSTER NODES: [R $srv_idx CLUSTER NODES]" + fail "R $srv_idx didn't assume the cluster primary link in time" + } + } + wait_for_cluster_propagation } From 1c18c8084451153c468e3224f31da43ff6fbd615 Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 7 Nov 2024 13:44:21 +0800 Subject: [PATCH 15/92] Fix incorrect cache_memory reset in functionsLibCtxClear (#1255) functionsLibCtxClear should clear the provided lib_ctx parameter, not the static variable curr_functions_lib_ctx, as this contradicts the function's intended purpose. The impact i guess is minor, like in some unhappy paths (diskless load fails, function restore fails?), we will mess up the functions_caches field, which is used in used_memory_functions / used_memory_scripts fileds in INFO. Signed-off-by: Binbin --- src/functions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/functions.c b/src/functions.c index a00fefb329..e950024bad 100644 --- a/src/functions.c +++ b/src/functions.c @@ -175,7 +175,7 @@ void functionsLibCtxClear(functionsLibCtx *lib_ctx) { stats->n_lib = 0; } dictReleaseIterator(iter); - curr_functions_lib_ctx->cache_memory = 0; + lib_ctx->cache_memory = 0; } void functionsLibCtxClearCurrent(int async) { From 3672f9b2c322c4c8f073acc5973fffce546bd4e5 Mon Sep 17 00:00:00 2001 From: Wen Hui Date: Thu, 7 Nov 2024 20:05:16 -0500 Subject: [PATCH 16/92] Revert "Decline unsubscribe related command in non-subscribed mode" (#1265) This PR goal is to revert the changes on PR https://github.com/valkey-io/valkey/pull/759 Recently, we got some reports that in Valkey 8.0 the PR https://github.com/valkey-io/valkey/pull/759 (Decline unsubscribe related command in non-subscribed mode) causes break change. (https://github.com/valkey-io/valkey/issues/1228) Although from my thought, call commands "unsubscribeCommand", "sunsubscribeCommand", "punsubscribeCommand" in request-response mode make no sense. This is why I created PR https://github.com/valkey-io/valkey/pull/759 But breaking change is always no good, @valkey-io/core-team How do you think we revert this PR code changes? Signed-off-by: hwware --- src/server.c | 6 ------ tests/unit/info.tcl | 3 +-- tests/unit/pubsub.tcl | 25 ++++++++++++++++++++----- tests/unit/pubsubshard.tcl | 5 +++-- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/src/server.c b/src/server.c index 5658b05115..eda9a5b582 100644 --- a/src/server.c +++ b/src/server.c @@ -4165,12 +4165,6 @@ int processCommand(client *c) { return C_OK; } - /* Not allow several UNSUBSCRIBE commands executed under non-pubsub mode */ - if (!c->flag.pubsub && (c->cmd->proc == unsubscribeCommand || c->cmd->proc == sunsubscribeCommand || - c->cmd->proc == punsubscribeCommand)) { - rejectCommandFormat(c, "-NOSUB '%s' command executed not in subscribed mode", c->cmd->fullname); - return C_OK; - } /* Only allow commands with flag "t", such as INFO, REPLICAOF and so on, * when replica-serve-stale-data is no and we are a replica with a broken * link with primary. */ diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl index 61d1acd1f8..278a1d8e33 100644 --- a/tests/unit/info.tcl +++ b/tests/unit/info.tcl @@ -424,8 +424,7 @@ start_server {tags {"info" "external:skip"}} { set info [r info clients] assert_equal [getInfoProperty $info pubsub_clients] {1} # non-pubsub clients should not be involved - catch {unsubscribe $rd2 {non-exist-chan}} e - assert_match {*NOSUB*} $e + assert_equal {0} [unsubscribe $rd2 {non-exist-chan}] set info [r info clients] assert_equal [getInfoProperty $info pubsub_clients] {1} # close all clients diff --git a/tests/unit/pubsub.tcl b/tests/unit/pubsub.tcl index 68dc79a4a4..24b78b6e5a 100644 --- a/tests/unit/pubsub.tcl +++ b/tests/unit/pubsub.tcl @@ -109,12 +109,9 @@ start_server {tags {"pubsub network"}} { $rd1 close } - test "UNSUBSCRIBE and PUNSUBSCRIBE from non-subscribed channels" { + test "UNSUBSCRIBE from non-subscribed channels" { set rd1 [valkey_deferring_client] - foreach command {unsubscribe punsubscribe} { - catch {$command $rd1 {foo bar quux}} e - assert_match {*NOSUB*} $e - } + assert_equal {0 0 0} [unsubscribe $rd1 {foo bar quux}] # clean up clients $rd1 close } @@ -204,6 +201,14 @@ start_server {tags {"pubsub network"}} { $rd close } {0} {resp3} + test "PUNSUBSCRIBE from non-subscribed channels" { + set rd1 [valkey_deferring_client] + assert_equal {0 0 0} [punsubscribe $rd1 {foo.* bar.* quux.*}] + + # clean up clients + $rd1 close + } + test "NUMSUB returns numbers, not strings (#1561)" { r pubsub numsub abc def } {abc 0 def 0} @@ -241,6 +246,16 @@ start_server {tags {"pubsub network"}} { $rd1 close } + test "PUNSUBSCRIBE and UNSUBSCRIBE should always reply" { + # Make sure we are not subscribed to any channel at all. + r punsubscribe + r unsubscribe + # Now check if the commands still reply correctly. + set reply1 [r punsubscribe] + set reply2 [r unsubscribe] + concat $reply1 $reply2 + } {punsubscribe {} 0 unsubscribe {} 0} + ### Keyspace events notification tests test "Keyspace notifications: we receive keyspace notifications" { diff --git a/tests/unit/pubsubshard.tcl b/tests/unit/pubsubshard.tcl index d62a415705..e0e1e2972b 100644 --- a/tests/unit/pubsubshard.tcl +++ b/tests/unit/pubsubshard.tcl @@ -74,8 +74,9 @@ start_server {tags {"pubsubshard external:skip"}} { test "SUNSUBSCRIBE from non-subscribed channels" { set rd1 [valkey_deferring_client] - catch {sunsubscribe $rd1 {foo}} e - assert_match {*NOSUB*} $e + assert_equal {0} [sunsubscribe $rd1 {foo}] + assert_equal {0} [sunsubscribe $rd1 {bar}] + assert_equal {0} [sunsubscribe $rd1 {quux}] # clean up clients $rd1 close From 07b3e7ae7a9e08101fa4dd50aebb8fa5fbdd4f1e Mon Sep 17 00:00:00 2001 From: eifrah-aws Date: Fri, 8 Nov 2024 04:01:37 +0200 Subject: [PATCH 17/92] Add CMake build system for valkey (#1196) With this commit, users are able to build valkey using `CMake`. ## Example usage: Build `valkey-server` in Release mode with TLS enabled and using `jemalloc` as the allocator: ```bash mkdir build-release cd $_ cmake .. -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_INSTALL_PREFIX=/tmp/valkey-install \ -DBUILD_MALLOC=jemalloc -DBUILD_TLS=1 make -j$(nproc) install # start valkey /tmp/valkey-install/bin/valkey-server ``` Build `valkey-unit-tests`: ```bash mkdir build-release-ut cd $_ cmake .. -DCMAKE_BUILD_TYPE=Release \ -DBUILD_MALLOC=jemalloc -DBUILD_UNIT_TESTS=1 make -j$(nproc) # Run the tests ./bin/valkey-unit-tests ``` Current features supported by this PR: - Building against different allocators: (`jemalloc`, `tcmalloc`, `tcmalloc_minimal` and `libc`), e.g. to enable `jemalloc` pass `-DBUILD_MALLOC=jemalloc` to `cmake` - OpenSSL builds (to enable TLS, pass `-DBUILD_TLS=1` to `cmake`) - Sanitizier: pass `-DBUILD_SANITIZER=` to `cmake` - Install target + redis symbolic links - Build `valkey-unit-tests` executable - Standard CMake variables are supported. e.g. to install `valkey` under `/home/you/root` pass `-DCMAKE_INSTALL_PREFIX=/home/you/root` Why using `CMake`? To list *some* of the advantages of using `CMake`: - Superior IDE integrations: cmake generates the file `compile_commands.json` which is required by `clangd` to get a compiler accuracy code completion (in other words: your VScode will thank you) - Out of the source build tree: with the current build system, object files are created all over the place polluting the build source tree, the best practice is to build the project on a separate folder - Multiple build types co-existing: with the current build system, it is often hard to have multiple build configurations. With cmake you can do it easily: - It is the de-facto standard for C/C++ project these days More build examples: ASAN build: ```bash mkdir build-asan cd $_ cmake .. -DBUILD_SANITIZER=address -DBUILD_MALLOC=libc make -j$(nproc) ``` ASAN with jemalloc: ```bash mkdir build-asan-jemalloc cd $_ cmake .. -DBUILD_SANITIZER=address -DBUILD_MALLOC=jemalloc make -j$(nproc) ``` As seen by the previous examples, any combination is allowed and co-exist on the same source tree. ## Valkey installation With this new `CMake`, it is possible to install the binary by running `make install` or creating a package `make package` (currently supported on Debian like distros) ### Example 1: build & install using `make install`: ```bash mkdir build-release cd $_ cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/valkey-install -DCMAKE_BUILD_TYPE=Release make -j$(nproc) install # valkey is now installed under $HOME/valkey-install ``` ### Example 2: create a `.deb` installer: ```bash mkdir build-release cd $_ cmake .. -DCMAKE_BUILD_TYPE=Release make -j$(nproc) package # ... CPack deb generation output sudo gdebi -n ./valkey_8.1.0_amd64.deb # valkey is now installed under /opt/valkey ``` ### Example 3: create installer for non Debian systems (e.g. FreeBSD or macOS): ```bash mkdir build-release cd $_ cmake .. -DCMAKE_BUILD_TYPE=Release make -j$(nproc) package mkdir -p /opt/valkey && ./valkey-8.1.0-Darwin.sh --prefix=/opt/valkey --exclude-subdir # valkey-server is now installed under /opt/valkey ``` Signed-off-by: Eran Ifrah --- .cmake-format.yaml | 76 ++++++ .github/workflows/ci.yml | 25 ++ .gitignore | 2 + CMakeLists.txt | 43 ++++ README.md | 124 +++++++--- cmake/Modules/Packaging.cmake | 44 ++++ cmake/Modules/SourceFiles.cmake | 153 ++++++++++++ cmake/Modules/Utils.cmake | 102 ++++++++ cmake/Modules/ValkeySetup.cmake | 381 ++++++++++++++++++++++++++++++ deps/CMakeLists.txt | 26 ++ deps/fpconv/CMakeLists.txt | 4 + deps/hdr_histogram/CMakeLists.txt | 7 + deps/jemalloc/CMakeLists.txt | 23 ++ deps/linenoise/CMakeLists.txt | 4 + deps/lua/CMakeLists.txt | 44 ++++ src/CMakeLists.txt | 77 ++++++ src/modules/CMakeLists.txt | 21 ++ src/server.c | 1 - src/unit/CMakeLists.txt | 58 +++++ tests/CMakeLists.txt | 5 + tests/modules/CMakeLists.txt | 58 +++++ tests/rdma/CMakeLists.txt | 9 + 22 files changed, 1252 insertions(+), 35 deletions(-) create mode 100644 .cmake-format.yaml create mode 100644 CMakeLists.txt create mode 100644 cmake/Modules/Packaging.cmake create mode 100644 cmake/Modules/SourceFiles.cmake create mode 100644 cmake/Modules/Utils.cmake create mode 100644 cmake/Modules/ValkeySetup.cmake create mode 100644 deps/CMakeLists.txt create mode 100644 deps/fpconv/CMakeLists.txt create mode 100644 deps/hdr_histogram/CMakeLists.txt create mode 100644 deps/jemalloc/CMakeLists.txt create mode 100644 deps/linenoise/CMakeLists.txt create mode 100644 deps/lua/CMakeLists.txt create mode 100644 src/CMakeLists.txt create mode 100644 src/modules/CMakeLists.txt create mode 100644 src/unit/CMakeLists.txt create mode 100644 tests/CMakeLists.txt create mode 100644 tests/modules/CMakeLists.txt create mode 100644 tests/rdma/CMakeLists.txt diff --git a/.cmake-format.yaml b/.cmake-format.yaml new file mode 100644 index 0000000000..98ab11753a --- /dev/null +++ b/.cmake-format.yaml @@ -0,0 +1,76 @@ +format: + _help_line_width: + - How wide to allow formatted cmake files + line_width: 120 + _help_tab_size: + - How many spaces to tab for indent + tab_size: 4 + _help_use_tabchars: + - If true, lines are indented using tab characters (utf-8 + - 0x09) instead of space characters (utf-8 0x20). + - In cases where the layout would require a fractional tab + - character, the behavior of the fractional indentation is + - governed by + use_tabchars: false + _help_separate_ctrl_name_with_space: + - If true, separate flow control names from their parentheses + - with a space + separate_ctrl_name_with_space: true + _help_min_prefix_chars: + - If the statement spelling length (including space and + - parenthesis) is smaller than this amount, then force reject + - nested layouts. + min_prefix_chars: 4 + _help_max_prefix_chars: + - If the statement spelling length (including space and + - parenthesis) is larger than the tab width by more than this + - amount, then force reject un-nested layouts. + max_prefix_chars: 10 + _help_max_lines_hwrap: + - If a candidate layout is wrapped horizontally but it exceeds + - this many lines, then reject the layout. + max_lines_hwrap: 2 + _help_line_ending: + - What style line endings to use in the output. + line_ending: unix + _help_command_case: + - Format command names consistently as 'lower' or 'upper' case + command_case: lower + _help_keyword_case: + - Format keywords consistently as 'lower' or 'upper' case + keyword_case: unchanged + _help_always_wrap: + - A list of command names which should always be wrapped + always_wrap: [] + _help_enable_sort: + - If true, the argument lists which are known to be sortable + - will be sorted lexicographicall + enable_sort: true + _help_autosort: + - If true, the parsers may infer whether or not an argument + - list is sortable (without annotation). + autosort: false + _help_require_valid_layout: + - By default, if cmake-format cannot successfully fit + - everything into the desired linewidth it will apply the + - last, most agressive attempt that it made. If this flag is + - True, however, cmake-format will print error, exit with non- + - zero status code, and write-out nothing + require_valid_layout: false + _help_layout_passes: + - A dictionary mapping layout nodes to a list of wrap + - decisions. See the documentation for more information. + layout_passes: {} +encode: + _help_emit_byteorder_mark: + - If true, emit the unicode byte-order mark (BOM) at the start + - of the file + emit_byteorder_mark: false + _help_input_encoding: + - Specify the encoding of the input file. Defaults to utf-8 + input_encoding: utf-8 + _help_output_encoding: + - Specify the encoding of the output file. Defaults to utf-8. + - Note that cmake only claims to support utf-8 so be careful + - when using anything else + output_encoding: utf-8 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 48a94ef984..bc946b7193 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,6 +34,31 @@ jobs: run: | ./src/valkey-unit-tests + test-ubuntu-latest-cmake: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + - name: cmake and make + run: | + sudo apt-get install -y cmake libssl-dev + mkdir -p build-release + cd build-release + cmake -DCMAKE_BUILD_TYPE=Release .. -DBUILD_TLS=yes -DBUILD_UNIT_TESTS=yes + make -j$(nproc) + - name: test + run: | + sudo apt-get install -y tcl8.6 tclx + ln -sf $(pwd)/build-release/bin/valkey-server $(pwd)/src/valkey-server + ln -sf $(pwd)/build-release/bin/valkey-cli $(pwd)/src/valkey-cli + ln -sf $(pwd)/build-release/bin/valkey-benchmark $(pwd)/src/valkey-benchmark + ln -sf $(pwd)/build-release/bin/valkey-server $(pwd)/src/valkey-check-aof + ln -sf $(pwd)/build-release/bin/valkey-server $(pwd)/src/valkey-check-rdb + ln -sf $(pwd)/build-release/bin/valkey-server $(pwd)/src/valkey-sentinel + ./runtest --verbose --tags -slow --dump-logs + - name: unit tests + run: | + ./build-release/bin/valkey-unit-tests + test-sanitizer-address: runs-on: ubuntu-latest steps: diff --git a/.gitignore b/.gitignore index e448e23f7e..b108b4bb92 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,5 @@ nodes*.conf tests/cluster/tmp/* tests/rdma/rdma-test tags +build-debug/ +build-release/ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000..ad0bab8896 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,43 @@ +cmake_minimum_required(VERSION 3.20) + +# Must be done first +if (APPLE) + # Force clang compiler on macOS + find_program(CLANGPP "clang++") + find_program(CLANG "clang") + if (CLANG AND CLANGPP) + message(STATUS "Found ${CLANGPP}, ${CLANG}") + set(CMAKE_CXX_COMPILER ${CLANGPP}) + set(CMAKE_C_COMPILER ${CLANG}) + endif () +endif () + +# Options +option(BUILD_UNIT_TESTS "Build valkey-unit-tests" OFF) +option(BUILD_TEST_MODULES "Build all test modules" OFF) +option(BUILD_EXAMPLE_MODULES "Build example modules" OFF) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") +project("valkey") + +set(CMAKE_C_STANDARD 11) +set(CMAKE_C_STANDARD_REQUIRED ON) +set(CMAKE_C_EXTENSIONS ON) + +include(ValkeySetup) +add_subdirectory(src) +add_subdirectory(tests) + +# Include the packaging module +include(Packaging) + +# Clear cached variables from the cache +unset(BUILD_TESTS CACHE) +unset(CLANGPP CACHE) +unset(CLANG CACHE) +unset(BUILD_RDMA_MODULE CACHE) +unset(BUILD_TLS_MODULE CACHE) +unset(BUILD_UNIT_TESTS CACHE) +unset(BUILD_TEST_MODULES CACHE) +unset(BUILD_EXAMPLE_MODULES CACHE) +unset(USE_TLS CACHE) diff --git a/README.md b/README.md index 1a8ce1a4db..94f38bccf7 100644 --- a/README.md +++ b/README.md @@ -4,13 +4,12 @@ This project was forked from the open source Redis project right before the tran This README is just a fast *quick start* document. More details can be found under [valkey.io](https://valkey.io/) -What is Valkey? --------------- +# What is Valkey? + Valkey is a high-performance data structure server that primarily serves key/value workloads. It supports a wide range of native structures and an extensible plugin system for adding new data structures and access patterns. -Building Valkey --------------- +# Building Valkey using `Makefile` Valkey can be compiled and used on Linux, OSX, OpenBSD, NetBSD, FreeBSD. We support big endian and little endian architectures, and both 32 bit @@ -43,7 +42,7 @@ supports RDMA as connection module mode. Run: % make BUILD_RDMA=module -To build with systemd support, you'll need systemd development libraries (such +To build with systemd support, you'll need systemd development libraries (such as libsystemd-dev on Debian/Ubuntu or systemd-devel on CentOS) and run: % make USE_SYSTEMD=yes @@ -71,8 +70,7 @@ More about running the integration tests can be found in [tests/README.md](tests/README.md) and for unit tests, see [src/unit/README.md](src/unit/README.md). -Fixing build problems with dependencies or cached build options ---------- +## Fixing build problems with dependencies or cached build options Valkey has some dependencies which are included in the `deps` directory. `make` does not automatically rebuild dependencies even if something in @@ -91,8 +89,7 @@ optimizations (for debugging purposes), and other similar build time options, those options are cached indefinitely until you issue a `make distclean` command. -Fixing problems building 32 bit binaries ---------- +## Fixing problems building 32 bit binaries If after building Valkey with a 32 bit target you need to rebuild it with a 64 bit target, or the other way around, you need to perform a @@ -105,8 +102,7 @@ the following steps: * Try using the following command line instead of `make 32bit`: `make CFLAGS="-m32 -march=native" LDFLAGS="-m32"` -Allocator ---------- +## Allocator Selecting a non-default memory allocator when building Valkey is done by setting the `MALLOC` environment variable. Valkey is compiled and linked against libc @@ -122,28 +118,25 @@ To compile against jemalloc on Mac OS X systems, use: % make MALLOC=jemalloc -Monotonic clock ---------------- +## Monotonic clock By default, Valkey will build using the POSIX clock_gettime function as the monotonic clock source. On most modern systems, the internal processor clock -can be used to improve performance. Cautions can be found here: +can be used to improve performance. Cautions can be found here: http://oliveryang.net/2015/09/pitfalls-of-TSC-usage/ To build with support for the processor's internal instruction clock, use: % make CFLAGS="-DUSE_PROCESSOR_CLOCK" -Verbose build -------------- +## Verbose build Valkey will build with a user-friendly colorized output by default. If you want to see a more verbose output, use the following: % make V=1 -Running Valkey -------------- +# Running Valkey To run Valkey with the default configuration, just type: @@ -165,10 +158,10 @@ as options using the command line. Examples: All the options in valkey.conf are also supported as options using the command line, with exactly the same name. -Running Valkey with TLS: ------------------- +# Running Valkey with TLS: + +## Running manually -### Running manually To manually run a Valkey server with TLS mode (assuming `./gen-test-certs.sh` was invoked so sample certificates/keys are available): * TLS built-in mode: @@ -204,8 +197,7 @@ Specifying `--tls-replication yes` makes a replica connect to the primary. Using `--tls-cluster yes` makes Valkey Cluster use TLS across nodes. -Running Valkey with RDMA: ------------------- +# Running Valkey with RDMA: Note that Valkey Over RDMA is an experimental feature. It may be changed or removed in any minor or major version. @@ -236,8 +228,7 @@ Or: % ibv_devices -Playing with Valkey ------------------- +# Playing with Valkey You can use valkey-cli to play with Valkey. Start a valkey-server instance, then in another terminal try the following: @@ -256,8 +247,7 @@ then in another terminal try the following: (integer) 2 valkey> -Installing Valkey ------------------ +# Installing Valkey In order to install Valkey binaries into /usr/local/bin, just use: @@ -289,16 +279,82 @@ system reboots. You'll be able to stop and start Valkey using the script named `/etc/init.d/valkey_`, for instance `/etc/init.d/valkey_6379`. -Code contributions ------------------ +# Building using `CMake` + +In addition to the traditional `Makefile` build, Valkey supports an alternative, **experimental**, build system using `CMake`. + +To build and install `Valkey`, in `Release` mode (an optimized build), type this into your terminal: + +```bash +mkdir build-release +cd $_ +cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/opt/valkey +sudo make install +# Valkey is now installed under /opt/valkey +``` + +Other options supported by Valkey's `CMake` build system: + +## Special build flags + +- `-DBUILD_TLS=` enable TLS build for Valkey +- `-DBUILD_RDMA=` enable RDMA module build (only module mode supported) +- `-DBUILD_MALLOC=` choose the allocator to use. Default on Linux: `jemalloc`, for other OS: `libc` +- `-DBUILD_SANITIZER=` build with address sanitizer enabled +- `-DBUILD_UNIT_TESTS=[1|0]` when set, the build will produce the executable `valkey-unit-tests` +- `-DBUILD_TEST_MODULES=[1|0]` when set, the build will include the modules located under the `tests/modules` folder +- `-DBUILD_EXAMPLE_MODULES=[1|0]` when set, the build will include the example modules located under the `src/modules` folder + +## Common flags + +- `-DCMAKE_BUILD_TYPE=` define the build type, see CMake manual for more details +- `-DCMAKE_INSTALL_PREFIX=/installation/path` override this value to define a custom install prefix. Default: `/usr/local` +- `-G` generate build files for "Generator Name". By default, CMake will generate `Makefile`s. + +## Verbose build + +`CMake` generates a user-friendly colorized output by default. +If you want to see a more verbose output, use the following: + +```bash +make VERBOSE=1 +``` + +## Troubleshooting + +During the `CMake` stage, `CMake` caches variables in a local file named `CMakeCache.txt`. All variables generated by Valkey +are removed from the cache once consumed (this is done by calling to `unset(VAR-NAME CACHE)`). However, some variables, +like the compiler path, are kept in cache. To start a fresh build either remove the cache file `CMakeCache.txt` from the +build folder, or delete the build folder completely. + +**It is important to re-run `CMake` when adding new source files.** + +## Integration with IDE + +During the `CMake` stage of the build, `CMake` generates a JSON file named `compile_commands.json` and places it under the +build folder. This file is used by many IDEs and text editors for providing code completion (via `clangd`). + +A small caveat is that these tools will look for `compile_commands.json` under the Valkey's top folder. +A common workaround is to create a symbolic link to it: + +```bash +cd /path/to/valkey/ +# We assume here that your build folder is `build-release` +ln -sf $(pwd)/build-release/compile_commands.json $(pwd)/compile_commands.json +``` + +Restart your IDE and voila + +# Code contributions + Please see the [CONTRIBUTING.md][2]. For security bugs and vulnerabilities, please see [SECURITY.md][3]. -[1]: https://github.com/valkey-io/valkey/blob/unstable/COPYING -[2]: https://github.com/valkey-io/valkey/blob/unstable/CONTRIBUTING.md -[3]: https://github.com/valkey-io/valkey/blob/unstable/SECURITY.md +# Valkey is an open community project under LF Projects -Valkey is an open community project under LF Projects ------------------ Valkey a Series of LF Projects, LLC 2810 N Church St, PMB 57274 Wilmington, Delaware 19802-4447 + +[1]: https://github.com/valkey-io/valkey/blob/unstable/COPYING +[2]: https://github.com/valkey-io/valkey/blob/unstable/CONTRIBUTING.md +[3]: https://github.com/valkey-io/valkey/blob/unstable/SECURITY.md diff --git a/cmake/Modules/Packaging.cmake b/cmake/Modules/Packaging.cmake new file mode 100644 index 0000000000..c7ed5c426b --- /dev/null +++ b/cmake/Modules/Packaging.cmake @@ -0,0 +1,44 @@ +set(CPACK_PACKAGE_NAME "valkey") + +valkey_parse_version(CPACK_PACKAGE_VERSION_MAJOR CPACK_PACKAGE_VERSION_MINOR CPACK_PACKAGE_VERSION_PATCH) + +set(CPACK_PACKAGE_CONTACT "maintainers@lists.valkey.io") +set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Valkey is an open source (BSD) high-performance key/value datastore") +set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_SOURCE_DIR}/COPYING") +set(CPACK_RESOURCE_FILE_README "${CMAKE_SOURCE_DIR}/README.md") +set(CPACK_STRIP_FILES TRUE) + +valkey_get_distro_name(DISTRO_NAME) +message(STATUS "Current host distro: ${DISTRO_NAME}") + +if (DISTRO_NAME MATCHES ubuntu + OR DISTRO_NAME MATCHES debian + OR DISTRO_NAME MATCHES mint) + message(STATUS "Adding target package for ${DISTRO_NAME}") + set(CPACK_PACKAGING_INSTALL_PREFIX "/opt/valkey") + # Debian related parameters + set(CPACK_DEBIAN_PACKAGE_MAINTAINER "Valkey contributors") + set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON) + set(CPACK_DEBIAN_FILE_NAME DEB-DEFAULT) + set(CPACK_GENERATOR "DEB") +endif () + +include(CPack) +unset(DISTRO_NAME CACHE) + +# --------------------------------------------------- +# Create a helper script for creating symbolic links +# --------------------------------------------------- +write_file( + ${CMAKE_BINARY_DIR}/CreateSymlink.sh + "\ +#!/bin/bash \n\ +if [ -z \${DESTDIR} ]; then \n\ + # Script is called during 'make install' \n\ + PREFIX=${CMAKE_INSTALL_PREFIX}/bin \n\ +else \n\ + # Script is called during 'make package' \n\ + PREFIX=\${DESTDIR}${CPACK_PACKAGING_INSTALL_PREFIX}/bin \n\ +fi \n\ +cd \$PREFIX \n\ +ln -sf \$1 \$2") diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake new file mode 100644 index 0000000000..d76f17625e --- /dev/null +++ b/cmake/Modules/SourceFiles.cmake @@ -0,0 +1,153 @@ +# ------------------------------------------------- +# Define the sources to be built +# ------------------------------------------------- + +# valkey-server source files +set(VALKEY_SERVER_SRCS + ${CMAKE_SOURCE_DIR}/src/threads_mngr.c + ${CMAKE_SOURCE_DIR}/src/adlist.c + ${CMAKE_SOURCE_DIR}/src/quicklist.c + ${CMAKE_SOURCE_DIR}/src/ae.c + ${CMAKE_SOURCE_DIR}/src/anet.c + ${CMAKE_SOURCE_DIR}/src/dict.c + ${CMAKE_SOURCE_DIR}/src/kvstore.c + ${CMAKE_SOURCE_DIR}/src/sds.c + ${CMAKE_SOURCE_DIR}/src/zmalloc.c + ${CMAKE_SOURCE_DIR}/src/lzf_c.c + ${CMAKE_SOURCE_DIR}/src/lzf_d.c + ${CMAKE_SOURCE_DIR}/src/pqsort.c + ${CMAKE_SOURCE_DIR}/src/zipmap.c + ${CMAKE_SOURCE_DIR}/src/sha1.c + ${CMAKE_SOURCE_DIR}/src/ziplist.c + ${CMAKE_SOURCE_DIR}/src/release.c + ${CMAKE_SOURCE_DIR}/src/memory_prefetch.c + ${CMAKE_SOURCE_DIR}/src/io_threads.c + ${CMAKE_SOURCE_DIR}/src/networking.c + ${CMAKE_SOURCE_DIR}/src/util.c + ${CMAKE_SOURCE_DIR}/src/object.c + ${CMAKE_SOURCE_DIR}/src/db.c + ${CMAKE_SOURCE_DIR}/src/replication.c + ${CMAKE_SOURCE_DIR}/src/rdb.c + ${CMAKE_SOURCE_DIR}/src/t_string.c + ${CMAKE_SOURCE_DIR}/src/t_list.c + ${CMAKE_SOURCE_DIR}/src/t_set.c + ${CMAKE_SOURCE_DIR}/src/t_zset.c + ${CMAKE_SOURCE_DIR}/src/t_hash.c + ${CMAKE_SOURCE_DIR}/src/config.c + ${CMAKE_SOURCE_DIR}/src/aof.c + ${CMAKE_SOURCE_DIR}/src/pubsub.c + ${CMAKE_SOURCE_DIR}/src/multi.c + ${CMAKE_SOURCE_DIR}/src/debug.c + ${CMAKE_SOURCE_DIR}/src/sort.c + ${CMAKE_SOURCE_DIR}/src/intset.c + ${CMAKE_SOURCE_DIR}/src/syncio.c + ${CMAKE_SOURCE_DIR}/src/cluster.c + ${CMAKE_SOURCE_DIR}/src/cluster_legacy.c + ${CMAKE_SOURCE_DIR}/src/cluster_slot_stats.c + ${CMAKE_SOURCE_DIR}/src/crc16.c + ${CMAKE_SOURCE_DIR}/src/endianconv.c + ${CMAKE_SOURCE_DIR}/src/slowlog.c + ${CMAKE_SOURCE_DIR}/src/eval.c + ${CMAKE_SOURCE_DIR}/src/bio.c + ${CMAKE_SOURCE_DIR}/src/rio.c + ${CMAKE_SOURCE_DIR}/src/rand.c + ${CMAKE_SOURCE_DIR}/src/memtest.c + ${CMAKE_SOURCE_DIR}/src/syscheck.c + ${CMAKE_SOURCE_DIR}/src/crcspeed.c + ${CMAKE_SOURCE_DIR}/src/crccombine.c + ${CMAKE_SOURCE_DIR}/src/crc64.c + ${CMAKE_SOURCE_DIR}/src/bitops.c + ${CMAKE_SOURCE_DIR}/src/sentinel.c + ${CMAKE_SOURCE_DIR}/src/notify.c + ${CMAKE_SOURCE_DIR}/src/setproctitle.c + ${CMAKE_SOURCE_DIR}/src/blocked.c + ${CMAKE_SOURCE_DIR}/src/hyperloglog.c + ${CMAKE_SOURCE_DIR}/src/latency.c + ${CMAKE_SOURCE_DIR}/src/sparkline.c + ${CMAKE_SOURCE_DIR}/src/valkey-check-rdb.c + ${CMAKE_SOURCE_DIR}/src/valkey-check-aof.c + ${CMAKE_SOURCE_DIR}/src/geo.c + ${CMAKE_SOURCE_DIR}/src/lazyfree.c + ${CMAKE_SOURCE_DIR}/src/module.c + ${CMAKE_SOURCE_DIR}/src/evict.c + ${CMAKE_SOURCE_DIR}/src/expire.c + ${CMAKE_SOURCE_DIR}/src/geohash.c + ${CMAKE_SOURCE_DIR}/src/geohash_helper.c + ${CMAKE_SOURCE_DIR}/src/childinfo.c + ${CMAKE_SOURCE_DIR}/src/defrag.c + ${CMAKE_SOURCE_DIR}/src/siphash.c + ${CMAKE_SOURCE_DIR}/src/rax.c + ${CMAKE_SOURCE_DIR}/src/t_stream.c + ${CMAKE_SOURCE_DIR}/src/listpack.c + ${CMAKE_SOURCE_DIR}/src/localtime.c + ${CMAKE_SOURCE_DIR}/src/lolwut.c + ${CMAKE_SOURCE_DIR}/src/lolwut5.c + ${CMAKE_SOURCE_DIR}/src/lolwut6.c + ${CMAKE_SOURCE_DIR}/src/acl.c + ${CMAKE_SOURCE_DIR}/src/tracking.c + ${CMAKE_SOURCE_DIR}/src/socket.c + ${CMAKE_SOURCE_DIR}/src/tls.c + ${CMAKE_SOURCE_DIR}/src/sha256.c + ${CMAKE_SOURCE_DIR}/src/timeout.c + ${CMAKE_SOURCE_DIR}/src/setcpuaffinity.c + ${CMAKE_SOURCE_DIR}/src/monotonic.c + ${CMAKE_SOURCE_DIR}/src/mt19937-64.c + ${CMAKE_SOURCE_DIR}/src/resp_parser.c + ${CMAKE_SOURCE_DIR}/src/call_reply.c + ${CMAKE_SOURCE_DIR}/src/script_lua.c + ${CMAKE_SOURCE_DIR}/src/script.c + ${CMAKE_SOURCE_DIR}/src/functions.c + ${CMAKE_SOURCE_DIR}/src/function_lua.c + ${CMAKE_SOURCE_DIR}/src/commands.c + ${CMAKE_SOURCE_DIR}/src/strl.c + ${CMAKE_SOURCE_DIR}/src/connection.c + ${CMAKE_SOURCE_DIR}/src/unix.c + ${CMAKE_SOURCE_DIR}/src/server.c + ${CMAKE_SOURCE_DIR}/src/logreqres.c) + +# valkey-cli +set(VALKEY_CLI_SRCS + ${CMAKE_SOURCE_DIR}/src/anet.c + ${CMAKE_SOURCE_DIR}/src/adlist.c + ${CMAKE_SOURCE_DIR}/src/dict.c + ${CMAKE_SOURCE_DIR}/src/valkey-cli.c + ${CMAKE_SOURCE_DIR}/src/zmalloc.c + ${CMAKE_SOURCE_DIR}/src/release.c + ${CMAKE_SOURCE_DIR}/src/ae.c + ${CMAKE_SOURCE_DIR}/src/serverassert.c + ${CMAKE_SOURCE_DIR}/src/crcspeed.c + ${CMAKE_SOURCE_DIR}/src/crccombine.c + ${CMAKE_SOURCE_DIR}/src/crc64.c + ${CMAKE_SOURCE_DIR}/src/siphash.c + ${CMAKE_SOURCE_DIR}/src/crc16.c + ${CMAKE_SOURCE_DIR}/src/monotonic.c + ${CMAKE_SOURCE_DIR}/src/cli_common.c + ${CMAKE_SOURCE_DIR}/src/mt19937-64.c + ${CMAKE_SOURCE_DIR}/src/strl.c + ${CMAKE_SOURCE_DIR}/src/cli_commands.c) + +# valkey-benchmark +set(VALKEY_BENCHMARK_SRCS + ${CMAKE_SOURCE_DIR}/src/ae.c + ${CMAKE_SOURCE_DIR}/src/anet.c + ${CMAKE_SOURCE_DIR}/src/valkey-benchmark.c + ${CMAKE_SOURCE_DIR}/src/adlist.c + ${CMAKE_SOURCE_DIR}/src/dict.c + ${CMAKE_SOURCE_DIR}/src/zmalloc.c + ${CMAKE_SOURCE_DIR}/src/serverassert.c + ${CMAKE_SOURCE_DIR}/src/release.c + ${CMAKE_SOURCE_DIR}/src/crcspeed.c + ${CMAKE_SOURCE_DIR}/src/crccombine.c + ${CMAKE_SOURCE_DIR}/src/crc64.c + ${CMAKE_SOURCE_DIR}/src/siphash.c + ${CMAKE_SOURCE_DIR}/src/crc16.c + ${CMAKE_SOURCE_DIR}/src/monotonic.c + ${CMAKE_SOURCE_DIR}/src/cli_common.c + ${CMAKE_SOURCE_DIR}/src/mt19937-64.c + ${CMAKE_SOURCE_DIR}/src/strl.c) + +# valkey-rdma module +set(VALKEY_RDMA_MODULE_SRCS ${CMAKE_SOURCE_DIR}/src/rdma.c) + +# valkey-tls module +set(VALKEY_TLS_MODULE_SRCS ${CMAKE_SOURCE_DIR}/src/tls.c) diff --git a/cmake/Modules/Utils.cmake b/cmake/Modules/Utils.cmake new file mode 100644 index 0000000000..304f39fb2c --- /dev/null +++ b/cmake/Modules/Utils.cmake @@ -0,0 +1,102 @@ +# Return the current host distro name. For example: ubuntu, debian, amzn etc +function (valkey_get_distro_name DISTRO_NAME) + if (LINUX AND NOT APPLE) + execute_process( + COMMAND /bin/bash "-c" "cat /etc/os-release |grep ^ID=|cut -d = -f 2" + OUTPUT_VARIABLE _OUT_VAR + OUTPUT_STRIP_TRAILING_WHITESPACE) + # clean the output + string(REPLACE "\"" "" _OUT_VAR "${_OUT_VAR}") + string(REPLACE "." "" _OUT_VAR "${_OUT_VAR}") + set(${DISTRO_NAME} + "${_OUT_VAR}" + PARENT_SCOPE) + elseif (APPLE) + set(${DISTRO_NAME} + "darwin" + PARENT_SCOPE) + elseif (IS_FREEBSD) + set(${DISTRO_NAME} + "freebsd" + PARENT_SCOPE) + else () + set(${DISTRO_NAME} + "unknown" + PARENT_SCOPE) + endif () +endfunction () + +function (valkey_parse_version OUT_MAJOR OUT_MINOR OUT_PATCH) + # Read and parse package version from version.h file + file(STRINGS ${CMAKE_SOURCE_DIR}/src/version.h VERSION_LINES) + foreach (LINE ${VERSION_LINES}) + string(FIND "${LINE}" "#define VALKEY_VERSION " VERSION_STR_POS) + if (VERSION_STR_POS GREATER -1) + string(REPLACE "#define VALKEY_VERSION " "" LINE "${LINE}") + string(REPLACE "\"" "" LINE "${LINE}") + # Change "." to ";" to make it a list + string(REPLACE "." ";" LINE "${LINE}") + list(GET LINE 0 _MAJOR) + list(GET LINE 1 _MINOR) + list(GET LINE 2 _PATCH) + message(STATUS "Valkey version: ${_MAJOR}.${_MINOR}.${_PATCH}") + # Set the output variables + set(${OUT_MAJOR} + ${_MAJOR} + PARENT_SCOPE) + set(${OUT_MINOR} + ${_MINOR} + PARENT_SCOPE) + set(${OUT_PATCH} + ${_PATCH} + PARENT_SCOPE) + endif () + endforeach () +endfunction () + +# Given input argument `OPTION_VALUE`, check that the `OPTION_VALUE` is from the allowed values (one of: +# module/yes/no/1/0/true/false) +# +# Return value: +# +# If ARG is valid, return its number where: +# +# ~~~ +# - `no` | `0` | `off` => return `0` +# - `yes` | `1` | `on` => return `1` +# - `module` => return `2` +# ~~~ +function (valkey_parse_build_option OPTION_VALUE OUT_ARG_ENUM) + list(APPEND VALID_OPTIONS "yes") + list(APPEND VALID_OPTIONS "1") + list(APPEND VALID_OPTIONS "on") + list(APPEND VALID_OPTIONS "no") + list(APPEND VALID_OPTIONS "0") + list(APPEND VALID_OPTIONS "off") + list(APPEND VALID_OPTIONS "module") + + string(TOLOWER "${OPTION_VALUE}" OPTION_VALUE) + list(FIND VALID_OPTIONS "${ARG}" OPT_INDEX) + if (VERSION_STR_POS GREATER -1) + message(FATAL_ERROR "Invalid value passed ''${OPTION_VALUE}'") + endif () + + if ("${OPTION_VALUE}" STREQUAL "yes" + OR "${OPTION_VALUE}" STREQUAL "1" + OR "${OPTION_VALUE}" STREQUAL "on") + set(${OUT_ARG_ENUM} + 1 + PARENT_SCOPE) + elseif ( + "${OPTION_VALUE}" STREQUAL "no" + OR "${OPTION_VALUE}" STREQUAL "0" + OR "${OPTION_VALUE}" STREQUAL "off") + set(${OUT_ARG_ENUM} + 0 + PARENT_SCOPE) + else () + set(${OUT_ARG_ENUM} + 2 + PARENT_SCOPE) + endif () +endfunction () diff --git a/cmake/Modules/ValkeySetup.cmake b/cmake/Modules/ValkeySetup.cmake new file mode 100644 index 0000000000..e935c3b308 --- /dev/null +++ b/cmake/Modules/ValkeySetup.cmake @@ -0,0 +1,381 @@ +include(CheckIncludeFiles) +include(ProcessorCount) +include(Utils) + +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin") +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") + +# Generate compile_commands.json file for IDEs code completion support +set(CMAKE_EXPORT_COMPILE_COMMANDS 1) + +processorcount(VALKEY_PROCESSOR_COUNT) +message(STATUS "Processor count: ${VALKEY_PROCESSOR_COUNT}") + +# Installed executables will have this permissions +set(VALKEY_EXE_PERMISSIONS + OWNER_EXECUTE + OWNER_WRITE + OWNER_READ + GROUP_EXECUTE + GROUP_READ + WORLD_EXECUTE + WORLD_READ) + +set(VALKEY_SERVER_CFLAGS "") +set(VALKEY_SERVER_LDFLAGS "") + +# ---------------------------------------------------- +# Helper functions & macros +# ---------------------------------------------------- +macro (add_valkey_server_compiler_options value) + set(VALKEY_SERVER_CFLAGS "${VALKEY_SERVER_CFLAGS} ${value}") +endmacro () + +macro (add_valkey_server_linker_option value) + list(APPEND VALKEY_SERVER_LDFLAGS ${value}) +endmacro () + +macro (get_valkey_server_linker_option return_value) + list(JOIN VALKEY_SERVER_LDFLAGS " " ${value} ${return_value}) +endmacro () + +set(IS_FREEBSD 0) +if (CMAKE_SYSTEM_NAME MATCHES "^.*BSD$|DragonFly") + message(STATUS "Building for FreeBSD compatible system") + set(IS_FREEBSD 1) + include_directories("/usr/local/include") + add_valkey_server_compiler_options("-DUSE_BACKTRACE") +endif () + +# Helper function for creating symbolic link so that: link -> source +macro (valkey_create_symlink source link) + install( + CODE "execute_process( \ + COMMAND /bin/bash ${CMAKE_BINARY_DIR}/CreateSymlink.sh \ + ${source} \ + ${link} \ + )" + COMPONENT "valkey") +endmacro () + +# Install a binary +macro (valkey_install_bin target) + # Install cli tool and create a redis symbolic link + install( + TARGETS ${target} + DESTINATION ${CMAKE_INSTALL_BINDIR} + PERMISSIONS ${VALKEY_EXE_PERMISSIONS} + COMPONENT "valkey") +endmacro () + +# Helper function that defines, builds and installs `target` In addition, it creates a symbolic link between the target +# and `link_name` +macro (valkey_build_and_install_bin target sources ld_flags libs link_name) + add_executable(${target} ${sources}) + + if (USE_JEMALLOC) + # Using jemalloc + target_link_libraries(${target} jemalloc) + endif () + + # Place this line last to ensure that ${ld_flags} is placed last on the linker line + target_link_libraries(${target} ${libs} ${ld_flags}) + target_link_libraries(${target} hiredis) + if (USE_TLS) + # Add required libraries needed for TLS + target_link_libraries(${target} OpenSSL::SSL hiredis_ssl) + endif () + + if (IS_FREEBSD) + target_link_libraries(${target} execinfo) + endif () + + # Install cli tool and create a redis symbolic link + valkey_install_bin(${target}) + valkey_create_symlink(${target} ${link_name}) +endmacro () + +# Helper function that defines, builds and installs `target` module. +macro (valkey_build_and_install_module target sources ld_flags libs) + add_library(${target} SHARED ${sources}) + + if (USE_JEMALLOC) + # Using jemalloc + target_link_libraries(${target} jemalloc) + endif () + + # Place this line last to ensure that ${ld_flags} is placed last on the linker line + target_link_libraries(${target} ${libs} ${ld_flags}) + if (USE_TLS) + # Add required libraries needed for TLS + target_link_libraries(${target} OpenSSL::SSL hiredis_ssl) + endif () + + if (IS_FREEBSD) + target_link_libraries(${target} execinfo) + endif () + + # Install cli tool and create a redis symbolic link + valkey_install_bin(${target}) +endmacro () + +# Determine if we are building in Release or Debug mode +if (CMAKE_BUILD_TYPE MATCHES Debug OR CMAKE_BUILD_TYPE MATCHES DebugFull) + set(VALKEY_DEBUG_BUILD 1) + set(VALKEY_RELEASE_BUILD 0) + message(STATUS "Building in debug mode") +else () + set(VALKEY_DEBUG_BUILD 0) + set(VALKEY_RELEASE_BUILD 1) + message(STATUS "Building in release mode") +endif () + +# ---------------------------------------------------- +# Helper functions - end +# ---------------------------------------------------- + +# ---------------------------------------------------- +# Build options (allocator, tls, rdma et al) +# ---------------------------------------------------- + +if (NOT BUILD_MALLOC) + if (APPLE) + set(BUILD_MALLOC "libc") + elseif (UNIX) + set(BUILD_MALLOC "jemalloc") + endif () +endif () + +# User may pass different allocator library. Using -DBUILD_MALLOC=, make sure it is a valid value +if (BUILD_MALLOC) + if ("${BUILD_MALLOC}" STREQUAL "jemalloc") + set(MALLOC_LIB "jemalloc") + add_valkey_server_compiler_options("-DUSE_JEMALLOC") + set(USE_JEMALLOC 1) + elseif ("${BUILD_MALLOC}" STREQUAL "libc") + set(MALLOC_LIB "libc") + elseif ("${BUILD_MALLOC}" STREQUAL "tcmalloc") + set(MALLOC_LIB "tcmalloc") + add_valkey_server_compiler_options("-DUSE_TCMALLOC") + elseif ("${BUILD_MALLOC}" STREQUAL "tcmalloc_minimal") + set(MALLOC_LIB "tcmalloc_minimal") + add_valkey_server_compiler_options("-DUSE_TCMALLOC") + else () + message(FATAL_ERROR "BUILD_MALLOC can be one of: jemalloc, libc, tcmalloc or tcmalloc_minimal") + endif () +endif () + +message(STATUS "Using ${MALLOC_LIB}") + +# TLS support +if (BUILD_TLS) + valkey_parse_build_option(${BUILD_TLS} USE_TLS) + if (USE_TLS EQUAL 1) + # Only search for OpenSSL if needed + find_package(OpenSSL REQUIRED) + message(STATUS "OpenSSL include dir: ${OPENSSL_INCLUDE_DIR}") + message(STATUS "OpenSSL libraries: ${OPENSSL_LIBRARIES}") + include_directories(${OPENSSL_INCLUDE_DIR}) + endif () + + if (USE_TLS EQUAL 1) + add_valkey_server_compiler_options("-DUSE_OPENSSL=1") + add_valkey_server_compiler_options("-DBUILD_TLS_MODULE=0") + else () + # Build TLS as a module RDMA can only be built as a module. So disable it + message(WARNING "BUILD_TLS can be one of: [ON | OFF | 1 | 0], but '${BUILD_TLS}' was provided") + message(STATUS "TLS support is disabled") + set(USE_TLS 0) + endif () +else () + # By default, TLS is disabled + message(STATUS "TLS is disabled") + set(USE_TLS 0) +endif () + +if (BUILD_RDMA) + set(BUILD_RDMA_MODULE 0) + # RDMA support (Linux only) + if (LINUX AND NOT APPLE) + valkey_parse_build_option(${BUILD_RDMA} USE_RDMA) + if (USE_RDMA EQUAL 2) # Module + message(STATUS "Building RDMA as module") + add_valkey_server_compiler_options("-DUSE_RDMA=2") + find_package(PkgConfig REQUIRED) + + # Locate librdmacm & libibverbs, fail if we can't find them + pkg_check_modules(RDMACM REQUIRED librdmacm) + pkg_check_modules(IBVERBS REQUIRED libibverbs) + + message(STATUS "${RDMACM_LINK_LIBRARIES};${IBVERBS_LINK_LIBRARIES}") + list(APPEND RDMA_LIBS "${RDMACM_LIBRARIES};${IBVERBS_LIBRARIES}") + unset(RDMACM_LINK_LIBRARIES CACHE) + unset(IBVERBS_LINK_LIBRARIES CACHE) + set(BUILD_RDMA_MODULE 1) + elseif (USE_RDMA EQUAL 1) + # RDMA can only be built as a module. So disable it + message(WARNING "BUILD_RDMA can be one of: [NO | 0 | MODULE], but '${BUILD_RDMA}' was provided") + message(STATUS "RDMA build is disabled") + set(USE_RDMA 0) + endif () + else () + message(WARNING "RDMA is only supported on Linux platforms") + endif () +endif () + +set(BUILDING_ARM64 0) +set(BUILDING_ARM32 0) + +if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64") + set(BUILDING_ARM64 1) +endif () + +if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm") + set(BUILDING_ARM32 1) +endif () + +message(STATUS "Building on ${CMAKE_HOST_SYSTEM_NAME}") +if (BUILDING_ARM64) + message(STATUS "Compiling valkey for ARM64") + add_valkey_server_linker_option("-funwind-tables") +endif () + +if (APPLE) + add_valkey_server_linker_option("-rdynamic") + add_valkey_server_linker_option("-ldl") +elseif (UNIX) + add_valkey_server_linker_option("-rdynamic") + add_valkey_server_linker_option("-pthread") + add_valkey_server_linker_option("-ldl") + add_valkey_server_linker_option("-lm") +endif () + +if (VALKEY_DEBUG_BUILD) + # Debug build, use enable "-fno-omit-frame-pointer" + add_valkey_server_compiler_options("-fno-omit-frame-pointer") +endif () + +# Check for Atomic +check_include_files(stdatomic.h HAVE_C11_ATOMIC) +if (HAVE_C11_ATOMIC) + add_valkey_server_compiler_options("-std=gnu11") +else () + add_valkey_server_compiler_options("-std=c99") +endif () + +# Sanitizer +if (BUILD_SANITIZER) + # For best results, force libc + set(MALLOC_LIB, "libc") + if ("${BUILD_SANITIZER}" STREQUAL "address") + add_valkey_server_compiler_options("-fsanitize=address -fno-sanitize-recover=all -fno-omit-frame-pointer") + add_valkey_server_linker_option("-fsanitize=address") + elseif ("${BUILD_SANITIZER}" STREQUAL "thread") + add_valkey_server_compiler_options("-fsanitize=thread -fno-sanitize-recover=all -fno-omit-frame-pointer") + add_valkey_server_linker_option("-fsanitize=thread") + elseif ("${BUILD_SANITIZER}" STREQUAL "undefined") + add_valkey_server_compiler_options("-fsanitize=undefined -fno-sanitize-recover=all -fno-omit-frame-pointer") + add_valkey_server_linker_option("-fsanitize=undefined") + else () + message(FATAL_ERROR "Unknown sanitizer: ${BUILD_SANITIZER}") + endif () +endif () + +include_directories("${CMAKE_SOURCE_DIR}/deps/hiredis") +include_directories("${CMAKE_SOURCE_DIR}/deps/linenoise") +include_directories("${CMAKE_SOURCE_DIR}/deps/lua/src") +include_directories("${CMAKE_SOURCE_DIR}/deps/hdr_histogram") +include_directories("${CMAKE_SOURCE_DIR}/deps/fpconv") + +add_subdirectory("${CMAKE_SOURCE_DIR}/deps") + +# Update linker flags for the allocator +if (USE_JEMALLOC) + include_directories("${CMAKE_SOURCE_DIR}/deps/jemalloc/include") +endif () + +# Common compiler flags +add_valkey_server_compiler_options("-pedantic") + +# ---------------------------------------------------- +# Build options (allocator, tls, rdma et al) - end +# ---------------------------------------------------- + +# ------------------------------------------------- +# Code Generation section +# ------------------------------------------------- +find_program(PYTHON_EXE python3) +if (PYTHON_EXE) + # Python based code generation + message(STATUS "Found python3: ${PYTHON_EXE}") + # Rule for generating commands.def file from json files + message(STATUS "Adding target generate_commands_def") + file(GLOB COMMAND_FILES_JSON "${CMAKE_SOURCE_DIR}/src/commands/*.json") + add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/commands_def_generated + DEPENDS ${COMMAND_FILES_JSON} + COMMAND ${PYTHON_EXE} ${CMAKE_SOURCE_DIR}/utils/generate-command-code.py + COMMAND touch ${CMAKE_BINARY_DIR}/commands_def_generated + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/src") + add_custom_target(generate_commands_def DEPENDS ${CMAKE_BINARY_DIR}/commands_def_generated) + + # Rule for generating fmtargs.h + message(STATUS "Adding target generate_fmtargs_h") + add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/fmtargs_generated + DEPENDS ${CMAKE_SOURCE_DIR}/utils/generate-fmtargs.py + COMMAND sed '/Everything/,$$d' fmtargs.h > fmtargs.h.tmp + COMMAND ${PYTHON_EXE} ${CMAKE_SOURCE_DIR}/utils/generate-fmtargs.py >> fmtargs.h.tmp + COMMAND mv fmtargs.h.tmp fmtargs.h + COMMAND touch ${CMAKE_BINARY_DIR}/fmtargs_generated + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/src") + add_custom_target(generate_fmtargs_h DEPENDS ${CMAKE_BINARY_DIR}/fmtargs_generated) + + # Rule for generating test_files.h + message(STATUS "Adding target generate_test_files_h") + file(GLOB UNIT_TEST_SRCS "${CMAKE_SOURCE_DIR}/src/unit/*.c") + add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/test_files_generated + DEPENDS "${UNIT_TEST_SRCS};${CMAKE_SOURCE_DIR}/utils/generate-unit-test-header.py" + COMMAND ${PYTHON_EXE} ${CMAKE_SOURCE_DIR}/utils/generate-unit-test-header.py + COMMAND touch ${CMAKE_BINARY_DIR}/test_files_generated + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/src") + add_custom_target(generate_test_files_h DEPENDS ${CMAKE_BINARY_DIR}/test_files_generated) +else () + # Fake targets + add_custom_target(generate_commands_def) + add_custom_target(generate_fmtargs_h) + add_custom_target(generate_test_files_h) +endif () + +# Generate release.h file (always) +add_custom_target( + release_header + COMMAND sh -c '${CMAKE_SOURCE_DIR}/src/mkreleasehdr.sh' + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/src") + +# ------------------------------------------------- +# Code Generation section - end +# ------------------------------------------------- + +# ---------------------------------------------------------- +# All our source files are defined in SourceFiles.cmake file +# ---------------------------------------------------------- +include(SourceFiles) + +# Clear the below variables from the cache +unset(CMAKE_C_FLAGS CACHE) +unset(BUILD_SANITIZER CACHE) +unset(VALKEY_SERVER_LDFLAGS CACHE) +unset(VALKEY_SERVER_CFLAGS CACHE) +unset(PYTHON_EXE CACHE) +unset(HAVE_C11_ATOMIC CACHE) +unset(USE_TLS CACHE) +unset(USE_RDMA CACHE) +unset(BUILD_TLS CACHE) +unset(BUILD_RDMA CACHE) +unset(BUILD_MALLOC CACHE) +unset(USE_JEMALLOC CACHE) +unset(BUILD_TLS_MODULE CACHE) +unset(BUILD_TLS_BUILTIN CACHE) diff --git a/deps/CMakeLists.txt b/deps/CMakeLists.txt new file mode 100644 index 0000000000..c904b94031 --- /dev/null +++ b/deps/CMakeLists.txt @@ -0,0 +1,26 @@ +add_subdirectory(jemalloc) +add_subdirectory(lua) + +# Set hiredis options. We need to disable the defaults set in the OPTION(..) we do this by setting them in the CACHE +set(BUILD_SHARED_LIBS + OFF + CACHE BOOL "Build shared libraries") +set(DISABLE_TESTS + ON + CACHE BOOL "If tests should be compiled or not") +if (USE_TLS) # Module or no module + message(STATUS "Building hiredis_ssl") + set(ENABLE_SSL + ON + CACHE BOOL "Should we test SSL connections") +endif () + +add_subdirectory(hiredis) +add_subdirectory(linenoise) +add_subdirectory(fpconv) +add_subdirectory(hdr_histogram) + +# Clear any cached variables passed to hiredis from the cache +unset(BUILD_SHARED_LIBS CACHE) +unset(DISABLE_TESTS CACHE) +unset(ENABLE_SSL CACHE) diff --git a/deps/fpconv/CMakeLists.txt b/deps/fpconv/CMakeLists.txt new file mode 100644 index 0000000000..c586aa650a --- /dev/null +++ b/deps/fpconv/CMakeLists.txt @@ -0,0 +1,4 @@ +project(fpconv) + +set(SRCS "${CMAKE_CURRENT_LIST_DIR}/fpconv_dtoa.c" "${CMAKE_CURRENT_LIST_DIR}/fpconv_dtoa.h") +add_library(fpconv STATIC ${SRCS}) diff --git a/deps/hdr_histogram/CMakeLists.txt b/deps/hdr_histogram/CMakeLists.txt new file mode 100644 index 0000000000..7b45bd76ba --- /dev/null +++ b/deps/hdr_histogram/CMakeLists.txt @@ -0,0 +1,7 @@ +project(hdr_histogram) + +set(SRCS "${CMAKE_CURRENT_LIST_DIR}/hdr_histogram.c" "${CMAKE_CURRENT_LIST_DIR}/hdr_histogram.h" + "${CMAKE_CURRENT_LIST_DIR}/hdr_atomic.h" "${CMAKE_CURRENT_LIST_DIR}/hdr_redis_malloc.h") + +add_library(hdr_histogram STATIC ${SRCS}) +target_compile_definitions(hdr_histogram PRIVATE HDR_MALLOC_INCLUDE=\"hdr_redis_malloc.h\") diff --git a/deps/jemalloc/CMakeLists.txt b/deps/jemalloc/CMakeLists.txt new file mode 100644 index 0000000000..e79e960ec2 --- /dev/null +++ b/deps/jemalloc/CMakeLists.txt @@ -0,0 +1,23 @@ +project(jemalloc) + +# Build jemalloc using configure && make install +set(JEMALLOC_INSTALL_DIR ${CMAKE_BINARY_DIR}/jemalloc-build) +set(JEMALLOC_SRC_DIR ${CMAKE_CURRENT_LIST_DIR}) +if (NOT EXISTS ${JEMALLOC_INSTALL_DIR}/lib/libjemalloc.a) + message(STATUS "Building jemalloc (custom build)") + message(STATUS "JEMALLOC_SRC_DIR = ${JEMALLOC_SRC_DIR}") + message(STATUS "JEMALLOC_INSTALL_DIR = ${JEMALLOC_INSTALL_DIR}") + + execute_process( + COMMAND sh -c "${JEMALLOC_SRC_DIR}/configure --disable-cxx \ + --with-version=5.3.0-0-g0 --with-lg-quantum=3 --disable-cache-oblivious --with-jemalloc-prefix=je_ \ + --enable-static --disable-shared --prefix=${JEMALLOC_INSTALL_DIR}" + WORKING_DIRECTORY ${JEMALLOC_SRC_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND make -j${VALKEY_PROCESSOR_COUNT} lib/libjemalloc.a install + WORKING_DIRECTORY "${JEMALLOC_SRC_DIR}") +endif () + +# Import the compiled library as a CMake target +add_library(jemalloc STATIC IMPORTED GLOBAL) +set_target_properties(jemalloc PROPERTIES IMPORTED_LOCATION "${JEMALLOC_INSTALL_DIR}/lib/libjemalloc.a" + INCLUDE_DIRECTORIES "${JEMALLOC_INSTALL_DIR}/include") diff --git a/deps/linenoise/CMakeLists.txt b/deps/linenoise/CMakeLists.txt new file mode 100644 index 0000000000..f801e4abf1 --- /dev/null +++ b/deps/linenoise/CMakeLists.txt @@ -0,0 +1,4 @@ +project(linenoise) + +set(SRCS "${CMAKE_CURRENT_LIST_DIR}/linenoise.c" "${CMAKE_CURRENT_LIST_DIR}/linenoise.h") +add_library(linenoise STATIC ${SRCS}) diff --git a/deps/lua/CMakeLists.txt b/deps/lua/CMakeLists.txt new file mode 100644 index 0000000000..e911de9232 --- /dev/null +++ b/deps/lua/CMakeLists.txt @@ -0,0 +1,44 @@ +project(lualib) + +set(LUA_SRC_DIR "${CMAKE_CURRENT_LIST_DIR}/src") +set(LUA_SRCS + ${LUA_SRC_DIR}/fpconv.c + ${LUA_SRC_DIR}/lbaselib.c + ${LUA_SRC_DIR}/lmathlib.c + ${LUA_SRC_DIR}/lstring.c + ${LUA_SRC_DIR}/lparser.c + ${LUA_SRC_DIR}/ldo.c + ${LUA_SRC_DIR}/lzio.c + ${LUA_SRC_DIR}/lmem.c + ${LUA_SRC_DIR}/strbuf.c + ${LUA_SRC_DIR}/lstrlib.c + ${LUA_SRC_DIR}/lundump.c + ${LUA_SRC_DIR}/lua_cmsgpack.c + ${LUA_SRC_DIR}/loslib.c + ${LUA_SRC_DIR}/lua_struct.c + ${LUA_SRC_DIR}/ldebug.c + ${LUA_SRC_DIR}/lobject.c + ${LUA_SRC_DIR}/ldump.c + ${LUA_SRC_DIR}/lua_cjson.c + ${LUA_SRC_DIR}/ldblib.c + ${LUA_SRC_DIR}/ltm.c + ${LUA_SRC_DIR}/ltable.c + ${LUA_SRC_DIR}/lstate.c + ${LUA_SRC_DIR}/lua_bit.c + ${LUA_SRC_DIR}/lua.c + ${LUA_SRC_DIR}/loadlib.c + ${LUA_SRC_DIR}/lcode.c + ${LUA_SRC_DIR}/lapi.c + ${LUA_SRC_DIR}/lgc.c + ${LUA_SRC_DIR}/lvm.c + ${LUA_SRC_DIR}/lfunc.c + ${LUA_SRC_DIR}/lauxlib.c + ${LUA_SRC_DIR}/ltablib.c + ${LUA_SRC_DIR}/linit.c + ${LUA_SRC_DIR}/lopcodes.c + ${LUA_SRC_DIR}/llex.c + ${LUA_SRC_DIR}/liolib.c) + +add_library(lualib STATIC "${LUA_SRCS}") +target_include_directories(lualib PUBLIC "${LUA_SRC_DIR}") +target_compile_definitions(lualib PRIVATE ENABLE_CJSON_GLOBAL) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000000..b7e328163b --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,77 @@ +project(valkey-server) + +set(INSTALL_BIN_PATH ${CMAKE_INSTALL_PREFIX}/bin) +set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1) + +# Target: valkey-server +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${VALKEY_SERVER_CFLAGS}") +message(STATUS "CFLAGS: ${CMAKE_C_FLAGS}") + +get_valkey_server_linker_option(VALKEY_SERVER_LDFLAGS) +list(APPEND SERVER_LIBS "fpconv") +list(APPEND SERVER_LIBS "lualib") +list(APPEND SERVER_LIBS "hdr_histogram") +valkey_build_and_install_bin(valkey-server "${VALKEY_SERVER_SRCS}" "${VALKEY_SERVER_LDFLAGS}" "${SERVER_LIBS}" + "redis-server") +add_dependencies(valkey-server generate_commands_def) +add_dependencies(valkey-server generate_fmtargs_h) +add_dependencies(valkey-server release_header) + +if (VALKEY_RELEASE_BUILD) + # Enable LTO for Release build + set_property(TARGET valkey-server PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE) +endif () + +# Target: valkey-cli +list(APPEND CLI_LIBS "linenoise") +valkey_build_and_install_bin(valkey-cli "${VALKEY_CLI_SRCS}" "${VALKEY_SERVER_LDFLAGS}" "${CLI_LIBS}" "redis-cli") +add_dependencies(valkey-cli generate_commands_def) +add_dependencies(valkey-cli generate_fmtargs_h) + +# Target: valkey-benchmark +list(APPEND BENCH_LIBS "hdr_histogram") +valkey_build_and_install_bin(valkey-benchmark "${VALKEY_BENCHMARK_SRCS}" "${VALKEY_SERVER_LDFLAGS}" "${BENCH_LIBS}" + "redis-benchmark") +add_dependencies(valkey-benchmark generate_commands_def) +add_dependencies(valkey-benchmark generate_fmtargs_h) + +# Targets: valkey-sentinel, valkey-check-aof and valkey-check-rdb are just symbolic links +valkey_create_symlink("valkey-server" "valkey-sentinel") +valkey_create_symlink("valkey-server" "valkey-check-rdb") +valkey_create_symlink("valkey-server" "valkey-check-aof") + +# Target valkey-rdma +if (BUILD_RDMA_MODULE) + set(MODULE_NAME "valkey-rdma") + message(STATUS "Building RDMA module") + add_library(${MODULE_NAME} SHARED "${VALKEY_RDMA_MODULE_SRCS}") + target_compile_options(${MODULE_NAME} PRIVATE -DBUILD_RDMA_MODULE -DUSE_RDMA=1) + target_link_libraries(${MODULE_NAME} "${RDMA_LIBS}") + # remove the "lib" prefix from the module + set_target_properties(${MODULE_NAME} PROPERTIES PREFIX "") + valkey_install_bin(${MODULE_NAME}) +endif () + +# Target valkey-tls (a module) +if (BUILD_TLS_MODULE) + message(STATUS "Building TLS as a module") + set(MODULE_NAME "valkey-tls") + add_library(${MODULE_NAME} SHARED ${VALKEY_TLS_MODULE_SRCS}) + target_compile_options(${MODULE_NAME} PRIVATE -DUSE_OPENSSL=2 -DBUILD_TLS_MODULE=2) + if (APPLE) + # Some symbols can only be resolved during runtime (they exist in the executable) + target_link_options(${MODULE_NAME} PRIVATE -undefined dynamic_lookup) + endif () + target_link_libraries(${MODULE_NAME} hiredis_ssl OpenSSL::SSL) + set_target_properties(${MODULE_NAME} PROPERTIES PREFIX "") +endif () + +if (BUILD_EXAMPLE_MODULES) + # Include the modules ("hello*") + message(STATUS "Building example modules") + add_subdirectory(modules) +endif () + +if (BUILD_UNIT_TESTS) + add_subdirectory(unit) +endif () diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt new file mode 100644 index 0000000000..958796232f --- /dev/null +++ b/src/modules/CMakeLists.txt @@ -0,0 +1,21 @@ +# Build modules +list(APPEND MODULES_LIST "helloacl") +list(APPEND MODULES_LIST "helloblock") +list(APPEND MODULES_LIST "hellocluster") +list(APPEND MODULES_LIST "hellodict") +list(APPEND MODULES_LIST "hellohook") +list(APPEND MODULES_LIST "hellotimer") +list(APPEND MODULES_LIST "hellotype") +list(APPEND MODULES_LIST "helloworld") + +foreach (MODULE_NAME ${MODULES_LIST}) + message(STATUS "Building module: ${MODULE_NAME}") + add_library(${MODULE_NAME} SHARED "${CMAKE_CURRENT_LIST_DIR}/${MODULE_NAME}.c") + target_include_directories(${MODULE_NAME} PRIVATE "${CMAKE_SOURCE_DIR}/src") + set_target_properties(${MODULE_NAME} PROPERTIES PREFIX "") + valkey_install_bin(${MODULE_NAME}) + if (APPLE) + # Some symbols can only be resolved during runtime (they exist in the executable) + target_link_options(${MODULE_NAME} PRIVATE -undefined dynamic_lookup) + endif () +endforeach () diff --git a/src/server.c b/src/server.c index eda9a5b582..e8c13dd763 100644 --- a/src/server.c +++ b/src/server.c @@ -7148,5 +7148,4 @@ __attribute__((weak)) int main(int argc, char **argv) { aeDeleteEventLoop(server.el); return 0; } - /* The End */ diff --git a/src/unit/CMakeLists.txt b/src/unit/CMakeLists.txt new file mode 100644 index 0000000000..7d80c533cf --- /dev/null +++ b/src/unit/CMakeLists.txt @@ -0,0 +1,58 @@ +project(valkey-unit-tests) + +file(GLOB UNIT_TEST_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.c") +set(UNIT_TEST_SRCS "${UNIT_TEST_SRCS}") + +get_valkey_server_linker_option(VALKEY_SERVER_LDFLAGS) + +# Build unit tests only +message(STATUS "Building unit tests") +list(APPEND COMPILE_DEFINITIONS "SERVER_TEST=1") +if (USE_TLS) + if (BUILD_TLS_MODULE) + # TLS as a module + list(APPEND COMPILE_DEFINITIONS "USE_OPENSSL=2") + else (BUILD_TLS_MODULE) + # Built-in TLS support + list(APPEND COMPILE_DEFINITIONS "USE_OPENSSL=1") + list(APPEND COMPILE_DEFINITIONS "BUILD_TLS_MODULE=0") + endif () +endif () + +# Build Valkey sources as a static library for the test +add_library(valkeylib STATIC ${VALKEY_SERVER_SRCS}) +target_compile_options(valkeylib PRIVATE "${COMPILE_FLAGS}") +target_compile_definitions(valkeylib PRIVATE "${COMPILE_DEFINITIONS}") + +add_executable(valkey-unit-tests ${UNIT_TEST_SRCS}) +target_compile_options(valkey-unit-tests PRIVATE "${COMPILE_FLAGS}") +target_compile_definitions(valkey-unit-tests PRIVATE "${COMPILE_DEFINITIONS}") +add_dependencies(valkey-unit-tests generate_test_files_h) + +if (UNIX AND NOT APPLE) + # Avoid duplicate symbols on non macOS + target_link_options(valkey-unit-tests PRIVATE "-Wl,--allow-multiple-definition") +endif () + +if (USE_JEMALLOC) + # Using jemalloc + target_link_libraries(valkey-unit-tests jemalloc) +endif () + +if (IS_FREEBSD) + target_link_libraries(valkey-unit-tests execinfo) +endif () + +target_link_libraries( + valkey-unit-tests + valkeylib + fpconv + lualib + hdr_histogram + hiredis + ${VALKEY_SERVER_LDFLAGS}) + +if (USE_TLS) + # Add required libraries needed for TLS + target_link_libraries(valkey-unit-tests OpenSSL::SSL hiredis_ssl) +endif () diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000000..2a76897bb0 --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,5 @@ +add_subdirectory(rdma) + +if (BUILD_TEST_MODULES) + add_subdirectory(modules) +endif () diff --git a/tests/modules/CMakeLists.txt b/tests/modules/CMakeLists.txt new file mode 100644 index 0000000000..0cac0c4cb6 --- /dev/null +++ b/tests/modules/CMakeLists.txt @@ -0,0 +1,58 @@ +# Build test modules +list(APPEND MODULES_LIST "commandfilter") +list(APPEND MODULES_LIST "basics") +list(APPEND MODULES_LIST "testrdb") +list(APPEND MODULES_LIST "fork") +list(APPEND MODULES_LIST "infotest") +list(APPEND MODULES_LIST "propagate") +list(APPEND MODULES_LIST "misc") +list(APPEND MODULES_LIST "hooks") +list(APPEND MODULES_LIST "blockonkeys") +list(APPEND MODULES_LIST "blockonbackground") +list(APPEND MODULES_LIST "scan") +list(APPEND MODULES_LIST "datatype") +list(APPEND MODULES_LIST "datatype2") +list(APPEND MODULES_LIST "auth") +list(APPEND MODULES_LIST "keyspace_events") +list(APPEND MODULES_LIST "blockedclient") +list(APPEND MODULES_LIST "getkeys") +list(APPEND MODULES_LIST "getchannels") +list(APPEND MODULES_LIST "test_lazyfree") +list(APPEND MODULES_LIST "timer") +list(APPEND MODULES_LIST "defragtest") +list(APPEND MODULES_LIST "keyspecs") +list(APPEND MODULES_LIST "hash") +list(APPEND MODULES_LIST "zset") +list(APPEND MODULES_LIST "stream") +list(APPEND MODULES_LIST "mallocsize") +list(APPEND MODULES_LIST "aclcheck") +list(APPEND MODULES_LIST "list") +list(APPEND MODULES_LIST "subcommands") +list(APPEND MODULES_LIST "reply") +list(APPEND MODULES_LIST "cmdintrospection") +list(APPEND MODULES_LIST "eventloop") +list(APPEND MODULES_LIST "moduleconfigs") +list(APPEND MODULES_LIST "moduleconfigstwo") +list(APPEND MODULES_LIST "publish") +list(APPEND MODULES_LIST "usercall") +list(APPEND MODULES_LIST "postnotifications") +list(APPEND MODULES_LIST "moduleauthtwo") +list(APPEND MODULES_LIST "rdbloadsave") +list(APPEND MODULES_LIST "crash") +list(APPEND MODULES_LIST "cluster") + +foreach (MODULE_NAME ${MODULES_LIST}) + message(STATUS "Building test module: ${MODULE_NAME}") + add_library(${MODULE_NAME} SHARED "${CMAKE_SOURCE_DIR}/tests/modules/${MODULE_NAME}.c") + target_include_directories(${MODULE_NAME} PRIVATE "${CMAKE_SOURCE_DIR}/src") + if (LINUX AND NOT APPLE) + # set the std to gnu11 here, to allow crash.c to get compiled + target_compile_options(${MODULE_NAME} PRIVATE "-std=gnu11") + endif () + set_target_properties(${MODULE_NAME} PROPERTIES PREFIX "") + valkey_install_bin(${MODULE_NAME}) + if (APPLE) + # Some symbols can only be resolved during runtime (they exist in the executable) + target_link_options(${MODULE_NAME} PRIVATE -undefined dynamic_lookup) + endif () +endforeach () diff --git a/tests/rdma/CMakeLists.txt b/tests/rdma/CMakeLists.txt new file mode 100644 index 0000000000..f721e9af52 --- /dev/null +++ b/tests/rdma/CMakeLists.txt @@ -0,0 +1,9 @@ +project(rdma-test) + +# Make sure RDMA build is enabled +if (BUILD_RDMA_MODULE) + add_executable(rdma-test "${CMAKE_SOURCE_DIR}/tests/rdma/rdma-test.c") + target_link_libraries(rdma-test "${RDMA_LIBS}") + target_link_options(rdma-test PRIVATE "-pthread") + valkey_install_bin(rdma-test) +endif () From e972d564609d50e97e19672b19f7590c09b4c086 Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Fri, 8 Nov 2024 02:25:43 +0000 Subject: [PATCH 18/92] Make sure to copy null terminator byte in dual channel code (#1272) As @madolson pointed out, these do have proper null terminators. This cleans them up to follow the rest of the code which copies the last byte explicitly, which should help reduce cognitive load and make it more resilient should code refactors occur (e.g. non-static allocation of memory, changes to other functions). --------- Signed-off-by: Jacob Murphy --- src/replication.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/replication.c b/src/replication.c index 6e8faff7a2..48e98ab8e7 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2697,7 +2697,7 @@ static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) { /* Initiate repl_provisional_primary to act as this replica temp primary until RDB is loaded */ server.repl_provisional_primary.conn = server.repl_transfer_s; - memcpy(server.repl_provisional_primary.replid, primary_replid, CONFIG_RUN_ID_SIZE); + memcpy(server.repl_provisional_primary.replid, primary_replid, sizeof(server.repl_provisional_primary.replid)); server.repl_provisional_primary.reploff = reploffset; server.repl_provisional_primary.read_reploff = reploffset; server.repl_provisional_primary.dbid = dbid; @@ -4269,7 +4269,7 @@ void replicationResurrectProvisionalPrimary(void) { /* Create a primary client, but do not initialize the read handler yet, as this replica still has a local buffer to * drain. */ replicationCreatePrimaryClientWithHandler(server.repl_transfer_s, server.repl_provisional_primary.dbid, NULL); - memcpy(server.primary->replid, server.repl_provisional_primary.replid, CONFIG_RUN_ID_SIZE); + memcpy(server.primary->replid, server.repl_provisional_primary.replid, sizeof(server.repl_provisional_primary.replid)); server.primary->reploff = server.repl_provisional_primary.reploff; server.primary->read_reploff = server.repl_provisional_primary.read_reploff; server.primary_repl_offset = server.primary->reploff; From 45d596e1216472e49b9f950a4b9a040b6e87add6 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Fri, 8 Nov 2024 16:33:01 +0800 Subject: [PATCH 19/92] RDMA: Use conn ref counter to prevent double close (#1250) RDMA: Use connection reference counter style The reference counter of connection is used to protect re-entry of closenmethod. Use this style instead the unsafe one. Signed-off-by: zhenwei pi --- src/rdma.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/rdma.c b/src/rdma.c index bb38baa0f1..7cdcb24913 100644 --- a/src/rdma.c +++ b/src/rdma.c @@ -1199,6 +1199,14 @@ static void connRdmaClose(connection *conn) { conn->fd = -1; } + /* If called from within a handler, schedule the close but + * keep the connection until the handler returns. + */ + if (connHasRefs(conn)) { + conn->flags |= CONN_FLAG_CLOSE_SCHEDULED; + return; + } + if (!cm_id) { return; } @@ -1689,7 +1697,6 @@ static int rdmaProcessPendingData(void) { listNode *ln; rdma_connection *rdma_conn; connection *conn; - listNode *node; int processed; processed = listLength(pending_list); @@ -1697,17 +1704,17 @@ static int rdmaProcessPendingData(void) { while ((ln = listNext(&li))) { rdma_conn = listNodeValue(ln); conn = &rdma_conn->c; - node = rdma_conn->pending_list_node; /* a connection can be disconnected by remote peer, CM event mark state as CONN_STATE_CLOSED, kick connection * read/write handler to close connection */ if (conn->state == CONN_STATE_ERROR || conn->state == CONN_STATE_CLOSED) { - listDelNode(pending_list, node); - /* do NOT call callHandler(conn, conn->read_handler) here, conn is freed in handler! */ - if (conn->read_handler) { - conn->read_handler(conn); - } else if (conn->write_handler) { - conn->write_handler(conn); + listDelNode(pending_list, rdma_conn->pending_list_node); + rdma_conn->pending_list_node = NULL; + /* Invoke both read_handler and write_handler, unless read_handler + returns 0, indicating the connection has closed, in which case + write_handler will be skipped. */ + if (callHandler(conn, conn->read_handler)) { + callHandler(conn, conn->write_handler); } continue; From 0b5b2c7484e6d401ce7818571bde09b49f88180e Mon Sep 17 00:00:00 2001 From: zixuan zhao Date: Mon, 11 Nov 2024 04:33:26 -0500 Subject: [PATCH 20/92] Log as primary role (M) instead of child process (C) during startup (#1282) Init server.pid earlier to keep log message role consistent. Closes #1206. Before: ```text 24881:C 21 Oct 2024 21:10:57.165 * oO0OoO0OoO0Oo Valkey is starting oO0OoO0OoO0Oo 24881:C 21 Oct 2024 21:10:57.165 * Valkey version=255.255.255, bits=64, commit=814e0f55, modified=1, pid=24881, just started 24881:C 21 Oct 2024 21:10:57.165 * Configuration loaded 24881:M 21 Oct 2024 21:10:57.167 * Increased maximum number of open files to 10032 (it was originally set to 1024). ``` After: ```text 68560:M 08 Nov 2024 16:10:12.257 * oO0OoO0OoO0Oo Valkey is starting oO0OoO0OoO0Oo 68560:M 08 Nov 2024 16:10:12.257 * Valkey version=255.255.255, bits=64, commit=45d596e1, modified=1, pid=68560, just started 68560:M 08 Nov 2024 16:10:12.257 * Configuration loaded 68560:M 08 Nov 2024 16:10:12.258 * monotonic clock: POSIX clock_gettime ``` Signed-off-by: azuredream --- src/server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.c b/src/server.c index e8c13dd763..0c4ddbe4b8 100644 --- a/src/server.c +++ b/src/server.c @@ -2670,7 +2670,6 @@ void initServer(void) { server.aof_state = server.aof_enabled ? AOF_ON : AOF_OFF; server.fsynced_reploff = server.aof_enabled ? 0 : -1; server.hz = server.config_hz; - server.pid = getpid(); server.in_fork_child = CHILD_TYPE_NONE; server.rdb_pipe_read = -1; server.rdb_child_exit_pipe = -1; @@ -6883,6 +6882,7 @@ __attribute__((weak)) int main(int argc, char **argv) { if (exec_name == NULL) exec_name = argv[0]; server.sentinel_mode = checkForSentinelMode(argc, argv, exec_name); initServerConfig(); + server.pid = getpid(); ACLInit(); /* The ACL subsystem must be initialized ASAP because the basic networking code and client creation depends on it. */ moduleInitModulesSystem(); From 9300a7ebc856356f1d55df16ddfb845773b5daca Mon Sep 17 00:00:00 2001 From: Qu Chen Date: Mon, 11 Nov 2024 01:39:48 -0800 Subject: [PATCH 21/92] Set fields to NULL after free in freeClient() (#1279) Null out several references after freeing the object in `freeClient()`. This is just to make the code more safe, to protect against use-after-free for future changes. Signed-off-by: Qu Chen --- src/networking.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/networking.c b/src/networking.c index 96dd05d505..1a008a852d 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1731,6 +1731,7 @@ void freeClient(client *c) { /* UNWATCH all the keys */ unwatchAllKeys(c); listRelease(c->watched_keys); + c->watched_keys = NULL; /* Unsubscribe from all the pubsub channels */ pubsubUnsubscribeAllChannels(c, 0); @@ -1738,16 +1739,22 @@ void freeClient(client *c) { pubsubUnsubscribeAllPatterns(c, 0); unmarkClientAsPubSub(c); dictRelease(c->pubsub_channels); + c->pubsub_channels = NULL; dictRelease(c->pubsub_patterns); + c->pubsub_patterns = NULL; dictRelease(c->pubsubshard_channels); + c->pubsubshard_channels = NULL; /* Free data structures. */ listRelease(c->reply); + c->reply = NULL; zfree(c->buf); + c->buf = NULL; freeReplicaReferencedReplBuffer(c); freeClientArgv(c); freeClientOriginalArgv(c); if (c->deferred_reply_errors) listRelease(c->deferred_reply_errors); + c->deferred_reply_errors = NULL; #ifdef LOG_REQ_RES reqresReset(c, 1); #endif From 4aacffa32da07eb09b271c7c3dfbd58c7a2cb8d1 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 11 Nov 2024 21:42:34 +0800 Subject: [PATCH 22/92] Stabilize dual replication test to avoid getting LOADING error (#1288) When doing `$replica replicaof no one`, we may get a LOADING error, this is because during the test execution, the replica may reconnect very quickly, and the full sync is initiated, and the replica has entered the LOADING state. In this commit, we make sure the primary is pasued after the fork, so the replica won't enter the LOADING state, and with this fix, this test seems more natural and predictable. Signed-off-by: Binbin --- .../integration/dual-channel-replication.tcl | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl index 5302030db9..05bdc130c1 100644 --- a/tests/integration/dual-channel-replication.tcl +++ b/tests/integration/dual-channel-replication.tcl @@ -23,14 +23,20 @@ proc get_client_id_by_last_cmd {r cmd} { return $client_id } -# Wait until the process enters a paused state, then resume the process. -proc wait_and_resume_process idx { +# Wait until the process enters a paused state. +proc wait_process_paused idx { set pid [srv $idx pid] wait_for_condition 50 1000 { [string match "T*" [exec ps -o state= -p $pid]] } else { fail "Process $pid didn't stop, current state is [exec ps -o state= -p $pid]" } +} + +# Wait until the process enters a paused state, then resume the process. +proc wait_and_resume_process idx { + set pid [srv $idx pid] + wait_process_paused $idx resume_process $pid } @@ -790,11 +796,20 @@ start_server {tags {"dual-channel-replication external:skip"}} { } else { fail "Primary did not free repl buf block after sync failure" } + # Full sync will be triggered after the replica is reconnected, pause primary main process after fork. + # In this way, in the subsequent replicaof no one, we won't get the LOADING error if the replica reconnects + # too quickly and enters the loading state. + $primary debug pause-after-fork 1 resume_process $replica_pid set res [wait_for_log_messages -1 {"*Unable to partial resync with replica * for lack of backlog*"} $loglines 2000 10] set loglines [lindex $res 1] } + # Waiting for the primary to enter the paused state, that is, make sure that bgsave is triggered. + wait_process_paused -1 $replica replicaof no one + # Resume the primary and make sure the sync is dropped. + resume_process [srv -1 pid] + $primary debug pause-after-fork 0 wait_for_condition 500 1000 { [s -1 rdb_bgsave_in_progress] eq 0 } else { From 167e8ab8de4c26a41222d94fcf0ccbd1864a9774 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 11 Nov 2024 21:43:46 +0800 Subject: [PATCH 23/92] Trigger the election immediately when doing a manual failover (#1081) Currently when a manual failover is triggeded, we will set a CLUSTER_TODO_HANDLE_FAILOVER to start the election as soon as possible in the next beforeSleep. But in fact, we won't delay the election in manual failover, waitting for the next beforeSleep to kick in will delay the election a some milliseconds. We can trigger the election immediately in this case in the same function call, without waitting for beforeSleep, which can save us some milliseconds. Signed-off-by: Binbin --- src/cluster_legacy.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index f1c9eb1fcf..04a04774fe 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4519,8 +4519,9 @@ void clusterFailoverReplaceYourPrimary(void) { * 3) Perform the failover informing all the other nodes. */ void clusterHandleReplicaFailover(void) { + mstime_t now = mstime(); mstime_t data_age; - mstime_t auth_age = mstime() - server.cluster->failover_auth_time; + mstime_t auth_age = now - server.cluster->failover_auth_time; int needed_quorum = (server.cluster->size / 2) + 1; int manual_failover = server.cluster->mf_end != 0 && server.cluster->mf_can_start; mstime_t auth_timeout, auth_retry_time; @@ -4582,7 +4583,7 @@ void clusterHandleReplicaFailover(void) { /* If the previous failover attempt timeout and the retry time has * elapsed, we can setup a new one. */ if (auth_age > auth_retry_time) { - server.cluster->failover_auth_time = mstime() + + server.cluster->failover_auth_time = now + 500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */ random() % 500; /* Random delay between 0 and 500 milliseconds. */ server.cluster->failover_auth_count = 0; @@ -4594,20 +4595,26 @@ void clusterHandleReplicaFailover(void) { server.cluster->failover_auth_time += server.cluster->failover_auth_rank * 1000; /* However if this is a manual failover, no delay is needed. */ if (server.cluster->mf_end) { - server.cluster->failover_auth_time = mstime(); + server.cluster->failover_auth_time = now; server.cluster->failover_auth_rank = 0; - clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); + /* Reset auth_age since it is outdated now and we can bypass the auth_timeout + * check in the next state and start the election ASAP. */ + auth_age = 0; } serverLog(LL_NOTICE, "Start of election delayed for %lld milliseconds " "(rank #%d, offset %lld).", - server.cluster->failover_auth_time - mstime(), server.cluster->failover_auth_rank, + server.cluster->failover_auth_time - now, server.cluster->failover_auth_rank, replicationGetReplicaOffset()); /* Now that we have a scheduled election, broadcast our offset * to all the other replicas so that they'll updated their offsets * if our offset is better. */ clusterBroadcastPong(CLUSTER_BROADCAST_LOCAL_REPLICAS); - return; + + /* Return ASAP if we can't start the election now. In a manual failover, + * we can start the election immediately, so in this case we continue to + * the next state without waiting for the next beforeSleep. */ + if (now < server.cluster->failover_auth_time) return; } /* It is possible that we received more updated offsets from other @@ -4627,7 +4634,7 @@ void clusterHandleReplicaFailover(void) { } /* Return ASAP if we can't still start the election. */ - if (mstime() < server.cluster->failover_auth_time) { + if (now < server.cluster->failover_auth_time) { clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_DELAY); return; } From a2d22c63c007eee1709ca71d9bf1e912fadb4f87 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 11 Nov 2024 22:12:49 +0800 Subject: [PATCH 24/92] Fix replica not able to initate election in time when epoch fails (#1009) If multiple primary nodes go down at the same time, their replica nodes will initiate the elections at the same time. There is a certain probability that the replicas will initate the elections in the same epoch. And obviously, in our current election mechanism, only one replica node can eventually get the enough votes, and the other replica node will fail to win due the the insufficient majority, and then its election will time out and we will wait for the retry, which result in a long failure time. If another node has been won the election in the failover epoch, we can assume that my election has failed and we can retry as soom as possible. Signed-off-by: Binbin --- src/cluster_legacy.c | 18 +++++++++++++++++ tests/unit/cluster/failover2.tcl | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 04a04774fe..ee7e4c531e 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3135,6 +3135,24 @@ int clusterProcessPacket(clusterLink *link) { if (sender_claims_to_be_primary && sender_claimed_config_epoch > sender->configEpoch) { sender->configEpoch = sender_claimed_config_epoch; clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG); + + if (server.cluster->failover_auth_time && sender->configEpoch >= server.cluster->failover_auth_epoch) { + /* Another node has claimed an epoch greater than or equal to ours. + * If we have an ongoing election, reset it because we cannot win + * with an epoch smaller than or equal to the incoming claim. This + * allows us to start a new election as soon as possible. */ + server.cluster->failover_auth_time = 0; + serverLog(LL_WARNING, + "Failover election in progress for epoch %llu, but received a claim from " + "node %.40s (%s) with an equal or higher epoch %llu. Resetting the election " + "since we cannot win an election in the past.", + (unsigned long long)server.cluster->failover_auth_epoch, + sender->name, sender->human_nodename, + (unsigned long long)sender->configEpoch); + /* Maybe we could start a new election, set a flag here to make sure + * we check as soon as possible, instead of waiting for a cron. */ + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); + } } /* Update the replication offset info for this node. */ sender->repl_offset = ntohu64(hdr->offset); diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl index 7bc6a05e95..21c4f4a678 100644 --- a/tests/unit/cluster/failover2.tcl +++ b/tests/unit/cluster/failover2.tcl @@ -64,3 +64,36 @@ start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval } } ;# start_cluster + + +start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} { + test "Primaries will not time out then they are elected in the same epoch" { + # Since we have the delay time, so these node may not initiate the + # election at the same time (same epoch). But if they do, we make + # sure there is no failover timeout. + + # Killing there primary nodes. + pause_process [srv 0 pid] + pause_process [srv -1 pid] + pause_process [srv -2 pid] + + # Wait for the failover + wait_for_condition 1000 50 { + [s -7 role] == "master" && + [s -8 role] == "master" && + [s -9 role] == "master" + } else { + fail "No failover detected" + } + + # Make sure there is no failover timeout. + verify_no_log_message -7 "*Failover attempt expired*" 0 + verify_no_log_message -8 "*Failover attempt expired*" 0 + verify_no_log_message -9 "*Failover attempt expired*" 0 + + # Resuming these primary nodes, speed up the shutdown. + resume_process [srv 0 pid] + resume_process [srv -1 pid] + resume_process [srv -2 pid] + } +} ;# start_cluster From 2df56d87c0ebe802f38e8922bb2ea1e4ca9cfa76 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 11 Nov 2024 22:13:47 +0800 Subject: [PATCH 25/92] Fix empty primary may have dirty slots data due to bad migration (#1285) If we become an empty primary for some reason, we still need to check if we need to delete dirty slots, because we may have dirty slots data left over from a bad migration. Like the target node forcibly executes CLUSTER SETSLOT NODE to take over the slot without performing key migration. Signed-off-by: Binbin --- src/cluster_legacy.c | 13 ++++++++++++- tests/unit/cluster/replica-migration.tcl | 20 ++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index ee7e4c531e..cfde3fd797 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -2451,6 +2451,7 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc * need to delete all the keys in the slots we lost ownership. */ uint16_t dirty_slots[CLUSTER_SLOTS]; int dirty_slots_count = 0; + int delete_dirty_slots = 0; /* We should detect if sender is new primary of our shard. * We will know it if all our slots were migrated to sender, and sender @@ -2677,6 +2678,12 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc serverLog(LL_NOTICE, "My last slot was migrated to node %.40s (%s) in shard %.40s. I am now an empty primary.", sender->name, sender->human_nodename, sender->shard_id); + /* We may still have dirty slots when we became a empty primary due to + * a bad migration. + * + * In order to maintain a consistent state between keys and slots + * we need to remove all the keys from the slots we lost. */ + delete_dirty_slots = 1; } } else if (dirty_slots_count) { /* If we are here, we received an update message which removed @@ -2686,6 +2693,10 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc * * In order to maintain a consistent state between keys and slots * we need to remove all the keys from the slots we lost. */ + delete_dirty_slots = 1; + } + + if (delete_dirty_slots) { for (int j = 0; j < dirty_slots_count; j++) { serverLog(LL_NOTICE, "Deleting keys in dirty slot %d on node %.40s (%s) in shard %.40s", dirty_slots[j], myself->name, myself->human_nodename, myself->shard_id); @@ -6069,7 +6080,7 @@ void removeChannelsInSlot(unsigned int slot) { /* Remove all the keys in the specified hash slot. * The number of removed items is returned. */ unsigned int delKeysInSlot(unsigned int hashslot) { - if (!kvstoreDictSize(server.db->keys, hashslot)) return 0; + if (!countKeysInSlot(hashslot)) return 0; unsigned int j = 0; diff --git a/tests/unit/cluster/replica-migration.tcl b/tests/unit/cluster/replica-migration.tcl index 05d6528684..d04069ef16 100644 --- a/tests/unit/cluster/replica-migration.tcl +++ b/tests/unit/cluster/replica-migration.tcl @@ -400,3 +400,23 @@ start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} { test_cluster_setslot "setslot" } my_slot_allocation cluster_allocate_replicas ;# start_cluster + +start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} { + test "Empty primary will check and delete the dirty slots" { + R 2 config set cluster-allow-replica-migration no + + # Write a key to slot 0. + R 2 incr key_977613 + + # Move slot 0 from primary 2 to primary 0. + R 0 cluster bumpepoch + R 0 cluster setslot 0 node [R 0 cluster myid] + + # Wait for R 2 to report that it is an empty primary (cluster-allow-replica-migration no) + wait_for_log_messages -2 {"*I am now an empty primary*"} 0 1000 50 + + # Make sure primary 0 will delete the dirty slots. + verify_log_message -2 "*Deleting keys in dirty slot 0*" 0 + assert_equal [R 2 dbsize] 0 + } +} my_slot_allocation cluster_allocate_replicas ;# start_cluster From 6fba747c39bee10e27942afabd2c46be4b4fae39 Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 14 Nov 2024 10:26:23 +0800 Subject: [PATCH 26/92] Fix log printing always shows the role as child under daemonize (#1301) In #1282, we init server.pid earlier to keep log message role consistent, but we forgot to consider daemonize. In daemonize mode, we will always print the child role. We need to reset server.pid after daemonize(), otherwise the log printing role will always be the child. It also causes a incorrect server.pid value, affecting the concatenation of some pid names. Signed-off-by: Binbin --- src/server.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/server.c b/src/server.c index 0c4ddbe4b8..be3982278f 100644 --- a/src/server.c +++ b/src/server.c @@ -7064,7 +7064,12 @@ __attribute__((weak)) int main(int argc, char **argv) { /* Daemonize if needed */ server.supervised = serverIsSupervised(server.supervised_mode); int background = server.daemonize && !server.supervised; - if (background) daemonize(); + if (background) { + /* We need to reset server.pid after daemonize(), otherwise the + * log printing role will always be the child. */ + daemonize(); + server.pid = getpid(); + } serverLog(LL_NOTICE, "oO0OoO0OoO0Oo Valkey is starting oO0OoO0OoO0Oo"); serverLog(LL_NOTICE, "Valkey version=%s, bits=%d, commit=%s, modified=%d, pid=%d, just started", VALKEY_VERSION, From 4a9864206f8aa1b3b33976c0a96b292d3fa4905a Mon Sep 17 00:00:00 2001 From: skyfirelee <739609084@qq.com> Date: Thu, 14 Nov 2024 10:37:44 +0800 Subject: [PATCH 27/92] Migrate quicklist unit test to new framework (#515) Migrate quicklist unit test to new unit test framework, and cleanup remaining references of SERVER_TEST, parent ticket #428. Closes #428. Signed-off-by: artikell <739609084@qq.com> Signed-off-by: Binbin Co-authored-by: Binbin --- .github/workflows/daily.yml | 49 +- src/Makefile | 3 - src/quicklist.c | 1420 --------------------- src/quicklist.h | 4 - src/server.c | 73 -- src/unit/test_files.h | 60 + src/unit/test_quicklist.c | 2300 +++++++++++++++++++++++++++++++++++ 7 files changed, 2377 insertions(+), 1532 deletions(-) create mode 100644 src/unit/test_quicklist.c diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index bcfa35c939..62eecb1fa8 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -60,7 +60,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make all-with-unit-tests SERVER_CFLAGS='-Werror -DSERVER_TEST' + run: make all-with-unit-tests SERVER_CFLAGS='-Werror' - name: testprep run: sudo apt-get install tcl8.6 tclx - name: test @@ -75,10 +75,7 @@ jobs: - name: cluster tests if: true && !contains(github.event.inputs.skiptests, 'cluster') run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - - name: legacy unit tests - if: true && !contains(github.event.inputs.skiptests, 'unittest') - run: ./src/valkey-server test all --accurate - - name: new unit tests + - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: ./src/valkey-unit-tests --accurate @@ -109,7 +106,7 @@ jobs: run: | apt-get update && apt-get install -y make gcc-13 update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 100 - make all-with-unit-tests CC=gcc OPT=-O3 SERVER_CFLAGS='-Werror -DSERVER_TEST -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3' + make all-with-unit-tests CC=gcc OPT=-O3 SERVER_CFLAGS='-Werror -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3' - name: testprep run: apt-get install -y tcl8.6 tclx procps - name: test @@ -124,10 +121,7 @@ jobs: - name: cluster tests if: true && !contains(github.event.inputs.skiptests, 'cluster') run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - - name: legacy unit tests - if: true && !contains(github.event.inputs.skiptests, 'unittest') - run: ./src/valkey-server test all --accurate - - name: new unit tests + - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: ./src/valkey-unit-tests --accurate @@ -234,7 +228,7 @@ jobs: - name: make run: | sudo apt-get update && sudo apt-get install libc6-dev-i386 - make 32bit SERVER_CFLAGS='-Werror -DSERVER_TEST' + make 32bit SERVER_CFLAGS='-Werror' - name: testprep run: sudo apt-get install tcl8.6 tclx - name: test @@ -251,10 +245,7 @@ jobs: - name: cluster tests if: true && !contains(github.event.inputs.skiptests, 'cluster') run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - - name: legacy unit tests - if: true && !contains(github.event.inputs.skiptests, 'unittest') - run: ./src/valkey-server test all --accurate - - name: new unit tests + - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: ./src/valkey-unit-tests --accurate @@ -483,7 +474,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make valgrind SERVER_CFLAGS='-Werror -DSERVER_TEST' + run: make valgrind SERVER_CFLAGS='-Werror' - name: testprep run: | sudo apt-get update @@ -515,7 +506,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make valgrind SERVER_CFLAGS='-Werror -DSERVER_TEST' + run: make all-with-unit-tests valgrind SERVER_CFLAGS='-Werror' - name: testprep run: | sudo apt-get update @@ -526,7 +517,7 @@ jobs: - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: | - valgrind --track-origins=yes --suppressions=./src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full --log-file=err.txt ./src/valkey-server test all --valgrind + valgrind --track-origins=yes --suppressions=./src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full --log-file=err.txt ./src/valkey-unit-tests --valgrind if grep -q 0x err.txt; then cat err.txt; exit 1; fi test-valgrind-no-malloc-usable-size-test: @@ -552,7 +543,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make valgrind CFLAGS="-DNO_MALLOC_USABLE_SIZE -DSERVER_TEST" SERVER_CFLAGS='-Werror' + run: make valgrind CFLAGS="-DNO_MALLOC_USABLE_SIZE" SERVER_CFLAGS='-Werror' - name: testprep run: | sudo apt-get update @@ -584,7 +575,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make valgrind CFLAGS="-DNO_MALLOC_USABLE_SIZE -DSERVER_TEST" SERVER_CFLAGS='-Werror' + run: make all-with-unit-tests valgrind CFLAGS="-DNO_MALLOC_USABLE_SIZE" SERVER_CFLAGS='-Werror' - name: testprep run: | sudo apt-get update @@ -595,7 +586,7 @@ jobs: - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: | - valgrind --track-origins=yes --suppressions=./src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full --log-file=err.txt ./src/valkey-server test all --valgrind + valgrind --track-origins=yes --suppressions=./src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full --log-file=err.txt ./src/valkey-unit-tests --valgrind if grep -q 0x err.txt; then cat err.txt; exit 1; fi test-sanitizer-address: @@ -627,7 +618,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make all-with-unit-tests OPT=-O3 SANITIZER=address SERVER_CFLAGS='-DSERVER_TEST -Werror' + run: make all-with-unit-tests OPT=-O3 SANITIZER=address SERVER_CFLAGS='-Werror' - name: testprep run: | sudo apt-get update @@ -644,10 +635,7 @@ jobs: - name: cluster tests if: true && !contains(github.event.inputs.skiptests, 'cluster') run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - - name: legacy unit tests - if: true && !contains(github.event.inputs.skiptests, 'unittest') - run: ./src/valkey-server test all - - name: new unit tests + - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: ./src/valkey-unit-tests @@ -680,7 +668,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make all-with-unit-tests OPT=-O3 SANITIZER=undefined SERVER_CFLAGS='-DSERVER_TEST -Werror' LUA_DEBUG=yes # we (ab)use this flow to also check Lua C API violations + run: make all-with-unit-tests OPT=-O3 SANITIZER=undefined SERVER_CFLAGS='-Werror' LUA_DEBUG=yes # we (ab)use this flow to also check Lua C API violations - name: testprep run: | sudo apt-get update @@ -697,10 +685,7 @@ jobs: - name: cluster tests if: true && !contains(github.event.inputs.skiptests, 'cluster') run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - - name: legacy unit tests - if: true && !contains(github.event.inputs.skiptests, 'unittest') - run: ./src/valkey-server test all --accurate - - name: new unit tests + - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: ./src/valkey-unit-tests --accurate @@ -1031,7 +1016,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make SERVER_CFLAGS='-Werror -DSERVER_TEST' + run: make SERVER_CFLAGS='-Werror' test-freebsd: runs-on: macos-12 diff --git a/src/Makefile b/src/Makefile index ae2de1c626..21affe61a3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -131,9 +131,6 @@ ifdef REDIS_LDFLAGS endif FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(SERVER_CFLAGS) -ifeq ($(SERVER_TEST),yes) - FINAL_CFLAGS +=-DSERVER_TEST=1 -endif FINAL_LDFLAGS=$(LDFLAGS) $(OPT) $(SERVER_LDFLAGS) $(DEBUG) FINAL_LIBS=-lm DEBUG=-g -ggdb diff --git a/src/quicklist.c b/src/quicklist.c index 617d21cd8c..225fac6fdf 100644 --- a/src/quicklist.c +++ b/src/quicklist.c @@ -210,9 +210,7 @@ void quicklistRelease(quicklist *quicklist) { * Returns 1 if listpack compressed successfully. * Returns 0 if compression failed or if listpack too small to compress. */ static int __quicklistCompressNode(quicklistNode *node) { -#ifdef SERVER_TEST node->attempted_compress = 1; -#endif if (node->dont_compress) return 0; /* validate that the node is neither @@ -250,9 +248,7 @@ static int __quicklistCompressNode(quicklistNode *node) { /* Uncompress the listpack in 'node' and update encoding details. * Returns 1 on successful decode, 0 on failure to decode. */ static int __quicklistDecompressNode(quicklistNode *node) { -#ifdef SERVER_TEST node->attempted_compress = 0; -#endif node->recompress = 0; void *decompressed = zmalloc(node->sz); @@ -1692,1419 +1688,3 @@ void quicklistBookmarksClear(quicklist *ql) { /* NOTE: We do not shrink (realloc) the quick list. main use case for this * function is just before releasing the allocation. */ } - -/* The rest of this file is test cases and test helpers. */ -#ifdef SERVER_TEST -#include -#include -#include "testhelp.h" -#include - -#define yell(str, ...) printf("ERROR! " str "\n\n", __VA_ARGS__) - -#define ERROR \ - do { \ - printf("\tERROR!\n"); \ - err++; \ - } while (0) - -#define ERR(x, ...) \ - do { \ - printf("%s:%s:%d:\t", __FILE__, __func__, __LINE__); \ - printf("ERROR! " x "\n", __VA_ARGS__); \ - err++; \ - } while (0) - -#define TEST(name) printf("test — %s\n", name); -#define TEST_DESC(name, ...) printf("test — " name "\n", __VA_ARGS__); - -#define QL_TEST_VERBOSE 0 - -#define UNUSED(x) (void)(x) -static void ql_info(quicklist *ql) { -#if QL_TEST_VERBOSE - printf("Container length: %lu\n", ql->len); - printf("Container size: %lu\n", ql->count); - if (ql->head) printf("\t(zsize head: %lu)\n", lpLength(ql->head->entry)); - if (ql->tail) printf("\t(zsize tail: %lu)\n", lpLength(ql->tail->entry)); - printf("\n"); -#else - UNUSED(ql); -#endif -} - -/* Return the UNIX time in microseconds */ -static long long ustime(void) { - struct timeval tv; - long long ust; - - gettimeofday(&tv, NULL); - ust = ((long long)tv.tv_sec) * 1000000; - ust += tv.tv_usec; - return ust; -} - -/* Return the UNIX time in milliseconds */ -static long long mstime(void) { - return ustime() / 1000; -} - -/* Iterate over an entire quicklist. - * Print the list if 'print' == 1. - * - * Returns physical count of elements found by iterating over the list. */ -static int _itrprintr(quicklist *ql, int print, int forward) { - quicklistIter *iter = quicklistGetIterator(ql, forward ? AL_START_HEAD : AL_START_TAIL); - quicklistEntry entry; - int i = 0; - int p = 0; - quicklistNode *prev = NULL; - while (quicklistNext(iter, &entry)) { - if (entry.node != prev) { - /* Count the number of list nodes too */ - p++; - prev = entry.node; - } - if (print) { - int size = (entry.sz > (1 << 20)) ? 1 << 20 : entry.sz; - printf("[%3d (%2d)]: [%.*s] (%lld)\n", i, p, size, (char *)entry.value, entry.longval); - } - i++; - } - quicklistReleaseIterator(iter); - return i; -} -static int itrprintr(quicklist *ql, int print) { - return _itrprintr(ql, print, 1); -} - -static int itrprintr_rev(quicklist *ql, int print) { - return _itrprintr(ql, print, 0); -} - -#define ql_verify(a, b, c, d, e) \ - do { \ - err += _ql_verify((a), (b), (c), (d), (e)); \ - } while (0) - -static int _ql_verify_compress(quicklist *ql) { - int errors = 0; - if (quicklistAllowsCompression(ql)) { - quicklistNode *node = ql->head; - unsigned int low_raw = ql->compress; - unsigned int high_raw = ql->len - ql->compress; - - for (unsigned int at = 0; at < ql->len; at++, node = node->next) { - if (node && (at < low_raw || at >= high_raw)) { - if (node->encoding != QUICKLIST_NODE_ENCODING_RAW) { - yell("Incorrect compression: node %d is " - "compressed at depth %d ((%u, %u); total " - "nodes: %lu; size: %zu; recompress: %d)", - at, ql->compress, low_raw, high_raw, ql->len, node->sz, node->recompress); - errors++; - } - } else { - if (node->encoding != QUICKLIST_NODE_ENCODING_LZF && !node->attempted_compress) { - yell("Incorrect non-compression: node %d is NOT " - "compressed at depth %d ((%u, %u); total " - "nodes: %lu; size: %zu; recompress: %d; attempted: %d)", - at, ql->compress, low_raw, high_raw, ql->len, node->sz, node->recompress, - node->attempted_compress); - errors++; - } - } - } - } - return errors; -} - -/* Verify list metadata matches physical list contents. */ -static int _ql_verify(quicklist *ql, uint32_t len, uint32_t count, uint32_t head_count, uint32_t tail_count) { - int errors = 0; - - ql_info(ql); - if (len != ql->len) { - yell("quicklist length wrong: expected %d, got %lu", len, ql->len); - errors++; - } - - if (count != ql->count) { - yell("quicklist count wrong: expected %d, got %lu", count, ql->count); - errors++; - } - - int loopr = itrprintr(ql, 0); - if (loopr != (int)ql->count) { - yell("quicklist cached count not match actual count: expected %lu, got " - "%d", - ql->count, loopr); - errors++; - } - - int rloopr = itrprintr_rev(ql, 0); - if (loopr != rloopr) { - yell("quicklist has different forward count than reverse count! " - "Forward count is %d, reverse count is %d.", - loopr, rloopr); - errors++; - } - - if (ql->len == 0 && !errors) { - return errors; - } - - if (ql->head && head_count != ql->head->count && head_count != lpLength(ql->head->entry)) { - yell("quicklist head count wrong: expected %d, " - "got cached %d vs. actual %lu", - head_count, ql->head->count, lpLength(ql->head->entry)); - errors++; - } - - if (ql->tail && tail_count != ql->tail->count && tail_count != lpLength(ql->tail->entry)) { - yell("quicklist tail count wrong: expected %d, " - "got cached %u vs. actual %lu", - tail_count, ql->tail->count, lpLength(ql->tail->entry)); - errors++; - } - - errors += _ql_verify_compress(ql); - return errors; -} - -/* Release iterator and verify compress correctly. */ -static void ql_release_iterator(quicklistIter *iter) { - quicklist *ql = NULL; - if (iter) ql = iter->quicklist; - quicklistReleaseIterator(iter); - if (ql) assert(!_ql_verify_compress(ql)); -} - -/* Generate new string concatenating integer i against string 'prefix' */ -static char *genstr(char *prefix, int i) { - static char result[64] = {0}; - snprintf(result, sizeof(result), "%s%d", prefix, i); - return result; -} - -static void randstring(unsigned char *target, size_t sz) { - size_t p = 0; - int minval, maxval; - switch (rand() % 3) { - case 0: - minval = 'a'; - maxval = 'z'; - break; - case 1: - minval = '0'; - maxval = '9'; - break; - case 2: - minval = 'A'; - maxval = 'Z'; - break; - default: assert(NULL); - } - - while (p < sz) target[p++] = minval + rand() % (maxval - minval + 1); -} - -/* main test, but callable from other files */ -int quicklistTest(int argc, char *argv[], int flags) { - UNUSED(argc); - UNUSED(argv); - - int accurate = (flags & TEST_ACCURATE); - unsigned int err = 0; - int optimize_start = -(int)(sizeof(optimization_level) / sizeof(*optimization_level)); - - printf("Starting optimization offset at: %d\n", optimize_start); - - int options[] = {0, 1, 2, 3, 4, 5, 6, 10}; - int fills[] = {-5, -4, -3, -2, -1, 0, 1, 2, 32, 66, 128, 999}; - size_t option_count = sizeof(options) / sizeof(*options); - int fill_count = (int)(sizeof(fills) / sizeof(*fills)); - long long runtime[option_count]; - - for (int _i = 0; _i < (int)option_count; _i++) { - printf("Testing Compression option %d\n", options[_i]); - long long start = mstime(); - quicklistIter *iter; - - TEST("create list") { - quicklist *ql = quicklistNew(-2, options[_i]); - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("add to tail of empty list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushTail(ql, "hello", 6); - /* 1 for head and 1 for tail because 1 node = head = tail */ - ql_verify(ql, 1, 1, 1, 1); - quicklistRelease(ql); - } - - TEST("add to head of empty list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushHead(ql, "hello", 6); - /* 1 for head and 1 for tail because 1 node = head = tail */ - ql_verify(ql, 1, 1, 1, 1); - quicklistRelease(ql); - } - - TEST_DESC("add to tail 5x at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 5; i++) quicklistPushTail(ql, genstr("hello", i), 32); - if (ql->count != 5) ERROR; - if (fills[f] == 32) ql_verify(ql, 1, 5, 5, 5); - quicklistRelease(ql); - } - } - - TEST_DESC("add to head 5x at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 5; i++) quicklistPushHead(ql, genstr("hello", i), 32); - if (ql->count != 5) ERROR; - if (fills[f] == 32) ql_verify(ql, 1, 5, 5, 5); - quicklistRelease(ql); - } - } - - TEST_DESC("add to tail 500x at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i), 64); - if (ql->count != 500) ERROR; - if (fills[f] == 32) ql_verify(ql, 16, 500, 32, 20); - quicklistRelease(ql); - } - } - - TEST_DESC("add to head 500x at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); - if (ql->count != 500) ERROR; - if (fills[f] == 32) ql_verify(ql, 16, 500, 20, 32); - quicklistRelease(ql); - } - } - - TEST("rotate empty") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistRotate(ql); - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("Comprassion Plain node") { - for (int f = 0; f < fill_count; f++) { - size_t large_limit = (fills[f] < 0) ? quicklistNodeNegFillLimit(fills[f]) + 1 : SIZE_SAFETY_LIMIT + 1; - - char buf[large_limit]; - quicklist *ql = quicklistNew(fills[f], 1); - for (int i = 0; i < 500; i++) { - /* Set to 256 to allow the node to be triggered to compress, - * if it is less than 48(nocompress), the test will be successful. */ - snprintf(buf, sizeof(buf), "hello%d", i); - quicklistPushHead(ql, buf, large_limit); - } - - quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); - quicklistEntry entry; - int i = 0; - while (quicklistNext(iter, &entry)) { - assert(QL_NODE_IS_PLAIN(entry.node)); - snprintf(buf, sizeof(buf), "hello%d", i); - if (strcmp((char *)entry.value, buf)) - ERR("value [%s] didn't match [%s] at position %d", entry.value, buf, i); - i++; - } - ql_release_iterator(iter); - quicklistRelease(ql); - } - } - - TEST("NEXT plain node") { - for (int f = 0; f < fill_count; f++) { - size_t large_limit = (fills[f] < 0) ? quicklistNodeNegFillLimit(fills[f]) + 1 : SIZE_SAFETY_LIMIT + 1; - quicklist *ql = quicklistNew(fills[f], options[_i]); - - char buf[large_limit]; - memcpy(buf, "plain", 5); - quicklistPushHead(ql, buf, large_limit); - quicklistPushHead(ql, buf, large_limit); - quicklistPushHead(ql, "packed3", 7); - quicklistPushHead(ql, "packed4", 7); - quicklistPushHead(ql, buf, large_limit); - - quicklistEntry entry; - quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); - - while (quicklistNext(iter, &entry) != 0) { - if (QL_NODE_IS_PLAIN(entry.node)) - assert(!memcmp(entry.value, "plain", 5)); - else - assert(!memcmp(entry.value, "packed", 6)); - } - ql_release_iterator(iter); - quicklistRelease(ql); - } - } - - TEST("rotate plain node ") { - for (int f = 0; f < fill_count; f++) { - size_t large_limit = (fills[f] < 0) ? quicklistNodeNegFillLimit(fills[f]) + 1 : SIZE_SAFETY_LIMIT + 1; - - unsigned char *data = NULL; - size_t sz; - long long lv; - int i = 0; - quicklist *ql = quicklistNew(fills[f], options[_i]); - char buf[large_limit]; - memcpy(buf, "hello1", 6); - quicklistPushHead(ql, buf, large_limit); - memcpy(buf, "hello4", 6); - quicklistPushHead(ql, buf, large_limit); - memcpy(buf, "hello3", 6); - quicklistPushHead(ql, buf, large_limit); - memcpy(buf, "hello2", 6); - quicklistPushHead(ql, buf, large_limit); - quicklistRotate(ql); - - for (i = 1; i < 5; i++) { - assert(QL_NODE_IS_PLAIN(ql->tail)); - quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); - int temp_char = data[5]; - zfree(data); - assert(temp_char == ('0' + i)); - } - - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - } - - TEST("rotate one val once") { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - quicklistPushHead(ql, "hello", 6); - quicklistRotate(ql); - /* Ignore compression verify because listpack is - * too small to compress. */ - ql_verify(ql, 1, 1, 1, 1); - quicklistRelease(ql); - } - } - - TEST_DESC("rotate 500 val 5000 times at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - quicklistPushHead(ql, "900", 3); - quicklistPushHead(ql, "7000", 4); - quicklistPushHead(ql, "-1200", 5); - quicklistPushHead(ql, "42", 2); - for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 64); - ql_info(ql); - for (int i = 0; i < 5000; i++) { - ql_info(ql); - quicklistRotate(ql); - } - if (fills[f] == 1) - ql_verify(ql, 504, 504, 1, 1); - else if (fills[f] == 2) - ql_verify(ql, 252, 504, 2, 2); - else if (fills[f] == 32) - ql_verify(ql, 16, 504, 32, 24); - quicklistRelease(ql); - } - } - - TEST("pop empty") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPop(ql, QUICKLIST_HEAD, NULL, NULL, NULL); - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("pop 1 string from 1") { - quicklist *ql = quicklistNew(-2, options[_i]); - char *populate = genstr("hello", 331); - quicklistPushHead(ql, populate, 32); - unsigned char *data; - size_t sz; - long long lv; - ql_info(ql); - assert(quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv)); - assert(data != NULL); - assert(sz == 32); - if (strcmp(populate, (char *)data)) { - int size = sz; - ERR("Pop'd value (%.*s) didn't equal original value (%s)", size, data, populate); - } - zfree(data); - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("pop head 1 number from 1") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushHead(ql, "55513", 5); - unsigned char *data; - size_t sz; - long long lv; - ql_info(ql); - assert(quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv)); - assert(data == NULL); - assert(lv == 55513); - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("pop head 500 from 500") { - quicklist *ql = quicklistNew(-2, options[_i]); - for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); - ql_info(ql); - for (int i = 0; i < 500; i++) { - unsigned char *data; - size_t sz; - long long lv; - int ret = quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); - assert(ret == 1); - assert(data != NULL); - assert(sz == 32); - if (strcmp(genstr("hello", 499 - i), (char *)data)) { - int size = sz; - ERR("Pop'd value (%.*s) didn't equal original value (%s)", size, data, genstr("hello", 499 - i)); - } - zfree(data); - } - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("pop head 5000 from 500") { - quicklist *ql = quicklistNew(-2, options[_i]); - for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); - for (int i = 0; i < 5000; i++) { - unsigned char *data; - size_t sz; - long long lv; - int ret = quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); - if (i < 500) { - assert(ret == 1); - assert(data != NULL); - assert(sz == 32); - if (strcmp(genstr("hello", 499 - i), (char *)data)) { - int size = sz; - ERR("Pop'd value (%.*s) didn't equal original value " - "(%s)", - size, data, genstr("hello", 499 - i)); - } - zfree(data); - } else { - assert(ret == 0); - } - } - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("iterate forward over 500 list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); - quicklistIter *iter = quicklistGetIterator(ql, AL_START_HEAD); - quicklistEntry entry; - int i = 499, count = 0; - while (quicklistNext(iter, &entry)) { - char *h = genstr("hello", i); - if (strcmp((char *)entry.value, h)) - ERR("value [%s] didn't match [%s] at position %d", entry.value, h, i); - i--; - count++; - } - if (count != 500) ERR("Didn't iterate over exactly 500 elements (%d)", i); - ql_verify(ql, 16, 500, 20, 32); - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST("iterate reverse over 500 list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); - quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); - quicklistEntry entry; - int i = 0; - while (quicklistNext(iter, &entry)) { - char *h = genstr("hello", i); - if (strcmp((char *)entry.value, h)) - ERR("value [%s] didn't match [%s] at position %d", entry.value, h, i); - i++; - } - if (i != 500) ERR("Didn't iterate over exactly 500 elements (%d)", i); - ql_verify(ql, 16, 500, 20, 32); - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST("insert after 1 element") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushHead(ql, "hello", 6); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); - quicklistInsertAfter(iter, &entry, "abc", 4); - ql_release_iterator(iter); - ql_verify(ql, 1, 2, 2, 2); - - /* verify results */ - iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); - int sz = entry.sz; - if (strncmp((char *)entry.value, "hello", 5)) { - ERR("Value 0 didn't match, instead got: %.*s", sz, entry.value); - } - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); - sz = entry.sz; - if (strncmp((char *)entry.value, "abc", 3)) { - ERR("Value 1 didn't match, instead got: %.*s", sz, entry.value); - } - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST("insert before 1 element") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushHead(ql, "hello", 6); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); - quicklistInsertBefore(iter, &entry, "abc", 4); - ql_release_iterator(iter); - ql_verify(ql, 1, 2, 2, 2); - - /* verify results */ - iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); - int sz = entry.sz; - if (strncmp((char *)entry.value, "abc", 3)) { - ERR("Value 0 didn't match, instead got: %.*s", sz, entry.value); - } - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); - sz = entry.sz; - if (strncmp((char *)entry.value, "hello", 5)) { - ERR("Value 1 didn't match, instead got: %.*s", sz, entry.value); - } - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST("insert head while head node is full") { - quicklist *ql = quicklistNew(4, options[_i]); - for (int i = 0; i < 10; i++) quicklistPushTail(ql, genstr("hello", i), 6); - quicklistSetFill(ql, -1); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, -10, &entry); - char buf[4096] = {0}; - quicklistInsertBefore(iter, &entry, buf, 4096); - ql_release_iterator(iter); - ql_verify(ql, 4, 11, 1, 2); - quicklistRelease(ql); - } - - TEST("insert tail while tail node is full") { - quicklist *ql = quicklistNew(4, options[_i]); - for (int i = 0; i < 10; i++) quicklistPushHead(ql, genstr("hello", i), 6); - quicklistSetFill(ql, -1); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); - char buf[4096] = {0}; - quicklistInsertAfter(iter, &entry, buf, 4096); - ql_release_iterator(iter); - ql_verify(ql, 4, 11, 2, 1); - quicklistRelease(ql); - } - - TEST_DESC("insert once in elements while iterating at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - quicklistPushTail(ql, "abc", 3); - quicklistSetFill(ql, 1); - quicklistPushTail(ql, "def", 3); /* force to unique node */ - quicklistSetFill(ql, f); - quicklistPushTail(ql, "bob", 3); /* force to reset for +3 */ - quicklistPushTail(ql, "foo", 3); - quicklistPushTail(ql, "zoo", 3); - - itrprintr(ql, 0); - /* insert "bar" before "bob" while iterating over list. */ - quicklistIter *iter = quicklistGetIterator(ql, AL_START_HEAD); - quicklistEntry entry; - while (quicklistNext(iter, &entry)) { - if (!strncmp((char *)entry.value, "bob", 3)) { - /* Insert as fill = 1 so it spills into new node. */ - quicklistInsertBefore(iter, &entry, "bar", 3); - break; /* didn't we fix insert-while-iterating? */ - } - } - ql_release_iterator(iter); - itrprintr(ql, 0); - - /* verify results */ - iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); - int sz = entry.sz; - - if (strncmp((char *)entry.value, "abc", 3)) - ERR("Value 0 didn't match, instead got: %.*s", sz, entry.value); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); - if (strncmp((char *)entry.value, "def", 3)) - ERR("Value 1 didn't match, instead got: %.*s", sz, entry.value); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 2, &entry); - if (strncmp((char *)entry.value, "bar", 3)) - ERR("Value 2 didn't match, instead got: %.*s", sz, entry.value); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 3, &entry); - if (strncmp((char *)entry.value, "bob", 3)) - ERR("Value 3 didn't match, instead got: %.*s", sz, entry.value); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 4, &entry); - if (strncmp((char *)entry.value, "foo", 3)) - ERR("Value 4 didn't match, instead got: %.*s", sz, entry.value); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 5, &entry); - if (strncmp((char *)entry.value, "zoo", 3)) - ERR("Value 5 didn't match, instead got: %.*s", sz, entry.value); - ql_release_iterator(iter); - quicklistRelease(ql); - } - } - - TEST_DESC("insert [before] 250 new in middle of 500 elements at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i), 32); - for (int i = 0; i < 250; i++) { - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, 250, &entry); - quicklistInsertBefore(iter, &entry, genstr("abc", i), 32); - ql_release_iterator(iter); - } - if (fills[f] == 32) ql_verify(ql, 25, 750, 32, 20); - quicklistRelease(ql); - } - } - - TEST_DESC("insert [after] 250 new in middle of 500 elements at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); - for (int i = 0; i < 250; i++) { - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, 250, &entry); - quicklistInsertAfter(iter, &entry, genstr("abc", i), 32); - ql_release_iterator(iter); - } - - if (ql->count != 750) ERR("List size not 750, but rather %ld", ql->count); - - if (fills[f] == 32) ql_verify(ql, 26, 750, 20, 32); - quicklistRelease(ql); - } - } - - TEST("duplicate empty list") { - quicklist *ql = quicklistNew(-2, options[_i]); - ql_verify(ql, 0, 0, 0, 0); - quicklist *copy = quicklistDup(ql); - ql_verify(copy, 0, 0, 0, 0); - quicklistRelease(ql); - quicklistRelease(copy); - } - - TEST("duplicate list of 1 element") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushHead(ql, genstr("hello", 3), 32); - ql_verify(ql, 1, 1, 1, 1); - quicklist *copy = quicklistDup(ql); - ql_verify(copy, 1, 1, 1, 1); - quicklistRelease(ql); - quicklistRelease(copy); - } - - TEST("duplicate list of 500") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); - ql_verify(ql, 16, 500, 20, 32); - - quicklist *copy = quicklistDup(ql); - ql_verify(copy, 16, 500, 20, 32); - quicklistRelease(ql); - quicklistRelease(copy); - } - - for (int f = 0; f < fill_count; f++) { - TEST_DESC("index 1,200 from 500 list at fill %d at compress %d", f, options[_i]) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); - if (strcmp((char *)entry.value, "hello2") != 0) ERR("Value: %s", entry.value); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 200, &entry); - if (strcmp((char *)entry.value, "hello201") != 0) ERR("Value: %s", entry.value); - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST_DESC("index -1,-2 from 500 list at fill %d at compress %d", fills[f], options[_i]) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); - if (strcmp((char *)entry.value, "hello500") != 0) ERR("Value: %s", entry.value); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, -2, &entry); - if (strcmp((char *)entry.value, "hello499") != 0) ERR("Value: %s", entry.value); - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST_DESC("index -100 from 500 list at fill %d at compress %d", fills[f], options[_i]) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, -100, &entry); - if (strcmp((char *)entry.value, "hello401") != 0) ERR("Value: %s", entry.value); - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST_DESC("index too big +1 from 50 list at fill %d at compress %d", fills[f], options[_i]) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 50; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - quicklistEntry entry; - int sz = entry.sz; - iter = quicklistGetIteratorEntryAtIdx(ql, 50, &entry); - if (iter) ERR("Index found at 50 with 50 list: %.*s", sz, entry.value); - ql_release_iterator(iter); - quicklistRelease(ql); - } - } - - TEST("delete range empty list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistDelRange(ql, 5, 20); - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("delete range of entire node in list of one node") { - quicklist *ql = quicklistNew(-2, options[_i]); - for (int i = 0; i < 32; i++) quicklistPushHead(ql, genstr("hello", i), 32); - ql_verify(ql, 1, 32, 32, 32); - quicklistDelRange(ql, 0, 32); - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("delete range of entire node with overflow counts") { - quicklist *ql = quicklistNew(-2, options[_i]); - for (int i = 0; i < 32; i++) quicklistPushHead(ql, genstr("hello", i), 32); - ql_verify(ql, 1, 32, 32, 32); - quicklistDelRange(ql, 0, 128); - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("delete middle 100 of 500 list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - ql_verify(ql, 16, 500, 32, 20); - quicklistDelRange(ql, 200, 100); - ql_verify(ql, 14, 400, 32, 20); - quicklistRelease(ql); - } - - TEST("delete less than fill but across nodes") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - ql_verify(ql, 16, 500, 32, 20); - quicklistDelRange(ql, 60, 10); - ql_verify(ql, 16, 490, 32, 20); - quicklistRelease(ql); - } - - TEST("delete negative 1 from 500 list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - ql_verify(ql, 16, 500, 32, 20); - quicklistDelRange(ql, -1, 1); - ql_verify(ql, 16, 499, 32, 19); - quicklistRelease(ql); - } - - TEST("delete negative 1 from 500 list with overflow counts") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - ql_verify(ql, 16, 500, 32, 20); - quicklistDelRange(ql, -1, 128); - ql_verify(ql, 16, 499, 32, 19); - quicklistRelease(ql); - } - - TEST("delete negative 100 from 500 list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - quicklistDelRange(ql, -100, 100); - ql_verify(ql, 13, 400, 32, 16); - quicklistRelease(ql); - } - - TEST("delete -10 count 5 from 50 list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 50; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - ql_verify(ql, 2, 50, 32, 18); - quicklistDelRange(ql, -10, 5); - ql_verify(ql, 2, 45, 32, 13); - quicklistRelease(ql); - } - - TEST("numbers only list read") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushTail(ql, "1111", 4); - quicklistPushTail(ql, "2222", 4); - quicklistPushTail(ql, "3333", 4); - quicklistPushTail(ql, "4444", 4); - ql_verify(ql, 1, 4, 4, 4); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); - if (entry.longval != 1111) ERR("Not 1111, %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); - if (entry.longval != 2222) ERR("Not 2222, %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 2, &entry); - if (entry.longval != 3333) ERR("Not 3333, %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 3, &entry); - if (entry.longval != 4444) ERR("Not 4444, %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 4, &entry); - if (iter) ERR("Index past elements: %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); - if (entry.longval != 4444) ERR("Not 4444 (reverse), %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, -2, &entry); - if (entry.longval != 3333) ERR("Not 3333 (reverse), %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, -3, &entry); - if (entry.longval != 2222) ERR("Not 2222 (reverse), %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, -4, &entry); - if (entry.longval != 1111) ERR("Not 1111 (reverse), %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, -5, &entry); - if (iter) ERR("Index past elements (reverse), %lld", entry.longval); - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST("numbers larger list read") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - char num[32]; - long long nums[5000]; - for (int i = 0; i < 5000; i++) { - nums[i] = -5157318210846258176 + i; - int sz = ll2string(num, sizeof(num), nums[i]); - quicklistPushTail(ql, num, sz); - } - quicklistPushTail(ql, "xxxxxxxxxxxxxxxxxxxx", 20); - quicklistEntry entry; - for (int i = 0; i < 5000; i++) { - iter = quicklistGetIteratorEntryAtIdx(ql, i, &entry); - if (entry.longval != nums[i]) ERR("[%d] Not longval %lld but rather %lld", i, nums[i], entry.longval); - entry.longval = 0xdeadbeef; - ql_release_iterator(iter); - } - iter = quicklistGetIteratorEntryAtIdx(ql, 5000, &entry); - if (strncmp((char *)entry.value, "xxxxxxxxxxxxxxxxxxxx", 20)) ERR("String val not match: %s", entry.value); - ql_verify(ql, 157, 5001, 32, 9); - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST("numbers larger list read B") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushTail(ql, "99", 2); - quicklistPushTail(ql, "98", 2); - quicklistPushTail(ql, "xxxxxxxxxxxxxxxxxxxx", 20); - quicklistPushTail(ql, "96", 2); - quicklistPushTail(ql, "95", 2); - quicklistReplaceAtIndex(ql, 1, "foo", 3); - quicklistReplaceAtIndex(ql, -1, "bar", 3); - quicklistRelease(ql); - } - - TEST_DESC("lrem test at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - char *words[] = {"abc", "foo", "bar", "foobar", "foobared", "zap", "bar", "test", "foo"}; - char *result[] = {"abc", "foo", "foobar", "foobared", "zap", "test", "foo"}; - char *resultB[] = {"abc", "foo", "foobar", "foobared", "zap", "test"}; - for (int i = 0; i < 9; i++) quicklistPushTail(ql, words[i], strlen(words[i])); - - /* lrem 0 bar */ - quicklistIter *iter = quicklistGetIterator(ql, AL_START_HEAD); - quicklistEntry entry; - int i = 0; - while (quicklistNext(iter, &entry)) { - if (quicklistCompare(&entry, (unsigned char *)"bar", 3)) { - quicklistDelEntry(iter, &entry); - } - i++; - } - ql_release_iterator(iter); - - /* check result of lrem 0 bar */ - iter = quicklistGetIterator(ql, AL_START_HEAD); - i = 0; - while (quicklistNext(iter, &entry)) { - /* Result must be: abc, foo, foobar, foobared, zap, test, - * foo */ - int sz = entry.sz; - if (strncmp((char *)entry.value, result[i], entry.sz)) { - ERR("No match at position %d, got %.*s instead of %s", i, sz, entry.value, result[i]); - } - i++; - } - ql_release_iterator(iter); - - quicklistPushTail(ql, "foo", 3); - - /* lrem -2 foo */ - iter = quicklistGetIterator(ql, AL_START_TAIL); - i = 0; - int del = 2; - while (quicklistNext(iter, &entry)) { - if (quicklistCompare(&entry, (unsigned char *)"foo", 3)) { - quicklistDelEntry(iter, &entry); - del--; - } - if (!del) break; - i++; - } - ql_release_iterator(iter); - - /* check result of lrem -2 foo */ - /* (we're ignoring the '2' part and still deleting all foo - * because - * we only have two foo) */ - iter = quicklistGetIterator(ql, AL_START_TAIL); - i = 0; - size_t resB = sizeof(resultB) / sizeof(*resultB); - while (quicklistNext(iter, &entry)) { - /* Result must be: abc, foo, foobar, foobared, zap, test, - * foo */ - int sz = entry.sz; - if (strncmp((char *)entry.value, resultB[resB - 1 - i], sz)) { - ERR("No match at position %d, got %.*s instead of %s", i, sz, entry.value, - resultB[resB - 1 - i]); - } - i++; - } - - ql_release_iterator(iter); - quicklistRelease(ql); - } - } - - TEST_DESC("iterate reverse + delete at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - quicklistPushTail(ql, "abc", 3); - quicklistPushTail(ql, "def", 3); - quicklistPushTail(ql, "hij", 3); - quicklistPushTail(ql, "jkl", 3); - quicklistPushTail(ql, "oop", 3); - - quicklistEntry entry; - quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); - int i = 0; - while (quicklistNext(iter, &entry)) { - if (quicklistCompare(&entry, (unsigned char *)"hij", 3)) { - quicklistDelEntry(iter, &entry); - } - i++; - } - ql_release_iterator(iter); - - if (i != 5) ERR("Didn't iterate 5 times, iterated %d times.", i); - - /* Check results after deletion of "hij" */ - iter = quicklistGetIterator(ql, AL_START_HEAD); - i = 0; - char *vals[] = {"abc", "def", "jkl", "oop"}; - while (quicklistNext(iter, &entry)) { - if (!quicklistCompare(&entry, (unsigned char *)vals[i], 3)) { - ERR("Value at %d didn't match %s\n", i, vals[i]); - } - i++; - } - ql_release_iterator(iter); - quicklistRelease(ql); - } - } - - TEST_DESC("iterator at index test at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - char num[32]; - long long nums[5000]; - for (int i = 0; i < 760; i++) { - nums[i] = -5157318210846258176 + i; - int sz = ll2string(num, sizeof(num), nums[i]); - quicklistPushTail(ql, num, sz); - } - - quicklistEntry entry; - quicklistIter *iter = quicklistGetIteratorAtIdx(ql, AL_START_HEAD, 437); - int i = 437; - while (quicklistNext(iter, &entry)) { - if (entry.longval != nums[i]) ERR("Expected %lld, but got %lld", entry.longval, nums[i]); - i++; - } - ql_release_iterator(iter); - quicklistRelease(ql); - } - } - - TEST_DESC("ltrim test A at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - char num[32]; - long long nums[5000]; - for (int i = 0; i < 32; i++) { - nums[i] = -5157318210846258176 + i; - int sz = ll2string(num, sizeof(num), nums[i]); - quicklistPushTail(ql, num, sz); - } - if (fills[f] == 32) ql_verify(ql, 1, 32, 32, 32); - /* ltrim 25 53 (keep [25,32] inclusive = 7 remaining) */ - quicklistDelRange(ql, 0, 25); - quicklistDelRange(ql, 0, 0); - quicklistEntry entry; - for (int i = 0; i < 7; i++) { - iter = quicklistGetIteratorEntryAtIdx(ql, i, &entry); - if (entry.longval != nums[25 + i]) - ERR("Deleted invalid range! Expected %lld but got " - "%lld", - entry.longval, nums[25 + i]); - ql_release_iterator(iter); - } - if (fills[f] == 32) ql_verify(ql, 1, 7, 7, 7); - quicklistRelease(ql); - } - } - - TEST_DESC("ltrim test B at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - /* Force-disable compression because our 33 sequential - * integers don't compress and the check always fails. */ - quicklist *ql = quicklistNew(fills[f], QUICKLIST_NOCOMPRESS); - char num[32]; - long long nums[5000]; - for (int i = 0; i < 33; i++) { - nums[i] = i; - int sz = ll2string(num, sizeof(num), nums[i]); - quicklistPushTail(ql, num, sz); - } - if (fills[f] == 32) ql_verify(ql, 2, 33, 32, 1); - /* ltrim 5 16 (keep [5,16] inclusive = 12 remaining) */ - quicklistDelRange(ql, 0, 5); - quicklistDelRange(ql, -16, 16); - if (fills[f] == 32) ql_verify(ql, 1, 12, 12, 12); - quicklistEntry entry; - - iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); - if (entry.longval != 5) ERR("A: longval not 5, but %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); - if (entry.longval != 16) ERR("B! got instead: %lld", entry.longval); - quicklistPushTail(ql, "bobobob", 7); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); - int sz = entry.sz; - if (strncmp((char *)entry.value, "bobobob", 7)) - ERR("Tail doesn't match bobobob, it's %.*s instead", sz, entry.value); - ql_release_iterator(iter); - - for (int i = 0; i < 12; i++) { - iter = quicklistGetIteratorEntryAtIdx(ql, i, &entry); - if (entry.longval != nums[5 + i]) - ERR("Deleted invalid range! Expected %lld but got " - "%lld", - entry.longval, nums[5 + i]); - ql_release_iterator(iter); - } - quicklistRelease(ql); - } - } - - TEST_DESC("ltrim test C at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - char num[32]; - long long nums[5000]; - for (int i = 0; i < 33; i++) { - nums[i] = -5157318210846258176 + i; - int sz = ll2string(num, sizeof(num), nums[i]); - quicklistPushTail(ql, num, sz); - } - if (fills[f] == 32) ql_verify(ql, 2, 33, 32, 1); - /* ltrim 3 3 (keep [3,3] inclusive = 1 remaining) */ - quicklistDelRange(ql, 0, 3); - quicklistDelRange(ql, -29, 4000); /* make sure not loop forever */ - if (fills[f] == 32) ql_verify(ql, 1, 1, 1, 1); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); - if (entry.longval != -5157318210846258173) ERROR; - ql_release_iterator(iter); - quicklistRelease(ql); - } - } - - TEST_DESC("ltrim test D at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - char num[32]; - long long nums[5000]; - for (int i = 0; i < 33; i++) { - nums[i] = -5157318210846258176 + i; - int sz = ll2string(num, sizeof(num), nums[i]); - quicklistPushTail(ql, num, sz); - } - if (fills[f] == 32) ql_verify(ql, 2, 33, 32, 1); - quicklistDelRange(ql, -12, 3); - if (ql->count != 30) ERR("Didn't delete exactly three elements! Count is: %lu", ql->count); - quicklistRelease(ql); - } - } - - long long stop = mstime(); - runtime[_i] = stop - start; - } - - /* Run a longer test of compression depth outside of primary test loop. */ - int list_sizes[] = {250, 251, 500, 999, 1000}; - long long start = mstime(); - int list_count = accurate ? (int)(sizeof(list_sizes) / sizeof(*list_sizes)) : 1; - for (int list = 0; list < list_count; list++) { - TEST_DESC("verify specific compression of interior nodes with %d list ", list_sizes[list]) { - for (int f = 0; f < fill_count; f++) { - for (int depth = 1; depth < 40; depth++) { - /* skip over many redundant test cases */ - quicklist *ql = quicklistNew(fills[f], depth); - for (int i = 0; i < list_sizes[list]; i++) { - quicklistPushTail(ql, genstr("hello TAIL", i + 1), 64); - quicklistPushHead(ql, genstr("hello HEAD", i + 1), 64); - } - - for (int step = 0; step < 2; step++) { - /* test remove node */ - if (step == 1) { - for (int i = 0; i < list_sizes[list] / 2; i++) { - unsigned char *data; - assert(quicklistPop(ql, QUICKLIST_HEAD, &data, NULL, NULL)); - zfree(data); - assert(quicklistPop(ql, QUICKLIST_TAIL, &data, NULL, NULL)); - zfree(data); - } - } - quicklistNode *node = ql->head; - unsigned int low_raw = ql->compress; - unsigned int high_raw = ql->len - ql->compress; - - for (unsigned int at = 0; at < ql->len; at++, node = node->next) { - if (at < low_raw || at >= high_raw) { - if (node->encoding != QUICKLIST_NODE_ENCODING_RAW) { - ERR("Incorrect compression: node %d is " - "compressed at depth %d ((%u, %u); total " - "nodes: %lu; size: %zu)", - at, depth, low_raw, high_raw, ql->len, node->sz); - } - } else { - if (node->encoding != QUICKLIST_NODE_ENCODING_LZF) { - ERR("Incorrect non-compression: node %d is NOT " - "compressed at depth %d ((%u, %u); total " - "nodes: %lu; size: %zu; attempted: %d)", - at, depth, low_raw, high_raw, ql->len, node->sz, node->attempted_compress); - } - } - } - } - - quicklistRelease(ql); - } - } - } - } - long long stop = mstime(); - - printf("\n"); - for (size_t i = 0; i < option_count; i++) - printf("Test Loop %02d: %0.2f seconds.\n", options[i], (float)runtime[i] / 1000); - printf("Compressions: %0.2f seconds.\n", (float)(stop - start) / 1000); - printf("\n"); - - TEST("bookmark get updated to next item") { - quicklist *ql = quicklistNew(1, 0); - quicklistPushTail(ql, "1", 1); - quicklistPushTail(ql, "2", 1); - quicklistPushTail(ql, "3", 1); - quicklistPushTail(ql, "4", 1); - quicklistPushTail(ql, "5", 1); - assert(ql->len == 5); - /* add two bookmarks, one pointing to the node before the last. */ - assert(quicklistBookmarkCreate(&ql, "_dummy", ql->head->next)); - assert(quicklistBookmarkCreate(&ql, "_test", ql->tail->prev)); - /* test that the bookmark returns the right node, delete it and see that the bookmark points to the last node */ - assert(quicklistBookmarkFind(ql, "_test") == ql->tail->prev); - assert(quicklistDelRange(ql, -2, 1)); - assert(quicklistBookmarkFind(ql, "_test") == ql->tail); - /* delete the last node, and see that the bookmark was deleted. */ - assert(quicklistDelRange(ql, -1, 1)); - assert(quicklistBookmarkFind(ql, "_test") == NULL); - /* test that other bookmarks aren't affected */ - assert(quicklistBookmarkFind(ql, "_dummy") == ql->head->next); - assert(quicklistBookmarkFind(ql, "_missing") == NULL); - assert(ql->len == 3); - quicklistBookmarksClear(ql); /* for coverage */ - assert(quicklistBookmarkFind(ql, "_dummy") == NULL); - quicklistRelease(ql); - } - - TEST("bookmark limit") { - int i; - quicklist *ql = quicklistNew(1, 0); - quicklistPushHead(ql, "1", 1); - for (i = 0; i < QL_MAX_BM; i++) assert(quicklistBookmarkCreate(&ql, genstr("", i), ql->head)); - /* when all bookmarks are used, creation fails */ - assert(!quicklistBookmarkCreate(&ql, "_test", ql->head)); - /* delete one and see that we can now create another */ - assert(quicklistBookmarkDelete(ql, "0")); - assert(quicklistBookmarkCreate(&ql, "_test", ql->head)); - /* delete one and see that the rest survive */ - assert(quicklistBookmarkDelete(ql, "_test")); - for (i = 1; i < QL_MAX_BM; i++) assert(quicklistBookmarkFind(ql, genstr("", i)) == ql->head); - /* make sure the deleted ones are indeed gone */ - assert(!quicklistBookmarkFind(ql, "0")); - assert(!quicklistBookmarkFind(ql, "_test")); - quicklistRelease(ql); - } - - if (flags & TEST_LARGE_MEMORY) { - TEST("compress and decompress quicklist listpack node") { - quicklistNode *node = quicklistCreateNode(); - node->entry = lpNew(0); - - /* Just to avoid triggering the assertion in __quicklistCompressNode(), - * it disables the passing of quicklist head or tail node. */ - node->prev = quicklistCreateNode(); - node->next = quicklistCreateNode(); - - /* Create a rand string */ - size_t sz = (1 << 25); /* 32MB per one entry */ - unsigned char *s = zmalloc(sz); - randstring(s, sz); - - /* Keep filling the node, until it reaches 1GB */ - for (int i = 0; i < 32; i++) { - node->entry = lpAppend(node->entry, s, sz); - quicklistNodeUpdateSz(node); - - long long start = mstime(); - assert(__quicklistCompressNode(node)); - assert(__quicklistDecompressNode(node)); - printf("Compress and decompress: %zu MB in %.2f seconds.\n", node->sz / 1024 / 1024, - (float)(mstime() - start) / 1000); - } - - zfree(s); - zfree(node->prev); - zfree(node->next); - zfree(node->entry); - zfree(node); - } - -#if ULONG_MAX >= 0xffffffffffffffff - TEST("compress and decomress quicklist plain node large than UINT32_MAX") { - size_t sz = (1ull << 32); - unsigned char *s = zmalloc(sz); - randstring(s, sz); - memcpy(s, "helloworld", 10); - memcpy(s + sz - 10, "1234567890", 10); - - quicklistNode *node = __quicklistCreateNode(QUICKLIST_NODE_CONTAINER_PLAIN, s, sz); - - /* Just to avoid triggering the assertion in __quicklistCompressNode(), - * it disables the passing of quicklist head or tail node. */ - node->prev = quicklistCreateNode(); - node->next = quicklistCreateNode(); - - long long start = mstime(); - assert(__quicklistCompressNode(node)); - assert(__quicklistDecompressNode(node)); - printf("Compress and decompress: %zu MB in %.2f seconds.\n", node->sz / 1024 / 1024, - (float)(mstime() - start) / 1000); - - assert(memcmp(node->entry, "helloworld", 10) == 0); - assert(memcmp(node->entry + sz - 10, "1234567890", 10) == 0); - zfree(node->prev); - zfree(node->next); - zfree(node->entry); - zfree(node); - } -#endif - } - - if (!err) - printf("ALL TESTS PASSED!\n"); - else - ERR("Sorry, not all tests passed! In fact, %d tests failed.", err); - - return err; -} -#endif diff --git a/src/quicklist.h b/src/quicklist.h index bb94807913..4411f823b0 100644 --- a/src/quicklist.h +++ b/src/quicklist.h @@ -198,10 +198,6 @@ quicklistNode *quicklistBookmarkFind(quicklist *ql, const char *name); void quicklistBookmarksClear(quicklist *ql); int quicklistSetPackedThreshold(size_t sz); -#ifdef SERVER_TEST -int quicklistTest(int argc, char *argv[], int flags); -#endif - /* Directions for iterators */ #define AL_START_HEAD 0 #define AL_START_TAIL 1 diff --git a/src/server.c b/src/server.c index be3982278f..3217351faf 100644 --- a/src/server.c +++ b/src/server.c @@ -6774,85 +6774,12 @@ int iAmPrimary(void) { (server.cluster_enabled && clusterNodeIsPrimary(getMyClusterNode()))); } -#ifdef SERVER_TEST -#include "testhelp.h" -#include "intset.h" /* Compact integer set structure */ - -int __failed_tests = 0; -int __test_num = 0; - -/* The flags are the following: - * --accurate: Runs tests with more iterations. - * --large-memory: Enables tests that consume more than 100mb. */ -typedef int serverTestProc(int argc, char **argv, int flags); -struct serverTest { - char *name; - serverTestProc *proc; - int failed; -} serverTests[] = { - {"quicklist", quicklistTest}, -}; -serverTestProc *getTestProcByName(const char *name) { - int numtests = sizeof(serverTests) / sizeof(struct serverTest); - for (int j = 0; j < numtests; j++) { - if (!strcasecmp(name, serverTests[j].name)) { - return serverTests[j].proc; - } - } - return NULL; -} -#endif - /* Main is marked as weak so that unit tests can use their own main function. */ __attribute__((weak)) int main(int argc, char **argv) { struct timeval tv; int j; char config_from_stdin = 0; -#ifdef SERVER_TEST - monotonicInit(); /* Required for dict tests, that are relying on monotime during dict rehashing. */ - if (argc >= 3 && !strcasecmp(argv[1], "test")) { - int flags = 0; - for (j = 3; j < argc; j++) { - char *arg = argv[j]; - if (!strcasecmp(arg, "--accurate")) - flags |= TEST_ACCURATE; - else if (!strcasecmp(arg, "--large-memory")) - flags |= TEST_LARGE_MEMORY; - else if (!strcasecmp(arg, "--valgrind")) - flags |= TEST_VALGRIND; - } - - if (!strcasecmp(argv[2], "all")) { - int numtests = sizeof(serverTests) / sizeof(struct serverTest); - for (j = 0; j < numtests; j++) { - serverTests[j].failed = (serverTests[j].proc(argc, argv, flags) != 0); - } - - /* Report tests result */ - int failed_num = 0; - for (j = 0; j < numtests; j++) { - if (serverTests[j].failed) { - failed_num++; - printf("[failed] Test - %s\n", serverTests[j].name); - } else { - printf("[ok] Test - %s\n", serverTests[j].name); - } - } - - printf("%d tests, %d passed, %d failed\n", numtests, numtests - failed_num, failed_num); - - return failed_num == 0 ? 0 : 1; - } else { - serverTestProc *proc = getTestProcByName(argv[2]); - if (!proc) return -1; /* test not found */ - return proc(argc, argv, flags); - } - - return 0; - } -#endif - /* We need to initialize our libraries, and the server configuration. */ #ifdef INIT_SETPROCTITLE_REPLACEMENT spt_init(argc, argv); diff --git a/src/unit/test_files.h b/src/unit/test_files.h index c2b062039a..87bc031fb4 100644 --- a/src/unit/test_files.h +++ b/src/unit/test_files.h @@ -84,6 +84,64 @@ int test_listpackBenchmarkLpValidateIntegrity(int argc, char **argv, int flags); int test_listpackBenchmarkLpCompareWithString(int argc, char **argv, int flags); int test_listpackBenchmarkLpCompareWithNumber(int argc, char **argv, int flags); int test_listpackBenchmarkFree(int argc, char **argv, int flags); +int test_quicklistCreateList(int argc, char **argv, int flags); +int test_quicklistAddToTailOfEmptyList(int argc, char **argv, int flags); +int test_quicklistAddToHeadOfEmptyList(int argc, char **argv, int flags); +int test_quicklistAddToTail5xAtCompress(int argc, char **argv, int flags); +int test_quicklistAddToHead5xAtCompress(int argc, char **argv, int flags); +int test_quicklistAddToTail500xAtCompress(int argc, char **argv, int flags); +int test_quicklistAddToHead500xAtCompress(int argc, char **argv, int flags); +int test_quicklistRotateEmpty(int argc, char **argv, int flags); +int test_quicklistComprassionPlainNode(int argc, char **argv, int flags); +int test_quicklistNextPlainNode(int argc, char **argv, int flags); +int test_quicklistRotatePlainNode(int argc, char **argv, int flags); +int test_quicklistRotateOneValOnce(int argc, char **argv, int flags); +int test_quicklistRotate500Val5000TimesAtCompress(int argc, char **argv, int flags); +int test_quicklistPopEmpty(int argc, char **argv, int flags); +int test_quicklistPop1StringFrom1(int argc, char **argv, int flags); +int test_quicklistPopHead1NumberFrom1(int argc, char **argv, int flags); +int test_quicklistPopHead500From500(int argc, char **argv, int flags); +int test_quicklistPopHead5000From500(int argc, char **argv, int flags); +int test_quicklistIterateForwardOver500List(int argc, char **argv, int flags); +int test_quicklistIterateReverseOver500List(int argc, char **argv, int flags); +int test_quicklistInsertAfter1Element(int argc, char **argv, int flags); +int test_quicklistInsertBefore1Element(int argc, char **argv, int flags); +int test_quicklistInsertHeadWhileHeadNodeIsFull(int argc, char **argv, int flags); +int test_quicklistInsertTailWhileTailNodeIsFull(int argc, char **argv, int flags); +int test_quicklistInsertOnceInElementsWhileIteratingAtCompress(int argc, char **argv, int flags); +int test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress(int argc, char **argv, int flags); +int test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress(int argc, char **argv, int flags); +int test_quicklistDuplicateEmptyList(int argc, char **argv, int flags); +int test_quicklistDuplicateListOf1Element(int argc, char **argv, int flags); +int test_quicklistDuplicateListOf500(int argc, char **argv, int flags); +int test_quicklistIndex1200From500ListAtFill(int argc, char **argv, int flags); +int test_quicklistIndex12From500ListAtFill(int argc, char **argv, int flags); +int test_quicklistIndex100From500ListAtFill(int argc, char **argv, int flags); +int test_quicklistIndexTooBig1From50ListAtFill(int argc, char **argv, int flags); +int test_quicklistDeleteRangeEmptyList(int argc, char **argv, int flags); +int test_quicklistDeleteRangeOfEntireNodeInListOfOneNode(int argc, char **argv, int flags); +int test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts(int argc, char **argv, int flags); +int test_quicklistDeleteMiddle100Of500List(int argc, char **argv, int flags); +int test_quicklistDeleteLessThanFillButAcrossNodes(int argc, char **argv, int flags); +int test_quicklistDeleteNegative1From500List(int argc, char **argv, int flags); +int test_quicklistDeleteNegative1From500ListWithOverflowCounts(int argc, char **argv, int flags); +int test_quicklistDeleteNegative100From500List(int argc, char **argv, int flags); +int test_quicklistDelete10Count5From50List(int argc, char **argv, int flags); +int test_quicklistNumbersOnlyListRead(int argc, char **argv, int flags); +int test_quicklistNumbersLargerListRead(int argc, char **argv, int flags); +int test_quicklistNumbersLargerListReadB(int argc, char **argv, int flags); +int test_quicklistLremTestAtCompress(int argc, char **argv, int flags); +int test_quicklistIterateReverseDeleteAtCompress(int argc, char **argv, int flags); +int test_quicklistIteratorAtIndexTestAtCompress(int argc, char **argv, int flags); +int test_quicklistLtrimTestAAtCompress(int argc, char **argv, int flags); +int test_quicklistLtrimTestBAtCompress(int argc, char **argv, int flags); +int test_quicklistLtrimTestCAtCompress(int argc, char **argv, int flags); +int test_quicklistLtrimTestDAtCompress(int argc, char **argv, int flags); +int test_quicklistVerifySpecificCompressionOfInteriorNodes(int argc, char **argv, int flags); +int test_quicklistBookmarkGetUpdatedToNextItem(int argc, char **argv, int flags); +int test_quicklistBookmarkLimit(int argc, char **argv, int flags); +int test_quicklistCompressAndDecompressQuicklistListpackNode(int argc, char **argv, int flags); +int test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX(int argc, char **argv, int flags); int test_raxRandomWalk(int argc, char **argv, int flags); int test_raxIteratorUnitTests(int argc, char **argv, int flags); int test_raxTryInsertUnitTests(int argc, char **argv, int flags); @@ -157,6 +215,7 @@ unitTest __test_endianconv_c[] = {{"test_endianconv", test_endianconv}, {NULL, N unitTest __test_intset_c[] = {{"test_intsetValueEncodings", test_intsetValueEncodings}, {"test_intsetBasicAdding", test_intsetBasicAdding}, {"test_intsetLargeNumberRandomAdd", test_intsetLargeNumberRandomAdd}, {"test_intsetUpgradeFromint16Toint32", test_intsetUpgradeFromint16Toint32}, {"test_intsetUpgradeFromint16Toint64", test_intsetUpgradeFromint16Toint64}, {"test_intsetUpgradeFromint32Toint64", test_intsetUpgradeFromint32Toint64}, {"test_intsetStressLookups", test_intsetStressLookups}, {"test_intsetStressAddDelete", test_intsetStressAddDelete}, {NULL, NULL}}; unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict}, {NULL, NULL}}; unitTest __test_listpack_c[] = {{"test_listpackCreateIntList", test_listpackCreateIntList}, {"test_listpackCreateList", test_listpackCreateList}, {"test_listpackLpPrepend", test_listpackLpPrepend}, {"test_listpackLpPrependInteger", test_listpackLpPrependInteger}, {"test_listpackGetELementAtIndex", test_listpackGetELementAtIndex}, {"test_listpackPop", test_listpackPop}, {"test_listpackGetELementAtIndex2", test_listpackGetELementAtIndex2}, {"test_listpackIterate0toEnd", test_listpackIterate0toEnd}, {"test_listpackIterate1toEnd", test_listpackIterate1toEnd}, {"test_listpackIterate2toEnd", test_listpackIterate2toEnd}, {"test_listpackIterateBackToFront", test_listpackIterateBackToFront}, {"test_listpackIterateBackToFrontWithDelete", test_listpackIterateBackToFrontWithDelete}, {"test_listpackDeleteWhenNumIsMinusOne", test_listpackDeleteWhenNumIsMinusOne}, {"test_listpackDeleteWithNegativeIndex", test_listpackDeleteWithNegativeIndex}, {"test_listpackDeleteInclusiveRange0_0", test_listpackDeleteInclusiveRange0_0}, {"test_listpackDeleteInclusiveRange0_1", test_listpackDeleteInclusiveRange0_1}, {"test_listpackDeleteInclusiveRange1_2", test_listpackDeleteInclusiveRange1_2}, {"test_listpackDeleteWitStartIndexOutOfRange", test_listpackDeleteWitStartIndexOutOfRange}, {"test_listpackDeleteWitNumOverflow", test_listpackDeleteWitNumOverflow}, {"test_listpackBatchDelete", test_listpackBatchDelete}, {"test_listpackDeleteFooWhileIterating", test_listpackDeleteFooWhileIterating}, {"test_listpackReplaceWithSameSize", test_listpackReplaceWithSameSize}, {"test_listpackReplaceWithDifferentSize", test_listpackReplaceWithDifferentSize}, {"test_listpackRegressionGt255Bytes", test_listpackRegressionGt255Bytes}, {"test_listpackCreateLongListAndCheckIndices", test_listpackCreateLongListAndCheckIndices}, {"test_listpackCompareStrsWithLpEntries", test_listpackCompareStrsWithLpEntries}, {"test_listpackLpMergeEmptyLps", test_listpackLpMergeEmptyLps}, {"test_listpackLpMergeLp1Larger", test_listpackLpMergeLp1Larger}, {"test_listpackLpMergeLp2Larger", test_listpackLpMergeLp2Larger}, {"test_listpackLpNextRandom", test_listpackLpNextRandom}, {"test_listpackLpNextRandomCC", test_listpackLpNextRandomCC}, {"test_listpackRandomPairWithOneElement", test_listpackRandomPairWithOneElement}, {"test_listpackRandomPairWithManyElements", test_listpackRandomPairWithManyElements}, {"test_listpackRandomPairsWithOneElement", test_listpackRandomPairsWithOneElement}, {"test_listpackRandomPairsWithManyElements", test_listpackRandomPairsWithManyElements}, {"test_listpackRandomPairsUniqueWithOneElement", test_listpackRandomPairsUniqueWithOneElement}, {"test_listpackRandomPairsUniqueWithManyElements", test_listpackRandomPairsUniqueWithManyElements}, {"test_listpackPushVariousEncodings", test_listpackPushVariousEncodings}, {"test_listpackLpFind", test_listpackLpFind}, {"test_listpackLpValidateIntegrity", test_listpackLpValidateIntegrity}, {"test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN", test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN}, {"test_listpackStressWithRandom", test_listpackStressWithRandom}, {"test_listpackSTressWithVariableSize", test_listpackSTressWithVariableSize}, {"test_listpackBenchmarkInit", test_listpackBenchmarkInit}, {"test_listpackBenchmarkLpAppend", test_listpackBenchmarkLpAppend}, {"test_listpackBenchmarkLpFindString", test_listpackBenchmarkLpFindString}, {"test_listpackBenchmarkLpFindNumber", test_listpackBenchmarkLpFindNumber}, {"test_listpackBenchmarkLpSeek", test_listpackBenchmarkLpSeek}, {"test_listpackBenchmarkLpValidateIntegrity", test_listpackBenchmarkLpValidateIntegrity}, {"test_listpackBenchmarkLpCompareWithString", test_listpackBenchmarkLpCompareWithString}, {"test_listpackBenchmarkLpCompareWithNumber", test_listpackBenchmarkLpCompareWithNumber}, {"test_listpackBenchmarkFree", test_listpackBenchmarkFree}, {NULL, NULL}}; +unitTest __test_quicklist_c[] = {{"test_quicklistCreateList", test_quicklistCreateList}, {"test_quicklistAddToTailOfEmptyList", test_quicklistAddToTailOfEmptyList}, {"test_quicklistAddToHeadOfEmptyList", test_quicklistAddToHeadOfEmptyList}, {"test_quicklistAddToTail5xAtCompress", test_quicklistAddToTail5xAtCompress}, {"test_quicklistAddToHead5xAtCompress", test_quicklistAddToHead5xAtCompress}, {"test_quicklistAddToTail500xAtCompress", test_quicklistAddToTail500xAtCompress}, {"test_quicklistAddToHead500xAtCompress", test_quicklistAddToHead500xAtCompress}, {"test_quicklistRotateEmpty", test_quicklistRotateEmpty}, {"test_quicklistComprassionPlainNode", test_quicklistComprassionPlainNode}, {"test_quicklistNextPlainNode", test_quicklistNextPlainNode}, {"test_quicklistRotatePlainNode", test_quicklistRotatePlainNode}, {"test_quicklistRotateOneValOnce", test_quicklistRotateOneValOnce}, {"test_quicklistRotate500Val5000TimesAtCompress", test_quicklistRotate500Val5000TimesAtCompress}, {"test_quicklistPopEmpty", test_quicklistPopEmpty}, {"test_quicklistPop1StringFrom1", test_quicklistPop1StringFrom1}, {"test_quicklistPopHead1NumberFrom1", test_quicklistPopHead1NumberFrom1}, {"test_quicklistPopHead500From500", test_quicklistPopHead500From500}, {"test_quicklistPopHead5000From500", test_quicklistPopHead5000From500}, {"test_quicklistIterateForwardOver500List", test_quicklistIterateForwardOver500List}, {"test_quicklistIterateReverseOver500List", test_quicklistIterateReverseOver500List}, {"test_quicklistInsertAfter1Element", test_quicklistInsertAfter1Element}, {"test_quicklistInsertBefore1Element", test_quicklistInsertBefore1Element}, {"test_quicklistInsertHeadWhileHeadNodeIsFull", test_quicklistInsertHeadWhileHeadNodeIsFull}, {"test_quicklistInsertTailWhileTailNodeIsFull", test_quicklistInsertTailWhileTailNodeIsFull}, {"test_quicklistInsertOnceInElementsWhileIteratingAtCompress", test_quicklistInsertOnceInElementsWhileIteratingAtCompress}, {"test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistDuplicateEmptyList", test_quicklistDuplicateEmptyList}, {"test_quicklistDuplicateListOf1Element", test_quicklistDuplicateListOf1Element}, {"test_quicklistDuplicateListOf500", test_quicklistDuplicateListOf500}, {"test_quicklistIndex1200From500ListAtFill", test_quicklistIndex1200From500ListAtFill}, {"test_quicklistIndex12From500ListAtFill", test_quicklistIndex12From500ListAtFill}, {"test_quicklistIndex100From500ListAtFill", test_quicklistIndex100From500ListAtFill}, {"test_quicklistIndexTooBig1From50ListAtFill", test_quicklistIndexTooBig1From50ListAtFill}, {"test_quicklistDeleteRangeEmptyList", test_quicklistDeleteRangeEmptyList}, {"test_quicklistDeleteRangeOfEntireNodeInListOfOneNode", test_quicklistDeleteRangeOfEntireNodeInListOfOneNode}, {"test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts", test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts}, {"test_quicklistDeleteMiddle100Of500List", test_quicklistDeleteMiddle100Of500List}, {"test_quicklistDeleteLessThanFillButAcrossNodes", test_quicklistDeleteLessThanFillButAcrossNodes}, {"test_quicklistDeleteNegative1From500List", test_quicklistDeleteNegative1From500List}, {"test_quicklistDeleteNegative1From500ListWithOverflowCounts", test_quicklistDeleteNegative1From500ListWithOverflowCounts}, {"test_quicklistDeleteNegative100From500List", test_quicklistDeleteNegative100From500List}, {"test_quicklistDelete10Count5From50List", test_quicklistDelete10Count5From50List}, {"test_quicklistNumbersOnlyListRead", test_quicklistNumbersOnlyListRead}, {"test_quicklistNumbersLargerListRead", test_quicklistNumbersLargerListRead}, {"test_quicklistNumbersLargerListReadB", test_quicklistNumbersLargerListReadB}, {"test_quicklistLremTestAtCompress", test_quicklistLremTestAtCompress}, {"test_quicklistIterateReverseDeleteAtCompress", test_quicklistIterateReverseDeleteAtCompress}, {"test_quicklistIteratorAtIndexTestAtCompress", test_quicklistIteratorAtIndexTestAtCompress}, {"test_quicklistLtrimTestAAtCompress", test_quicklistLtrimTestAAtCompress}, {"test_quicklistLtrimTestBAtCompress", test_quicklistLtrimTestBAtCompress}, {"test_quicklistLtrimTestCAtCompress", test_quicklistLtrimTestCAtCompress}, {"test_quicklistLtrimTestDAtCompress", test_quicklistLtrimTestDAtCompress}, {"test_quicklistVerifySpecificCompressionOfInteriorNodes", test_quicklistVerifySpecificCompressionOfInteriorNodes}, {"test_quicklistBookmarkGetUpdatedToNextItem", test_quicklistBookmarkGetUpdatedToNextItem}, {"test_quicklistBookmarkLimit", test_quicklistBookmarkLimit}, {"test_quicklistCompressAndDecompressQuicklistListpackNode", test_quicklistCompressAndDecompressQuicklistListpackNode}, {"test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX", test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX}, {NULL, NULL}}; unitTest __test_rax_c[] = {{"test_raxRandomWalk", test_raxRandomWalk}, {"test_raxIteratorUnitTests", test_raxIteratorUnitTests}, {"test_raxTryInsertUnitTests", test_raxTryInsertUnitTests}, {"test_raxRegressionTest1", test_raxRegressionTest1}, {"test_raxRegressionTest2", test_raxRegressionTest2}, {"test_raxRegressionTest3", test_raxRegressionTest3}, {"test_raxRegressionTest4", test_raxRegressionTest4}, {"test_raxRegressionTest5", test_raxRegressionTest5}, {"test_raxRegressionTest6", test_raxRegressionTest6}, {"test_raxBenchmark", test_raxBenchmark}, {"test_raxHugeKey", test_raxHugeKey}, {"test_raxFuzz", test_raxFuzz}, {NULL, NULL}}; unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {"test_sdssplitargs", test_sdssplitargs}, {NULL, NULL}}; unitTest __test_sha1_c[] = {{"test_sha1", test_sha1}, {NULL, NULL}}; @@ -176,6 +235,7 @@ struct unitTestSuite { {"test_intset.c", __test_intset_c}, {"test_kvstore.c", __test_kvstore_c}, {"test_listpack.c", __test_listpack_c}, + {"test_quicklist.c", __test_quicklist_c}, {"test_rax.c", __test_rax_c}, {"test_sds.c", __test_sds_c}, {"test_sha1.c", __test_sha1_c}, diff --git a/src/unit/test_quicklist.c b/src/unit/test_quicklist.c new file mode 100644 index 0000000000..6addb33f41 --- /dev/null +++ b/src/unit/test_quicklist.c @@ -0,0 +1,2300 @@ +#include +#include +#include +#include "test_help.h" +#include +#include + +#include "../zmalloc.h" +#include "../listpack.h" +#include "../quicklist.c" + +static int options[] = {0, 1, 2, 3, 4, 5, 6, 10}; +static int option_count = 8; + +static int fills[] = {-5, -4, -3, -2, -1, 0, + 1, 2, 32, 66, 128, 999}; +static int fill_count = 12; +static long long runtime[8]; +static unsigned int err = 0; + +/*----------------------------------------------------------------------------- + * Unit Function + *----------------------------------------------------------------------------*/ +/* Return the UNIX time in microseconds */ +static long long ustime(void) { + struct timeval tv; + long long ust; + + gettimeofday(&tv, NULL); + ust = ((long long)tv.tv_sec) * 1000000; + ust += tv.tv_usec; + return ust; +} + +/* Return the UNIX time in milliseconds */ +static long long mstime(void) { + return ustime() / 1000; +} + +/* Generate new string concatenating integer i against string 'prefix' */ +static char *genstr(char *prefix, int i) { + static char result[64] = {0}; + snprintf(result, sizeof(result), "%s%d", prefix, i); + return result; +} + +__attribute__((unused)) static void randstring(unsigned char *target, size_t sz) { + size_t p = 0; + int minval, maxval; + switch (rand() % 3) { + case 0: + minval = 'a'; + maxval = 'z'; + break; + case 1: + minval = '0'; + maxval = '9'; + break; + case 2: + minval = 'A'; + maxval = 'Z'; + break; + default: + abort(); + } + + while (p < sz) + target[p++] = minval + rand() % (maxval - minval + 1); +} + +#define TEST(name) printf("test — %s\n", name); + +#define QL_TEST_VERBOSE 0 +static void ql_info(quicklist *ql) { +#if QL_TEST_VERBOSE + TEST_PRINT_INFO("Container length: %lu\n", ql->len); + TEST_PRINT_INFO("Container size: %lu\n", ql->count); + if (ql->head) + TEST_PRINT_INFO("\t(zsize head: %lu)\n", lpLength(ql->head->entry)); + if (ql->tail) + TEST_PRINT_INFO("\t(zsize tail: %lu)\n", lpLength(ql->tail->entry)); +#else + UNUSED(ql); +#endif +} + +/* Iterate over an entire quicklist. + * Print the list if 'print' == 1. + * + * Returns physical count of elements found by iterating over the list. */ +static int _itrprintr(quicklist *ql, int print, int forward) { + quicklistIter *iter = + quicklistGetIterator(ql, forward ? AL_START_HEAD : AL_START_TAIL); + quicklistEntry entry; + int i = 0; + int p = 0; + quicklistNode *prev = NULL; + while (quicklistNext(iter, &entry)) { + if (entry.node != prev) { + /* Count the number of list nodes too */ + p++; + prev = entry.node; + } + if (print) { + int size = (entry.sz > (1 << 20)) ? 1 << 20 : entry.sz; + TEST_PRINT_INFO("[%3d (%2d)]: [%.*s] (%lld)\n", i, p, size, + (char *)entry.value, entry.longval); + } + i++; + } + quicklistReleaseIterator(iter); + return i; +} + +static int itrprintr(quicklist *ql, int print) { + return _itrprintr(ql, print, 1); +} + +static int itrprintr_rev(quicklist *ql, int print) { + return _itrprintr(ql, print, 0); +} + +#define ql_verify(a, b, c, d, e) \ + do { \ + err += _ql_verify((a), (b), (c), (d), (e)); \ + } while (0) + +static int _ql_verify_compress(quicklist *ql) { + int errors = 0; + if (quicklistAllowsCompression(ql)) { + quicklistNode *node = ql->head; + unsigned int low_raw = ql->compress; + unsigned int high_raw = ql->len - ql->compress; + + for (unsigned int at = 0; at < ql->len; at++, node = node->next) { + if (node && (at < low_raw || at >= high_raw)) { + if (node->encoding != QUICKLIST_NODE_ENCODING_RAW) { + TEST_PRINT_INFO("Incorrect compression: node %d is " + "compressed at depth %d ((%u, %u); total " + "nodes: %lu; size: %zu; recompress: %d)", + at, ql->compress, low_raw, high_raw, ql->len, node->sz, + node->recompress); + errors++; + } + } else { + if (node->encoding != QUICKLIST_NODE_ENCODING_LZF && + !node->attempted_compress) { + TEST_PRINT_INFO("Incorrect non-compression: node %d is NOT " + "compressed at depth %d ((%u, %u); total " + "nodes: %lu; size: %zu; recompress: %d; attempted: %d)", + at, ql->compress, low_raw, high_raw, ql->len, node->sz, + node->recompress, node->attempted_compress); + errors++; + } + } + } + } + return errors; +} + +/* Verify list metadata matches physical list contents. */ +static int _ql_verify(quicklist *ql, uint32_t len, uint32_t count, uint32_t head_count, uint32_t tail_count) { + int errors = 0; + + ql_info(ql); + if (len != ql->len) { + TEST_PRINT_INFO("quicklist length wrong: expected %d, got %lu", len, ql->len); + errors++; + } + + if (count != ql->count) { + TEST_PRINT_INFO("quicklist count wrong: expected %d, got %lu", count, ql->count); + errors++; + } + + int loopr = itrprintr(ql, 0); + if (loopr != (int)ql->count) { + TEST_PRINT_INFO("quicklist cached count not match actual count: expected %lu, got " + "%d", + ql->count, loopr); + errors++; + } + + int rloopr = itrprintr_rev(ql, 0); + if (loopr != rloopr) { + TEST_PRINT_INFO("quicklist has different forward count than reverse count! " + "Forward count is %d, reverse count is %d.", + loopr, rloopr); + errors++; + } + + if (ql->len == 0 && !errors) { + return errors; + } + + if (ql->head && head_count != ql->head->count && + head_count != lpLength(ql->head->entry)) { + TEST_PRINT_INFO("quicklist head count wrong: expected %d, " + "got cached %d vs. actual %lu", + head_count, ql->head->count, lpLength(ql->head->entry)); + errors++; + } + + if (ql->tail && tail_count != ql->tail->count && + tail_count != lpLength(ql->tail->entry)) { + TEST_PRINT_INFO("quicklist tail count wrong: expected %d, " + "got cached %u vs. actual %lu", + tail_count, ql->tail->count, lpLength(ql->tail->entry)); + errors++; + } + + errors += _ql_verify_compress(ql); + return errors; +} + +/* Release iterator and verify compress correctly. */ +static void ql_release_iterator(quicklistIter *iter) { + quicklist *ql = NULL; + if (iter) ql = iter->quicklist; + quicklistReleaseIterator(iter); + if (ql && _ql_verify_compress(ql)) { + abort(); + } +} + +/*----------------------------------------------------------------------------- + * Quicklist Unit Test + *----------------------------------------------------------------------------*/ +int test_quicklistCreateList(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + TEST("create list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistAddToTailOfEmptyList(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("add to tail of empty list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushTail(ql, "hello", 6); + /* 1 for head and 1 for tail because 1 node = head = tail */ + ql_verify(ql, 1, 1, 1, 1); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistAddToHeadOfEmptyList(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("add to head of empty list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushHead(ql, "hello", 6); + /* 1 for head and 1 for tail because 1 node = head = tail */ + ql_verify(ql, 1, 1, 1, 1); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistAddToTail5xAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("add to tail 5x at compress"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 5; i++) quicklistPushTail(ql, genstr("hello", i), 32); + if (ql->count != 5) { + err++; + }; + if (fills[f] == 32) ql_verify(ql, 1, 5, 5, 5); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistAddToHead5xAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("add to head 5x at compress"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 5; i++) quicklistPushHead(ql, genstr("hello", i), 32); + if (ql->count != 5) { + err++; + }; + if (fills[f] == 32) ql_verify(ql, 1, 5, 5, 5); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistAddToTail500xAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("add to tail 500x at compress"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i), 64); + if (ql->count != 500) { + err++; + }; + if (fills[f] == 32) ql_verify(ql, 16, 500, 32, 20); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistAddToHead500xAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("add to head 500x at compress"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); + if (ql->count != 500) { + err++; + }; + if (fills[f] == 32) ql_verify(ql, 16, 500, 20, 32); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistRotateEmpty(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("rotate empty"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistRotate(ql); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistComprassionPlainNode(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("Comprassion Plain node"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + size_t large_limit = (fills[f] < 0) ? quicklistNodeNegFillLimit(fills[f]) + 1 : SIZE_SAFETY_LIMIT + 1; + + char buf[large_limit]; + quicklist *ql = quicklistNew(fills[f], 1); + for (int i = 0; i < 500; i++) { + /* Set to 256 to allow the node to be triggered to compress, + * if it is less than 48(nocompress), the test will be successful. */ + snprintf(buf, sizeof(buf), "hello%d", i); + quicklistPushHead(ql, buf, large_limit); + } + + quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); + quicklistEntry entry; + int i = 0; + while (quicklistNext(iter, &entry)) { + TEST_ASSERT(QL_NODE_IS_PLAIN(entry.node)); + snprintf(buf, sizeof(buf), "hello%d", i); + if (strcmp((char *)entry.value, buf)) { + TEST_PRINT_INFO("value [%s] didn't match [%s] at position %d", entry.value, buf, i); + err++; + } + i++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistNextPlainNode(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("NEXT plain node"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + size_t large_limit = (fills[f] < 0) ? quicklistNodeNegFillLimit(fills[f]) + 1 : SIZE_SAFETY_LIMIT + 1; + quicklist *ql = quicklistNew(fills[f], options[_i]); + + char buf[large_limit]; + memcpy(buf, "plain", 5); + quicklistPushHead(ql, buf, large_limit); + quicklistPushHead(ql, buf, large_limit); + quicklistPushHead(ql, "packed3", 7); + quicklistPushHead(ql, "packed4", 7); + quicklistPushHead(ql, buf, large_limit); + + quicklistEntry entry; + quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); + + while (quicklistNext(iter, &entry) != 0) { + if (QL_NODE_IS_PLAIN(entry.node)) + TEST_ASSERT(!memcmp(entry.value, "plain", 5)); + else + TEST_ASSERT(!memcmp(entry.value, "packed", 6)); + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistRotatePlainNode(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("rotate plain node"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + size_t large_limit = (fills[f] < 0) ? quicklistNodeNegFillLimit(fills[f]) + 1 : SIZE_SAFETY_LIMIT + 1; + + unsigned char *data = NULL; + size_t sz; + long long lv; + int i = 0; + quicklist *ql = quicklistNew(fills[f], options[_i]); + char buf[large_limit]; + memcpy(buf, "hello1", 6); + quicklistPushHead(ql, buf, large_limit); + memcpy(buf, "hello4", 6); + quicklistPushHead(ql, buf, large_limit); + memcpy(buf, "hello3", 6); + quicklistPushHead(ql, buf, large_limit); + memcpy(buf, "hello2", 6); + quicklistPushHead(ql, buf, large_limit); + quicklistRotate(ql); + + for (i = 1; i < 5; i++) { + TEST_ASSERT(QL_NODE_IS_PLAIN(ql->tail)); + quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); + int temp_char = data[5]; + zfree(data); + TEST_ASSERT(temp_char == ('0' + i)); + } + + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistRotateOneValOnce(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("rotate one val once"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + quicklistPushHead(ql, "hello", 6); + quicklistRotate(ql); + /* Ignore compression verify because listpack is + * too small to compress. */ + ql_verify(ql, 1, 1, 1, 1); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistRotate500Val5000TimesAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("rotate 500 val 5000 times at compress"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + quicklistPushHead(ql, "900", 3); + quicklistPushHead(ql, "7000", 4); + quicklistPushHead(ql, "-1200", 5); + quicklistPushHead(ql, "42", 2); + for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 64); + ql_info(ql); + for (int i = 0; i < 5000; i++) { + ql_info(ql); + quicklistRotate(ql); + } + if (fills[f] == 1) + ql_verify(ql, 504, 504, 1, 1); + else if (fills[f] == 2) + ql_verify(ql, 252, 504, 2, 2); + else if (fills[f] == 32) + ql_verify(ql, 16, 504, 32, 24); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistPopEmpty(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("pop empty"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPop(ql, QUICKLIST_HEAD, NULL, NULL, NULL); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistPop1StringFrom1(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("pop 1 string from 1"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + char *populate = genstr("hello", 331); + quicklistPushHead(ql, populate, 32); + unsigned char *data; + size_t sz; + long long lv; + ql_info(ql); + TEST_ASSERT(quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv)); + TEST_ASSERT(data != NULL); + TEST_ASSERT(sz == 32); + if (strcmp(populate, (char *)data)) { + int size = sz; + TEST_PRINT_INFO("Pop'd value (%.*s) didn't equal original value (%s)", size, data, populate); + err++; + } + zfree(data); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistPopHead1NumberFrom1(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("pop head 1 number from 1"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushHead(ql, "55513", 5); + unsigned char *data; + size_t sz; + long long lv; + ql_info(ql); + TEST_ASSERT(quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv)); + TEST_ASSERT(data == NULL); + TEST_ASSERT(lv == 55513); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistPopHead500From500(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("pop head 500 from 500"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); + ql_info(ql); + for (int i = 0; i < 500; i++) { + unsigned char *data; + size_t sz; + long long lv; + int ret = quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); + TEST_ASSERT(ret == 1); + TEST_ASSERT(data != NULL); + TEST_ASSERT(sz == 32); + if (strcmp(genstr("hello", 499 - i), (char *)data)) { + int size = sz; + TEST_PRINT_INFO("Pop'd value (%.*s) didn't equal original value (%s)", size, data, genstr("hello", 499 - i)); + err++; + } + zfree(data); + } + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistPopHead5000From500(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("pop head 5000 from 500"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); + for (int i = 0; i < 5000; i++) { + unsigned char *data; + size_t sz; + long long lv; + int ret = quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); + if (i < 500) { + TEST_ASSERT(ret == 1); + TEST_ASSERT(data != NULL); + TEST_ASSERT(sz == 32); + if (strcmp(genstr("hello", 499 - i), (char *)data)) { + int size = sz; + TEST_PRINT_INFO("Pop'd value (%.*s) didn't equal original value " + "(%s)", + size, data, genstr("hello", 499 - i)); + err++; + } + zfree(data); + } else { + TEST_ASSERT(ret == 0); + } + } + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistIterateForwardOver500List(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("iterate forward over 500 list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); + quicklistIter *iter = quicklistGetIterator(ql, AL_START_HEAD); + quicklistEntry entry; + int i = 499, count = 0; + while (quicklistNext(iter, &entry)) { + char *h = genstr("hello", i); + if (strcmp((char *)entry.value, h)) { + TEST_PRINT_INFO("value [%s] didn't match [%s] at position %d", entry.value, h, i); + err++; + } + i--; + count++; + } + if (count != 500) { + TEST_PRINT_INFO("Didn't iterate over exactly 500 elements (%d)", i); + err++; + } + ql_verify(ql, 16, 500, 20, 32); + ql_release_iterator(iter); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistIterateReverseOver500List(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("iterate reverse over 500 list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); + quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); + quicklistEntry entry; + int i = 0; + while (quicklistNext(iter, &entry)) { + char *h = genstr("hello", i); + if (strcmp((char *)entry.value, h)) { + TEST_PRINT_INFO("value [%s] didn't match [%s] at position %d", entry.value, h, i); + err++; + } + i++; + } + if (i != 500) { + TEST_PRINT_INFO("Didn't iterate over exactly 500 elements (%d)", i); + err++; + } + ql_verify(ql, 16, 500, 20, 32); + ql_release_iterator(iter); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistInsertAfter1Element(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("insert after 1 element"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushHead(ql, "hello", 6); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); + quicklistInsertAfter(iter, &entry, "abc", 4); + ql_release_iterator(iter); + ql_verify(ql, 1, 2, 2, 2); + + /* verify results */ + iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); + int sz = entry.sz; + if (strncmp((char *)entry.value, "hello", 5)) { + TEST_PRINT_INFO("Value 0 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); + sz = entry.sz; + if (strncmp((char *)entry.value, "abc", 3)) { + TEST_PRINT_INFO("Value 1 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistInsertBefore1Element(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("insert before 1 element"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushHead(ql, "hello", 6); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); + quicklistInsertBefore(iter, &entry, "abc", 4); + ql_release_iterator(iter); + ql_verify(ql, 1, 2, 2, 2); + + /* verify results */ + iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); + int sz = entry.sz; + if (strncmp((char *)entry.value, "abc", 3)) { + TEST_PRINT_INFO("Value 0 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); + sz = entry.sz; + if (strncmp((char *)entry.value, "hello", 5)) { + TEST_PRINT_INFO("Value 1 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistInsertHeadWhileHeadNodeIsFull(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("insert head while head node is full"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(4, options[_i]); + for (int i = 0; i < 10; i++) quicklistPushTail(ql, genstr("hello", i), 6); + quicklistSetFill(ql, -1); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, -10, &entry); + char buf[4096] = {0}; + quicklistInsertBefore(iter, &entry, buf, 4096); + ql_release_iterator(iter); + ql_verify(ql, 4, 11, 1, 2); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistInsertTailWhileTailNodeIsFull(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("insert tail while tail node is full"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(4, options[_i]); + for (int i = 0; i < 10; i++) quicklistPushHead(ql, genstr("hello", i), 6); + quicklistSetFill(ql, -1); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); + char buf[4096] = {0}; + quicklistInsertAfter(iter, &entry, buf, 4096); + ql_release_iterator(iter); + ql_verify(ql, 4, 11, 2, 1); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistInsertOnceInElementsWhileIteratingAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("insert once in elements while iterating at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + quicklistPushTail(ql, "abc", 3); + quicklistSetFill(ql, 1); + quicklistPushTail(ql, "def", 3); /* force to unique node */ + quicklistSetFill(ql, f); + quicklistPushTail(ql, "bob", 3); /* force to reset for +3 */ + quicklistPushTail(ql, "foo", 3); + quicklistPushTail(ql, "zoo", 3); + + itrprintr(ql, 0); + /* insert "bar" before "bob" while iterating over list. */ + quicklistIter *iter = quicklistGetIterator(ql, AL_START_HEAD); + quicklistEntry entry; + while (quicklistNext(iter, &entry)) { + if (!strncmp((char *)entry.value, "bob", 3)) { + /* Insert as fill = 1 so it spills into new node. */ + quicklistInsertBefore(iter, &entry, "bar", 3); + break; /* didn't we fix insert-while-iterating? */ + } + } + ql_release_iterator(iter); + itrprintr(ql, 0); + + /* verify results */ + iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); + int sz = entry.sz; + + if (strncmp((char *)entry.value, "abc", 3)) { + TEST_PRINT_INFO("Value 0 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); + if (strncmp((char *)entry.value, "def", 3)) { + TEST_PRINT_INFO("Value 1 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 2, &entry); + if (strncmp((char *)entry.value, "bar", 3)) { + TEST_PRINT_INFO("Value 2 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 3, &entry); + if (strncmp((char *)entry.value, "bob", 3)) { + TEST_PRINT_INFO("Value 3 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 4, &entry); + if (strncmp((char *)entry.value, "foo", 3)) { + TEST_PRINT_INFO("Value 4 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 5, &entry); + if (strncmp((char *)entry.value, "zoo", 3)) { + TEST_PRINT_INFO("Value 5 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("insert [before] 250 new in middle of 500 elements at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i), 32); + for (int i = 0; i < 250; i++) { + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, 250, &entry); + quicklistInsertBefore(iter, &entry, genstr("abc", i), 32); + ql_release_iterator(iter); + } + if (fills[f] == 32) ql_verify(ql, 25, 750, 32, 20); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("insert [after] 250 new in middle of 500 elements at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); + for (int i = 0; i < 250; i++) { + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, 250, &entry); + quicklistInsertAfter(iter, &entry, genstr("abc", i), 32); + ql_release_iterator(iter); + } + + if (ql->count != 750) { + TEST_PRINT_INFO("List size not 750, but rather %ld", ql->count); + err++; + } + + if (fills[f] == 32) ql_verify(ql, 26, 750, 20, 32); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDuplicateEmptyList(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("duplicate empty list"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + ql_verify(ql, 0, 0, 0, 0); + quicklist *copy = quicklistDup(ql); + ql_verify(copy, 0, 0, 0, 0); + quicklistRelease(ql); + quicklistRelease(copy); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDuplicateListOf1Element(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("duplicate list of 1 element"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushHead(ql, genstr("hello", 3), 32); + ql_verify(ql, 1, 1, 1, 1); + quicklist *copy = quicklistDup(ql); + ql_verify(copy, 1, 1, 1, 1); + quicklistRelease(ql); + quicklistRelease(copy); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDuplicateListOf500(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("duplicate list of 500"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); + ql_verify(ql, 16, 500, 20, 32); + + quicklist *copy = quicklistDup(ql); + ql_verify(copy, 16, 500, 20, 32); + quicklistRelease(ql); + quicklistRelease(copy); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistIndex1200From500ListAtFill(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("index 1,200 from 500 list at fill at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); + if (strcmp((char *)entry.value, "hello2") != 0) { + TEST_PRINT_INFO("Value: %s", entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 200, &entry); + if (strcmp((char *)entry.value, "hello201") != 0) { + TEST_PRINT_INFO("Value: %s", entry.value); + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistIndex12From500ListAtFill(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("index -1,-2 from 500 list at fill at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); + if (strcmp((char *)entry.value, "hello500") != 0) { + TEST_PRINT_INFO("Value: %s", entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, -2, &entry); + if (strcmp((char *)entry.value, "hello499") != 0) { + TEST_PRINT_INFO("Value: %s", entry.value); + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistIndex100From500ListAtFill(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("index -100 from 500 list at fill at compress"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, -100, &entry); + if (strcmp((char *)entry.value, "hello401") != 0) { + TEST_PRINT_INFO("Value: %s", entry.value); + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistIndexTooBig1From50ListAtFill(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("index too big +1 from 50 list at fill at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 50; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + quicklistEntry entry; + int sz = entry.sz; + iter = quicklistGetIteratorEntryAtIdx(ql, 50, &entry); + if (iter) { + TEST_PRINT_INFO("Index found at 50 with 50 list: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDeleteRangeEmptyList(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete range empty list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistDelRange(ql, 5, 20); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDeleteRangeOfEntireNodeInListOfOneNode(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete range of entire node in list of one node"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + for (int i = 0; i < 32; i++) quicklistPushHead(ql, genstr("hello", i), 32); + ql_verify(ql, 1, 32, 32, 32); + quicklistDelRange(ql, 0, 32); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete range of entire node with overflow counts"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + for (int i = 0; i < 32; i++) quicklistPushHead(ql, genstr("hello", i), 32); + ql_verify(ql, 1, 32, 32, 32); + quicklistDelRange(ql, 0, 128); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDeleteMiddle100Of500List(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete middle 100 of 500 list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + ql_verify(ql, 16, 500, 32, 20); + quicklistDelRange(ql, 200, 100); + ql_verify(ql, 14, 400, 32, 20); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDeleteLessThanFillButAcrossNodes(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete less than fill but across nodes"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + ql_verify(ql, 16, 500, 32, 20); + quicklistDelRange(ql, 60, 10); + ql_verify(ql, 16, 490, 32, 20); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDeleteNegative1From500List(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete negative 1 from 500 list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + ql_verify(ql, 16, 500, 32, 20); + quicklistDelRange(ql, -1, 1); + ql_verify(ql, 16, 499, 32, 19); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDeleteNegative1From500ListWithOverflowCounts(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete negative 1 from 500 list with overflow counts"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + ql_verify(ql, 16, 500, 32, 20); + quicklistDelRange(ql, -1, 128); + ql_verify(ql, 16, 499, 32, 19); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDeleteNegative100From500List(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete negative 100 from 500 list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + quicklistDelRange(ql, -100, 100); + ql_verify(ql, 13, 400, 32, 16); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDelete10Count5From50List(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete -10 count 5 from 50 list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 50; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + ql_verify(ql, 2, 50, 32, 18); + quicklistDelRange(ql, -10, 5); + ql_verify(ql, 2, 45, 32, 13); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistNumbersOnlyListRead(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("numbers only list read"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushTail(ql, "1111", 4); + quicklistPushTail(ql, "2222", 4); + quicklistPushTail(ql, "3333", 4); + quicklistPushTail(ql, "4444", 4); + ql_verify(ql, 1, 4, 4, 4); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); + if (entry.longval != 1111) { + TEST_PRINT_INFO("Not 1111, %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); + if (entry.longval != 2222) { + TEST_PRINT_INFO("Not 2222, %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 2, &entry); + if (entry.longval != 3333) { + TEST_PRINT_INFO("Not 3333, %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 3, &entry); + if (entry.longval != 4444) { + TEST_PRINT_INFO("Not 4444, %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 4, &entry); + if (iter) { + TEST_PRINT_INFO("Index past elements: %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); + if (entry.longval != 4444) { + TEST_PRINT_INFO("Not 4444 (reverse), %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, -2, &entry); + if (entry.longval != 3333) { + TEST_PRINT_INFO("Not 3333 (reverse), %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, -3, &entry); + if (entry.longval != 2222) { + TEST_PRINT_INFO("Not 2222 (reverse), %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, -4, &entry); + if (entry.longval != 1111) { + TEST_PRINT_INFO("Not 1111 (reverse), %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, -5, &entry); + if (iter) { + TEST_PRINT_INFO("Index past elements (reverse), %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistNumbersLargerListRead(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("numbers larger list read"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 5000; i++) { + nums[i] = -5157318210846258176 + i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + quicklistPushTail(ql, "xxxxxxxxxxxxxxxxxxxx", 20); + quicklistEntry entry; + for (int i = 0; i < 5000; i++) { + iter = quicklistGetIteratorEntryAtIdx(ql, i, &entry); + if (entry.longval != nums[i]) { + TEST_PRINT_INFO("[%d] Not longval %lld but rather %lld", i, nums[i], entry.longval); + err++; + } + entry.longval = 0xdeadbeef; + ql_release_iterator(iter); + } + iter = quicklistGetIteratorEntryAtIdx(ql, 5000, &entry); + if (strncmp((char *)entry.value, "xxxxxxxxxxxxxxxxxxxx", 20)) { + TEST_PRINT_INFO("String val not match: %s", entry.value); + err++; + } + ql_verify(ql, 157, 5001, 32, 9); + ql_release_iterator(iter); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistNumbersLargerListReadB(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("numbers larger list read B"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushTail(ql, "99", 2); + quicklistPushTail(ql, "98", 2); + quicklistPushTail(ql, "xxxxxxxxxxxxxxxxxxxx", 20); + quicklistPushTail(ql, "96", 2); + quicklistPushTail(ql, "95", 2); + quicklistReplaceAtIndex(ql, 1, "foo", 3); + quicklistReplaceAtIndex(ql, -1, "bar", 3); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistLremTestAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("lrem test at compress"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + char *words[] = {"abc", "foo", "bar", "foobar", "foobared", "zap", "bar", "test", "foo"}; + char *result[] = {"abc", "foo", "foobar", "foobared", "zap", "test", "foo"}; + char *resultB[] = {"abc", "foo", "foobar", "foobared", "zap", "test"}; + for (int i = 0; i < 9; i++) quicklistPushTail(ql, words[i], strlen(words[i])); + + /* lrem 0 bar */ + quicklistIter *iter = quicklistGetIterator(ql, AL_START_HEAD); + quicklistEntry entry; + int i = 0; + while (quicklistNext(iter, &entry)) { + if (quicklistCompare(&entry, (unsigned char *)"bar", 3)) { + quicklistDelEntry(iter, &entry); + } + i++; + } + ql_release_iterator(iter); + + /* check result of lrem 0 bar */ + iter = quicklistGetIterator(ql, AL_START_HEAD); + i = 0; + while (quicklistNext(iter, &entry)) { + /* Result must be: abc, foo, foobar, foobared, zap, test, + * foo */ + int sz = entry.sz; + if (strncmp((char *)entry.value, result[i], entry.sz)) { + TEST_PRINT_INFO("No match at position %d, got %.*s instead of %s", i, sz, entry.value, result[i]); + err++; + } + i++; + } + ql_release_iterator(iter); + + quicklistPushTail(ql, "foo", 3); + + /* lrem -2 foo */ + iter = quicklistGetIterator(ql, AL_START_TAIL); + i = 0; + int del = 2; + while (quicklistNext(iter, &entry)) { + if (quicklistCompare(&entry, (unsigned char *)"foo", 3)) { + quicklistDelEntry(iter, &entry); + del--; + } + if (!del) break; + i++; + } + ql_release_iterator(iter); + + /* check result of lrem -2 foo */ + /* (we're ignoring the '2' part and still deleting all foo + * because + * we only have two foo) */ + iter = quicklistGetIterator(ql, AL_START_TAIL); + i = 0; + size_t resB = sizeof(resultB) / sizeof(*resultB); + while (quicklistNext(iter, &entry)) { + /* Result must be: abc, foo, foobar, foobared, zap, test, + * foo */ + int sz = entry.sz; + if (strncmp((char *)entry.value, resultB[resB - 1 - i], sz)) { + TEST_PRINT_INFO("No match at position %d, got %.*s instead of %s", i, sz, entry.value, + resultB[resB - 1 - i]); + err++; + } + i++; + } + + ql_release_iterator(iter); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistIterateReverseDeleteAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("iterate reverse + delete at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + quicklistPushTail(ql, "abc", 3); + quicklistPushTail(ql, "def", 3); + quicklistPushTail(ql, "hij", 3); + quicklistPushTail(ql, "jkl", 3); + quicklistPushTail(ql, "oop", 3); + + quicklistEntry entry; + quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); + int i = 0; + while (quicklistNext(iter, &entry)) { + if (quicklistCompare(&entry, (unsigned char *)"hij", 3)) { + quicklistDelEntry(iter, &entry); + } + i++; + } + ql_release_iterator(iter); + + if (i != 5) { + TEST_PRINT_INFO("Didn't iterate 5 times, iterated %d times.", i); + err++; + } + + /* Check results after deletion of "hij" */ + iter = quicklistGetIterator(ql, AL_START_HEAD); + i = 0; + char *vals[] = {"abc", "def", "jkl", "oop"}; + while (quicklistNext(iter, &entry)) { + if (!quicklistCompare(&entry, (unsigned char *)vals[i], 3)) { + TEST_PRINT_INFO("Value at %d didn't match %s\n", i, vals[i]); + err++; + } + i++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistIteratorAtIndexTestAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("iterator at index test at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 760; i++) { + nums[i] = -5157318210846258176 + i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + + quicklistEntry entry; + quicklistIter *iter = quicklistGetIteratorAtIdx(ql, AL_START_HEAD, 437); + int i = 437; + while (quicklistNext(iter, &entry)) { + if (entry.longval != nums[i]) { + TEST_PRINT_INFO("Expected %lld, but got %lld", entry.longval, nums[i]); + err++; + } + i++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistLtrimTestAAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("ltrim test A at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 32; i++) { + nums[i] = -5157318210846258176 + i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + if (fills[f] == 32) ql_verify(ql, 1, 32, 32, 32); + /* ltrim 25 53 (keep [25,32] inclusive = 7 remaining) */ + quicklistDelRange(ql, 0, 25); + quicklistDelRange(ql, 0, 0); + quicklistEntry entry; + for (int i = 0; i < 7; i++) { + iter = quicklistGetIteratorEntryAtIdx(ql, i, &entry); + if (entry.longval != nums[25 + i]) { + TEST_PRINT_INFO("Deleted invalid range! Expected %lld but got " + "%lld", + entry.longval, nums[25 + i]); + err++; + } + ql_release_iterator(iter); + } + if (fills[f] == 32) ql_verify(ql, 1, 7, 7, 7); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistLtrimTestBAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("ltrim test B at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + /* Force-disable compression because our 33 sequential + * integers don't compress and the check always fails. */ + quicklist *ql = quicklistNew(fills[f], QUICKLIST_NOCOMPRESS); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 33; i++) { + nums[i] = i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + if (fills[f] == 32) ql_verify(ql, 2, 33, 32, 1); + /* ltrim 5 16 (keep [5,16] inclusive = 12 remaining) */ + quicklistDelRange(ql, 0, 5); + quicklistDelRange(ql, -16, 16); + if (fills[f] == 32) ql_verify(ql, 1, 12, 12, 12); + quicklistEntry entry; + + iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); + if (entry.longval != 5) { + TEST_PRINT_INFO("A: longval not 5, but %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); + if (entry.longval != 16) { + TEST_PRINT_INFO("B! got instead: %lld", entry.longval); + err++; + } + quicklistPushTail(ql, "bobobob", 7); + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); + int sz = entry.sz; + if (strncmp((char *)entry.value, "bobobob", 7)) { + TEST_PRINT_INFO("Tail doesn't match bobobob, it's %.*s instead", sz, entry.value); + err++; + } + ql_release_iterator(iter); + + for (int i = 0; i < 12; i++) { + iter = quicklistGetIteratorEntryAtIdx(ql, i, &entry); + if (entry.longval != nums[5 + i]) { + TEST_PRINT_INFO("Deleted invalid range! Expected %lld but got " + "%lld", + entry.longval, nums[5 + i]); + err++; + } + + ql_release_iterator(iter); + } + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistLtrimTestCAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("ltrim test C at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 33; i++) { + nums[i] = -5157318210846258176 + i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + if (fills[f] == 32) ql_verify(ql, 2, 33, 32, 1); + /* ltrim 3 3 (keep [3,3] inclusive = 1 remaining) */ + quicklistDelRange(ql, 0, 3); + quicklistDelRange(ql, -29, 4000); /* make sure not loop forever */ + if (fills[f] == 32) ql_verify(ql, 1, 1, 1, 1); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); + if (entry.longval != -5157318210846258173) { + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistLtrimTestDAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("ltrim test D at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 33; i++) { + nums[i] = -5157318210846258176 + i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + if (fills[f] == 32) ql_verify(ql, 2, 33, 32, 1); + quicklistDelRange(ql, -12, 3); + if (ql->count != 30) { + TEST_PRINT_INFO("Didn't delete exactly three elements! Count is: %lu", ql->count); + err++; + } + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistVerifySpecificCompressionOfInteriorNodes(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + int accurate = flags & UNIT_TEST_ACCURATE; + + TEST("verify specific compression of interior nodes"); + + /* Run a longer test of compression depth outside of primary test loop. */ + int list_sizes[] = {250, 251, 500, 999, 1000}; + int list_count = accurate ? (int)(sizeof(list_sizes) / sizeof(*list_sizes)) : 1; + for (int list = 0; list < list_count; list++) { + for (int f = 0; f < fill_count; f++) { + for (int depth = 1; depth < 40; depth++) { + /* skip over many redundant test cases */ + quicklist *ql = quicklistNew(fills[f], depth); + for (int i = 0; i < list_sizes[list]; i++) { + quicklistPushTail(ql, genstr("hello TAIL", i + 1), 64); + quicklistPushHead(ql, genstr("hello HEAD", i + 1), 64); + } + + for (int step = 0; step < 2; step++) { + /* test remove node */ + if (step == 1) { + for (int i = 0; i < list_sizes[list] / 2; i++) { + unsigned char *data; + TEST_ASSERT(quicklistPop(ql, QUICKLIST_HEAD, &data, + NULL, NULL)); + zfree(data); + TEST_ASSERT(quicklistPop(ql, QUICKLIST_TAIL, &data, + NULL, NULL)); + zfree(data); + } + } + quicklistNode *node = ql->head; + unsigned int low_raw = ql->compress; + unsigned int high_raw = ql->len - ql->compress; + + for (unsigned int at = 0; at < ql->len; + at++, node = node->next) { + if (at < low_raw || at >= high_raw) { + if (node->encoding != QUICKLIST_NODE_ENCODING_RAW) { + TEST_PRINT_INFO("Incorrect compression: node %d is " + "compressed at depth %d ((%u, %u); total " + "nodes: %lu; size: %zu)", + at, depth, low_raw, high_raw, ql->len, + node->sz); + err++; + } + } else { + if (node->encoding != QUICKLIST_NODE_ENCODING_LZF) { + TEST_PRINT_INFO("Incorrect non-compression: node %d is NOT " + "compressed at depth %d ((%u, %u); total " + "nodes: %lu; size: %zu; attempted: %d)", + at, depth, low_raw, high_raw, ql->len, + node->sz, node->attempted_compress); + err++; + } + } + } + } + + quicklistRelease(ql); + } + } + } + TEST_ASSERT(err == 0); + return 0; +} + +/*----------------------------------------------------------------------------- + * Quicklist Bookmark Unit Test + *----------------------------------------------------------------------------*/ + +int test_quicklistBookmarkGetUpdatedToNextItem(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + TEST("bookmark get updated to next item"); + + quicklist *ql = quicklistNew(1, 0); + quicklistPushTail(ql, "1", 1); + quicklistPushTail(ql, "2", 1); + quicklistPushTail(ql, "3", 1); + quicklistPushTail(ql, "4", 1); + quicklistPushTail(ql, "5", 1); + TEST_ASSERT(ql->len == 5); + /* add two bookmarks, one pointing to the node before the last. */ + TEST_ASSERT(quicklistBookmarkCreate(&ql, "_dummy", ql->head->next)); + TEST_ASSERT(quicklistBookmarkCreate(&ql, "_test", ql->tail->prev)); + /* test that the bookmark returns the right node, delete it and see that the bookmark points to the last node */ + TEST_ASSERT(quicklistBookmarkFind(ql, "_test") == ql->tail->prev); + TEST_ASSERT(quicklistDelRange(ql, -2, 1)); + TEST_ASSERT(quicklistBookmarkFind(ql, "_test") == ql->tail); + /* delete the last node, and see that the bookmark was deleted. */ + TEST_ASSERT(quicklistDelRange(ql, -1, 1)); + TEST_ASSERT(quicklistBookmarkFind(ql, "_test") == NULL); + /* test that other bookmarks aren't affected */ + TEST_ASSERT(quicklistBookmarkFind(ql, "_dummy") == ql->head->next); + TEST_ASSERT(quicklistBookmarkFind(ql, "_missing") == NULL); + TEST_ASSERT(ql->len == 3); + quicklistBookmarksClear(ql); /* for coverage */ + TEST_ASSERT(quicklistBookmarkFind(ql, "_dummy") == NULL); + quicklistRelease(ql); + return 0; +} + +int test_quicklistBookmarkLimit(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + TEST("bookmark limit"); + + int i; + quicklist *ql = quicklistNew(1, 0); + quicklistPushHead(ql, "1", 1); + for (i = 0; i < QL_MAX_BM; i++) + TEST_ASSERT(quicklistBookmarkCreate(&ql, genstr("", i), ql->head)); + /* when all bookmarks are used, creation fails */ + TEST_ASSERT(!quicklistBookmarkCreate(&ql, "_test", ql->head)); + /* delete one and see that we can now create another */ + TEST_ASSERT(quicklistBookmarkDelete(ql, "0")); + TEST_ASSERT(quicklistBookmarkCreate(&ql, "_test", ql->head)); + /* delete one and see that the rest survive */ + TEST_ASSERT(quicklistBookmarkDelete(ql, "_test")); + for (i = 1; i < QL_MAX_BM; i++) + TEST_ASSERT(quicklistBookmarkFind(ql, genstr("", i)) == ql->head); + /* make sure the deleted ones are indeed gone */ + TEST_ASSERT(!quicklistBookmarkFind(ql, "0")); + TEST_ASSERT(!quicklistBookmarkFind(ql, "_test")); + quicklistRelease(ql); + return 0; +} + +int test_quicklistCompressAndDecompressQuicklistListpackNode(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + TEST("compress and decompress quicklist listpack node"); + + if (!(flags & UNIT_TEST_LARGE_MEMORY)) return 0; + + quicklistNode *node = quicklistCreateNode(); + node->entry = lpNew(0); + + /* Just to avoid triggering the assertion in __quicklistCompressNode(), + * it disables the passing of quicklist head or tail node. */ + node->prev = quicklistCreateNode(); + node->next = quicklistCreateNode(); + + /* Create a rand string */ + size_t sz = (1 << 25); /* 32MB per one entry */ + unsigned char *s = zmalloc(sz); + randstring(s, sz); + + /* Keep filling the node, until it reaches 1GB */ + for (int i = 0; i < 32; i++) { + node->entry = lpAppend(node->entry, s, sz); + node->sz = lpBytes((node)->entry); + + long long start = mstime(); + TEST_ASSERT(__quicklistCompressNode(node)); + TEST_ASSERT(__quicklistDecompressNode(node)); + TEST_PRINT_INFO("Compress and decompress: %zu MB in %.2f seconds.\n", + node->sz / 1024 / 1024, (float)(mstime() - start) / 1000); + } + + zfree(s); + zfree(node->prev); + zfree(node->next); + zfree(node->entry); + zfree(node); + return 0; +} + +int test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + TEST("compress and decomress quicklist plain node large than UINT32_MAX"); + + if (!(flags & UNIT_TEST_LARGE_MEMORY)) return 0; + +#if ULONG_MAX >= 0xffffffffffffffff + + size_t sz = (1ull << 32); + unsigned char *s = zmalloc(sz); + randstring(s, sz); + memcpy(s, "helloworld", 10); + memcpy(s + sz - 10, "1234567890", 10); + + quicklistNode *node = __quicklistCreateNode(QUICKLIST_NODE_CONTAINER_PLAIN, s, sz); + + /* Just to avoid triggering the assertion in __quicklistCompressNode(), + * it disables the passing of quicklist head or tail node. */ + node->prev = quicklistCreateNode(); + node->next = quicklistCreateNode(); + + long long start = mstime(); + TEST_ASSERT(__quicklistCompressNode(node)); + TEST_ASSERT(__quicklistDecompressNode(node)); + TEST_PRINT_INFO("Compress and decompress: %zu MB in %.2f seconds.\n", + node->sz / 1024 / 1024, (float)(mstime() - start) / 1000); + + TEST_ASSERT(memcmp(node->entry, "helloworld", 10) == 0); + TEST_ASSERT(memcmp(node->entry + sz - 10, "1234567890", 10) == 0); + zfree(node->prev); + zfree(node->next); + zfree(node->entry); + zfree(node); + +#endif + return 0; +} From 863d31280369a290c5b51f446a2c018ce3e98da0 Mon Sep 17 00:00:00 2001 From: Parth <661497+parthpatel@users.noreply.github.com> Date: Wed, 13 Nov 2024 21:50:55 -0800 Subject: [PATCH 28/92] Fix link-time optimization to work correctly for unit tests (i.e. -flto flag) (#1290) (#1296) * We compile various c files into object and package them into library (.a file) using ar to feed to unit tests. With new GCC versions, the objects inside such library don't participate in LTO process without additional flags. * Here is a direct quote from gcc documentation explaining this issue: "If you are not using a linker with plugin support and/or do not enable the linker plugin, then the objects inside libfoo.a are extracted and linked as usual, but they do not participate in the LTO optimization process. In order to make a static library suitable for both LTO optimization and usual linkage, compile its object files with -flto-ffat-lto-objects." * Read full documentation about -flto at https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html * Without this additional flag, I get following errors while executing "make test-unit". With this change, those errors go away. ``` ARCHIVE libvalkey.a ar: threads_mngr.o: plugin needed to handle lto object ... .. . /tmp/ccDYbMXL.ltrans0.ltrans.o: In function `dictClear': /local/workplace/elasticache/valkey/src/unit/../dict.c:776: undefined reference to `valkey_free' /local/workplace/elasticache/valkey/src/unit/../dict.c:770: undefined reference to `valkey_free' /tmp/ccDYbMXL.ltrans0.ltrans.o: In function `dictGetVal': ``` Fixes #1290 --------- Signed-off-by: Parth Patel <661497+parthpatel@users.noreply.github.com> --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index 21affe61a3..a76356e9d5 100644 --- a/src/Makefile +++ b/src/Makefile @@ -25,7 +25,7 @@ ifeq ($(OPTIMIZATION),-O3) ifeq (clang,$(CLANG)) OPTIMIZATION+=-flto else - OPTIMIZATION+=-flto=auto + OPTIMIZATION+=-flto=auto -ffat-lto-objects endif endif ifneq ($(OPTIMIZATION),-O0) From 32f7541fe34e5e0520a5917d09661756d330bd11 Mon Sep 17 00:00:00 2001 From: Qu Chen Date: Thu, 14 Nov 2024 00:45:47 -0800 Subject: [PATCH 29/92] Simplify dictType callbacks and move some macros from dict.h to dict.c (#1281) Remove the dict pointer argument to the `dictType` callbacks `keyDup`, `keyCompare`, `keyDestructor` and `valDestructor`. This argument was unused in all of the callback implementations. The macros `dictFreeKey()` and `dictFreeVal()` are made internal to dict and moved from dict.h to dict.c. They're also changed from macros to static inline functions. Signed-off-by: Qu Chen --- src/config.c | 9 ++++----- src/dict.c | 18 ++++++++++++++--- src/dict.h | 25 +++++++++++------------- src/eval.c | 3 +-- src/functions.c | 15 ++++++-------- src/latency.c | 3 +-- src/module.c | 3 +-- src/sentinel.c | 5 ++--- src/server.c | 43 ++++++++++++++--------------------------- src/server.h | 12 ++++++------ src/unit/test_dict.c | 8 ++------ src/unit/test_kvstore.c | 3 +-- src/valkey-benchmark.c | 6 ++---- src/valkey-cli.c | 19 +++++++----------- 14 files changed, 74 insertions(+), 98 deletions(-) diff --git a/src/config.c b/src/config.c index f718543c39..15fec15276 100644 --- a/src/config.c +++ b/src/config.c @@ -1013,15 +1013,14 @@ void configGetCommand(client *c) { #define CONFIG_REWRITE_SIGNATURE "# Generated by CONFIG REWRITE" -/* We use the following dictionary type to store where a configuration - * option is mentioned in the old configuration file, so it's - * like "maxmemory" -> list of line numbers (first line is zero). */ -void dictListDestructor(dict *d, void *val); - /* Sentinel config rewriting is implemented inside sentinel.c by * rewriteConfigSentinelOption(). */ void rewriteConfigSentinelOption(struct rewriteConfigState *state); +/* We use the following dictionary type to store where a configuration + * option is mentioned in the old configuration file, so it's + * like "maxmemory" -> list of line numbers (first line is zero). + */ dictType optionToLineDictType = { dictSdsCaseHash, /* hash function */ NULL, /* key dup */ diff --git a/src/dict.c b/src/dict.c index f164820584..48c0f815bb 100644 --- a/src/dict.c +++ b/src/dict.c @@ -576,7 +576,7 @@ dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing) { if (!position) return NULL; /* Dup the key if necessary. */ - if (d->type->keyDup) key = d->type->keyDup(d, key); + if (d->type->keyDup) key = d->type->keyDup(key); return dictInsertAtPosition(d, key, position); } @@ -640,7 +640,7 @@ int dictReplace(dict *d, void *key, void *val) { * reverse. */ void *oldval = dictGetVal(existing); dictSetVal(d, existing, val); - if (d->type->valDestructor) d->type->valDestructor(d, oldval); + if (d->type->valDestructor) d->type->valDestructor(oldval); return 0; } @@ -742,6 +742,18 @@ dictEntry *dictUnlink(dict *d, const void *key) { return dictGenericDelete(d, key, 1); } +inline static void dictFreeKey(dict *d, dictEntry *entry) { + if (d->type->keyDestructor) { + d->type->keyDestructor(dictGetKey(entry)); + } +} + +inline static void dictFreeVal(dict *d, dictEntry *entry) { + if (d->type->valDestructor) { + d->type->valDestructor(dictGetVal(entry)); + } +} + /* You need to call this function to really free the entry after a call * to dictUnlink(). It's safe to call this function with 'he' = NULL. */ void dictFreeUnlinkedEntry(dict *d, dictEntry *he) { @@ -919,7 +931,7 @@ void dictTwoPhaseUnlinkFree(dict *d, dictEntry *he, dictEntry **plink, int table : (entryIsEmbedded(de) ? &decodeEntryEmbedded(de)->field : (panic("Entry type not supported"), NULL))) void dictSetKey(dict *d, dictEntry *de, void *key) { - void *k = d->type->keyDup ? d->type->keyDup(d, key) : key; + void *k = d->type->keyDup ? d->type->keyDup(key) : key; if (entryIsNormal(de)) { dictEntryNormal *_de = decodeEntryNormal(de); _de->key = k; diff --git a/src/dict.h b/src/dict.h index 1c9e059baa..88ebd7bf99 100644 --- a/src/dict.h +++ b/src/dict.h @@ -53,10 +53,10 @@ typedef struct dict dict; typedef struct dictType { /* Callbacks */ uint64_t (*hashFunction)(const void *key); - void *(*keyDup)(dict *d, const void *key); - int (*keyCompare)(dict *d, const void *key1, const void *key2); - void (*keyDestructor)(dict *d, void *key); - void (*valDestructor)(dict *d, void *obj); + void *(*keyDup)(const void *key); + int (*keyCompare)(const void *key1, const void *key2); + void (*keyDestructor)(void *key); + void (*valDestructor)(void *obj); int (*resizeAllowed)(size_t moreMem, double usedRatio); /* Invoked at the start of dict initialization/rehashing (old and new ht are already created) */ void (*rehashingStarted)(dict *d); @@ -144,16 +144,13 @@ typedef struct { #define DICT_HT_INITIAL_SIZE (1 << (DICT_HT_INITIAL_EXP)) /* ------------------------------- Macros ------------------------------------*/ -#define dictFreeVal(d, entry) \ - do { \ - if ((d)->type->valDestructor) (d)->type->valDestructor((d), dictGetVal(entry)); \ - } while (0) - -#define dictFreeKey(d, entry) \ - if ((d)->type->keyDestructor) (d)->type->keyDestructor((d), dictGetKey(entry)) - -#define dictCompareKeys(d, key1, key2) \ - (((d)->type->keyCompare) ? (d)->type->keyCompare((d), key1, key2) : (key1) == (key2)) +static inline int dictCompareKeys(dict *d, const void *key1, const void *key2) { + if (d->type->keyCompare) { + return d->type->keyCompare(key1, key2); + } else { + return (key1 == key2); + } +} #define dictMetadata(d) (&(d)->metadata) #define dictMetadataSize(d) ((d)->type->dictMetadataBytes ? (d)->type->dictMetadataBytes(d) : 0) diff --git a/src/eval.c b/src/eval.c index fd12e40ad2..e5d7d56aa2 100644 --- a/src/eval.c +++ b/src/eval.c @@ -57,8 +57,7 @@ void evalGenericCommandWithDebugging(client *c, int evalsha); sds ldbCatStackValue(sds s, lua_State *lua, int idx); listNode *luaScriptsLRUAdd(client *c, sds sha, int evalsha); -static void dictLuaScriptDestructor(dict *d, void *val) { - UNUSED(d); +static void dictLuaScriptDestructor(void *val) { if (val == NULL) return; /* Lazy freeing will set value to NULL. */ decrRefCount(((luaScript *)val)->body); zfree(val); diff --git a/src/functions.c b/src/functions.c index e950024bad..c9ec42b322 100644 --- a/src/functions.c +++ b/src/functions.c @@ -43,9 +43,9 @@ typedef enum { static size_t engine_cache_memory = 0; /* Forward declaration */ -static void engineFunctionDispose(dict *d, void *obj); -static void engineStatsDispose(dict *d, void *obj); -static void engineLibraryDispose(dict *d, void *obj); +static void engineFunctionDispose(void *obj); +static void engineStatsDispose(void *obj); +static void engineLibraryDispose(void *obj); static int functionsVerifyName(sds name); typedef struct functionsLibEngineStats { @@ -126,15 +126,13 @@ static size_t libraryMallocSize(functionLibInfo *li) { return zmalloc_size(li) + sdsAllocSize(li->name) + sdsAllocSize(li->code); } -static void engineStatsDispose(dict *d, void *obj) { - UNUSED(d); +static void engineStatsDispose(void *obj) { functionsLibEngineStats *stats = obj; zfree(stats); } /* Dispose function memory */ -static void engineFunctionDispose(dict *d, void *obj) { - UNUSED(d); +static void engineFunctionDispose(void *obj) { if (!obj) { return; } @@ -158,8 +156,7 @@ static void engineLibraryFree(functionLibInfo *li) { zfree(li); } -static void engineLibraryDispose(dict *d, void *obj) { - UNUSED(d); +static void engineLibraryDispose(void *obj) { engineLibraryFree(obj); } diff --git a/src/latency.c b/src/latency.c index eef1532d03..783f04b197 100644 --- a/src/latency.c +++ b/src/latency.c @@ -37,8 +37,7 @@ #include "hdr_histogram.h" /* Dictionary type for latency events. */ -int dictStringKeyCompare(dict *d, const void *key1, const void *key2) { - UNUSED(d); +int dictStringKeyCompare(const void *key1, const void *key2) { return strcmp(key1, key2) == 0; } diff --git a/src/module.c b/src/module.c index 2884239200..1e98b36f30 100644 --- a/src/module.c +++ b/src/module.c @@ -11814,8 +11814,7 @@ uint64_t dictCStringKeyHash(const void *key) { return dictGenHashFunction((unsigned char *)key, strlen((char *)key)); } -int dictCStringKeyCompare(dict *d, const void *key1, const void *key2) { - UNUSED(d); +int dictCStringKeyCompare(const void *key1, const void *key2) { return strcmp(key1, key2) == 0; } diff --git a/src/sentinel.c b/src/sentinel.c index 711c4aea3e..ccd3ccbdca 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -416,8 +416,7 @@ void sentinelSimFailureCrash(void); void releaseSentinelValkeyInstance(sentinelValkeyInstance *ri); -void dictInstancesValDestructor(dict *d, void *obj) { - UNUSED(d); +void dictInstancesValDestructor(void *obj) { releaseSentinelValkeyInstance(obj); } @@ -4259,7 +4258,7 @@ void sentinelSetCommand(client *c) { /* If the target name is the same as the source name there * is no need to add an entry mapping to itself. */ - if (!dictSdsKeyCaseCompare(ri->renamed_commands, oldname, newname)) { + if (!dictSdsKeyCaseCompare(oldname, newname)) { oldname = sdsdup(oldname); newname = sdsdup(newname); dictAdd(ri->renamed_commands, oldname, newname); diff --git a/src/server.c b/src/server.c index 3217351faf..8841219697 100644 --- a/src/server.c +++ b/src/server.c @@ -360,25 +360,20 @@ void exitFromChild(int retcode) { * keys and Objects as values (Objects can hold SDS strings, * lists, sets). */ -void dictVanillaFree(dict *d, void *val) { - UNUSED(d); +void dictVanillaFree(void *val) { zfree(val); } -void dictListDestructor(dict *d, void *val) { - UNUSED(d); +void dictListDestructor(void *val) { listRelease((list *)val); } -void dictDictDestructor(dict *d, void *val) { - UNUSED(d); +void dictDictDestructor(void *val) { dictRelease((dict *)val); } -int dictSdsKeyCompare(dict *d, const void *key1, const void *key2) { +int dictSdsKeyCompare(const void *key1, const void *key2) { int l1, l2; - UNUSED(d); - l1 = sdslen((sds)key1); l2 = sdslen((sds)key2); if (l1 != l2) return 0; @@ -391,30 +386,26 @@ size_t dictSdsEmbedKey(unsigned char *buf, size_t buf_len, const void *key, uint /* A case insensitive version used for the command lookup table and other * places where case insensitive non binary-safe comparison is needed. */ -int dictSdsKeyCaseCompare(dict *d, const void *key1, const void *key2) { - UNUSED(d); +int dictSdsKeyCaseCompare(const void *key1, const void *key2) { return strcasecmp(key1, key2) == 0; } -void dictObjectDestructor(dict *d, void *val) { - UNUSED(d); +void dictObjectDestructor(void *val) { if (val == NULL) return; /* Lazy freeing will set value to NULL. */ decrRefCount(val); } -void dictSdsDestructor(dict *d, void *val) { - UNUSED(d); +void dictSdsDestructor(void *val) { sdsfree(val); } -void *dictSdsDup(dict *d, const void *key) { - UNUSED(d); +void *dictSdsDup(const void *key) { return sdsdup((const sds)key); } -int dictObjKeyCompare(dict *d, const void *key1, const void *key2) { +int dictObjKeyCompare(const void *key1, const void *key2) { const robj *o1 = key1, *o2 = key2; - return dictSdsKeyCompare(d, o1->ptr, o2->ptr); + return dictSdsKeyCompare(o1->ptr, o2->ptr); } uint64_t dictObjHash(const void *key) { @@ -446,16 +437,13 @@ uint64_t dictClientHash(const void *key) { } /* Dict compare function for client */ -int dictClientKeyCompare(dict *d, const void *key1, const void *key2) { - UNUSED(d); +int dictClientKeyCompare(const void *key1, const void *key2) { return ((client *)key1)->id == ((client *)key2)->id; } /* Dict compare function for null terminated string */ -int dictCStrKeyCompare(dict *d, const void *key1, const void *key2) { +int dictCStrKeyCompare(const void *key1, const void *key2) { int l1, l2; - UNUSED(d); - l1 = strlen((char *)key1); l2 = strlen((char *)key2); if (l1 != l2) return 0; @@ -463,12 +451,11 @@ int dictCStrKeyCompare(dict *d, const void *key1, const void *key2) { } /* Dict case insensitive compare function for null terminated string */ -int dictCStrKeyCaseCompare(dict *d, const void *key1, const void *key2) { - UNUSED(d); +int dictCStrKeyCaseCompare(const void *key1, const void *key2) { return strcasecmp(key1, key2) == 0; } -int dictEncObjKeyCompare(dict *d, const void *key1, const void *key2) { +int dictEncObjKeyCompare(const void *key1, const void *key2) { robj *o1 = (robj *)key1, *o2 = (robj *)key2; int cmp; @@ -480,7 +467,7 @@ int dictEncObjKeyCompare(dict *d, const void *key1, const void *key2) { * objects as well. */ if (o1->refcount != OBJ_STATIC_REFCOUNT) o1 = getDecodedObject(o1); if (o2->refcount != OBJ_STATIC_REFCOUNT) o2 = getDecodedObject(o2); - cmp = dictSdsKeyCompare(d, o1->ptr, o2->ptr); + cmp = dictSdsKeyCompare(o1->ptr, o2->ptr); if (o1->refcount != OBJ_STATIC_REFCOUNT) decrRefCount(o1); if (o2->refcount != OBJ_STATIC_REFCOUNT) decrRefCount(o2); return cmp; diff --git a/src/server.h b/src/server.h index 5cf56e9c86..c7a9806cac 100644 --- a/src/server.h +++ b/src/server.h @@ -2730,7 +2730,7 @@ int serverSetProcTitle(char *title); int validateProcTitleTemplate(const char *template); int serverCommunicateSystemd(const char *sd_notify_msg); void serverSetCpuAffinity(const char *cpulist); -void dictVanillaFree(dict *d, void *val); +void dictVanillaFree(void *val); /* ERROR STATS constants */ @@ -3717,11 +3717,11 @@ void startEvictionTimeProc(void); /* Keys hashing / comparison functions for dict.c hash tables. */ uint64_t dictSdsHash(const void *key); uint64_t dictSdsCaseHash(const void *key); -int dictSdsKeyCompare(dict *d, const void *key1, const void *key2); -int dictSdsKeyCaseCompare(dict *d, const void *key1, const void *key2); -void dictSdsDestructor(dict *d, void *val); -void dictListDestructor(dict *d, void *val); -void *dictSdsDup(dict *d, const void *key); +int dictSdsKeyCompare(const void *key1, const void *key2); +int dictSdsKeyCaseCompare(const void *key1, const void *key2); +void dictSdsDestructor(void *val); +void dictListDestructor(void *val); +void *dictSdsDup(const void *key); /* Git SHA1 */ char *serverGitSHA1(void); diff --git a/src/unit/test_dict.c b/src/unit/test_dict.c index a5af4eef79..b03d252c74 100644 --- a/src/unit/test_dict.c +++ b/src/unit/test_dict.c @@ -5,19 +5,15 @@ uint64_t hashCallback(const void *key) { return dictGenHashFunction((unsigned char *)key, strlen((char *)key)); } -int compareCallback(dict *d, const void *key1, const void *key2) { +int compareCallback(const void *key1, const void *key2) { int l1, l2; - UNUSED(d); - l1 = strlen((char *)key1); l2 = strlen((char *)key2); if (l1 != l2) return 0; return memcmp(key1, key2, l1) == 0; } -void freeCallback(dict *d, void *val) { - UNUSED(d); - +void freeCallback(void *val) { zfree(val); } diff --git a/src/unit/test_kvstore.c b/src/unit/test_kvstore.c index b3eff7d132..062b9f32fc 100644 --- a/src/unit/test_kvstore.c +++ b/src/unit/test_kvstore.c @@ -5,8 +5,7 @@ uint64_t hashTestCallback(const void *key) { return dictGenHashFunction((unsigned char *)key, strlen((char *)key)); } -void freeTestCallback(dict *d, void *val) { - UNUSED(d); +void freeTestCallback(void *val) { zfree(val); } diff --git a/src/valkey-benchmark.c b/src/valkey-benchmark.c index b22ee8cbed..57cdd6fc16 100644 --- a/src/valkey-benchmark.c +++ b/src/valkey-benchmark.c @@ -199,7 +199,7 @@ static long long showThroughput(struct aeEventLoop *eventLoop, long long id, voi /* Dict callbacks */ static uint64_t dictSdsHash(const void *key); -static int dictSdsKeyCompare(dict *d, const void *key1, const void *key2); +static int dictSdsKeyCompare(const void *key1, const void *key2); /* Implementation */ static long long ustime(void) { @@ -220,10 +220,8 @@ static uint64_t dictSdsHash(const void *key) { return dictGenHashFunction((unsigned char *)key, sdslen((char *)key)); } -static int dictSdsKeyCompare(dict *d, const void *key1, const void *key2) { +static int dictSdsKeyCompare(const void *key1, const void *key2) { int l1, l2; - UNUSED(d); - l1 = sdslen((sds)key1); l2 = sdslen((sds)key2); if (l1 != l2) return 0; diff --git a/src/valkey-cli.c b/src/valkey-cli.c index b4a7fcaf91..0ba03dc6ba 100644 --- a/src/valkey-cli.c +++ b/src/valkey-cli.c @@ -172,9 +172,9 @@ static struct termios orig_termios; /* To restore terminal at exit.*/ /* Dict Helpers */ static uint64_t dictSdsHash(const void *key); -static int dictSdsKeyCompare(dict *d, const void *key1, const void *key2); -static void dictSdsDestructor(dict *d, void *val); -static void dictListDestructor(dict *d, void *val); +static int dictSdsKeyCompare(const void *key1, const void *key2); +static void dictSdsDestructor(void *val); +static void dictListDestructor(void *val); /* Cluster Manager Command Info */ typedef struct clusterManagerCommand { @@ -371,23 +371,19 @@ static uint64_t dictSdsHash(const void *key) { return dictGenHashFunction((unsigned char *)key, sdslen((char *)key)); } -static int dictSdsKeyCompare(dict *d, const void *key1, const void *key2) { +static int dictSdsKeyCompare(const void *key1, const void *key2) { int l1, l2; - UNUSED(d); - l1 = sdslen((sds)key1); l2 = sdslen((sds)key2); if (l1 != l2) return 0; return memcmp(key1, key2, l1) == 0; } -static void dictSdsDestructor(dict *d, void *val) { - UNUSED(d); +static void dictSdsDestructor(void *val) { sdsfree(val); } -void dictListDestructor(dict *d, void *val) { - UNUSED(d); +void dictListDestructor(void *val) { listRelease((list *)val); } @@ -8663,9 +8659,8 @@ static typeinfo *typeinfo_add(dict *types, char *name, typeinfo *type_template) return info; } -void type_free(dict *d, void *val) { +void type_free(void *val) { typeinfo *info = val; - UNUSED(d); if (info->biggest_key) sdsfree(info->biggest_key); sdsfree(info->name); zfree(info); From b9994030e952788c8f736bcd02387dddf2c8b1cb Mon Sep 17 00:00:00 2001 From: bentotten <59932872+bentotten@users.noreply.github.com> Date: Thu, 14 Nov 2024 20:48:48 -0800 Subject: [PATCH 30/92] Log clusterbus handshake timeout failures (#1247) This adds a log when a handshake fails for a timeout. This can help troubleshoot cluster asymmetry issues caused by failed MEETs --------- Signed-off-by: Ben Totten Signed-off-by: bentotten <59932872+bentotten@users.noreply.github.com> Co-authored-by: Ben Totten Co-authored-by: Madelyn Olson --- src/cluster_legacy.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index cfde3fd797..f1d3b878c2 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4909,6 +4909,8 @@ static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_ /* A Node in HANDSHAKE state has a limited lifespan equal to the * configured node timeout. */ if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) { + serverLog(LL_WARNING, "Clusterbus handshake timeout %s:%d after %lldms", node->ip, + node->cport, handshake_timeout); clusterDelNode(node); return 1; } From d3f3b9cc3a452b6d18e9e862dcae5a923952c8da Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 15 Nov 2024 14:27:28 +0800 Subject: [PATCH 31/92] Fix daily valgrind build with unit tests (#1309) This was introduced in #515. Signed-off-by: Binbin --- .github/workflows/daily.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index 62eecb1fa8..8e9045fe4b 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -506,7 +506,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make all-with-unit-tests valgrind SERVER_CFLAGS='-Werror' + run: make valgrind all-with-unit-tests SERVER_CFLAGS='-Werror' - name: testprep run: | sudo apt-get update @@ -575,7 +575,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make all-with-unit-tests valgrind CFLAGS="-DNO_MALLOC_USABLE_SIZE" SERVER_CFLAGS='-Werror' + run: make valgrind all-with-unit-tests CFLAGS="-DNO_MALLOC_USABLE_SIZE" SERVER_CFLAGS='-Werror' - name: testprep run: | sudo apt-get update From 4e2493e5c961b36e6832e8d6ea259939b0cf0fde Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 15 Nov 2024 16:34:32 +0800 Subject: [PATCH 32/92] Kill diskless fork child asap when the last replica drop (#1227) We originally checked the replica connection to whether to kill the diskless child only when rdbPipeReadHandler is triggered. Actually we can check it when the replica is disconnected, so that we don't have to wait for rdbPipeReadHandler to be triggered and can kill the forkless child as soon as possible. In this way, when the child or rdbPipeReadHandler is stuck for some reason, we can kill the child faster and release the fork resources. Signed-off-by: Binbin --- src/networking.c | 8 +++++++- src/replication.c | 12 ++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/networking.c b/src/networking.c index 1a008a852d..0db1fda8d7 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1555,12 +1555,17 @@ void unlinkClient(client *c) { * in which case it needs to be cleaned from that list */ if (c->flag.replica && c->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && server.rdb_pipe_conns) { int i; + int still_alive = 0; for (i = 0; i < server.rdb_pipe_numconns; i++) { if (server.rdb_pipe_conns[i] == c->conn) { rdbPipeWriteHandlerConnRemoved(c->conn); server.rdb_pipe_conns[i] = NULL; - break; } + if (server.rdb_pipe_conns[i]) still_alive++; + } + if (still_alive == 0) { + serverLog(LL_NOTICE, "Diskless rdb transfer, last replica dropped, killing fork child."); + killRDBChild(); } } /* Only use shutdown when the fork is active and we are the parent. */ @@ -1781,6 +1786,7 @@ void freeClient(client *c) { if (server.saveparamslen == 0 && c->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK && anyOtherReplicaWaitRdb(c) == 0) { + serverLog(LL_NOTICE, "Background saving, persistence disabled, last replica dropped, killing fork child."); killRDBChild(); } if (c->repl_state == REPLICA_STATE_SEND_BULK) { diff --git a/src/replication.c b/src/replication.c index 48e98ab8e7..ce2f5d7983 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1669,7 +1669,9 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, if (!conn) continue; stillUp++; } - serverLog(LL_NOTICE, "Diskless rdb transfer, done reading from pipe, %d replicas still up.", stillUp); + if (stillUp) { + serverLog(LL_NOTICE, "Diskless rdb transfer, done reading from pipe, %d replicas still up.", stillUp); + } /* Now that the replicas have finished reading, notify the child that it's safe to exit. * When the server detects the child has exited, it can mark the replica as online, and * start streaming the replication buffers. */ @@ -1678,7 +1680,6 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, return; } - int stillAlive = 0; for (i = 0; i < server.rdb_pipe_numconns; i++) { ssize_t nwritten; connection *conn = server.rdb_pipe_conns[i]; @@ -1708,15 +1709,10 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, server.rdb_pipe_numconns_writing++; connSetWriteHandler(conn, rdbPipeWriteHandler); } - stillAlive++; } - if (stillAlive == 0) { - serverLog(LL_WARNING, "Diskless rdb transfer, last replica dropped, killing fork child."); - killRDBChild(); - } /* Remove the pipe read handler if at least one write handler was set. */ - if (server.rdb_pipe_numconns_writing || stillAlive == 0) { + if (server.rdb_pipe_numconns_writing) { aeDeleteFileEvent(server.el, server.rdb_pipe_read, AE_READABLE); break; } From 92181b67970efad6df82ea2319ccd4a266dfec5e Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 15 Nov 2024 16:47:15 +0800 Subject: [PATCH 33/92] Fix primary crash when processing dirty slots during shutdown wait / failover wait / client pause (#1131) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have an assert in propagateNow. If the primary node receives a CLUSTER UPDATE such as dirty slots during SIGTERM waitting or during a manual failover pausing or during a client pause, the delKeysInSlot call will trigger this assert and cause primary crash. In this case, we added a new server_del_keys_in_slot state just like client_pause_in_transaction to track the state to avoid the assert in propagateNow, the dirty slots will be deleted in the end without affecting the data consistency. Signed-off-by: Binbin Co-authored-by: Viktor Söderqvist --- src/cluster_legacy.c | 5 ++ src/networking.c | 2 +- src/server.c | 24 +++++++- src/server.h | 3 +- tests/unit/cluster/slot-ownership.tcl | 85 +++++++++++++++++++++++++++ tests/unit/pause.tcl | 27 +++++++++ 6 files changed, 142 insertions(+), 4 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index f1d3b878c2..69af65f1e8 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -6084,6 +6084,9 @@ void removeChannelsInSlot(unsigned int slot) { unsigned int delKeysInSlot(unsigned int hashslot) { if (!countKeysInSlot(hashslot)) return 0; + /* We may lose a slot during the pause. We need to track this + * state so that we don't assert in propagateNow(). */ + server.server_del_keys_in_slot = 1; unsigned int j = 0; kvstoreDictIterator *kvs_di = NULL; @@ -6108,6 +6111,8 @@ unsigned int delKeysInSlot(unsigned int hashslot) { } kvstoreReleaseDictIterator(kvs_di); + server.server_del_keys_in_slot = 0; + serverAssert(server.execution_nesting == 0); return j; } diff --git a/src/networking.c b/src/networking.c index 0db1fda8d7..4791055b5a 100644 --- a/src/networking.c +++ b/src/networking.c @@ -4571,7 +4571,7 @@ static void pauseClientsByClient(mstime_t endTime, int isPauseClientAll) { } /* Pause actions up to the specified unixtime (in ms) for a given type of - * commands. + * purpose. * * A main use case of this function is to allow pausing replication traffic * so that a failover without data loss to occur. Replicas will continue to receive diff --git a/src/server.c b/src/server.c index 8841219697..12691df8ee 100644 --- a/src/server.c +++ b/src/server.c @@ -3315,8 +3315,28 @@ static void propagateNow(int dbid, robj **argv, int argc, int target) { if (!shouldPropagate(target)) return; /* This needs to be unreachable since the dataset should be fixed during - * replica pause (otherwise data may be lost during a failover) */ - serverAssert(!(isPausedActions(PAUSE_ACTION_REPLICA) && (!server.client_pause_in_transaction))); + * replica pause (otherwise data may be lost during a failover). + * + * Though, there are exceptions: + * + * 1. We allow write commands that were queued up before and after to + * execute, if a CLIENT PAUSE executed during a transaction, we will + * track the state, the CLIENT PAUSE takes effect only after a transaction + * has finished. + * 2. Primary loses a slot during the pause, deletes all keys and replicates + * DEL to its replicas. In this case, we will track the state, the dirty + * slots will be deleted in the end without affecting the data consistency. + * + * Note that case 2 can happen in one of the following scenarios: + * 1) The primary waits for the replica to replicate before exiting, see + * shutdown-timeout in conf for more details. In this case, primary lost + * a slot during the SIGTERM waiting. + * 2) The primary waits for the replica to replicate during a manual failover. + * In this case, primary lost a slot during the pausing. + * 3) The primary was paused by CLIENT PAUSE, and lost a slot during the + * pausing. */ + serverAssert(!isPausedActions(PAUSE_ACTION_REPLICA) || server.client_pause_in_transaction || + server.server_del_keys_in_slot); if (server.aof_state != AOF_OFF && target & PROPAGATE_AOF) feedAppendOnlyFile(dbid, argv, argc); if (target & PROPAGATE_REPL) replicationFeedReplicas(dbid, argv, argc); diff --git a/src/server.h b/src/server.h index c7a9806cac..5ef04a9080 100644 --- a/src/server.h +++ b/src/server.h @@ -1701,6 +1701,7 @@ struct valkeyServer { const char *busy_module_yield_reply; /* When non-null, we are inside RM_Yield. */ char *ignore_warnings; /* Config: warnings that should be ignored. */ int client_pause_in_transaction; /* Was a client pause executed during this Exec? */ + int server_del_keys_in_slot; /* The server is deleting the keys in the dirty slot. */ int thp_enabled; /* If true, THP is enabled. */ size_t page_size; /* The page size of OS. */ /* Modules */ @@ -2863,7 +2864,7 @@ void flushReplicasOutputBuffers(void); void disconnectReplicas(void); void evictClients(void); int listenToPort(connListener *fds); -void pauseActions(pause_purpose purpose, mstime_t end, uint32_t actions_bitmask); +void pauseActions(pause_purpose purpose, mstime_t end, uint32_t actions); void unpauseActions(pause_purpose purpose); uint32_t isPausedActions(uint32_t action_bitmask); uint32_t isPausedActionsWithUpdate(uint32_t action_bitmask); diff --git a/tests/unit/cluster/slot-ownership.tcl b/tests/unit/cluster/slot-ownership.tcl index 0f3e3cc4f7..0073c2904f 100644 --- a/tests/unit/cluster/slot-ownership.tcl +++ b/tests/unit/cluster/slot-ownership.tcl @@ -59,3 +59,88 @@ start_cluster 2 2 {tags {external:skip cluster}} { } } } + +start_cluster 3 1 {tags {external:skip cluster} overrides {shutdown-timeout 100}} { + test "Primary lost a slot during the shutdown waiting" { + R 0 set FOO 0 + + # Pause the replica. + pause_process [srv -3 pid] + + # Incr the key and immediately shutdown the primary. + # The primary waits for the replica to replicate before exiting. + R 0 incr FOO + exec kill -SIGTERM [srv 0 pid] + wait_for_condition 50 100 { + [s 0 shutdown_in_milliseconds] > 0 + } else { + fail "Primary not indicating ongoing shutdown." + } + + # Move the slot to other primary + R 1 cluster bumpepoch + R 1 cluster setslot [R 1 cluster keyslot FOO] node [R 1 cluster myid] + + # Waiting for dirty slot update. + wait_for_log_messages 0 {"*Deleting keys in dirty slot*"} 0 1000 10 + + # Resume the replica and make sure primary exits normally instead of crashing. + resume_process [srv -3 pid] + wait_for_log_messages 0 {"*Valkey is now ready to exit, bye bye*"} 0 1000 10 + + # Make sure that the replica will become the new primary and does not own the key. + wait_for_condition 1000 50 { + [s -3 role] eq {master} + } else { + fail "The replica was not converted into primary" + } + assert_error {ERR no such key} {R 3 debug object foo} + } +} + +start_cluster 3 1 {tags {external:skip cluster}} { + test "Primary lost a slot during the manual failover pausing" { + R 0 set FOO 0 + + # Set primaries to drop the FAILOVER_AUTH_REQUEST packets, so that + # primary 0 will pause until the failover times out. + R 1 debug drop-cluster-packet-filter 5 + R 2 debug drop-cluster-packet-filter 5 + + # Replica doing the manual failover. + R 3 cluster failover + + # Move the slot to other primary + R 1 cluster bumpepoch + R 1 cluster setslot [R 1 cluster keyslot FOO] node [R 1 cluster myid] + + # Waiting for dirty slot update. + wait_for_log_messages 0 {"*Deleting keys in dirty slot*"} 0 1000 10 + + # Make sure primary doesn't crash when deleting the keys. + R 0 ping + + R 1 debug drop-cluster-packet-filter -1 + R 2 debug drop-cluster-packet-filter -1 + } +} + +start_cluster 3 1 {tags {external:skip cluster}} { + test "Primary lost a slot during the client pause command" { + R 0 set FOO 0 + + R 0 client pause 1000000000 write + + # Move the slot to other primary + R 1 cluster bumpepoch + R 1 cluster setslot [R 1 cluster keyslot FOO] node [R 1 cluster myid] + + # Waiting for dirty slot update. + wait_for_log_messages 0 {"*Deleting keys in dirty slot*"} 0 1000 10 + + # Make sure primary doesn't crash when deleting the keys. + R 0 ping + + R 0 client unpause + } +} diff --git a/tests/unit/pause.tcl b/tests/unit/pause.tcl index 38c13afc46..b18a32d48f 100644 --- a/tests/unit/pause.tcl +++ b/tests/unit/pause.tcl @@ -260,6 +260,33 @@ start_server {tags {"pause network"}} { r client unpause } + test "Test eviction is skipped during client pause" { + r flushall + set evicted_keys [s 0 evicted_keys] + + r multi + r set foo{t} bar + r config set maxmemory-policy allkeys-random + r config set maxmemory 1 + r client PAUSE 50000 WRITE + r exec + + # No keys should actually have been evicted. + assert_match $evicted_keys [s 0 evicted_keys] + + # The previous config set triggers a time event, but due to the pause, + # no eviction has been made. After the unpause, a eviction will happen. + r client unpause + wait_for_condition 1000 10 { + [expr $evicted_keys + 1] eq [s 0 evicted_keys] + } else { + fail "Key is not evicted" + } + + r config set maxmemory 0 + r config set maxmemory-policy noeviction + } + test "Test both active and passive expires are skipped during client pause" { set expired_keys [s 0 expired_keys] r multi From 86f33ea2b05e0f14391942c635a87974eb103937 Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 15 Nov 2024 16:48:13 +0800 Subject: [PATCH 34/92] Unprotect rdb channel when bgsave child fails in dual channel replication (#1297) If bgsaveerr is error, there is no need to protect the rdb channel. The impact of this may be that when bgsave fails, we will protect the rdb channel for 60s. It may occupy the reference of the repl buf block, making it impossible to recycle it until we free the client due to COB or free the client after 60s. We kept the RDB channel open as long as the replica hadn't established a main connection, even if the snapshot process failed. There is no value in keeping the RDB client in this case. Signed-off-by: Binbin --- src/replication.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/replication.c b/src/replication.c index ce2f5d7983..48f02cf658 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1741,6 +1741,8 @@ void updateReplicasWaitingBgsave(int bgsaveerr, int type) { struct valkey_stat buf; if (bgsaveerr != C_OK) { + /* If bgsaveerr is error, there is no need to protect the rdb channel. */ + replica->flag.protected_rdb_channel = 0; freeClientAsync(replica); serverLog(LL_WARNING, "SYNC failed. BGSAVE child returned an error"); continue; From aa2dd3ecb82bce5d76f7796c5e6df3e5c6e55203 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 16 Nov 2024 18:58:25 +0800 Subject: [PATCH 35/92] Stabilize replica migration test to make sure cluster config is consistent (#1311) CI report this failure: ``` [exception]: Executing test client: MOVED 1 127.0.0.1:22128. MOVED 1 127.0.0.1:22128 while executing "wait_for_condition 1000 50 { [R 3 get key_991803] == 1024 && [R 3 get key_977613] == 10240 && [R 4 get key_991803] == 1024 && ..." ``` This may be because, even though the cluster state becomes OK, The cluster still has inconsistent configuration for a short period of time. We make sure to wait for the config to be consistent. Signed-off-by: Binbin --- tests/unit/cluster/replica-migration.tcl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/unit/cluster/replica-migration.tcl b/tests/unit/cluster/replica-migration.tcl index d04069ef16..591d732fce 100644 --- a/tests/unit/cluster/replica-migration.tcl +++ b/tests/unit/cluster/replica-migration.tcl @@ -90,6 +90,8 @@ proc test_migrated_replica {type} { # Wait for the cluster to be ok. wait_for_condition 1000 50 { + [R 3 cluster slots] eq [R 4 cluster slots] && + [R 4 cluster slots] eq [R 7 cluster slots] && [CI 3 cluster_state] eq "ok" && [CI 4 cluster_state] eq "ok" && [CI 7 cluster_state] eq "ok" @@ -187,6 +189,7 @@ proc test_nonempty_replica {type} { # Wait for the cluster to be ok. wait_for_condition 1000 50 { + [R 4 cluster slots] eq [R 7 cluster slots] && [CI 4 cluster_state] eq "ok" && [CI 7 cluster_state] eq "ok" } else { @@ -306,6 +309,8 @@ proc test_sub_replica {type} { # Wait for the cluster to be ok. wait_for_condition 1000 50 { + [R 3 cluster slots] eq [R 4 cluster slots] && + [R 4 cluster slots] eq [R 7 cluster slots] && [CI 3 cluster_state] eq "ok" && [CI 4 cluster_state] eq "ok" && [CI 7 cluster_state] eq "ok" From 94113fde7fb251e24911e51ab8cf2a696864ebb6 Mon Sep 17 00:00:00 2001 From: uriyage <78144248+uriyage@users.noreply.github.com> Date: Mon, 18 Nov 2024 07:52:35 +0200 Subject: [PATCH 36/92] Improvements for TLS with I/O threads (#1271) Main thread profiling revealed significant overhead in TLS operations, even with read/write offloaded to I/O threads: Perf results: **10.82%** 8.82% `valkey-server libssl.so.3 [.] SSL_pending` # Called by main thread after I/O completion **10.16%** 5.06% `valkey-server libcrypto.so.3 [.] ERR_clear_error` # Called for every event regardless of thread handling This commit further optimizes TLS operations by moving more work from the main thread to I/O threads: Improve TLS offloading to I/O threads with two main changes: 1. Move `ERR_clear_error()` calls closer to SSL operations - Currently, error queue is cleared for every TLS event - Now only clear before actual SSL function calls - This prevents unnecessary clearing in main thread when operations are handled by I/O threads 2. Optimize `SSL_pending()` checks - Add `TLS_CONN_FLAG_HAS_PENDING` flag to track pending data - Move pending check to follow read operations immediately - I/O thread sets flag when pending data exists - Main thread uses flag to update pending list Performance improvements: Testing setup based on https://valkey.io/blog/unlock-one-million-rps-part2/ Before: - SET: 896,047 ops/sec - GET: 875,794 ops/sec After: - SET: 985,784 ops/sec (+10% improvement) - GET: 1,066,171 ops/sec (+22% improvement) Signed-off-by: Uri Yagelnik --- src/tls.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/src/tls.c b/src/tls.c index f1c82d35e4..a1fda2a7ae 100644 --- a/src/tls.c +++ b/src/tls.c @@ -446,6 +446,7 @@ typedef enum { #define TLS_CONN_FLAG_WRITE_WANT_READ (1 << 1) #define TLS_CONN_FLAG_FD_SET (1 << 2) #define TLS_CONN_FLAG_POSTPONE_UPDATE_STATE (1 << 3) +#define TLS_CONN_FLAG_HAS_PENDING (1 << 4) typedef struct tls_connection { connection c; @@ -614,7 +615,7 @@ static void updatePendingData(tls_connection *conn) { /* If SSL has pending data, already read from the socket, we're at risk of not calling the read handler again, make * sure to add it to a list of pending connection that should be handled anyway. */ - if (SSL_pending(conn->ssl) > 0) { + if (conn->flags & TLS_CONN_FLAG_HAS_PENDING) { if (!conn->pending_list_node) { listAddNodeTail(pending_list, conn); conn->pending_list_node = listLast(pending_list); @@ -625,6 +626,14 @@ static void updatePendingData(tls_connection *conn) { } } +void updateSSLPendingFlag(tls_connection *conn) { + if (SSL_pending(conn->ssl) > 0) { + conn->flags |= TLS_CONN_FLAG_HAS_PENDING; + } else { + conn->flags &= ~TLS_CONN_FLAG_HAS_PENDING; + } +} + static void updateSSLEvent(tls_connection *conn) { if (conn->flags & TLS_CONN_FLAG_POSTPONE_UPDATE_STATE) return; @@ -653,8 +662,6 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { TLSCONN_DEBUG("tlsEventHandler(): fd=%d, state=%d, mask=%d, r=%d, w=%d, flags=%d", fd, conn->c.state, mask, conn->c.read_handler != NULL, conn->c.write_handler != NULL, conn->flags); - ERR_clear_error(); - switch (conn->c.state) { case CONN_STATE_CONNECTING: conn_error = anetGetError(conn->c.fd); @@ -662,6 +669,7 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { conn->c.last_errno = conn_error; conn->c.state = CONN_STATE_ERROR; } else { + ERR_clear_error(); if (!(conn->flags & TLS_CONN_FLAG_FD_SET)) { SSL_set_fd(conn->ssl, conn->c.fd); conn->flags |= TLS_CONN_FLAG_FD_SET; @@ -690,6 +698,7 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { conn->c.conn_handler = NULL; break; case CONN_STATE_ACCEPTING: + ERR_clear_error(); ret = SSL_accept(conn->ssl); if (ret <= 0) { WantIOType want = 0; @@ -747,10 +756,7 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { conn->flags &= ~TLS_CONN_FLAG_READ_WANT_WRITE; if (!callHandler((connection *)conn, conn->c.read_handler)) return; } - - if (mask & AE_READABLE) { - updatePendingData(conn); - } + updatePendingData(conn); break; } @@ -941,6 +947,7 @@ static int connTLSRead(connection *conn_, void *buf, size_t buf_len) { if (conn->c.state != CONN_STATE_CONNECTED) return -1; ERR_clear_error(); ret = SSL_read(conn->ssl, buf, buf_len); + updateSSLPendingFlag(conn); return updateStateAfterSSLIO(conn, ret, 1); } @@ -992,7 +999,7 @@ static int connTLSBlockingConnect(connection *conn_, const char *addr, int port, * which means the specified timeout will not be enforced accurately. */ SSL_set_fd(conn->ssl, conn->c.fd); setBlockingTimeout(conn, timeout); - + ERR_clear_error(); if ((ret = SSL_connect(conn->ssl)) <= 0) { conn->c.state = CONN_STATE_ERROR; return C_ERR; @@ -1023,6 +1030,7 @@ static ssize_t connTLSSyncRead(connection *conn_, char *ptr, ssize_t size, long setBlockingTimeout(conn, timeout); ERR_clear_error(); int ret = SSL_read(conn->ssl, ptr, size); + updateSSLPendingFlag(conn); ret = updateStateAfterSSLIO(conn, ret, 0); unsetBlockingTimeout(conn); @@ -1041,6 +1049,7 @@ static ssize_t connTLSSyncReadLine(connection *conn_, char *ptr, ssize_t size, l ERR_clear_error(); int ret = SSL_read(conn->ssl, &c, 1); + updateSSLPendingFlag(conn); ret = updateStateAfterSSLIO(conn, ret, 0); if (ret <= 0) { nread = -1; From d07674fc01fd9b3b4fdd8c13de74d3d28697ddc5 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 18 Nov 2024 14:55:26 +0800 Subject: [PATCH 37/92] Fix sds unittest tests to check for zmalloc_usable_size (#1314) s_malloc_size == zmalloc_size, currently sdsAllocSize does not calculate PREFIX_SIZE when no malloc_size available, this casue test_typesAndAllocSize fail in the new unittest, what we want to check is actually zmalloc_usable_size. Signed-off-by: Binbin --- src/unit/test_sds.c | 15 ++++++++------- src/unit/test_zmalloc.c | 2 ++ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/unit/test_sds.c b/src/unit/test_sds.c index b97d0d9d32..30f25e4f6f 100644 --- a/src/unit/test_sds.c +++ b/src/unit/test_sds.c @@ -259,43 +259,44 @@ int test_typesAndAllocSize(int argc, char **argv, int flags) { sds x = sdsnewlen(NULL, 31); TEST_ASSERT_MESSAGE("len 31 type", (x[-1] & SDS_TYPE_MASK) == SDS_TYPE_5); + TEST_ASSERT_MESSAGE("len 31 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 32); TEST_ASSERT_MESSAGE("len 32 type", (x[-1] & SDS_TYPE_MASK) >= SDS_TYPE_8); - TEST_ASSERT_MESSAGE("len 32 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 32 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 252); TEST_ASSERT_MESSAGE("len 252 type", (x[-1] & SDS_TYPE_MASK) >= SDS_TYPE_8); - TEST_ASSERT_MESSAGE("len 252 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 252 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 253); TEST_ASSERT_MESSAGE("len 253 type", (x[-1] & SDS_TYPE_MASK) == SDS_TYPE_16); - TEST_ASSERT_MESSAGE("len 253 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 253 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 65530); TEST_ASSERT_MESSAGE("len 65530 type", (x[-1] & SDS_TYPE_MASK) >= SDS_TYPE_16); - TEST_ASSERT_MESSAGE("len 65530 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 65530 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 65531); TEST_ASSERT_MESSAGE("len 65531 type", (x[-1] & SDS_TYPE_MASK) >= SDS_TYPE_32); - TEST_ASSERT_MESSAGE("len 65531 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 65531 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); #if (LONG_MAX == LLONG_MAX) if (flags & UNIT_TEST_LARGE_MEMORY) { x = sdsnewlen(NULL, 4294967286); TEST_ASSERT_MESSAGE("len 4294967286 type", (x[-1] & SDS_TYPE_MASK) >= SDS_TYPE_32); - TEST_ASSERT_MESSAGE("len 4294967286 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 4294967286 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 4294967287); TEST_ASSERT_MESSAGE("len 4294967287 type", (x[-1] & SDS_TYPE_MASK) == SDS_TYPE_64); - TEST_ASSERT_MESSAGE("len 4294967287 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 4294967287 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); } #endif diff --git a/src/unit/test_zmalloc.c b/src/unit/test_zmalloc.c index 6c1d03e8e1..08444a157e 100644 --- a/src/unit/test_zmalloc.c +++ b/src/unit/test_zmalloc.c @@ -6,6 +6,8 @@ int test_zmallocInitialUsedMemory(int argc, char **argv, int flags) { UNUSED(argv); UNUSED(flags); + /* If this fails, it may be that other tests have failed and the memory has not been released. */ + TEST_PRINT_INFO("test_zmallocInitialUsedMemory; used: %zu\n", zmalloc_used_memory()); TEST_ASSERT(zmalloc_used_memory() == 0); return 0; From c5012cc630bb65c07a17ea870630edd8825cde52 Mon Sep 17 00:00:00 2001 From: Amit Nagler <58042354+naglera@users.noreply.github.com> Date: Mon, 18 Nov 2024 13:09:35 +0200 Subject: [PATCH 38/92] Optimize RDB load performance and fix cluster mode resizing on replica side (#1199) This PR addresses two issues: 1. Performance Degradation Fix - Resolves a significant performance issue during RDB load on replica nodes. - The problem was causing replicas to rehash multiple times during the load process. Local testing demonstrated up to 50% degradation in BGSAVE time. - The problem occurs when the replica tries to expand pre-created slot dictionaries. This operation fails quietly, resulting in undetected performance issues. - This fix aims to optimize the RDB load process and restore expected performance levels. 2. Bug fix when reading `RDB_OPCODE_RESIZEDB` in Valkey 8.0 cluster mode- - Use the shard's master slots count when processing this opcode, as `clusterNodeCoversSlot` is not initialized for the currently syncing replica. - Previously, this problem went unnoticed because `RDB_OPCODE_RESIZEDB` had no practical impact (due to 1). These improvements will enhance overall system performance and ensure smoother upgrades to Valkey 8.0 in the future. Testing: - Conducted local tests to verify the performance improvement during RDB load. - Verified that ignoring `RDB_OPCODE_RESIZEDB` does not negatively impact functionality in the current version. Signed-off-by: naglera Co-authored-by: Binbin --- src/db.c | 2 +- src/kvstore.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/db.c b/src/db.c index 3e0e5a2e63..b59c7727b2 100644 --- a/src/db.c +++ b/src/db.c @@ -1884,7 +1884,7 @@ keyStatus expireIfNeeded(serverDb *db, robj *key, int flags) { * The purpose is to skip expansion of unused dicts in cluster mode (all * dicts not mapped to *my* slots) */ static int dbExpandSkipSlot(int slot) { - return !clusterNodeCoversSlot(getMyClusterNode(), slot); + return !clusterNodeCoversSlot(clusterNodeGetPrimary(getMyClusterNode()), slot); } /* diff --git a/src/kvstore.c b/src/kvstore.c index 7142fa0f61..49662f330a 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -423,9 +423,11 @@ unsigned long long kvstoreScan(kvstore *kvs, * `dictTryExpand` call and in case of `dictExpand` call it signifies no expansion was performed. */ int kvstoreExpand(kvstore *kvs, uint64_t newsize, int try_expand, kvstoreExpandShouldSkipDictIndex *skip_cb) { + if (newsize == 0) return 1; for (int i = 0; i < kvs->num_dicts; i++) { - dict *d = kvstoreGetDict(kvs, i); - if (!d || (skip_cb && skip_cb(i))) continue; + if (skip_cb && skip_cb(i)) continue; + /* If the dictionary doesn't exist, create it */ + dict *d = createDictIfNeeded(kvs, i); int result = try_expand ? dictTryExpand(d, newsize) : dictExpand(d, newsize); if (try_expand && result == DICT_ERR) return 0; } From f9d0b876224beecc8386cce5e11d43e649b82189 Mon Sep 17 00:00:00 2001 From: Seungmin Lee <155032684+sungming2@users.noreply.github.com> Date: Mon, 18 Nov 2024 18:00:30 -0800 Subject: [PATCH 39/92] Upgrade macos-12 to macos-13 in workflows (#1318) ### Problem GitHub Actions is starting the deprecation process for macOS 12. Deprecation will begin on 10/7/24 and the image will be fully unsupported by 12/3/24. For more details, see https://github.com/actions/runner-images/issues/10721 Signed-off-by: Seungmin Lee Co-authored-by: Seungmin Lee --- .github/workflows/daily.yml | 4 ++-- deps/hiredis/.github/workflows/build.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index 8e9045fe4b..8bdbc8d4c2 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -990,7 +990,7 @@ jobs: strategy: fail-fast: false matrix: - os: [macos-12, macos-14] + os: [macos-13, macos-14] runs-on: ${{ matrix.os }} if: | (github.event_name == 'workflow_dispatch' || @@ -1019,7 +1019,7 @@ jobs: run: make SERVER_CFLAGS='-Werror' test-freebsd: - runs-on: macos-12 + runs-on: macos-13 if: | (github.event_name == 'workflow_dispatch' || (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || diff --git a/deps/hiredis/.github/workflows/build.yml b/deps/hiredis/.github/workflows/build.yml index 581800b4f7..048ee51cd4 100644 --- a/deps/hiredis/.github/workflows/build.yml +++ b/deps/hiredis/.github/workflows/build.yml @@ -112,7 +112,7 @@ jobs: run: $GITHUB_WORKSPACE/test.sh freebsd: - runs-on: macos-12 + runs-on: macos-13 name: FreeBSD steps: - uses: actions/checkout@v3 From 3d0c8342030654bdfaf74d08d2d5645ff616c7a7 Mon Sep 17 00:00:00 2001 From: Seungmin Lee <155032684+sungming2@users.noreply.github.com> Date: Mon, 18 Nov 2024 18:06:35 -0800 Subject: [PATCH 40/92] Fix LRU crash when getting too many random lua scripts (#1310) ### Problem Valkey stores scripts in a dictionary (lua_scripts) keyed by their SHA1 hashes, but it needs a way to know which scripts are least recently used. It uses an LRU list (lua_scripts_lru_list) to keep track of scripts in usage order. When the list reaches a maximum length, Valkey evicts the oldest scripts to free memory in both the list and dictionary. The problem here is that the sds from the LRU list can be pointing to already freed/moved memory by active defrag that the sds in the dictionary used to point to. It results in assertion error at [this line](https://github.com/valkey-io/valkey/blob/unstable/src/eval.c#L519) ### Solution If we duplicate the sds when adding it to the LRU list, we can create an independent copy of the script identifier (sha). This duplication ensures that the sha string in the LRU list remains stable and unaffected by any defragmentation that could alter or free the original sds. In addition, dictUnlink doesn't require exact pointer match([ref](https://github.com/valkey-io/valkey/blob/unstable/src/eval.c#L71-L78)) so this change makes sense to unlink the right dictEntry with the copy of the sds. ### Reproduce To reproduce it with tcl test: 1. Disable je_get_defrag_hint in defrag.c to trigger defrag often 2. Execute test script ``` start_server {tags {"auth external:skip"}} { test {Regression for script LRU crash} { r config set activedefrag yes r config set active-defrag-ignore-bytes 1 r config set active-defrag-threshold-lower 0 r config set active-defrag-threshold-upper 1 r config set active-defrag-cycle-min 99 r config set active-defrag-cycle-max 99 for {set i 0} {$i < 100000} {incr i} { r eval "return $i" 0 } after 5000; } } ``` ### Crash info Crash report: ``` === REDIS BUG REPORT START: Cut & paste starting from here === 14044:M 12 Nov 2024 14:51:27.054 # === ASSERTION FAILED === 14044:M 12 Nov 2024 14:51:27.054 # ==> eval.c:556 'de' is not true ------ STACK TRACE ------ Backtrace: /usr/bin/redis-server 127.0.0.1:6379 [cluster](luaDeleteFunction+0x148)[0x723708] /usr/bin/redis-server 127.0.0.1:6379 [cluster](luaCreateFunction+0x26c)[0x724450] /usr/bin/redis-server 127.0.0.1:6379 [cluster](evalCommand+0x2bc)[0x7254dc] /usr/bin/redis-server 127.0.0.1:6379 [cluster](call+0x574)[0x5b8d14] /usr/bin/redis-server 127.0.0.1:6379 [cluster](processCommand+0xc84)[0x5b9b10] /usr/bin/redis-server 127.0.0.1:6379 [cluster](processCommandAndResetClient+0x11c)[0x6db63c] /usr/bin/redis-server 127.0.0.1:6379 [cluster](processInputBuffer+0x1b0)[0x6dffd4] /usr/bin/redis-server 127.0.0.1:6379 [cluster][0x6bd968] /usr/bin/redis-server 127.0.0.1:6379 [cluster][0x659634] /usr/bin/redis-server 127.0.0.1:6379 [cluster](amzTLSEventHandler+0x194)[0x6588d8] /usr/bin/redis-server 127.0.0.1:6379 [cluster][0x750c88] /usr/bin/redis-server 127.0.0.1:6379 [cluster](aeProcessEvents+0x228)[0x757fa8] /usr/bin/redis-server 127.0.0.1:6379 [cluster](redisMain+0x478)[0x7786b8] /lib64/libc.so.6(__libc_start_main+0xe4)[0xffffa7763da4] /usr/bin/redis-server 127.0.0.1:6379 [cluster][0x5ad3b0] ``` Defrag info: ``` mem_fragmentation_ratio:1.18 mem_fragmentation_bytes:47229992 active_defrag_hits:20561 active_defrag_misses:5878518 active_defrag_key_hits:77 active_defrag_key_misses:212 total_active_defrag_time:29009 ``` ### Test: Run the test script to push 100,000 scripts to ensure the LRU list keeps 500 maximum length without any crash. ``` 27489:M 14 Nov 2024 20:56:41.583 * LRU List length: 500 27489:M 14 Nov 2024 20:56:41.583 * LRU List length: 500 27489:M 14 Nov 2024 20:56:41.584 * LRU List length: 500 27489:M 14 Nov 2024 20:56:41.584 * LRU List length: 500 27489:M 14 Nov 2024 20:56:41.584 * LRU List length: 500 27489:M 14 Nov 2024 20:56:41.584 * LRU List length: 500 27489:M 14 Nov 2024 20:56:41.584 * LRU List length: 500 27489:M 14 Nov 2024 20:56:41.584 * LRU List length: 500 27489:M 14 Nov 2024 20:56:41.584 * LRU List length: 500 27489:M 14 Nov 2024 20:56:41.584 * LRU List length: 500 27489:M 14 Nov 2024 20:56:41.584 * LRU List length: 500 27489:M 14 Nov 2024 20:56:41.584 * LRU List length: 500 27489:M 14 Nov 2024 20:56:41.584 * LRU List length: 500 [ok]: Regression for script LRU crash (6811 ms) [1/1 done]: unit/test (7 seconds) ``` --------- Signed-off-by: Seungmin Lee Signed-off-by: Seungmin Lee <155032684+sungming2@users.noreply.github.com> Co-authored-by: Seungmin Lee Co-authored-by: Binbin --- src/eval.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/eval.c b/src/eval.c index e5d7d56aa2..a9c50cdf90 100644 --- a/src/eval.c +++ b/src/eval.c @@ -199,10 +199,12 @@ void scriptingInit(int setup) { } /* Initialize a dictionary we use to map SHAs to scripts. - * Initialize a list we use for lua script evictions, it shares the - * sha with the dictionary, so free fn is not set. */ + * Initialize a list we use for lua script evictions. + * Note that we duplicate the sha when adding to the lru list due to defrag, + * and we need to free them respectively. */ lctx.lua_scripts = dictCreate(&shaScriptObjectDictType); lctx.lua_scripts_lru_list = listCreate(); + listSetFreeMethod(lctx.lua_scripts_lru_list, (void (*)(void *))sdsfree); lctx.lua_scripts_mem = 0; luaRegisterServerAPI(lua); @@ -518,9 +520,6 @@ void luaDeleteFunction(client *c, sds sha) { dictEntry *de = dictUnlink(lctx.lua_scripts, sha); serverAssertWithInfo(c ? c : lctx.lua_client, NULL, de); luaScript *l = dictGetVal(de); - /* We only delete `EVAL` scripts, which must exist in the LRU list. */ - serverAssert(l->node); - listDelNode(lctx.lua_scripts_lru_list, l->node); lctx.lua_scripts_mem -= sdsAllocSize(sha) + getStringObjectSdsUsedMemory(l->body); dictFreeUnlinkedEntry(lctx.lua_scripts, de); } @@ -549,11 +548,12 @@ listNode *luaScriptsLRUAdd(client *c, sds sha, int evalsha) { listNode *ln = listFirst(lctx.lua_scripts_lru_list); sds oldest = listNodeValue(ln); luaDeleteFunction(c, oldest); + listDelNode(lctx.lua_scripts_lru_list, ln); server.stat_evictedscripts++; } /* Add current. */ - listAddNodeTail(lctx.lua_scripts_lru_list, sha); + listAddNodeTail(lctx.lua_scripts_lru_list, sdsdup(sha)); return listLast(lctx.lua_scripts_lru_list); } From 132798b57d7f95ad5901495d566578bf8ba71390 Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 19 Nov 2024 23:42:50 +0800 Subject: [PATCH 41/92] Receipt of REPLCONF VERSION reply should be triggered by event (#1320) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This add the missing return when repl_state change to RECEIVE_VERSION_REPLY, this way we won’t be blocked if the primary doesn’t reply with REPLCONF VERSION. In practice i guess this is no likely to block in this context, reading small responses are are likely to be received in one packet, so this is just a cleanup (consistent with the previous state machine processing). Also update the state machine diagram to mention the VERSION reply. Signed-off-by: Binbin --- src/replication.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/replication.c b/src/replication.c index 48f02cf658..a809c4c166 100644 --- a/src/replication.c +++ b/src/replication.c @@ -3379,15 +3379,15 @@ void dualChannelSetupMainConnForPsync(connection *conn) { * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_ENDOFF│ │ │by the primary │ * │RECEIVE_IP_REPLY │ │ │ └───────┬───────────────────┘ │ ┌──▼────────────────┐ │ * └────────┬──────────┘ │ │ │$ENDOFF │ │RECEIVE_PSYNC_REPLY│ │ - * │ │ │ ├─────────────────────────┘ └──┬────────────────┘ │ - * │ │ │ │ │+CONTINUE │ - * │ │ │ ┌───────▼───────────────┐ ┌──▼────────────────┐ │ - * │ │ │ │DUAL_CHANNEL_RDB_LOAD │ │TRANSFER │ │ + * │+OK │ │ ├─────────────────────────┘ └──┬────────────────┘ │ + * ┌────────▼──────────┐ │ │ │ │+CONTINUE │ + * │RECEIVE_CAPA_REPLY │ │ │ ┌───────▼───────────────┐ ┌──▼────────────────┐ │ + * └────────┬──────────┘ │ │ │DUAL_CHANNEL_RDB_LOAD │ │TRANSFER │ │ * │+OK │ │ └───────┬───────────────┘ └─────┬─────────────┘ │ - * ┌────────▼──────────┐ │ │ │Done loading │ │ - * │RECEIVE_CAPA_REPLY │ │ │ ┌───────▼───────────────┐ │ │ - * └────────┬──────────┘ │ │ │DUAL_CHANNEL_RDB_LOADED│ │ │ - * │ │ │ └───────┬───────────────┘ │ │ + * ┌────────▼─────────────┐ │ │ │Done loading │ │ + * │RECEIVE_VERSION_REPLY │ │ │ ┌───────▼───────────────┐ │ │ + * └────────┬─────────────┘ │ │ │DUAL_CHANNEL_RDB_LOADED│ │ │ + * │+OK │ │ └───────┬───────────────┘ │ │ * ┌────────▼───┐ │ │ │ │ │ * │SEND_PSYNC │ │ │ │Replica loads local replication │ │ * └─┬──────────┘ │ │ │buffer into memory │ │ @@ -3589,6 +3589,7 @@ void syncWithPrimary(connection *conn) { sdsfree(err); err = NULL; server.repl_state = REPL_STATE_RECEIVE_VERSION_REPLY; + return; } /* Receive VERSION reply. */ From ee386c92ffa9724771e4980064fa279655e46f90 Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 20 Nov 2024 00:17:20 +0800 Subject: [PATCH 42/92] Manual failover vote is not limited by two times the node timeout (#1305) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This limit should not restrict manual failover, otherwise in some scenarios, manual failover will time out. For example, if some FAILOVER_AUTH_REQUESTs or some FAILOVER_AUTH_ACKs are lost during a manual failover, it cannot vote in the second manual failover. Or in a mixed scenario of plain failover and manual failover, it cannot vote for the subsequent manual failover. The problem with the manual failover retry is that the mf will pause the client 5s in the primary side. So every retry every manual failover timed out is a bad move. --------- Signed-off-by: Binbin Co-authored-by: Viktor Söderqvist --- src/cluster_legacy.c | 15 +++-- src/cluster_legacy.h | 3 +- tests/support/cluster_util.tcl | 1 + tests/unit/cluster/manual-failover.tcl | 88 ++++++++++++++++++++++++++ 4 files changed, 101 insertions(+), 6 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 69af65f1e8..7b3384ee9f 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4363,12 +4363,17 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* We did not voted for a replica about this primary for two * times the node timeout. This is not strictly needed for correctness - * of the algorithm but makes the base case more linear. */ - if (mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) { + * of the algorithm but makes the base case more linear. + * + * This limitation does not restrict manual failover. If a user initiates + * a manual failover, we need to allow it to vote, otherwise the manual + * failover may time out. */ + if (!force_ack && mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) { serverLog(LL_WARNING, - "Failover auth denied to %.40s %s: " - "can't vote about this primary before %lld milliseconds", + "Failover auth denied to %.40s (%s): " + "can't vote for any replica of %.40s (%s) within %lld milliseconds", node->name, node->human_nodename, + node->replicaof->name, node->replicaof->human_nodename, (long long)((server.cluster_node_timeout * 2) - (mstime() - node->replicaof->voted_time))); return; } @@ -4394,7 +4399,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* We can vote for this replica. */ server.cluster->lastVoteEpoch = server.cluster->currentEpoch; - node->replicaof->voted_time = mstime(); + if (!force_ack) node->replicaof->voted_time = mstime(); clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG); clusterSendFailoverAuth(node); serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu", node->name, node->human_nodename, diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 5280644e6e..2c3e1d83c8 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -338,7 +338,8 @@ struct _clusterNode { mstime_t pong_received; /* Unix time we received the pong */ mstime_t data_received; /* Unix time we received any data */ mstime_t fail_time; /* Unix time when FAIL flag was set */ - mstime_t voted_time; /* Last time we voted for a replica of this primary */ + mstime_t voted_time; /* Last time we voted for a replica of this primary in non manual + * failover scenarios. */ mstime_t repl_offset_time; /* Unix time we received offset for this node */ mstime_t orphaned_time; /* Starting time of orphaned primary condition */ long long repl_offset; /* Last known repl offset for this node. */ diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index 4b399214b9..686f00071b 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -145,6 +145,7 @@ proc wait_for_cluster_size {cluster_size} { # Check that cluster nodes agree about "state", or raise an error. proc wait_for_cluster_state {state} { for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused [srv -$j pid]]} continue wait_for_condition 1000 50 { [CI $j cluster_state] eq $state } else { diff --git a/tests/unit/cluster/manual-failover.tcl b/tests/unit/cluster/manual-failover.tcl index 2a9dff934b..78842068fa 100644 --- a/tests/unit/cluster/manual-failover.tcl +++ b/tests/unit/cluster/manual-failover.tcl @@ -183,3 +183,91 @@ test "Wait for instance #0 to return back alive" { } } ;# start_cluster + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} { + test "Manual failover vote is not limited by two times the node timeout - drop the auth ack" { + set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK 6 + set CLUSTER_PACKET_TYPE_NONE -1 + + # Setting a large timeout to make sure we hit the voted_time limit. + R 0 config set cluster-node-timeout 150000 + R 1 config set cluster-node-timeout 150000 + R 2 config set cluster-node-timeout 150000 + + # Let replica drop FAILOVER_AUTH_ACK so that the election won't + # get the enough votes and the election will time out. + R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK + + # The first manual failover will time out. + R 3 cluster failover + wait_for_log_messages 0 {"*Manual failover timed out*"} 0 1000 50 + wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 50 + + # Undo packet drop, so that replica can win the next election. + R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE + + # Make sure the second manual failover will work. + R 3 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {slave} && + [s -3 role] eq {master} + } else { + fail "The second failover does not happen" + } + wait_for_cluster_propagation + } +} ;# start_cluster + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} { + test "Manual failover vote is not limited by two times the node timeout - mixed failover" { + # Make sure the failover is triggered by us. + R 1 config set cluster-replica-validity-factor 0 + R 3 config set cluster-replica-no-failover yes + R 3 config set cluster-replica-validity-factor 0 + + # Pause the primary. + pause_process [srv 0 pid] + wait_for_cluster_state fail + + # Setting a large timeout to make sure we hit the voted_time limit. + R 1 config set cluster-node-timeout 150000 + R 2 config set cluster-node-timeout 150000 + + # R 3 performs an automatic failover and it will work. + R 3 config set cluster-replica-no-failover no + wait_for_condition 1000 50 { + [s -3 role] eq {master} + } else { + fail "The first failover does not happen" + } + + # Resume the primary and wait for it to become a replica. + resume_process [srv 0 pid] + wait_for_condition 1000 50 { + [s 0 role] eq {slave} + } else { + fail "Old primary not converted into replica" + } + wait_for_cluster_propagation + + # The old primary doing a manual failover and wait for it. + R 0 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {master} && + [s -3 role] eq {slave} + } else { + fail "The second failover does not happen" + } + wait_for_cluster_propagation + + # R 3 performs a manual failover and it will work. + R 3 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {slave} && + [s -3 role] eq {master} + } else { + fail "The third falover does not happen" + } + wait_for_cluster_propagation + } +} ;# start_cluster From 49863109453faa907ce2c8b1158e60a6777d28ab Mon Sep 17 00:00:00 2001 From: Yanqi Lv Date: Wed, 20 Nov 2024 04:53:19 +0800 Subject: [PATCH 43/92] Import-mode: Avoid expiration and eviction during data syncing (#1185) New config: `import-mode (yes|no)` New command: `CLIENT IMPORT-SOURCE (ON|OFF)` The config, when set to `yes`, disables eviction and deletion of expired keys, except for commands coming from a client which has marked itself as an import-source, the data source when importing data from another node, using the CLIENT IMPORT-SOURCE command. When we sync data from the source Valkey to the destination Valkey using some sync tools like [redis-shake](https://github.com/tair-opensource/RedisShake), the destination Valkey can perform expiration and eviction, which may cause data corruption. This problem has been discussed in https://github.com/redis/redis/discussions/9760#discussioncomment-1681041 and Redis already have a solution. But in Valkey we haven't fixed it by now. E.g. we call `set key 1 ex 1` on the source server and transfer this command to the destination server. Then we call `incr key` on the source server before the key expired, we will have a key on the source server with a value of 2. But when the command arrived at the destination server, the key may be expired and has deleted. So we will have a key on the destination server with a value of 1, which is inconsistent with the source server. In standalone mode, we can use writable replica to simplify the sync process. However, in cluster mode, we still need a sync tool to help us transfer the source data to the destination. The sync tool usually work as a normal client and the destination works as a primary which keep expiration and eviction. In this PR, we add a new mode named 'import-mode'. In this mode, server stop expiration and eviction just like a replica. Notice that this mode exists only in sync state to avoid data inconsistency caused by expiration and eviction. Import mode only takes effect on the primary. Sync tools can mark their clients as an import source by `CLIENT IMPORT-SOURCE`, which work like a client from primary and can visit expired keys in `lookupkey`. **Notice: during the migration, other clients, apart from the import source, should not access the data imported by import source.** --------- Signed-off-by: lvyanqi.lyq Signed-off-by: Yanqi Lv Co-authored-by: Madelyn Olson --- src/commands.def | 29 ++++++++++ src/commands/client-import-source.json | 40 ++++++++++++++ src/config.c | 1 + src/db.c | 21 +++++++- src/evict.c | 4 +- src/expire.c | 7 ++- src/networking.c | 20 +++++++ src/server.c | 9 ++-- src/server.h | 5 +- tests/unit/expire.tcl | 74 ++++++++++++++++++++++++++ tests/unit/maxmemory.tcl | 18 +++++++ valkey.conf | 7 +++ 12 files changed, 225 insertions(+), 10 deletions(-) create mode 100644 src/commands/client-import-source.json diff --git a/src/commands.def b/src/commands.def index 791b30d540..ecc77126af 100644 --- a/src/commands.def +++ b/src/commands.def @@ -1230,6 +1230,34 @@ struct COMMAND_ARG CLIENT_CAPA_Args[] = { #define CLIENT_ID_Keyspecs NULL #endif +/********** CLIENT IMPORT_SOURCE ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* CLIENT IMPORT_SOURCE history */ +#define CLIENT_IMPORT_SOURCE_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* CLIENT IMPORT_SOURCE tips */ +#define CLIENT_IMPORT_SOURCE_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* CLIENT IMPORT_SOURCE key specs */ +#define CLIENT_IMPORT_SOURCE_Keyspecs NULL +#endif + +/* CLIENT IMPORT_SOURCE enabled argument table */ +struct COMMAND_ARG CLIENT_IMPORT_SOURCE_enabled_Subargs[] = { +{MAKE_ARG("on",ARG_TYPE_PURE_TOKEN,-1,"ON",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("off",ARG_TYPE_PURE_TOKEN,-1,"OFF",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* CLIENT IMPORT_SOURCE argument table */ +struct COMMAND_ARG CLIENT_IMPORT_SOURCE_Args[] = { +{MAKE_ARG("enabled",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=CLIENT_IMPORT_SOURCE_enabled_Subargs}, +}; + /********** CLIENT INFO ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -1630,6 +1658,7 @@ struct COMMAND_STRUCT CLIENT_Subcommands[] = { {MAKE_CMD("getredir","Returns the client ID to which the connection's tracking notifications are redirected.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_GETREDIR_History,0,CLIENT_GETREDIR_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_GETREDIR_Keyspecs,0,NULL,0)}, {MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_HELP_History,0,CLIENT_HELP_Tips,0,clientCommand,2,CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_HELP_Keyspecs,0,NULL,0)}, {MAKE_CMD("id","Returns the unique client ID of the connection.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_ID_History,0,CLIENT_ID_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_ID_Keyspecs,0,NULL,0)}, +{MAKE_CMD("import-source","Mark this client as an import source when server is in import mode.","O(1)","8.1.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_IMPORT_SOURCE_History,0,CLIENT_IMPORT_SOURCE_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_IMPORT_SOURCE_Keyspecs,0,NULL,1),.args=CLIENT_IMPORT_SOURCE_Args}, {MAKE_CMD("info","Returns information about the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_INFO_History,0,CLIENT_INFO_Tips,1,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_INFO_Keyspecs,0,NULL,0)}, {MAKE_CMD("kill","Terminates open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_KILL_History,7,CLIENT_KILL_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_KILL_Keyspecs,0,NULL,1),.args=CLIENT_KILL_Args}, {MAKE_CMD("list","Lists open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_LIST_History,7,CLIENT_LIST_Tips,1,clientCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_LIST_Keyspecs,0,NULL,2),.args=CLIENT_LIST_Args}, diff --git a/src/commands/client-import-source.json b/src/commands/client-import-source.json new file mode 100644 index 0000000000..113c07d70a --- /dev/null +++ b/src/commands/client-import-source.json @@ -0,0 +1,40 @@ +{ + "IMPORT-SOURCE": { + "summary": "Mark this client as an import source when server is in import mode.", + "complexity": "O(1)", + "group": "connection", + "since": "8.1.0", + "arity": 3, + "container": "CLIENT", + "function": "clientCommand", + "command_flags": [ + "NOSCRIPT", + "LOADING", + "STALE" + ], + "acl_categories": [ + "CONNECTION" + ], + "reply_schema": { + "const": "OK" + }, + "arguments": [ + { + "name": "enabled", + "type": "oneof", + "arguments": [ + { + "name": "on", + "type": "pure-token", + "token": "ON" + }, + { + "name": "off", + "type": "pure-token", + "token": "OFF" + } + ] + } + ] + } +} \ No newline at end of file diff --git a/src/config.c b/src/config.c index 15fec15276..c4009adefa 100644 --- a/src/config.c +++ b/src/config.c @@ -3139,6 +3139,7 @@ standardConfig static_configs[] = { createBoolConfig("enable-debug-assert", NULL, IMMUTABLE_CONFIG | HIDDEN_CONFIG, server.enable_debug_assert, 0, NULL, NULL), createBoolConfig("cluster-slot-stats-enabled", NULL, MODIFIABLE_CONFIG, server.cluster_slot_stats_enabled, 0, NULL, NULL), createBoolConfig("hide-user-data-from-log", NULL, MODIFIABLE_CONFIG, server.hide_user_data_from_log, 1, NULL, NULL), + createBoolConfig("import-mode", NULL, MODIFIABLE_CONFIG, server.import_mode, 0, NULL, NULL), /* String Configs */ createStringConfig("aclfile", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, server.acl_filename, "", NULL, NULL), diff --git a/src/db.c b/src/db.c index b59c7727b2..10d4a04091 100644 --- a/src/db.c +++ b/src/db.c @@ -385,7 +385,7 @@ robj *dbRandomKey(serverDb *db) { key = dictGetKey(de); keyobj = createStringObject(key, sdslen(key)); if (dbFindExpiresWithDictIndex(db, key, randomDictIndex)) { - if (allvolatile && server.primary_host && --maxtries == 0) { + if (allvolatile && (server.primary_host || server.import_mode) && --maxtries == 0) { /* If the DB is composed only of keys with an expire set, * it could happen that all the keys are already logically * expired in the repilca, so the function cannot stop because @@ -1821,6 +1821,25 @@ keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, int flags, int di if (server.primary_host != NULL) { if (server.current_client && (server.current_client->flag.primary)) return KEY_VALID; if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return KEY_EXPIRED; + } else if (server.import_mode) { + /* If we are running in the import mode on a primary, instead of + * evicting the expired key from the database, we return ASAP: + * the key expiration is controlled by the import source that will + * send us synthesized DEL operations for expired keys. The + * exception is when write operations are performed on this server + * because it's a primary. + * + * Notice: other clients, apart from the import source, should not access + * the data imported by import source. + * + * Still we try to return the right information to the caller, + * that is, KEY_VALID if we think the key should still be valid, + * KEY_EXPIRED if we think the key is expired but don't want to delete it at this time. + * + * When receiving commands from the import source, keys are never considered + * expired. */ + if (server.current_client && (server.current_client->flag.import_source)) return KEY_VALID; + if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return KEY_EXPIRED; } /* In some cases we're explicitly instructed to return an indication of a diff --git a/src/evict.c b/src/evict.c index 5e4b6220eb..5208328b32 100644 --- a/src/evict.c +++ b/src/evict.c @@ -546,8 +546,8 @@ int performEvictions(void) { goto update_metrics; } - if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION) { - result = EVICT_FAIL; /* We need to free memory, but policy forbids. */ + if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION || (iAmPrimary() && server.import_mode)) { + result = EVICT_FAIL; /* We need to free memory, but policy forbids or we are in import mode. */ goto update_metrics; } diff --git a/src/expire.c b/src/expire.c index 928bb58d86..c22df1ef86 100644 --- a/src/expire.c +++ b/src/expire.c @@ -520,8 +520,11 @@ int checkAlreadyExpired(long long when) { * of a replica instance. * * Instead we add the already expired key to the database with expire time - * (possibly in the past) and wait for an explicit DEL from the primary. */ - return (when <= commandTimeSnapshot() && !server.loading && !server.primary_host); + * (possibly in the past) and wait for an explicit DEL from the primary. + * + * If the server is a primary and in the import mode, we also add the already + * expired key and wait for an explicit DEL from the import source. */ + return (when <= commandTimeSnapshot() && !server.loading && !server.primary_host && !server.import_mode); } #define EXPIRE_NX (1 << 0) diff --git a/src/networking.c b/src/networking.c index 4791055b5a..9558780f39 100644 --- a/src/networking.c +++ b/src/networking.c @@ -3585,6 +3585,10 @@ void clientCommand(client *c) { " Protect current client connection from eviction.", "NO-TOUCH (ON|OFF)", " Will not touch LRU/LFU stats when this mode is on.", + "IMPORT-SOURCE (ON|OFF)", + " Mark this connection as an import source if server.import_mode is true.", + " Sync tools can set their connections into 'import-source' state to visit", + " expired keys.", NULL}; addReplyHelp(c, help); } else if (!strcasecmp(c->argv[1]->ptr, "id") && c->argc == 2) { @@ -4058,6 +4062,22 @@ void clientCommand(client *c) { } } addReply(c, shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr, "import-source")) { + /* CLIENT IMPORT-SOURCE ON|OFF */ + if (!server.import_mode) { + addReplyError(c, "Server is not in import mode"); + return; + } + if (!strcasecmp(c->argv[2]->ptr, "on")) { + c->flag.import_source = 1; + addReply(c, shared.ok); + } else if (!strcasecmp(c->argv[2]->ptr, "off")) { + c->flag.import_source = 0; + addReply(c, shared.ok); + } else { + addReplyErrorObject(c, shared.syntaxerr); + return; + } } else { addReplySubcommandSyntaxError(c); } diff --git a/src/server.c b/src/server.c index 12691df8ee..aebbb57a93 100644 --- a/src/server.c +++ b/src/server.c @@ -1131,10 +1131,10 @@ void databasesCron(void) { /* Expire keys by random sampling. Not required for replicas * as primary will synthesize DELs for us. */ if (server.active_expire_enabled) { - if (iAmPrimary()) { - activeExpireCycle(ACTIVE_EXPIRE_CYCLE_SLOW); - } else { + if (!iAmPrimary()) { expireReplicaKeys(); + } else if (!server.import_mode) { + activeExpireCycle(ACTIVE_EXPIRE_CYCLE_SLOW); } } @@ -1727,7 +1727,7 @@ void beforeSleep(struct aeEventLoop *eventLoop) { /* Run a fast expire cycle (the called function will return * ASAP if a fast cycle is not needed). */ - if (server.active_expire_enabled && iAmPrimary()) activeExpireCycle(ACTIVE_EXPIRE_CYCLE_FAST); + if (server.active_expire_enabled && !server.import_mode && iAmPrimary()) activeExpireCycle(ACTIVE_EXPIRE_CYCLE_FAST); if (moduleCount()) { moduleFireServerEvent(VALKEYMODULE_EVENT_EVENTLOOP, VALKEYMODULE_SUBEVENT_EVENTLOOP_BEFORE_SLEEP, NULL); @@ -2133,6 +2133,7 @@ void initServerConfig(void) { server.extended_redis_compat = 0; server.pause_cron = 0; server.dict_resizing = 1; + server.import_mode = 0; server.latency_tracking_info_percentiles_len = 3; server.latency_tracking_info_percentiles = zmalloc(sizeof(double) * (server.latency_tracking_info_percentiles_len)); diff --git a/src/server.h b/src/server.h index 5ef04a9080..531ca8e7c8 100644 --- a/src/server.h +++ b/src/server.h @@ -1233,7 +1233,8 @@ typedef struct ClientFlags { * knows that it does not need the cache and required a full sync. With this * flag, we won't cache the primary in freeClient. */ uint64_t fake : 1; /* This is a fake client without a real connection. */ - uint64_t reserved : 5; /* Reserved for future use */ + uint64_t import_source : 1; /* This client is importing data to server and can visit expired key. */ + uint64_t reserved : 4; /* Reserved for future use */ } ClientFlags; typedef struct client { @@ -2089,6 +2090,8 @@ struct valkeyServer { char primary_replid[CONFIG_RUN_ID_SIZE + 1]; /* Primary PSYNC runid. */ long long primary_initial_offset; /* Primary PSYNC offset. */ int repl_replica_lazy_flush; /* Lazy FLUSHALL before loading DB? */ + /* Import Mode */ + int import_mode; /* If true, server is in import mode and forbid expiration and eviction. */ /* Synchronous replication. */ list *clients_waiting_acks; /* Clients waiting in WAIT or WAITAOF. */ int get_ack_from_replicas; /* If true we send REPLCONF GETACK. */ diff --git a/tests/unit/expire.tcl b/tests/unit/expire.tcl index d85ce7ee68..fba425f62d 100644 --- a/tests/unit/expire.tcl +++ b/tests/unit/expire.tcl @@ -832,6 +832,80 @@ start_server {tags {"expire"}} { close_replication_stream $repl assert_equal [r debug set-active-expire 1] {OK} } {} {needs:debug} + + test {Import mode should forbid active expiration} { + r flushall + + r config set import-mode yes + assert_equal [r client import-source on] {OK} + + r set foo1 bar PX 1 + r set foo2 bar PX 1 + after 100 + + assert_equal [r dbsize] {2} + + assert_equal [r client import-source off] {OK} + r config set import-mode no + + # Verify all keys have expired + wait_for_condition 40 100 { + [r dbsize] eq 0 + } else { + fail "Keys did not actively expire." + } + } + + test {Import mode should forbid lazy expiration} { + r flushall + r debug set-active-expire 0 + + r config set import-mode yes + assert_equal [r client import-source on] {OK} + + r set foo1 1 PX 1 + after 10 + + r get foo1 + assert_equal [r dbsize] {1} + + assert_equal [r client import-source off] {OK} + r config set import-mode no + + r get foo1 + + assert_equal [r dbsize] {0} + + assert_equal [r debug set-active-expire 1] {OK} + } {} {needs:debug} + + test {RANDOMKEY can return expired key in import mode} { + r flushall + + r config set import-mode yes + assert_equal [r client import-source on] {OK} + + r set foo1 bar PX 1 + after 10 + + set client [valkey [srv "host"] [srv "port"] 0 $::tls] + if {!$::singledb} { + $client select 9 + } + assert_equal [$client ttl foo1] {-2} + + assert_equal [r randomkey] {foo1} + + assert_equal [r client import-source off] {OK} + r config set import-mode no + + # Verify all keys have expired + wait_for_condition 40 100 { + [r dbsize] eq 0 + } else { + fail "Keys did not actively expire." + } + } } start_cluster 1 0 {tags {"expire external:skip cluster"}} { diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl index d4e62246f1..89e9699a3e 100644 --- a/tests/unit/maxmemory.tcl +++ b/tests/unit/maxmemory.tcl @@ -611,3 +611,21 @@ start_server {tags {"maxmemory" "external:skip"}} { assert {[r object freq foo] == 5} } } + +start_server {tags {"maxmemory" "external:skip"}} { + test {Import mode should forbid eviction} { + r set key val + r config set import-mode yes + assert_equal [r client import-source on] {OK} + r config set maxmemory-policy allkeys-lru + r config set maxmemory 1 + + assert_equal [r dbsize] {1} + assert_error {OOM command not allowed*} {r set key1 val1} + + assert_equal [r client import-source off] {OK} + r config set import-mode no + + assert_equal [r dbsize] {0} + } +} \ No newline at end of file diff --git a/valkey.conf b/valkey.conf index 7c7b9da43e..bf82b01874 100644 --- a/valkey.conf +++ b/valkey.conf @@ -818,6 +818,13 @@ replica-priority 100 # # replica-ignore-disk-write-errors no +# Make the primary forbid expiration and eviction. +# This is useful for sync tools, because expiration and eviction may cause the data corruption. +# Sync tools can mark their connections as importing source by CLIENT IMPORT-SOURCE. +# NOTICE: Clients should avoid writing the same key on the source server and the destination server. +# +# import-mode no + # ----------------------------------------------------------------------------- # By default, Sentinel includes all replicas in its reports. A replica # can be excluded from Sentinel's announcements. An unannounced replica From f553ccbda674caa13d3cfa6e8096c5f19cb3a9c1 Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 21 Nov 2024 20:01:30 +0800 Subject: [PATCH 44/92] Use goto to cleanup error handling in readSyncBulkPayload (#1332) The goto error label is the same as the error return, use goto to reduce the references. ``` error: cancelReplicationHandshake(1); return; ``` Also this can make the log printing more continuous under the error, that is, we print the error log first, and then print the reconnecting log at the last (in cancelReplicationHandshake). Signed-off-by: Binbin --- src/replication.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/src/replication.c b/src/replication.c index a809c4c166..75f08c4c89 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2091,8 +2091,7 @@ void readSyncBulkPayload(connection *conn) { } serverLog(LL_WARNING, "I/O error trying to sync with PRIMARY: %s", (nread == -1) ? connGetLastError(conn) : "connection lost"); - cancelReplicationHandshake(1); - return; + goto error; } server.stat_net_repl_input_bytes += nread; @@ -2257,7 +2256,6 @@ void readSyncBulkPayload(connection *conn) { if (loadingFailed) { stopLoading(0); - cancelReplicationHandshake(1); rioFreeConn(&rdb, NULL); if (server.repl_diskless_load == REPL_DISKLESS_LOAD_SWAPDB) { @@ -2277,7 +2275,7 @@ void readSyncBulkPayload(connection *conn) { /* Note that there's no point in restarting the AOF on SYNC * failure, it'll be restarted when sync succeeds or the replica * gets promoted. */ - return; + goto error; } /* RDB loading succeeded if we reach this point. */ @@ -2319,8 +2317,7 @@ void readSyncBulkPayload(connection *conn) { "Failed trying to sync the temp DB to disk in " "PRIMARY <-> REPLICA synchronization: %s", strerror(errno)); - cancelReplicationHandshake(1); - return; + goto error; } /* Rename rdb like renaming rewrite aof asynchronously. */ @@ -2330,9 +2327,8 @@ void readSyncBulkPayload(connection *conn) { "Failed trying to rename the temp DB into %s in " "PRIMARY <-> REPLICA synchronization: %s", server.rdb_filename, strerror(errno)); - cancelReplicationHandshake(1); if (old_rdb_fd != -1) close(old_rdb_fd); - return; + goto error; } /* Close old rdb asynchronously. */ if (old_rdb_fd != -1) bioCreateCloseJob(old_rdb_fd, 0, 0); @@ -2343,8 +2339,7 @@ void readSyncBulkPayload(connection *conn) { "Failed trying to sync DB directory %s in " "PRIMARY <-> REPLICA synchronization: %s", server.rdb_filename, strerror(errno)); - cancelReplicationHandshake(1); - return; + goto error; } /* We will soon start loading the RDB from disk, the replication history is changed, @@ -2361,7 +2356,6 @@ void readSyncBulkPayload(connection *conn) { if (rdbLoad(server.rdb_filename, &rsi, RDBFLAGS_REPLICATION) != RDB_OK) { serverLog(LL_WARNING, "Failed trying to load the PRIMARY synchronization " "DB from disk, check server logs."); - cancelReplicationHandshake(1); if (server.rdb_del_sync_files && allPersistenceDisabled()) { serverLog(LL_NOTICE, "Removing the RDB file obtained from " "the primary. This replica has persistence " @@ -2375,7 +2369,7 @@ void readSyncBulkPayload(connection *conn) { /* Note that there's no point in restarting the AOF on sync failure, it'll be restarted when sync succeeds or replica promoted. */ - return; + goto error; } /* Cleanup. */ From 6038eda010dfb99eff908cf0839cc41004383acd Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 21 Nov 2024 21:02:05 +0800 Subject: [PATCH 45/92] Make FUNCTION RESTORE FLUSH flush async based on lazyfree-lazy-user-flush (#1254) FUNCTION RESTORE have a FLUSH option, it will delete all the existing libraries before restoring the payload. If for some reasons, there are a lot of libraries, we will block a while in here. Signed-off-by: Binbin --- src/functions.c | 17 +++++++++++++---- src/functions.h | 4 ++-- src/replication.c | 2 +- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/functions.c b/src/functions.c index c9ec42b322..916d8fd622 100644 --- a/src/functions.c +++ b/src/functions.c @@ -185,6 +185,15 @@ void functionsLibCtxClearCurrent(int async) { } } +/* Free the given functions ctx */ +static void functionsLibCtxFreeGeneric(functionsLibCtx *functions_lib_ctx, int async) { + if (async) { + freeFunctionsAsync(functions_lib_ctx); + } else { + functionsLibCtxFree(functions_lib_ctx); + } +} + /* Free the given functions ctx */ void functionsLibCtxFree(functionsLibCtx *functions_lib_ctx) { functionsLibCtxClear(functions_lib_ctx); @@ -196,8 +205,8 @@ void functionsLibCtxFree(functionsLibCtx *functions_lib_ctx) { /* Swap the current functions ctx with the given one. * Free the old functions ctx. */ -void functionsLibCtxSwapWithCurrent(functionsLibCtx *new_lib_ctx) { - functionsLibCtxFree(curr_functions_lib_ctx); +void functionsLibCtxSwapWithCurrent(functionsLibCtx *new_lib_ctx, int async) { + functionsLibCtxFreeGeneric(curr_functions_lib_ctx, async); curr_functions_lib_ctx = new_lib_ctx; } @@ -769,7 +778,7 @@ void functionRestoreCommand(client *c) { } if (restore_replicy == restorePolicy_Flush) { - functionsLibCtxSwapWithCurrent(functions_lib_ctx); + functionsLibCtxSwapWithCurrent(functions_lib_ctx, server.lazyfree_lazy_user_flush); functions_lib_ctx = NULL; /* avoid releasing the f_ctx in the end */ } else { if (libraryJoin(curr_functions_lib_ctx, functions_lib_ctx, restore_replicy == restorePolicy_Replace, &err) != @@ -789,7 +798,7 @@ void functionRestoreCommand(client *c) { addReply(c, shared.ok); } if (functions_lib_ctx) { - functionsLibCtxFree(functions_lib_ctx); + functionsLibCtxFreeGeneric(functions_lib_ctx, server.lazyfree_lazy_user_flush); } } diff --git a/src/functions.h b/src/functions.h index da196cf197..429405bb2d 100644 --- a/src/functions.h +++ b/src/functions.h @@ -134,9 +134,9 @@ size_t functionsLibCtxFunctionsLen(functionsLibCtx *functions_ctx); functionsLibCtx *functionsLibCtxGetCurrent(void); functionsLibCtx *functionsLibCtxCreate(void); void functionsLibCtxClearCurrent(int async); -void functionsLibCtxFree(functionsLibCtx *lib_ctx); +void functionsLibCtxFree(functionsLibCtx *functions_lib_ctx); void functionsLibCtxClear(functionsLibCtx *lib_ctx); -void functionsLibCtxSwapWithCurrent(functionsLibCtx *lib_ctx); +void functionsLibCtxSwapWithCurrent(functionsLibCtx *new_lib_ctx, int async); int functionLibCreateFunction(sds name, void *function, functionLibInfo *li, sds desc, uint64_t f_flags, sds *err); diff --git a/src/replication.c b/src/replication.c index 75f08c4c89..437ae278ec 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2289,7 +2289,7 @@ void readSyncBulkPayload(connection *conn) { swapMainDbWithTempDb(diskless_load_tempDb); /* swap existing functions ctx with the temporary one */ - functionsLibCtxSwapWithCurrent(temp_functions_lib_ctx); + functionsLibCtxSwapWithCurrent(temp_functions_lib_ctx, 0); moduleFireServerEvent(VALKEYMODULE_EVENT_REPL_ASYNC_LOAD, VALKEYMODULE_SUBEVENT_REPL_ASYNC_LOAD_COMPLETED, NULL); From b486a415009660f355d0a8eb9fd67a9c9cb9cc6e Mon Sep 17 00:00:00 2001 From: xbasel <103044017+xbasel@users.noreply.github.com> Date: Thu, 21 Nov 2024 18:22:16 +0200 Subject: [PATCH 46/92] Preserve original fd blocking state in TLS I/O operations (#1298) This change prevents unintended side effects on connection state and improves consistency with non-TLS sync operations. For example, when invoking `connTLSSyncRead` with a blocking file descriptor, the mode is switched to non-blocking upon `connTLSSyncRead` exit. If the code assumes the file descriptor remains blocking and calls the normal `read` expecting it to block, it may result in a short read. This caused a crash in dual-channel, which was fixed in this PR by relocating `connBlock()`: https://github.com/valkey-io/valkey/pull/837 Signed-off-by: xbasel <103044017+xbasel@users.noreply.github.com> --- src/anet.c | 30 ++++++++++++++++++++++++++---- src/anet.h | 1 + src/tls.c | 21 ++++++++++++++++----- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/src/anet.c b/src/anet.c index d4ac698982..8dc06ca62e 100644 --- a/src/anet.c +++ b/src/anet.c @@ -70,17 +70,24 @@ int anetGetError(int fd) { return sockerr; } -int anetSetBlock(char *err, int fd, int non_block) { +static int anetGetSocketFlags(char *err, int fd) { int flags; - /* Set the socket blocking (if non_block is zero) or non-blocking. - * Note that fcntl(2) for F_GETFL and F_SETFL can't be - * interrupted by a signal. */ if ((flags = fcntl(fd, F_GETFL)) == -1) { anetSetError(err, "fcntl(F_GETFL): %s", strerror(errno)); return ANET_ERR; } + return flags; +} + +int anetSetBlock(char *err, int fd, int non_block) { + int flags = anetGetSocketFlags(err, fd); + + if (flags == ANET_ERR) { + return ANET_ERR; + } + /* Check if this flag has been set or unset, if so, * then there is no need to call fcntl to set/unset it again. */ if (!!(flags & O_NONBLOCK) == !!non_block) return ANET_OK; @@ -105,6 +112,21 @@ int anetBlock(char *err, int fd) { return anetSetBlock(err, fd, 0); } +int anetIsBlock(char *err, int fd) { + int flags = anetGetSocketFlags(err, fd); + + if (flags == ANET_ERR) { + return ANET_ERR; + } + + /* Check if the O_NONBLOCK flag is set */ + if (flags & O_NONBLOCK) { + return 0; /* Socket is non-blocking */ + } else { + return 1; /* Socket is blocking */ + } +} + /* Enable the FD_CLOEXEC on the given fd to avoid fd leaks. * This function should be invoked for fd's on specific places * where fork + execve system calls are called. */ diff --git a/src/anet.h b/src/anet.h index ab32f72e4b..b14b4bdaad 100644 --- a/src/anet.h +++ b/src/anet.h @@ -61,6 +61,7 @@ int anetTcpAccept(char *err, int serversock, char *ip, size_t ip_len, int *port) int anetUnixAccept(char *err, int serversock); int anetNonBlock(char *err, int fd); int anetBlock(char *err, int fd); +int anetIsBlock(char *err, int fd); int anetCloexec(int fd); int anetEnableTcpNoDelay(char *err, int fd); int anetDisableTcpNoDelay(char *err, int fd); diff --git a/src/tls.c b/src/tls.c index a1fda2a7ae..d1dd567354 100644 --- a/src/tls.c +++ b/src/tls.c @@ -974,6 +974,10 @@ static int connTLSSetReadHandler(connection *conn, ConnectionCallbackFunc func) return C_OK; } +static int isBlocking(tls_connection *conn) { + return anetIsBlock(NULL, conn->c.fd); +} + static void setBlockingTimeout(tls_connection *conn, long long timeout) { anetBlock(NULL, conn->c.fd); anetSendTimeout(NULL, conn->c.fd, timeout); @@ -1012,27 +1016,31 @@ static int connTLSBlockingConnect(connection *conn_, const char *addr, int port, static ssize_t connTLSSyncWrite(connection *conn_, char *ptr, ssize_t size, long long timeout) { tls_connection *conn = (tls_connection *)conn_; - + int blocking = isBlocking(conn); setBlockingTimeout(conn, timeout); SSL_clear_mode(conn->ssl, SSL_MODE_ENABLE_PARTIAL_WRITE); ERR_clear_error(); int ret = SSL_write(conn->ssl, ptr, size); ret = updateStateAfterSSLIO(conn, ret, 0); SSL_set_mode(conn->ssl, SSL_MODE_ENABLE_PARTIAL_WRITE); - unsetBlockingTimeout(conn); + if (!blocking) { + unsetBlockingTimeout(conn); + } return ret; } static ssize_t connTLSSyncRead(connection *conn_, char *ptr, ssize_t size, long long timeout) { tls_connection *conn = (tls_connection *)conn_; - + int blocking = isBlocking(conn); setBlockingTimeout(conn, timeout); ERR_clear_error(); int ret = SSL_read(conn->ssl, ptr, size); updateSSLPendingFlag(conn); ret = updateStateAfterSSLIO(conn, ret, 0); - unsetBlockingTimeout(conn); + if (!blocking) { + unsetBlockingTimeout(conn); + } return ret; } @@ -1041,6 +1049,7 @@ static ssize_t connTLSSyncReadLine(connection *conn_, char *ptr, ssize_t size, l tls_connection *conn = (tls_connection *)conn_; ssize_t nread = 0; + int blocking = isBlocking(conn); setBlockingTimeout(conn, timeout); size--; @@ -1067,7 +1076,9 @@ static ssize_t connTLSSyncReadLine(connection *conn_, char *ptr, ssize_t size, l size--; } exit: - unsetBlockingTimeout(conn); + if (!blocking) { + unsetBlockingTimeout(conn); + } return nread; } From b56eed2479191dfd1f644768b7144c35a75ef52c Mon Sep 17 00:00:00 2001 From: zvi-code <54795925+zvi-code@users.noreply.github.com> Date: Fri, 22 Nov 2024 02:29:21 +0200 Subject: [PATCH 47/92] Remove valkey specific changes in jemalloc source code (#1266) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Summary of the change This is a base PR for refactoring defrag. It moves the defrag logic to rely on jemalloc [native api](https://github.com/jemalloc/jemalloc/pull/1463#issuecomment-479706489) instead of relying on custom code changes made by valkey in the jemalloc ([je_defrag_hint](https://github.com/valkey-io/valkey/blob/9f8185f5c80bc98bdbc631b90ccf13929d6a0cbc/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h#L382)) library. This enables valkey to use latest vanila jemalloc without the need to maintain code changes cross jemalloc versions. This change requires some modifications because the new api is providing only the information, not a yes\no defrag. The logic needs to be implemented at valkey code. Additionally, the api does not provide, within single call, all the information needed to make a decision, this information is available through additional api call. To reduce the calls to jemalloc, in this PR the required information is collected during the `computeDefragCycles` and not for every single ptr, this way we are avoiding the additional api call. Followup work will utilize the new options that are now open and will further improve the defrag decision and process. ### Added files: `allocator_defrag.c` / `allocator_defrag.h` - This files implement the allocator specific knowledge for making defrag decision. The knowledge about slabs and allocation logic and so on, all goes into this file. This improves the separation between jemalloc specific code and other possible implementation. ### Moved functions: [`zmalloc_no_tcache` , `zfree_no_tcache` ](https://github.com/valkey-io/valkey/blob/4593dc2f059661e1c4eb43bba025f68948344228/src/zmalloc.c#L215) - these are very jemalloc specific logic assumptions, and are very specific to how we defrag with jemalloc. This is also with the vision that from performance perspective we should consider using tcache, we only need to make sure we don't recycle entries without going through the arena [for example: we can use private tcache, one for free and one for alloc]. `frag_smallbins_bytes` - the logic and implementation moved to the new file ### Existing API: * [once a second + when completed full cycle] [`computeDefragCycles`](https://github.com/valkey-io/valkey/blob/4593dc2f059661e1c4eb43bba025f68948344228/src/defrag.c#L916) * `zmalloc_get_allocator_info` : gets from jemalloc _allocated, active, resident, retained, muzzy_, `frag_smallbins_bytes` * [`frag_smallbins_bytes`](https://github.com/valkey-io/valkey/blob/4593dc2f059661e1c4eb43bba025f68948344228/src/zmalloc.c#L690) : for each bin; gets from jemalloc bin_info, `curr_regs`, `cur_slabs` * [during defrag, for each pointer] * `je_defrag_hint` is getting a memory pointer and returns {0,1} . [Internally it uses](https://github.com/valkey-io/valkey/blob/4593dc2f059661e1c4eb43bba025f68948344228/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h#L368) this information points: * #`nonfull_slabs` * #`total_slabs` * #free regs in the ptr slab ## Jemalloc API (via ctl interface) [BATCH][`experimental_utilization_batch_query_ctl`](https://github.com/valkey-io/valkey/blob/4593dc2f059661e1c4eb43bba025f68948344228/deps/jemalloc/src/ctl.c#L4114) : gets an array of pointers, returns for each pointer 3 values, * number of free regions in the extent * number of regions in the extent * size of the extent in terms of bytes [EXTENDED][`experimental_utilization_query_ctl`](https://github.com/valkey-io/valkey/blob/4593dc2f059661e1c4eb43bba025f68948344228/deps/jemalloc/src/ctl.c#L3989) : * memory address of the extent a potential reallocation would go into * number of free regions in the extent * number of regions in the extent * size of the extent in terms of bytes * [stats-enabled]total number of free regions in the bin the extent belongs to * [stats-enabled]total number of regions in the bin the extent belongs to ### `experimental_utilization_batch_query_ctl` vs valkey `je_defrag_hint`? [good] - We can query pointers in a batch, reduce the overall overhead - The per ptr decision algorithm is not within jemalloc api, jemalloc only provides information, valkey can tune\configure\optimize easily [bad] - In the batch API we only know the utilization of the slab (of that memory ptr), we don’t get the data about #`nonfull_slabs` and total allocated regs. ## New functions: 1. `defrag_jemalloc_init`: Reducing the cost of call to je_ctl: use the [MIB interface](https://jemalloc.net/jemalloc.3.html) to get a faster calls. See this quote from the jemalloc documentation: The mallctlnametomib() function provides a way to avoid repeated name lookups for applications that repeatedly query the same portion of the namespace,by translating a name to a “Management Information Base” (MIB) that can be passed repeatedly to mallctlbymib(). 6. `jemalloc_sz2binind_lgq*` : this api is to support reverse map between bin size and it’s info without lookup. This mapping depends on the number of size classes we have that are derived from [`lg_quantum`](https://github.com/valkey-io/valkey/blob/4593dc2f059661e1c4eb43bba025f68948344228/deps/Makefile#L115) 7. `defrag_jemalloc_get_frag_smallbins` : This function replaces `frag_smallbins_bytes` the logic moved to the new file allocator_defrag `defrag_jemalloc_should_defrag_multi` → `handle_results` - unpacks the results 8. `should_defrag` : implements the same logic as the existing implementation [inside](https://github.com/valkey-io/valkey/blob/9f8185f5c80bc98bdbc631b90ccf13929d6a0cbc/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h#L382) je_defrag_hint 9. `defrag_jemalloc_should_defrag_multi` : implements the hint for an array of pointers, utilizing the new batch api. currently only 1 pointer is passed. ### Logical differences: In order to get the information about #`nonfull_slabs` and #`regs`, we use the query cycle to collect the information per size class. In order to find the index of bin information given bin size, in o(1), we use `jemalloc_sz2binind_lgq*` . ## Testing This is the first draft. I did some initial testing that basically fragmentation by reducing max memory and than waiting for defrag to reach desired level. The test only serves as sanity that defrag is succeeding eventually, no data provided here regarding efficiency and performance. ### Test: 1. disable `activedefrag` 2. run valkey benchmark on overlapping address ranges with different block sizes 3. wait untill `used_memory` reaches 10GB 4. set `maxmemory` to 5GB and `maxmemory-policy` to `allkeys-lru` 5. stop load 6. wait for `mem_fragmentation_ratio` to reach 2 7. enable `activedefrag` - start test timer 8. wait until reach `mem_fragmentation_ratio` = 1.1 #### Results*: (With this PR)Test results: ` 56 sec` (Without this PR)Test results: `67 sec` *both runs perform same "work" number of buffers moved to reach fragmentation target Next benchmarking is to compare to: - DONE // existing `je_get_defrag_hint` - compare with naive defrag all: `int defrag_hint() {return 1;}` --------- Signed-off-by: Zvi Schneider Signed-off-by: Zvi Schneider Signed-off-by: zvi-code <54795925+zvi-code@users.noreply.github.com> Co-authored-by: Zvi Schneider Co-authored-by: Zvi Schneider Co-authored-by: Madelyn Olson --- cmake/Modules/SourceFiles.cmake | 1 + .../internal/jemalloc_internal_inlines_c.h | 51 --- .../include/jemalloc/jemalloc_macros.h.in | 4 - deps/jemalloc/src/jemalloc.c | 9 - src/Makefile | 2 +- src/allocator_defrag.c | 426 ++++++++++++++++++ src/allocator_defrag.h | 22 + src/defrag.c | 14 +- src/server.c | 9 +- src/server.h | 1 + src/zmalloc.c | 79 +--- src/zmalloc.h | 19 +- 12 files changed, 466 insertions(+), 171 deletions(-) create mode 100644 src/allocator_defrag.c create mode 100644 src/allocator_defrag.h diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake index d76f17625e..873229d6f0 100644 --- a/cmake/Modules/SourceFiles.cmake +++ b/cmake/Modules/SourceFiles.cmake @@ -74,6 +74,7 @@ set(VALKEY_SERVER_SRCS ${CMAKE_SOURCE_DIR}/src/geohash.c ${CMAKE_SOURCE_DIR}/src/geohash_helper.c ${CMAKE_SOURCE_DIR}/src/childinfo.c + ${CMAKE_SOURCE_DIR}/src/allocator_defrag.c ${CMAKE_SOURCE_DIR}/src/defrag.c ${CMAKE_SOURCE_DIR}/src/siphash.c ${CMAKE_SOURCE_DIR}/src/rax.c diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h index 2cd7e7ce93..b0868b7d61 100644 --- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h +++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h @@ -337,55 +337,4 @@ imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) { return fallback_alloc(size); } -JEMALLOC_ALWAYS_INLINE int -iget_defrag_hint(tsdn_t *tsdn, void* ptr) { - int defrag = 0; - emap_alloc_ctx_t alloc_ctx; - emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, &alloc_ctx); - if (likely(alloc_ctx.slab)) { - /* Small allocation. */ - edata_t *slab = emap_edata_lookup(tsdn, &arena_emap_global, ptr); - arena_t *arena = arena_get_from_edata(slab); - szind_t binind = edata_szind_get(slab); - unsigned binshard = edata_binshard_get(slab); - bin_t *bin = arena_get_bin(arena, binind, binshard); - malloc_mutex_lock(tsdn, &bin->lock); - arena_dalloc_bin_locked_info_t info; - arena_dalloc_bin_locked_begin(&info, binind); - /* Don't bother moving allocations from the slab currently used for new allocations */ - if (slab != bin->slabcur) { - int free_in_slab = edata_nfree_get(slab); - if (free_in_slab) { - const bin_info_t *bin_info = &bin_infos[binind]; - /* Find number of non-full slabs and the number of regs in them */ - unsigned long curslabs = 0; - size_t curregs = 0; - /* Run on all bin shards (usually just one) */ - for (uint32_t i=0; i< bin_info->n_shards; i++) { - bin_t *bb = arena_get_bin(arena, binind, i); - curslabs += bb->stats.nonfull_slabs; - /* Deduct the regs in full slabs (they're not part of the game) */ - unsigned long full_slabs = bb->stats.curslabs - bb->stats.nonfull_slabs; - curregs += bb->stats.curregs - full_slabs * bin_info->nregs; - if (bb->slabcur) { - /* Remove slabcur from the overall utilization (not a candidate to nove from) */ - curregs -= bin_info->nregs - edata_nfree_get(bb->slabcur); - curslabs -= 1; - } - } - /* Compare the utilization ratio of the slab in question to the total average - * among non-full slabs. To avoid precision loss in division, we do that by - * extrapolating the usage of the slab as if all slabs have the same usage. - * If this slab is less used than the average, we'll prefer to move the data - * to hopefully more used ones. To avoid stagnation when all slabs have the same - * utilization, we give additional 12.5% weight to the decision to defrag. */ - defrag = (bin_info->nregs - free_in_slab) * curslabs <= curregs + curregs / 8; - } - } - arena_dalloc_bin_locked_finish(tsdn, arena, bin, &info); - malloc_mutex_unlock(tsdn, &bin->lock); - } - return defrag; -} - #endif /* JEMALLOC_INTERNAL_INLINES_C_H */ diff --git a/deps/jemalloc/include/jemalloc/jemalloc_macros.h.in b/deps/jemalloc/include/jemalloc/jemalloc_macros.h.in index d04af34d93..ebb3137e6f 100644 --- a/deps/jemalloc/include/jemalloc/jemalloc_macros.h.in +++ b/deps/jemalloc/include/jemalloc/jemalloc_macros.h.in @@ -147,7 +147,3 @@ #else # define JEMALLOC_SYS_NOTHROW JEMALLOC_NOTHROW #endif - -/* This version of Jemalloc, modified for Redis, has the je_get_defrag_hint() - * function. */ -#define JEMALLOC_FRAG_HINT diff --git a/deps/jemalloc/src/jemalloc.c b/deps/jemalloc/src/jemalloc.c index 83026093be..ea9232c5d6 100644 --- a/deps/jemalloc/src/jemalloc.c +++ b/deps/jemalloc/src/jemalloc.c @@ -4474,12 +4474,3 @@ jemalloc_postfork_child(void) { } /******************************************************************************/ - -/* Helps the application decide if a pointer is worth re-allocating in order to reduce fragmentation. - * returns 1 if the allocation should be moved, and 0 if the allocation be kept. - * If the application decides to re-allocate it should use MALLOCX_TCACHE_NONE when doing so. */ -JEMALLOC_EXPORT int JEMALLOC_NOTHROW -get_defrag_hint(void* ptr) { - assert(ptr != NULL); - return iget_defrag_hint(TSDN_NULL, ptr); -} diff --git a/src/Makefile b/src/Makefile index a76356e9d5..f876f55dec 100644 --- a/src/Makefile +++ b/src/Makefile @@ -411,7 +411,7 @@ endif ENGINE_NAME=valkey SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX) ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX) -ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o +ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX) ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX) diff --git a/src/allocator_defrag.c b/src/allocator_defrag.c new file mode 100644 index 0000000000..b2330c95e0 --- /dev/null +++ b/src/allocator_defrag.c @@ -0,0 +1,426 @@ +/* Copyright 2024- Valkey contributors + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ + +/* + * This file implements allocator-specific defragmentation logic used + * within the Valkey engine. Below is the relationship between various + * components involved in allocation and defragmentation: + * + * Application code + * / \ + * allocation / \ defrag + * / \ + * zmalloc allocator_defrag + * / | \ / \ + * / | \ / \ + * / | \ / \ + * libc tcmalloc jemalloc other + * + * Explanation: + * - **Application code**: High-level application logic that uses memory + * allocation and may trigger defragmentation. + * - **zmalloc**: An abstraction layer over the memory allocator, providing + * a uniform allocation interface to the application code. It can delegate + * to various underlying allocators (e.g., libc, tcmalloc, jemalloc, or others). + * It is not dependant on defrag implementation logic and it's possible to use jemalloc + * version that does not support defrag. + * - **allocator_defrag**: This file contains allocator-specific logic for + * defragmentation, invoked from `defrag.c` when memory defragmentation is needed. + * currently jemalloc is the only allocator with implemented defrag logic. It is possible that + * future implementation will include non-allocator defragmentation (think of data-structure + * compaction for example). + * - **Underlying allocators**: These are the actual memory allocators, such as + * libc, tcmalloc, jemalloc, or other custom allocators. The defragmentation + * logic in `allocator_defrag` interacts with these allocators to reorganize + * memory and reduce fragmentation. + * + * The `defrag.c` file acts as the central entry point for defragmentation, + * invoking allocator-specific implementations provided here in `allocator_defrag.c`. + * + * Note: Developers working on `zmalloc` or `allocator_defrag` should refer to + * the other component to ensure both are using the same allocator configuration. + */ + +#include +#include "serverassert.h" +#include "allocator_defrag.h" + +#define UNUSED(x) (void)(x) + +#if defined(HAVE_DEFRAG) && defined(USE_JEMALLOC) + +#define STRINGIFY_(x) #x +#define STRINGIFY(x) STRINGIFY_(x) + +#define BATCH_QUERY_ARGS_OUT 3 +#define SLAB_NFREE(out, i) out[(i) * BATCH_QUERY_ARGS_OUT] +#define SLAB_LEN(out, i) out[(i) * BATCH_QUERY_ARGS_OUT + 2] +#define SLAB_NUM_REGS(out, i) out[(i) * BATCH_QUERY_ARGS_OUT + 1] + +#define UTILIZATION_THRESHOLD_FACTOR_MILI (125) // 12.5% additional utilization + +/* + * Represents a precomputed key for querying jemalloc statistics. + * + * The `jeMallctlKey` structure stores a key corresponding to a specific jemalloc + * statistics field name. This key is used with the `je_mallctlbymib` interface + * to query statistics more efficiently, bypassing the need for runtime string + * lookup and translation performed by `je_mallctl`. + * + * - `je_mallctlnametomib` is called once for each statistics field to precompute + * and store the key corresponding to the field name. + * - Subsequent queries use `je_mallctlbymib` with the stored key, avoiding the + * overhead of repeated string-based lookups. + * + */ +typedef struct jeMallctlKey { + size_t key[6]; /* The precomputed key used to query jemalloc statistics. */ + size_t keylen; /* The length of the key array. */ +} jeMallctlKey; + +/* Stores MIB (Management Information Base) keys for jemalloc bin queries. + * + * This struct holds precomputed `jeMallctlKey` values for querying various + * jemalloc bin-related statistics efficiently. + */ +typedef struct jeBinInfoKeys { + jeMallctlKey curr_slabs; /* Key to query the current number of slabs in the bin. */ + jeMallctlKey nonfull_slabs; /* Key to query the number of non-full slabs in the bin. */ + jeMallctlKey curr_regs; /* Key to query the current number of regions in the bin. */ +} jeBinInfoKeys; + +/* Represents detailed information about a jemalloc bin. + * + * This struct provides metadata about a jemalloc bin, including the size of + * its regions, total number of regions, and related MIB keys for efficient + * queries. + */ +typedef struct jeBinInfo { + size_t reg_size; /* Size of each region in the bin. */ + uint32_t nregs; /* Total number of regions in the bin. */ + jeBinInfoKeys info_keys; /* Precomputed MIB keys for querying bin statistics. */ +} jeBinInfo; + +/* Represents the configuration for jemalloc bins. + * + * This struct contains information about the number of bins and metadata for + * each bin, as well as precomputed keys for batch utility queries and epoch updates. + */ +typedef struct jemallocCB { + unsigned nbins; /* Number of bins in the jemalloc configuration. */ + jeBinInfo *bin_info; /* Array of `jeBinInfo` structs, one for each bin. */ + jeMallctlKey util_batch_query; /* Key to query batch utilization information. */ + jeMallctlKey epoch; /* Key to trigger statistics sync between threads. */ +} jemallocCB; + +/* Represents the latest usage statistics for a jemalloc bin. + * + * This struct tracks the current usage of a bin, including the number of slabs + * and regions, and calculates the number of full slabs from other fields. + */ +typedef struct jemallocBinUsageData { + size_t curr_slabs; /* Current number of slabs in the bin. */ + size_t curr_nonfull_slabs; /* Current number of non-full slabs in the bin. */ + size_t curr_regs; /* Current number of regions in the bin. */ +} jemallocBinUsageData; + + +static int defrag_supported = 0; +/* Control block holding information about bins and query helper - + * this structure is initialized once when calling allocatorDefragInit. It does not change afterwards*/ +static jemallocCB je_cb = {0, NULL, {{0}, 0}, {{0}, 0}}; +/* Holds the latest usage statistics for each bin. This structure is updated when calling + * allocatorDefragGetFragSmallbins and later is used to make a defrag decision for a memory pointer. */ +static jemallocBinUsageData *je_usage_info = NULL; + + +/* ----------------------------------------------------------------------------- + * Alloc/Free API that are cooperative with defrag + * -------------------------------------------------------------------------- */ + +/* Allocation and free functions that bypass the thread cache + * and go straight to the allocator arena bins. + * Currently implemented only for jemalloc. Used for online defragmentation. + */ +void *allocatorDefragAlloc(size_t size) { + void *ptr = je_mallocx(size, MALLOCX_TCACHE_NONE); + return ptr; +} +void allocatorDefragFree(void *ptr, size_t size) { + if (ptr == NULL) return; + je_sdallocx(ptr, size, MALLOCX_TCACHE_NONE); +} + +/* ----------------------------------------------------------------------------- + * Helper functions for jemalloc translation between size and index + * -------------------------------------------------------------------------- */ + +/* Get the bin index in bin array from the reg_size. + * + * these are reverse engineered mapping of reg_size -> binind. We need this information because the utilization query + * returns the size of the buffer and not the bin index, and we need the bin index to access it's usage information + * + * Note: In case future PR will return the binind (that is better API anyway) we can get rid of + * these conversion functions + */ +static inline unsigned jeSize2BinIndexLgQ3(size_t sz) { + /* Smallest power-of-2 quantum for binning */ + const size_t size_class_group_size = 4; + /* Number of bins in each power-of-2 size class group */ + const size_t lg_quantum_3_first_pow2 = 3; + /* Offset for exponential bins */ + const size_t lg_quantum_3_offset = ((64 >> lg_quantum_3_first_pow2) - 1); + /* Small sizes (8-64 bytes) use linear binning */ + if (sz <= 64) { // 64 = 1 << (lg_quantum_3_first_pow2 + 3) + return (sz >> 3) - 1; // Divide by 8 and subtract 1 + } + + /* For larger sizes, use exponential binning */ + + /* Calculate leading zeros of (sz - 1) to properly handle power-of-2 sizes */ + unsigned leading_zeros = __builtin_clzll(sz - 1); + unsigned exp = 64 - leading_zeros; // Effective log2(sz) + + /* Calculate the size's position within its group */ + unsigned within_group_offset = size_class_group_size - + (((1ULL << exp) - sz) >> (exp - lg_quantum_3_first_pow2)); + + /* Calculate the final bin index */ + return within_group_offset + + ((exp - (lg_quantum_3_first_pow2 + 3)) - 1) * size_class_group_size + + lg_quantum_3_offset; +} +/* ----------------------------------------------------------------------------- + * Interface functions to get fragmentation info from jemalloc + * -------------------------------------------------------------------------- */ +#define ARENA_TO_QUERY MALLCTL_ARENAS_ALL + +static inline void jeRefreshStats(const jemallocCB *je_cb) { + uint64_t epoch = 1; // Value doesn't matter + size_t sz = sizeof(epoch); + /* Refresh stats */ + je_mallctlbymib(je_cb->epoch.key, je_cb->epoch.keylen, &epoch, &sz, &epoch, sz); +} + +/* Extract key that corresponds to the given name for fast query. This should be called once for each key_name */ +static inline int jeQueryKeyInit(const char *key_name, jeMallctlKey *key_info) { + key_info->keylen = sizeof(key_info->key) / sizeof(key_info->key[0]); + int res = je_mallctlnametomib(key_name, key_info->key, &key_info->keylen); + /* sanity check that returned value is not larger than provided */ + assert(key_info->keylen <= sizeof(key_info->key) / sizeof(key_info->key[0])); + return res; +} + +/* Query jemalloc control interface using previously extracted key (with jeQueryKeyInit) instead of name string. + * This interface (named MIB in jemalloc) is faster as it avoids string dict lookup at run-time. */ +static inline int jeQueryCtlInterface(const jeMallctlKey *key_info, void *value) { + size_t sz = sizeof(size_t); + return je_mallctlbymib(key_info->key, key_info->keylen, value, &sz, NULL, 0); +} + +static inline int binQueryHelperInitialization(jeBinInfoKeys *helper, unsigned bin_index) { + char mallctl_name[128]; + + /* Mib of fetch number of used regions in the bin */ + snprintf(mallctl_name, sizeof(mallctl_name), "stats.arenas." STRINGIFY(ARENA_TO_QUERY) ".bins.%d.curregs", bin_index); + if (jeQueryKeyInit(mallctl_name, &helper->curr_regs) != 0) return -1; + /* Mib of fetch number of current slabs in the bin */ + snprintf(mallctl_name, sizeof(mallctl_name), "stats.arenas." STRINGIFY(ARENA_TO_QUERY) ".bins.%d.curslabs", bin_index); + if (jeQueryKeyInit(mallctl_name, &helper->curr_slabs) != 0) return -1; + /* Mib of fetch nonfull slabs */ + snprintf(mallctl_name, sizeof(mallctl_name), "stats.arenas." STRINGIFY(ARENA_TO_QUERY) ".bins.%d.nonfull_slabs", bin_index); + if (jeQueryKeyInit(mallctl_name, &helper->nonfull_slabs) != 0) return -1; + + return 0; +} + +/* Initializes the defragmentation system for the jemalloc memory allocator. + * + * This function performs the necessary setup and initialization steps for the defragmentation system. + * It retrieves the configuration information for the jemalloc arenas and bins, and initializes the usage + * statistics data structure. + * + * return 0 on success, or a non-zero error code on failure. + * + * The initialization process involves the following steps: + * 1. Check if defragmentation is supported by the current jemalloc version. + * 2. Retrieve the arena bin configuration information using the `je_mallctlbymib` function. + * 3. Initialize the `usage_latest` structure with the bin usage statistics and configuration data. + * 4. Set the `defrag_supported` flag to indicate that defragmentation is enabled. + * + * Note: This function must be called before using any other defragmentation-related functionality. + * It should be called during the initialization phase of the code that uses the + * defragmentation feature. + */ +int allocatorDefragInit(void) { + char mallctl_name[100]; + jeBinInfo *bin_info; + size_t sz; + int je_res; + + /* the init should be called only once, fail if unexpected call */ + assert(!defrag_supported); + + /* Get the mib of the per memory pointers query command that is used during defrag scan over memory */ + if (jeQueryKeyInit("experimental.utilization.batch_query", &je_cb.util_batch_query) != 0) return -1; + + je_res = jeQueryKeyInit("epoch", &je_cb.epoch); + assert(je_res == 0); + jeRefreshStats(&je_cb); + + /* get quantum for verification only, current code assumes lg-quantum should be 3 */ + size_t jemalloc_quantum; + sz = sizeof(jemalloc_quantum); + je_mallctl("arenas.quantum", &jemalloc_quantum, &sz, NULL, 0); + /* lg-quantum should be 3 so jemalloc_quantum should be 1<<3 */ + assert(jemalloc_quantum == 8); + + sz = sizeof(je_cb.nbins); + je_res = je_mallctl("arenas.nbins", &je_cb.nbins, &sz, NULL, 0); + assert(je_res == 0 && je_cb.nbins != 0); + + je_cb.bin_info = je_calloc(je_cb.nbins, sizeof(jeBinInfo)); + assert(je_cb.bin_info != NULL); + je_usage_info = je_calloc(je_cb.nbins, sizeof(jemallocBinUsageData)); + assert(je_usage_info != NULL); + + for (unsigned j = 0; j < je_cb.nbins; j++) { + bin_info = &je_cb.bin_info[j]; + /* The size of the current bin */ + snprintf(mallctl_name, sizeof(mallctl_name), "arenas.bin.%d.size", j); + sz = sizeof(bin_info->reg_size); + je_res = je_mallctl(mallctl_name, &bin_info->reg_size, &sz, NULL, 0); + assert(je_res == 0); + /* Number of regions per slab */ + snprintf(mallctl_name, sizeof(mallctl_name), "arenas.bin.%d.nregs", j); + sz = sizeof(bin_info->nregs); + je_res = je_mallctl(mallctl_name, &bin_info->nregs, &sz, NULL, 0); + assert(je_res == 0); + + /* init bin specific fast query keys */ + je_res = binQueryHelperInitialization(&bin_info->info_keys, j); + assert(je_res == 0); + + /* verify the reverse map of reg_size to bin index */ + assert(jeSize2BinIndexLgQ3(bin_info->reg_size) == j); + } + + /* defrag is supported mark it to enable defrag queries */ + defrag_supported = 1; + return 0; +} + +/* Total size of consumed meomry in unused regs in small bins (AKA external fragmentation). + * The function will refresh the epoch. + * + * return total fragmentation bytes + */ +unsigned long allocatorDefragGetFragSmallbins(void) { + assert(defrag_supported); + unsigned long frag = 0; + jeRefreshStats(&je_cb); + for (unsigned j = 0; j < je_cb.nbins; j++) { + jeBinInfo *bin_info = &je_cb.bin_info[j]; + jemallocBinUsageData *bin_usage = &je_usage_info[j]; + + /* Number of current slabs in the bin */ + jeQueryCtlInterface(&bin_info->info_keys.curr_regs, &bin_usage->curr_regs); + /* Number of current slabs in the bin */ + jeQueryCtlInterface(&bin_info->info_keys.curr_slabs, &bin_usage->curr_slabs); + /* Number of non full slabs in the bin */ + jeQueryCtlInterface(&bin_info->info_keys.nonfull_slabs, &bin_usage->curr_nonfull_slabs); + + /* Calculate the fragmentation bytes for the current bin and add it to the total. */ + frag += ((bin_info->nregs * bin_usage->curr_slabs) - bin_usage->curr_regs) * bin_info->reg_size; + } + return frag; +} + +/* Determines whether defragmentation should be performed on a pointer based on jemalloc information. + * + * bin_info Pointer to the bin information structure. + * bin_usage Pointer to the bin usage structure. + * nalloced Number of allocated regions in the bin. + * + * return 1 if defragmentation should be performed, 0 otherwise. + * + * This function checks the following conditions to determine if defragmentation should be performed: + * 1. If the number of allocated regions (nalloced) is equal to the total number of regions (bin_info->nregs), + * defragmentation is not necessary as moving regions is guaranteed not to change the fragmentation ratio. + * 2. If the number of non-full slabs (bin_usage->curr_nonfull_slabs) is less than 2, defragmentation is not performed + * because there is no other slab to move regions to. + * 3. If slab utilization < 'avg utilization'*1.125 [code 1.125 == (1000+UTILIZATION_THRESHOLD_FACTOR_MILI)/1000] + * than we should defrag. This is aligned with previous je_defrag_hint implementation. + */ +static inline int makeDefragDecision(jeBinInfo *bin_info, jemallocBinUsageData *bin_usage, unsigned long nalloced) { + unsigned long curr_full_slabs = bin_usage->curr_slabs - bin_usage->curr_nonfull_slabs; + size_t allocated_nonfull = bin_usage->curr_regs - curr_full_slabs * bin_info->nregs; + if (bin_info->nregs == nalloced || bin_usage->curr_nonfull_slabs < 2 || + 1000 * nalloced * bin_usage->curr_nonfull_slabs > (1000 + UTILIZATION_THRESHOLD_FACTOR_MILI) * allocated_nonfull) { + return 0; + } + return 1; +} + +/* + * Performs defragmentation analysis for a given ptr. + * + * ptr - ptr to memory region to be analyzed. + * + * return - the function returns 1 if defrag should be performed, 0 otherwise. + */ +int allocatorShouldDefrag(void *ptr) { + assert(defrag_supported); + size_t out[BATCH_QUERY_ARGS_OUT]; + size_t out_sz = sizeof(out); + size_t in_sz = sizeof(ptr); + for (unsigned j = 0; j < BATCH_QUERY_ARGS_OUT; j++) { + out[j] = -1; + } + je_mallctlbymib(je_cb.util_batch_query.key, + je_cb.util_batch_query.keylen, + out, &out_sz, + &ptr, in_sz); + /* handle results with appropriate quantum value */ + assert(SLAB_NUM_REGS(out, 0) > 0); + assert(SLAB_LEN(out, 0) > 0); + assert(SLAB_NFREE(out, 0) != (size_t)-1); + unsigned region_size = SLAB_LEN(out, 0) / SLAB_NUM_REGS(out, 0); + /* check that the allocation size is in range of small bins */ + if (region_size > je_cb.bin_info[je_cb.nbins - 1].reg_size) { + return 0; + } + /* get the index based on quantum used */ + unsigned binind = jeSize2BinIndexLgQ3(region_size); + /* make sure binind is in range and reverse map is correct */ + assert(binind < je_cb.nbins && region_size == je_cb.bin_info[binind].reg_size); + + return makeDefragDecision(&je_cb.bin_info[binind], + &je_usage_info[binind], + je_cb.bin_info[binind].nregs - SLAB_NFREE(out, 0)); +} + +#else + +int allocatorDefragInit(void) { + return -1; +} +void allocatorDefragFree(void *ptr, size_t size) { + UNUSED(ptr); + UNUSED(size); +} +__attribute__((malloc)) void *allocatorDefragAlloc(size_t size) { + UNUSED(size); + return NULL; +} +unsigned long allocatorDefragGetFragSmallbins(void) { + return 0; +} + +int allocatorShouldDefrag(void *ptr) { + UNUSED(ptr); + return 0; +} +#endif diff --git a/src/allocator_defrag.h b/src/allocator_defrag.h new file mode 100644 index 0000000000..7fb56208b6 --- /dev/null +++ b/src/allocator_defrag.h @@ -0,0 +1,22 @@ +#ifndef __ALLOCATOR_DEFRAG_H +#define __ALLOCATOR_DEFRAG_H + +#if defined(USE_JEMALLOC) +#include +/* We can enable the server defrag capabilities only if we are using Jemalloc + * and the version that has the experimental.utilization namespace in mallctl . */ +#if defined(JEMALLOC_VERSION_MAJOR) && \ + (JEMALLOC_VERSION_MAJOR > 5 || \ + (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR > 2) || \ + (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR == 2 && JEMALLOC_VERSION_BUGFIX >= 1)) +#define HAVE_DEFRAG +#endif +#endif + +int allocatorDefragInit(void); +void allocatorDefragFree(void *ptr, size_t size); +__attribute__((malloc)) void *allocatorDefragAlloc(size_t size); +unsigned long allocatorDefragGetFragSmallbins(void); +int allocatorShouldDefrag(void *ptr); + +#endif /* __ALLOCATOR_DEFRAG_H */ diff --git a/src/defrag.c b/src/defrag.c index 4d34009f8b..b49a175f7c 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -49,10 +49,6 @@ typedef struct defragPubSubCtx { dict *(*clientPubSubChannels)(client *); } defragPubSubCtx; -/* this method was added to jemalloc in order to help us understand which - * pointers are worthwhile moving and which aren't */ -int je_get_defrag_hint(void *ptr); - /* Defrag helper for generic allocations. * * returns NULL in case the allocation wasn't moved. @@ -61,7 +57,7 @@ int je_get_defrag_hint(void *ptr); void *activeDefragAlloc(void *ptr) { size_t size; void *newptr; - if (!je_get_defrag_hint(ptr)) { + if (!allocatorShouldDefrag(ptr)) { server.stat_active_defrag_misses++; return NULL; } @@ -69,9 +65,9 @@ void *activeDefragAlloc(void *ptr) { * make sure not to use the thread cache. so that we don't get back the same * pointers we try to free */ size = zmalloc_size(ptr); - newptr = zmalloc_no_tcache(size); + newptr = allocatorDefragAlloc(size); memcpy(newptr, ptr, size); - zfree_no_tcache(ptr); + allocatorDefragFree(ptr, size); server.stat_active_defrag_hits++; return newptr; } @@ -756,8 +752,8 @@ void defragScanCallback(void *privdata, const dictEntry *de) { * without the possibility of getting any results. */ float getAllocatorFragmentation(size_t *out_frag_bytes) { size_t resident, active, allocated, frag_smallbins_bytes; - zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL, &frag_smallbins_bytes); - + zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL); + frag_smallbins_bytes = allocatorDefragGetFragSmallbins(); /* Calculate the fragmentation ratio as the proportion of wasted memory in small * bins (which are defraggable) relative to the total allocated memory (including large bins). * This is because otherwise, if most of the memory usage is large bins, we may show high percentage, diff --git a/src/server.c b/src/server.c index aebbb57a93..51de89ee53 100644 --- a/src/server.c +++ b/src/server.c @@ -1297,8 +1297,8 @@ void cronUpdateMemoryStats(void) { * allocations, and allocator reserved pages that can be pursed (all not actual frag) */ zmalloc_get_allocator_info( &server.cron_malloc_stats.allocator_allocated, &server.cron_malloc_stats.allocator_active, - &server.cron_malloc_stats.allocator_resident, NULL, &server.cron_malloc_stats.allocator_muzzy, - &server.cron_malloc_stats.allocator_frag_smallbins_bytes); + &server.cron_malloc_stats.allocator_resident, NULL, &server.cron_malloc_stats.allocator_muzzy); + server.cron_malloc_stats.allocator_frag_smallbins_bytes = allocatorDefragGetFragSmallbins(); /* in case the allocator isn't providing these stats, fake them so that * fragmentation info still shows some (inaccurate metrics) */ if (!server.cron_malloc_stats.allocator_resident) { @@ -6794,7 +6794,10 @@ __attribute__((weak)) int main(int argc, char **argv) { #endif tzset(); /* Populates 'timezone' global. */ zmalloc_set_oom_handler(serverOutOfMemoryHandler); - +#if defined(HAVE_DEFRAG) + int res = allocatorDefragInit(); + serverAssert(res == 0); +#endif /* To achieve entropy, in case of containers, their time() and getpid() can * be the same. But value of tv_usec is fast enough to make the difference */ gettimeofday(&tv, NULL); diff --git a/src/server.h b/src/server.h index 531ca8e7c8..8962b04086 100644 --- a/src/server.h +++ b/src/server.h @@ -35,6 +35,7 @@ #include "solarisfixes.h" #include "rio.h" #include "commands.h" +#include "allocator_defrag.h" #include #include diff --git a/src/zmalloc.c b/src/zmalloc.c index e18fa8bac2..a696111e47 100644 --- a/src/zmalloc.c +++ b/src/zmalloc.c @@ -84,8 +84,6 @@ void zlibc_free(void *ptr) { #define calloc(count, size) je_calloc(count, size) #define realloc(ptr, size) je_realloc(ptr, size) #define free(ptr) je_free(ptr) -#define mallocx(size, flags) je_mallocx(size, flags) -#define dallocx(ptr, flags) je_dallocx(ptr, flags) #endif #define thread_local _Thread_local @@ -207,25 +205,6 @@ void *zmalloc_usable(size_t size, size_t *usable) { return ptr; } -/* Allocation and free functions that bypass the thread cache - * and go straight to the allocator arena bins. - * Currently implemented only for jemalloc. Used for online defragmentation. */ -#ifdef HAVE_DEFRAG -void *zmalloc_no_tcache(size_t size) { - if (size >= SIZE_MAX / 2) zmalloc_oom_handler(size); - void *ptr = mallocx(size + PREFIX_SIZE, MALLOCX_TCACHE_NONE); - if (!ptr) zmalloc_oom_handler(size); - update_zmalloc_stat_alloc(zmalloc_size(ptr)); - return ptr; -} - -void zfree_no_tcache(void *ptr) { - if (ptr == NULL) return; - update_zmalloc_stat_free(zmalloc_size(ptr)); - dallocx(ptr, MALLOCX_TCACHE_NONE); -} -#endif - /* Try allocating memory and zero it, and return NULL if failed. * '*usable' is set to the usable size if non NULL. */ static inline void *ztrycalloc_usable_internal(size_t size, size_t *usable) { @@ -683,52 +662,7 @@ size_t zmalloc_get_rss(void) { #define STRINGIFY_(x) #x #define STRINGIFY(x) STRINGIFY_(x) -/* Compute the total memory wasted in fragmentation of inside small arena bins. - * Done by summing the memory in unused regs in all slabs of all small bins. */ -size_t zmalloc_get_frag_smallbins(void) { - unsigned nbins; - size_t sz, frag = 0; - char buf[100]; - - sz = sizeof(unsigned); - assert(!je_mallctl("arenas.nbins", &nbins, &sz, NULL, 0)); - for (unsigned j = 0; j < nbins; j++) { - size_t curregs, curslabs, reg_size; - uint32_t nregs; - - /* The size of the current bin */ - snprintf(buf, sizeof(buf), "arenas.bin.%d.size", j); - sz = sizeof(size_t); - assert(!je_mallctl(buf, ®_size, &sz, NULL, 0)); - - /* Number of used regions in the bin */ - snprintf(buf, sizeof(buf), "stats.arenas." STRINGIFY(MALLCTL_ARENAS_ALL) ".bins.%d.curregs", j); - sz = sizeof(size_t); - assert(!je_mallctl(buf, &curregs, &sz, NULL, 0)); - - /* Number of regions per slab */ - snprintf(buf, sizeof(buf), "arenas.bin.%d.nregs", j); - sz = sizeof(uint32_t); - assert(!je_mallctl(buf, &nregs, &sz, NULL, 0)); - - /* Number of current slabs in the bin */ - snprintf(buf, sizeof(buf), "stats.arenas." STRINGIFY(MALLCTL_ARENAS_ALL) ".bins.%d.curslabs", j); - sz = sizeof(size_t); - assert(!je_mallctl(buf, &curslabs, &sz, NULL, 0)); - - /* Calculate the fragmentation bytes for the current bin and add it to the total. */ - frag += ((nregs * curslabs) - curregs) * reg_size; - } - - return frag; -} - -int zmalloc_get_allocator_info(size_t *allocated, - size_t *active, - size_t *resident, - size_t *retained, - size_t *muzzy, - size_t *frag_smallbins_bytes) { +int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident, size_t *retained, size_t *muzzy) { uint64_t epoch = 1; size_t sz; *allocated = *resident = *active = 0; @@ -763,8 +697,6 @@ int zmalloc_get_allocator_info(size_t *allocated, *muzzy = pmuzzy * page; } - /* Total size of consumed meomry in unused regs in small bins (AKA external fragmentation). */ - *frag_smallbins_bytes = zmalloc_get_frag_smallbins(); return 1; } @@ -789,13 +721,8 @@ int jemalloc_purge(void) { #else -int zmalloc_get_allocator_info(size_t *allocated, - size_t *active, - size_t *resident, - size_t *retained, - size_t *muzzy, - size_t *frag_smallbins_bytes) { - *allocated = *resident = *active = *frag_smallbins_bytes = 0; +int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident, size_t *retained, size_t *muzzy) { + *allocated = *resident = *active = 0; if (retained) *retained = 0; if (muzzy) *muzzy = 0; return 1; diff --git a/src/zmalloc.h b/src/zmalloc.h index 9b51f4c866..38c2bae864 100644 --- a/src/zmalloc.h +++ b/src/zmalloc.h @@ -100,13 +100,6 @@ #include #endif -/* We can enable the server defrag capabilities only if we are using Jemalloc - * and the version used is our special version modified for the server having - * the ability to return per-allocation fragmentation hints. */ -#if defined(USE_JEMALLOC) && defined(JEMALLOC_FRAG_HINT) -#define HAVE_DEFRAG -#endif - /* The zcalloc symbol is a symbol name already used by zlib, which is defining * other names using the "z" prefix specific to zlib. In practice, linking * valkey with a static openssl, which itself might depend on a static libz @@ -138,12 +131,7 @@ __attribute__((malloc)) char *zstrdup(const char *s); size_t zmalloc_used_memory(void); void zmalloc_set_oom_handler(void (*oom_handler)(size_t)); size_t zmalloc_get_rss(void); -int zmalloc_get_allocator_info(size_t *allocated, - size_t *active, - size_t *resident, - size_t *retained, - size_t *muzzy, - size_t *frag_smallbins_bytes); +int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident, size_t *retained, size_t *muzzy); void set_jemalloc_bg_thread(int enable); int jemalloc_purge(void); size_t zmalloc_get_private_dirty(long pid); @@ -153,11 +141,6 @@ void zlibc_free(void *ptr); void zlibc_trim(void); void zmadvise_dontneed(void *ptr); -#ifdef HAVE_DEFRAG -void zfree_no_tcache(void *ptr); -__attribute__((malloc)) void *zmalloc_no_tcache(size_t size); -#endif - #ifndef HAVE_MALLOC_SIZE size_t zmalloc_size(void *ptr); size_t zmalloc_usable_size(void *ptr); From c4be326c3225ca4323ad7c21ccafee7197d0d539 Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 22 Nov 2024 10:28:59 +0800 Subject: [PATCH 48/92] Make manual failover reset the on-going election to promote failover (#1274) If a manual failover got timed out, like the election don't get the enough votes, since we have a auth_timeout and a auth_retry_time, a new manual failover will not be able to proceed on the replica side. Like if we initiate a new manual failover after a election timed out, we will pause the primary, but on the replica side, due to retry_time, replica does not trigger the new election and the manual failover will eventually time out. In this case, if we initiate manual failover again and there is an ongoing election, we will reset it so that the replica can initiate a new election at the manual failover's request. Signed-off-by: Binbin --- src/cluster_legacy.c | 25 +++++++++++++-- tests/unit/cluster/manual-failover.tcl | 42 ++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 7b3384ee9f..c618feccae 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4848,6 +4848,27 @@ void clusterHandleReplicaMigration(int max_replicas) { * data loss due to the asynchronous primary-replica replication. * -------------------------------------------------------------------------- */ +void manualFailoverCanStart(void) { + serverAssert(server.cluster->mf_can_start == 0); + + if (server.cluster->failover_auth_time) { + /* There is another manual failover requested by the user. + * If we have an ongoing election, reset it because the user may initiate + * manual failover again when the previous manual failover timed out. + * Otherwise, if the previous election timed out (see auth_timeout) and + * before the next retry (see auth_retry_time), the new manual failover + * will pause the primary and replica can not do anything to advance the + * manual failover, and then the manual failover eventually times out. */ + server.cluster->failover_auth_time = 0; + serverLog(LL_WARNING, + "Failover election in progress for epoch %llu, but received a new manual failover. " + "Resetting the election.", + (unsigned long long)server.cluster->failover_auth_epoch); + } + + server.cluster->mf_can_start = 1; +} + /* Reset the manual failover state. This works for both primaries and replicas * as all the state about manual failover is cleared. * @@ -4888,7 +4909,7 @@ void clusterHandleManualFailover(void) { if (server.cluster->mf_primary_offset == replicationGetReplicaOffset()) { /* Our replication offset matches the primary replication offset * announced after clients were paused. We can start the failover. */ - server.cluster->mf_can_start = 1; + manualFailoverCanStart(); serverLog(LL_NOTICE, "All primary replication stream processed, " "manual failover can start."); clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); @@ -6785,7 +6806,7 @@ int clusterCommandSpecial(client *c) { * primary to agree about the offset. We just failover taking over * it without coordination. */ serverLog(LL_NOTICE, "Forced failover user request accepted (user request from '%s').", client); - server.cluster->mf_can_start = 1; + manualFailoverCanStart(); /* We can start a manual failover as soon as possible, setting a flag * here so that we don't need to waiting for the cron to kick in. */ clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER); diff --git a/tests/unit/cluster/manual-failover.tcl b/tests/unit/cluster/manual-failover.tcl index 78842068fa..bac2a7a4c7 100644 --- a/tests/unit/cluster/manual-failover.tcl +++ b/tests/unit/cluster/manual-failover.tcl @@ -271,3 +271,45 @@ start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval wait_for_cluster_propagation } } ;# start_cluster + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} { + test "Manual failover will reset the on-going election" { + set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST 5 + set CLUSTER_PACKET_TYPE_NONE -1 + + # Let other primaries drop FAILOVER_AUTH_REQUEST so that the election won't + # get the enough votes and the election will time out. + R 1 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST + R 2 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST + + # Replica doing the manual failover. + R 3 cluster failover + + # Waiting for primary and replica to confirm manual failover timeout. + wait_for_log_messages 0 {"*Manual failover timed out*"} 0 1000 50 + wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 50 + set loglines1 [count_log_lines 0] + set loglines2 [count_log_lines -3] + + # Undo packet drop, so that replica can win the next election. + R 1 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE + R 2 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE + + # Replica doing the manual failover again. + R 3 cluster failover + + # Make sure the election is reset. + wait_for_log_messages -3 {"*Failover election in progress*Resetting the election*"} $loglines2 1000 50 + + # Wait for failover. + wait_for_condition 1000 50 { + [s -3 role] == "master" + } else { + fail "No failover detected" + } + + # Make sure that the second manual failover does not time out. + verify_no_log_message 0 "*Manual failover timed out*" $loglines1 + verify_no_log_message -3 "*Manual failover timed out*" $loglines2 + } +} ;# start_cluster From 50aae13b0a7fffc6591ee2842d1ef4f2e59096dd Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 22 Nov 2024 10:29:24 +0800 Subject: [PATCH 49/92] Skip reclaim file page cache test in valgrind (#1327) The test is incompatible with valgrind. Added a new `--valgrind` argument to test suite, which will cause that test to be skipped. We skipped it in the past, see 5b61b0dc6d2579ee484fa6cf29bfac59513f84ab Signed-off-by: Binbin --- src/unit/README.md | 1 + src/unit/test_help.h | 4 +++- src/unit/test_main.c | 2 ++ src/unit/test_util.c | 4 +++- 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/unit/README.md b/src/unit/README.md index 1ef439eaeb..93ac82f6dc 100644 --- a/src/unit/README.md +++ b/src/unit/README.md @@ -12,6 +12,7 @@ Tests flags: * UNIT_TEST_ACCURATE: Corresponds to the --accurate flag. This flag indicates the test should use extra computation to more accurately validate the tests. * UNIT_TEST_LARGE_MEMORY: Corresponds to the --large-memory flag. This flag indicates whether or not tests should use more than 100mb of memory. * UNIT_TEST_SINGLE: Corresponds to the --single flag. This flag indicates that a single test is being executed. +* UNIT_TEST_VALGRIND: Corresponds to the --valgrind flag. This flag is just a hint passed to the test to indicate that we are running it under valgrind. Tests are allowed to be passed in additional arbitrary argv/argc, which they can access from the argc and argv arguments of the test. diff --git a/src/unit/test_help.h b/src/unit/test_help.h index 804a7e3449..51e77d19d3 100644 --- a/src/unit/test_help.h +++ b/src/unit/test_help.h @@ -18,10 +18,12 @@ /* The flags are the following: * --accurate: Runs tests with more iterations. * --large-memory: Enables tests that consume more than 100mb. - * --single: A flag to indicate a specific test file was executed. */ + * --single: A flag to indicate a specific test file was executed. + * --valgrind: Runs tests with valgrind. */ #define UNIT_TEST_ACCURATE (1 << 0) #define UNIT_TEST_LARGE_MEMORY (1 << 1) #define UNIT_TEST_SINGLE (1 << 2) +#define UNIT_TEST_VALGRIND (1 << 3) #define KRED "\33[31m" #define KGRN "\33[32m" diff --git a/src/unit/test_main.c b/src/unit/test_main.c index 277d1b42c1..1b7cd8c96d 100644 --- a/src/unit/test_main.c +++ b/src/unit/test_main.c @@ -49,6 +49,8 @@ int main(int argc, char **argv) { else if (!strcasecmp(arg, "--single") && (j + 1 < argc)) { flags |= UNIT_TEST_SINGLE; file = argv[j + 1]; + } else if (!strcasecmp(arg, "--valgrind")) { + flags |= UNIT_TEST_VALGRIND; } } diff --git a/src/unit/test_util.c b/src/unit/test_util.c index 70be0255d8..4558c38c3b 100644 --- a/src/unit/test_util.c +++ b/src/unit/test_util.c @@ -286,7 +286,9 @@ static int cache_exist(int fd) { int test_reclaimFilePageCache(int argc, char **argv, int flags) { UNUSED(argc); UNUSED(argv); - UNUSED(flags); + + /* The test is incompatible with valgrind, skip it. */ + if (flags & UNIT_TEST_VALGRIND) return 0; #if defined(__linux__) char *tmpfile = "/tmp/redis-reclaim-cache-test"; From 43b50261620fe813015a8f88717b46876c7e3f83 Mon Sep 17 00:00:00 2001 From: Sinkevich Artem Date: Fri, 22 Nov 2024 06:58:15 +0400 Subject: [PATCH 50/92] Fix argument types of formatting functions (#1253) `cluster_legacy.c`: `slot_info_pairs` has `uint16_t` values, but they were cast to `unsigned long` and `%i` was used. `valkey-cli.c`: `node->replicas_count` is `int`, not `unsigned long`. Signed-off-by: ArtSin --- src/cluster_legacy.c | 8 ++++---- src/valkey-cli.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index c618feccae..d01bfdbfe0 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -5609,12 +5609,12 @@ sds representClusterNodeFlags(sds ci, uint16_t flags) { * else each slot is added separately. */ sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_count) { for (int i = 0; i < slot_info_pairs_count; i += 2) { - unsigned long start = slot_info_pairs[i]; - unsigned long end = slot_info_pairs[i + 1]; + unsigned int start = slot_info_pairs[i]; + unsigned int end = slot_info_pairs[i + 1]; if (start == end) { - ci = sdscatfmt(ci, " %i", start); + ci = sdscatfmt(ci, " %u", start); } else { - ci = sdscatfmt(ci, " %i-%i", start, end); + ci = sdscatfmt(ci, " %u-%u", start, end); } } return ci; diff --git a/src/valkey-cli.c b/src/valkey-cli.c index 0ba03dc6ba..dc31981483 100644 --- a/src/valkey-cli.c +++ b/src/valkey-cli.c @@ -4391,7 +4391,7 @@ static sds clusterManagerNodeInfo(clusterManagerNode *node, int indent) { if (node->replicate != NULL) info = sdscatfmt(info, "\n%s replicates %S", spaces, node->replicate); else if (node->replicas_count) - info = sdscatfmt(info, "\n%s %U additional replica(s)", spaces, node->replicas_count); + info = sdscatfmt(info, "\n%s %i additional replica(s)", spaces, node->replicas_count); sdsfree(spaces); return info; } From 18d1eb5a8554474cfc34c89f859b664e65d9b48a Mon Sep 17 00:00:00 2001 From: Nadav Levanoni <38641521+nadav-levanoni@users.noreply.github.com> Date: Thu, 21 Nov 2024 19:14:28 -0800 Subject: [PATCH 51/92] Remove redundant dict_index calculations (#1205) We need to start making use of the new `WithDictIndex` APIs which allow us to reuse the dict_index calculation (avoid over-calling `getKeySlot` for no good reason). In this PR I optimized `lookupKey` so it now calls `getKeySlot` to reuse the dict_index two additional times. It also optimizes the keys command to avoid unnecessary computation of the slot id. --------- Signed-off-by: Nadav Levanoni Co-authored-by: Nadav Levanoni --- src/db.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/db.c b/src/db.c index 10d4a04091..5a57863de8 100644 --- a/src/db.c +++ b/src/db.c @@ -59,6 +59,7 @@ int keyIsExpired(serverDb *db, robj *key); static void dbSetValue(serverDb *db, robj *key, robj *val, int overwrite, dictEntry *de); static int getKVStoreIndexForKey(sds key); dictEntry *dbFindExpiresWithDictIndex(serverDb *db, void *key, int dict_index); +dictEntry *dbFindWithDictIndex(serverDb *db, void *key, int dict_index); /* Update LFU when an object is accessed. * Firstly, decrement the counter if the decrement time is reached. @@ -97,7 +98,8 @@ void updateLFU(robj *val) { * expired on replicas even if the primary is lagging expiring our key via DELs * in the replication link. */ robj *lookupKey(serverDb *db, robj *key, int flags) { - dictEntry *de = dbFind(db, key->ptr); + int dict_index = getKVStoreIndexForKey(key->ptr); + dictEntry *de = dbFindWithDictIndex(db, key->ptr, dict_index); robj *val = NULL; if (de) { val = dictGetVal(de); @@ -113,7 +115,7 @@ robj *lookupKey(serverDb *db, robj *key, int flags) { int expire_flags = 0; if (flags & LOOKUP_WRITE && !is_ro_replica) expire_flags |= EXPIRE_FORCE_DELETE_EXPIRED; if (flags & LOOKUP_NOEXPIRE) expire_flags |= EXPIRE_AVOID_DELETE_EXPIRED; - if (expireIfNeeded(db, key, expire_flags) != KEY_VALID) { + if (expireIfNeededWithDictIndex(db, key, expire_flags, dict_index) != KEY_VALID) { /* The key is no longer valid. */ val = NULL; } @@ -129,7 +131,7 @@ robj *lookupKey(serverDb *db, robj *key, int flags) { if (!hasActiveChildProcess() && !(flags & LOOKUP_NOTOUCH)) { if (!canUseSharedObject() && val->refcount == OBJ_SHARED_REFCOUNT) { val = dupStringObject(val); - kvstoreDictSetVal(db->keys, getKVStoreIndexForKey(key->ptr), de, val); + kvstoreDictSetVal(db->keys, dict_index, de, val); } if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) { updateLFU(val); @@ -834,13 +836,23 @@ void keysCommand(client *c) { } else { kvs_it = kvstoreIteratorInit(c->db->keys); } - robj keyobj; - while ((de = kvs_di ? kvstoreDictIteratorNext(kvs_di) : kvstoreIteratorNext(kvs_it)) != NULL) { + while (1) { + robj keyobj; + int dict_index; + if (kvs_di) { + de = kvstoreDictIteratorNext(kvs_di); + dict_index = pslot; + } else { + de = kvstoreIteratorNext(kvs_it); + dict_index = kvstoreIteratorGetCurrentDictIndex(kvs_it); + } + if (de == NULL) break; + sds key = dictGetKey(de); if (allkeys || stringmatchlen(pattern, plen, key, sdslen(key), 0)) { initStaticStringObject(keyobj, key); - if (!keyIsExpired(c->db, &keyobj)) { + if (!keyIsExpiredWithDictIndex(c->db, &keyobj, dict_index)) { addReplyBulkCBuffer(c, key, sdslen(key)); numkeys++; } From 109d2dadc0a23326a71f58c8e312859689d6697c Mon Sep 17 00:00:00 2001 From: Yury-Fridlyand Date: Thu, 21 Nov 2024 19:19:10 -0800 Subject: [PATCH 52/92] Add slack link for users (#1273) Add slack link for users --------- Signed-off-by: Yury-Fridlyand Co-authored-by: Madelyn Olson --- .github/ISSUE_TEMPLATE/config.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 9588d36020..8c4a0a8db5 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -9,6 +9,9 @@ contact_links: - name: Chat with us on Matrix? url: https://matrix.to/#/#valkey:matrix.org about: We are on Matrix too! + - name: Chat with us on Slack? + url: https://join.slack.com/t/valkey-oss-developer/shared_invite/zt-2nxs51chx-EB9hu9Qdch3GMfRcztTSkQ + about: We are on Slack too! - name: Documentation issue? url: https://github.com/valkey-io/valkey-doc/issues about: Report it on the valkey-doc repo. From 377ed22c971878b29b6d2c2c582198f2629f82ed Mon Sep 17 00:00:00 2001 From: Alan Scherger Date: Thu, 21 Nov 2024 21:26:30 -0600 Subject: [PATCH 53/92] [feat] add Ubuntu 24.04 Noble package support (#971) add Ubuntu 24.04 Noble package support Signed-off-by: Alan Scherger --- utils/releasetools/build-config.json | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/utils/releasetools/build-config.json b/utils/releasetools/build-config.json index 5ee9308b3b..5e39fae70f 100644 --- a/utils/releasetools/build-config.json +++ b/utils/releasetools/build-config.json @@ -12,6 +12,12 @@ "type": "deb", "platform": "focal" }, + { + "arch": "x86_64", + "target": "ubuntu24.04", + "type": "deb", + "platform": "noble" + }, { "arch": "arm64", "target": "ubuntu18.04", @@ -23,6 +29,12 @@ "target": "ubuntu20.04", "type": "deb", "platform": "focal" + }, + { + "arch": "arm64", + "target": "ubuntu24.04", + "type": "deb", + "platform": "noble" } ] } \ No newline at end of file From 979f4c1ceba9eecc0f984101775b101ab87b58fc Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 22 Nov 2024 16:49:16 +0800 Subject: [PATCH 54/92] Add cmake-build-debug and cmake-build-release to gitignore (#1340) Signed-off-by: Binbin --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index b108b4bb92..d5cac316e6 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,5 @@ tests/rdma/rdma-test tags build-debug/ build-release/ +cmake-build-debug/ +cmake-build-release/ From b9d224097a46dbe62ec0857cb91e7c67505a200e Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 23 Nov 2024 00:22:04 +0800 Subject: [PATCH 55/92] Brocast a PONG to all node in cluster when role changed (#1295) When a node role changes, we should brocast the change to notify other nodes. For example, one primary and one replica, after a failover, the replica became a new primary, the primary became a new replica. And then we trigger a second cluster failover for the new replica, the new replica will send a MFSTART to its primary, ie, the new primary. But the new primary may reject the MFSTART due to this logic: ``` } else if (type == CLUSTERMSG_TYPE_MFSTART) { if (!sender || sender->replicaof != myself) return 1; ``` In the new primary views, sender is still a primary, and sender->replicaof is NULL, so we will return. Then the manual failover timedout. Another possibility is that other primaries refuse to vote after receiving the FAILOVER_AUTH_REQUEST, since in their's views, sender is still a primary, so it refuse to vote, and then manual failover timedout. ``` void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { ... if (clusterNodeIsPrimary(node)) { serverLog(LL_WARNING, "Failover auth denied to... ``` The reason is that, currently, we only update the node->replicaof information when we receive a PING/PONG from the sender. For details, see clusterProcessPacket. Therefore, in some scenarios, such as clusters with many nodes and a large cluster-ping-interval (that is, cluster-node-timeout), the role change of the node will be very delayed. Added a DEBUG DISABLE-CLUSTER-RANDOM-PING command, send cluster ping to a random node every second (see clusterCron). Signed-off-by: Binbin --- src/cluster_legacy.c | 19 +++++--- src/cluster_legacy.h | 1 + src/debug.c | 5 +++ src/server.c | 1 + src/server.h | 2 + tests/unit/cluster/manual-failover.tcl | 61 ++++++++++++++++++++++++++ 6 files changed, 83 insertions(+), 6 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index d01bfdbfe0..97150b4d23 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -2669,7 +2669,8 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc * * If the sender and myself are in the same shard, try psync. */ clusterSetPrimary(sender, !are_in_same_shard, !are_in_same_shard); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG | + CLUSTER_TODO_BROADCAST_ALL); } else if (nodeIsPrimary(myself) && (sender_slots >= migrated_our_slots) && !are_in_same_shard) { /* When all our slots are lost to the sender and the sender belongs to * a different shard, this is likely due to a client triggered slot @@ -4538,7 +4539,7 @@ void clusterFailoverReplaceYourPrimary(void) { /* 4) Pong all the other nodes so that they can update the state * accordingly and detect that we switched to primary role. */ - clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + clusterDoBeforeSleep(CLUSTER_TODO_BROADCAST_ALL); /* 5) If there was a manual failover in progress, clear the state. */ resetManualFailover(); @@ -5029,7 +5030,7 @@ void clusterCron(void) { /* Ping some random node 1 time every 10 iterations, so that we usually ping * one random node every second. */ - if (!(iteration % 10)) { + if (!server.debug_cluster_disable_random_ping && !(iteration % 10)) { int j; /* Check a few random nodes and ping the one with the oldest @@ -5206,6 +5207,13 @@ void clusterBeforeSleep(void) { int fsync = flags & CLUSTER_TODO_FSYNC_CONFIG; clusterSaveConfigOrDie(fsync); } + + if (flags & CLUSTER_TODO_BROADCAST_ALL) { + /* Broadcast a pong to all known nodes. This is useful when something changes + * in the configuration and we want to make the cluster aware it before the + * regular ping. */ + clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + } } void clusterDoBeforeSleep(int flags) { @@ -6556,7 +6564,7 @@ void clusterCommandSetSlot(client *c) { } /* After importing this slot, let the other nodes know as * soon as possible. */ - clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + clusterDoBeforeSleep(CLUSTER_TODO_BROADCAST_ALL); } } } @@ -6748,8 +6756,7 @@ int clusterCommandSpecial(client *c) { * If the instance is a replica, it had a totally different replication history. * In these both cases, myself as a replica has to do a full sync. */ clusterSetPrimary(n, 1, 1); - clusterBroadcastPong(CLUSTER_BROADCAST_ALL); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_SAVE_CONFIG); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_BROADCAST_ALL); addReply(c, shared.ok); } else if (!strcasecmp(c->argv[1]->ptr, "count-failure-reports") && c->argc == 3) { /* CLUSTER COUNT-FAILURE-REPORTS */ diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 2c3e1d83c8..39148c748d 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -25,6 +25,7 @@ #define CLUSTER_TODO_SAVE_CONFIG (1 << 2) #define CLUSTER_TODO_FSYNC_CONFIG (1 << 3) #define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1 << 4) +#define CLUSTER_TODO_BROADCAST_ALL (1 << 5) /* clusterLink encapsulates everything needed to talk with a remote node. */ typedef struct clusterLink { diff --git a/src/debug.c b/src/debug.c index 13da7bcc93..082e20a3b6 100644 --- a/src/debug.c +++ b/src/debug.c @@ -436,6 +436,8 @@ void debugCommand(client *c) { "CLOSE-CLUSTER-LINK-ON-PACKET-DROP <0|1>", " This is valid only when DROP-CLUSTER-PACKET-FILTER is set to a valid packet type.", " When set to 1, the cluster link is closed after dropping a packet based on the filter.", + "DISABLE-CLUSTER-RANDOM-PING <0|1>", + " Disable sending cluster ping to a random node every second.", "OOM", " Crash the server simulating an out-of-memory error.", "PANIC", @@ -607,6 +609,9 @@ void debugCommand(client *c) { } else if (!strcasecmp(c->argv[1]->ptr, "close-cluster-link-on-packet-drop") && c->argc == 3) { server.debug_cluster_close_link_on_packet_drop = atoi(c->argv[2]->ptr); addReply(c, shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr, "disable-cluster-random-ping") && c->argc == 3) { + server.debug_cluster_disable_random_ping = atoi(c->argv[2]->ptr); + addReply(c, shared.ok); } else if (!strcasecmp(c->argv[1]->ptr, "object") && (c->argc == 3 || c->argc == 4)) { dictEntry *de; robj *val; diff --git a/src/server.c b/src/server.c index 51de89ee53..87ce9b15a6 100644 --- a/src/server.c +++ b/src/server.c @@ -2693,6 +2693,7 @@ void initServer(void) { server.blocking_op_nesting = 0; server.thp_enabled = 0; server.cluster_drop_packet_filter = -1; + server.debug_cluster_disable_random_ping = 0; server.reply_buffer_peak_reset_time = REPLY_BUFFER_DEFAULT_PEAK_RESET_TIME; server.reply_buffer_resizing_enabled = 1; server.client_mem_usage_buckets = NULL; diff --git a/src/server.h b/src/server.h index 8962b04086..51ec92451d 100644 --- a/src/server.h +++ b/src/server.h @@ -2194,6 +2194,8 @@ struct valkeyServer { int cluster_slot_stats_enabled; /* Cluster slot usage statistics tracking enabled. */ /* Debug config that goes along with cluster_drop_packet_filter. When set, the link is closed on packet drop. */ uint32_t debug_cluster_close_link_on_packet_drop : 1; + /* Debug config to control the random ping. When set, we will disable the random ping in clusterCron. */ + uint32_t debug_cluster_disable_random_ping : 1; sds cached_cluster_slot_info[CACHE_CONN_TYPE_MAX]; /* Index in array is a bitwise or of CACHE_CONN_TYPE_* */ /* Scripting */ mstime_t busy_reply_threshold; /* Script / module timeout in milliseconds */ diff --git a/tests/unit/cluster/manual-failover.tcl b/tests/unit/cluster/manual-failover.tcl index bac2a7a4c7..220ffc3eaf 100644 --- a/tests/unit/cluster/manual-failover.tcl +++ b/tests/unit/cluster/manual-failover.tcl @@ -313,3 +313,64 @@ start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval verify_no_log_message -3 "*Manual failover timed out*" $loglines2 } } ;# start_cluster + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 1000}} { + test "Broadcast PONG to the cluster when the node role changes" { + # R0 is a primary and R3 is a replica, we will do multiple cluster failover + # and then check their role and flags. + set R0_nodeid [R 0 cluster myid] + set R3_nodeid [R 3 cluster myid] + + # Make sure we don't send PINGs for a short period of time. + for {set j 0} {$j < [llength $::servers]} {incr j} { + R $j debug disable-cluster-random-ping 0 + R $j config set cluster-ping-interval 300000 + } + + R 3 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {slave} && + [s -3 role] eq {master} + } else { + fail "Failover does not happened" + } + + # Get the node information of R0 and R3 in my view from CLUSTER NODES + # R0 should be a replica and R3 should be a primary in all views. + for {set j 0} {$j < [llength $::servers]} {incr j} { + wait_for_condition 1000 50 { + [check_cluster_node_mark slave $j $R0_nodeid] && + [check_cluster_node_mark master $j $R3_nodeid] + } else { + puts "R0_nodeid: $R0_nodeid" + puts "R3_nodeid: $R3_nodeid" + puts "R $j cluster nodes:" + puts [R $j cluster nodes] + fail "Node role does not changed in the first failover" + } + } + + R 0 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {master} && + [s -3 role] eq {slave} + } else { + fail "The second failover does not happened" + } + + # Get the node information of R0 and R3 in my view from CLUSTER NODES + # R0 should be a primary and R3 should be a replica in all views. + for {set j 0} {$j < [llength $::servers]} {incr j} { + wait_for_condition 1000 50 { + [check_cluster_node_mark master $j $R0_nodeid] && + [check_cluster_node_mark slave $j $R3_nodeid] + } else { + puts "R0_nodeid: $R0_nodeid" + puts "R3_nodeid: $R3_nodeid" + puts "R $j cluster nodes:" + puts [R $j cluster nodes] + fail "Node role does not changed in the second failover" + } + } + } +} ;# start_cluster From 9851006d6d7af570d7f38025f4b1de68f12c7731 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 23 Nov 2024 00:23:38 +0800 Subject: [PATCH 56/92] Add short client info log to CLUSTER MEET / FORGET / RESET commands (#1249) These commands are all administrator commands. If they are operated incorrectly, serious consequences may occur. Print the full client info by using catClientInfoString, the info is useful when we want to identify the source of request. Since the origin client info is very large and might complicate the output, we added a catClientInfoShortString function, it will only print some basic fields, we want these fields that are useful to identify the client. These fields are: - id - addr - laddr - connection info - name - user - lib-name - lib-ver And also used it to replace the origin client info where it has the same purpose. Some logging is changed from full client info to short client info: - CLUSTER FAILOVER - FAILOVER / PSYNC - REPLICAOF NO ONE - SHUTDOWN Signed-off-by: Binbin --- src/cluster_legacy.c | 12 +++++++++++- src/networking.c | 23 +++++++++++++++++++++++ src/replication.c | 6 +++--- src/server.c | 2 +- src/server.h | 1 + 5 files changed, 39 insertions(+), 5 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 97150b4d23..e4b25e265d 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -6596,6 +6596,10 @@ int clusterCommandSpecial(client *c) { addReplyErrorFormat(c, "Invalid node address specified: %s:%s", (char *)c->argv[2]->ptr, (char *)c->argv[3]->ptr); } else { + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); + serverLog(LL_NOTICE, "Cluster meet %s:%lld (user request from '%s').", (char *)c->argv[2]->ptr, port, + client); + sdsfree(client); addReply(c, shared.ok); } } else if (!strcasecmp(c->argv[1]->ptr, "flushslots") && c->argc == 2) { @@ -6710,6 +6714,9 @@ int clusterCommandSpecial(client *c) { addReplyError(c, "Can't forget my master!"); return 1; } + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); + serverLog(LL_NOTICE, "Cluster forget %s (user request from '%s').", (char *)c->argv[2]->ptr, client); + sdsfree(client); clusterBlacklistAddNode(n); clusterDelNode(n); clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_SAVE_CONFIG); @@ -6798,7 +6805,7 @@ int clusterCommandSpecial(client *c) { } resetManualFailover(); server.cluster->mf_end = mstime() + CLUSTER_MF_TIMEOUT; - sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); if (takeover) { /* A takeover does not perform any initial check. It just @@ -6877,6 +6884,9 @@ int clusterCommandSpecial(client *c) { "master nodes containing keys"); return 1; } + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); + serverLog(LL_NOTICE, "Cluster reset (user request from '%s').", client); + sdsfree(client); clusterReset(hard); addReply(c, shared.ok); } else if (!strcasecmp(c->argv[1]->ptr, "links") && c->argc == 2) { diff --git a/src/networking.c b/src/networking.c index 9558780f39..93aa9d00ae 100644 --- a/src/networking.c +++ b/src/networking.c @@ -3385,6 +3385,29 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) { return ret; } +/* Concatenate a string representing the state of a client in a human + * readable format, into the sds string 's'. + * + * This is a simplified and shortened version of catClientInfoString, + * it only added some basic fields for tracking clients. */ +sds catClientInfoShortString(sds s, client *client, int hide_user_data) { + if (!server.crashed) waitForClientIO(client); + char conninfo[CONN_INFO_LEN]; + + sds ret = sdscatfmt( + s, + FMTARGS( + "id=%U", (unsigned long long)client->id, + " addr=%s", getClientPeerId(client), + " laddr=%s", getClientSockname(client), + " %s", connGetInfo(client->conn, conninfo, sizeof(conninfo)), + " name=%s", hide_user_data ? "*redacted*" : (client->name ? (char *)client->name->ptr : ""), + " user=%s", hide_user_data ? "*redacted*" : (client->user ? client->user->name : "(superuser)"), + " lib-name=%s", client->lib_name ? (char *)client->lib_name->ptr : "", + " lib-ver=%s", client->lib_ver ? (char *)client->lib_ver->ptr : "")); + return ret; +} + sds getAllClientsInfoString(int type, int hide_user_data) { listNode *ln; listIter li; diff --git a/src/replication.c b/src/replication.c index 437ae278ec..1654847bd6 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1051,7 +1051,7 @@ void syncCommand(client *c) { } else { replicationUnsetPrimary(); } - sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); serverLog(LL_NOTICE, "PRIMARY MODE enabled (failover request from '%s')", client); sdsfree(client); } else { @@ -3971,7 +3971,7 @@ void replicaofCommand(client *c) { if (!strcasecmp(c->argv[1]->ptr, "no") && !strcasecmp(c->argv[2]->ptr, "one")) { if (server.primary_host) { replicationUnsetPrimary(); - sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); serverLog(LL_NOTICE, "PRIMARY MODE enabled (user request from '%s')", client); sdsfree(client); } @@ -4000,7 +4000,7 @@ void replicaofCommand(client *c) { /* There was no previous primary or the user specified a different one, * we can continue. */ replicationSetPrimary(c->argv[1]->ptr, port, 0); - sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); serverLog(LL_NOTICE, "REPLICAOF %s:%d enabled (user request from '%s')", server.primary_host, server.primary_port, client); sdsfree(client); diff --git a/src/server.c b/src/server.c index 87ce9b15a6..6d346ac74c 100644 --- a/src/server.c +++ b/src/server.c @@ -4325,7 +4325,7 @@ int prepareForShutdown(client *c, int flags) { server.shutdown_flags = flags; if (c != NULL) { - sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); serverLog(LL_NOTICE, "User requested shutdown... (user request from '%s')", client); sdsfree(client); } else { diff --git a/src/server.h b/src/server.h index 51ec92451d..f4c7306009 100644 --- a/src/server.h +++ b/src/server.h @@ -2853,6 +2853,7 @@ char *getClientPeerId(client *client); char *getClientSockName(client *client); int isClientConnIpV6(client *c); sds catClientInfoString(sds s, client *client, int hide_user_data); +sds catClientInfoShortString(sds s, client *client, int hide_user_data); sds getAllClientsInfoString(int type, int hide_user_data); int clientSetName(client *c, robj *name, const char **err); void rewriteClientCommandVector(client *c, int argc, ...); From 33f42d7fb597ce28040f184ee57ed86d6f6ffbd8 Mon Sep 17 00:00:00 2001 From: eifrah-aws Date: Fri, 22 Nov 2024 22:17:53 +0200 Subject: [PATCH 57/92] CMake fixes + README update (#1276) --- CMakeLists.txt | 2 +- README.md | 14 +++++------ cmake/Modules/Utils.cmake | 13 ++++++++++ cmake/Modules/ValkeySetup.cmake | 43 ++++++++++++++++++--------------- deps/jemalloc/CMakeLists.txt | 13 ++++++++-- deps/lua/CMakeLists.txt | 9 +++++++ src/CMakeLists.txt | 10 ++++++++ 7 files changed, 75 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ad0bab8896..77d0c4e7d8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.20) +cmake_minimum_required(VERSION 3.10) # Must be done first if (APPLE) diff --git a/README.md b/README.md index 94f38bccf7..a32ac255df 100644 --- a/README.md +++ b/README.md @@ -297,19 +297,19 @@ Other options supported by Valkey's `CMake` build system: ## Special build flags -- `-DBUILD_TLS=` enable TLS build for Valkey -- `-DBUILD_RDMA=` enable RDMA module build (only module mode supported) +- `-DBUILD_TLS=` enable TLS build for Valkey. Default: `no` +- `-DBUILD_RDMA=` enable RDMA module build (only module mode supported). Default: `no` - `-DBUILD_MALLOC=` choose the allocator to use. Default on Linux: `jemalloc`, for other OS: `libc` -- `-DBUILD_SANITIZER=` build with address sanitizer enabled -- `-DBUILD_UNIT_TESTS=[1|0]` when set, the build will produce the executable `valkey-unit-tests` -- `-DBUILD_TEST_MODULES=[1|0]` when set, the build will include the modules located under the `tests/modules` folder -- `-DBUILD_EXAMPLE_MODULES=[1|0]` when set, the build will include the example modules located under the `src/modules` folder +- `-DBUILD_SANITIZER=` build with address sanitizer enabled. Default: disabled (no sanitizer) +- `-DBUILD_UNIT_TESTS=[yes|no]` when set, the build will produce the executable `valkey-unit-tests`. Default: `no` +- `-DBUILD_TEST_MODULES=[yes|no]` when set, the build will include the modules located under the `tests/modules` folder. Default: `no` +- `-DBUILD_EXAMPLE_MODULES=[yes|no]` when set, the build will include the example modules located under the `src/modules` folder. Default: `no` ## Common flags - `-DCMAKE_BUILD_TYPE=` define the build type, see CMake manual for more details - `-DCMAKE_INSTALL_PREFIX=/installation/path` override this value to define a custom install prefix. Default: `/usr/local` -- `-G` generate build files for "Generator Name". By default, CMake will generate `Makefile`s. +- `-G""` generate build files for "Generator Name". By default, CMake will generate `Makefile`s. ## Verbose build diff --git a/cmake/Modules/Utils.cmake b/cmake/Modules/Utils.cmake index 304f39fb2c..59076397de 100644 --- a/cmake/Modules/Utils.cmake +++ b/cmake/Modules/Utils.cmake @@ -100,3 +100,16 @@ function (valkey_parse_build_option OPTION_VALUE OUT_ARG_ENUM) PARENT_SCOPE) endif () endfunction () + +function (valkey_pkg_config PKGNAME OUT_VARIABLE) + if (NOT FOUND_PKGCONFIG) + # Locate pkg-config once + find_package(PkgConfig REQUIRED) + set(FOUND_PKGCONFIG 1) + endif () + pkg_check_modules(__PREFIX REQUIRED ${PKGNAME}) + message(STATUS "Found library for '${PKGNAME}': ${__PREFIX_LIBRARIES}") + set(${OUT_VARIABLE} + "${__PREFIX_LIBRARIES}" + PARENT_SCOPE) +endfunction () diff --git a/cmake/Modules/ValkeySetup.cmake b/cmake/Modules/ValkeySetup.cmake index e935c3b308..4fafd07910 100644 --- a/cmake/Modules/ValkeySetup.cmake +++ b/cmake/Modules/ValkeySetup.cmake @@ -74,9 +74,11 @@ endmacro () macro (valkey_build_and_install_bin target sources ld_flags libs link_name) add_executable(${target} ${sources}) - if (USE_JEMALLOC) - # Using jemalloc - target_link_libraries(${target} jemalloc) + if (USE_JEMALLOC + OR USE_TCMALLOC + OR USE_TCMALLOC_MINIMAL) + # Using custom allocator + target_link_libraries(${target} ${ALLOCATOR_LIB}) endif () # Place this line last to ensure that ${ld_flags} is placed last on the linker line @@ -151,16 +153,23 @@ endif () if (BUILD_MALLOC) if ("${BUILD_MALLOC}" STREQUAL "jemalloc") set(MALLOC_LIB "jemalloc") + set(ALLOCATOR_LIB "jemalloc") add_valkey_server_compiler_options("-DUSE_JEMALLOC") set(USE_JEMALLOC 1) elseif ("${BUILD_MALLOC}" STREQUAL "libc") set(MALLOC_LIB "libc") elseif ("${BUILD_MALLOC}" STREQUAL "tcmalloc") set(MALLOC_LIB "tcmalloc") + valkey_pkg_config(libtcmalloc ALLOCATOR_LIB) + add_valkey_server_compiler_options("-DUSE_TCMALLOC") + set(USE_TCMALLOC 1) elseif ("${BUILD_MALLOC}" STREQUAL "tcmalloc_minimal") set(MALLOC_LIB "tcmalloc_minimal") + valkey_pkg_config(libtcmalloc_minimal ALLOCATOR_LIB) + add_valkey_server_compiler_options("-DUSE_TCMALLOC") + set(USE_TCMALLOC_MINIMAL 1) else () message(FATAL_ERROR "BUILD_MALLOC can be one of: jemalloc, libc, tcmalloc or tcmalloc_minimal") endif () @@ -202,16 +211,12 @@ if (BUILD_RDMA) if (USE_RDMA EQUAL 2) # Module message(STATUS "Building RDMA as module") add_valkey_server_compiler_options("-DUSE_RDMA=2") - find_package(PkgConfig REQUIRED) # Locate librdmacm & libibverbs, fail if we can't find them - pkg_check_modules(RDMACM REQUIRED librdmacm) - pkg_check_modules(IBVERBS REQUIRED libibverbs) + valkey_pkg_config(librdmacm RDMACM_LIBS) + valkey_pkg_config(libibverbs IBVERBS_LIBS) - message(STATUS "${RDMACM_LINK_LIBRARIES};${IBVERBS_LINK_LIBRARIES}") - list(APPEND RDMA_LIBS "${RDMACM_LIBRARIES};${IBVERBS_LIBRARIES}") - unset(RDMACM_LINK_LIBRARIES CACHE) - unset(IBVERBS_LINK_LIBRARIES CACHE) + list(APPEND RDMA_LIBS "${RDMACM_LIBS};${IBVERBS_LIBS}") set(BUILD_RDMA_MODULE 1) elseif (USE_RDMA EQUAL 1) # RDMA can only be built as a module. So disable it @@ -266,17 +271,18 @@ endif () # Sanitizer if (BUILD_SANITIZER) - # For best results, force libc - set(MALLOC_LIB, "libc") + # Common CFLAGS + list(APPEND VALKEY_SANITAIZER_CFLAGS "-fno-sanitize-recover=all") + list(APPEND VALKEY_SANITAIZER_CFLAGS "-fno-omit-frame-pointer") if ("${BUILD_SANITIZER}" STREQUAL "address") - add_valkey_server_compiler_options("-fsanitize=address -fno-sanitize-recover=all -fno-omit-frame-pointer") - add_valkey_server_linker_option("-fsanitize=address") + list(APPEND VALKEY_SANITAIZER_CFLAGS "-fsanitize=address") + list(APPEND VALKEY_SANITAIZER_LDFLAGS "-fsanitize=address") elseif ("${BUILD_SANITIZER}" STREQUAL "thread") - add_valkey_server_compiler_options("-fsanitize=thread -fno-sanitize-recover=all -fno-omit-frame-pointer") - add_valkey_server_linker_option("-fsanitize=thread") + list(APPEND VALKEY_SANITAIZER_CFLAGS "-fsanitize=thread") + list(APPEND VALKEY_SANITAIZER_LDFLAGS "-fsanitize=thread") elseif ("${BUILD_SANITIZER}" STREQUAL "undefined") - add_valkey_server_compiler_options("-fsanitize=undefined -fno-sanitize-recover=all -fno-omit-frame-pointer") - add_valkey_server_linker_option("-fsanitize=undefined") + list(APPEND VALKEY_SANITAIZER_CFLAGS "-fsanitize=undefined") + list(APPEND VALKEY_SANITAIZER_LDFLAGS "-fsanitize=undefined") else () message(FATAL_ERROR "Unknown sanitizer: ${BUILD_SANITIZER}") endif () @@ -366,7 +372,6 @@ include(SourceFiles) # Clear the below variables from the cache unset(CMAKE_C_FLAGS CACHE) -unset(BUILD_SANITIZER CACHE) unset(VALKEY_SERVER_LDFLAGS CACHE) unset(VALKEY_SERVER_CFLAGS CACHE) unset(PYTHON_EXE CACHE) diff --git a/deps/jemalloc/CMakeLists.txt b/deps/jemalloc/CMakeLists.txt index e79e960ec2..0fa99df55e 100644 --- a/deps/jemalloc/CMakeLists.txt +++ b/deps/jemalloc/CMakeLists.txt @@ -12,9 +12,18 @@ if (NOT EXISTS ${JEMALLOC_INSTALL_DIR}/lib/libjemalloc.a) COMMAND sh -c "${JEMALLOC_SRC_DIR}/configure --disable-cxx \ --with-version=5.3.0-0-g0 --with-lg-quantum=3 --disable-cache-oblivious --with-jemalloc-prefix=je_ \ --enable-static --disable-shared --prefix=${JEMALLOC_INSTALL_DIR}" - WORKING_DIRECTORY ${JEMALLOC_SRC_DIR} COMMAND_ERROR_IS_FATAL ANY) + WORKING_DIRECTORY ${JEMALLOC_SRC_DIR} RESULTS_VARIABLE CONFIGURE_RESULT) + + if (NOT ${CONFIGURE_RESULT} EQUAL 0) + message(FATAL_ERROR "Jemalloc configure failed") + endif () + execute_process(COMMAND make -j${VALKEY_PROCESSOR_COUNT} lib/libjemalloc.a install - WORKING_DIRECTORY "${JEMALLOC_SRC_DIR}") + WORKING_DIRECTORY "${JEMALLOC_SRC_DIR}" RESULTS_VARIABLE MAKE_RESULT) + + if (NOT ${MAKE_RESULT} EQUAL 0) + message(FATAL_ERROR "Jemalloc build failed") + endif () endif () # Import the compiled library as a CMake target diff --git a/deps/lua/CMakeLists.txt b/deps/lua/CMakeLists.txt index e911de9232..0629d7f978 100644 --- a/deps/lua/CMakeLists.txt +++ b/deps/lua/CMakeLists.txt @@ -1,5 +1,7 @@ project(lualib) +include(CheckFunctionExists) + set(LUA_SRC_DIR "${CMAKE_CURRENT_LIST_DIR}/src") set(LUA_SRCS ${LUA_SRC_DIR}/fpconv.c @@ -42,3 +44,10 @@ set(LUA_SRCS add_library(lualib STATIC "${LUA_SRCS}") target_include_directories(lualib PUBLIC "${LUA_SRC_DIR}") target_compile_definitions(lualib PRIVATE ENABLE_CJSON_GLOBAL) + +# Use mkstemp if available +check_function_exists(mkstemp HAVE_MKSTEMP) +if (HAVE_MKSTEMP) + target_compile_definitions(lualib PRIVATE LUA_USE_MKSTEMP) +endif () +unset(HAVE_MKSTEMP CACHE) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b7e328163b..51e1b5a2e6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -22,6 +22,16 @@ if (VALKEY_RELEASE_BUILD) set_property(TARGET valkey-server PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE) endif () +if (BUILD_SANITIZER) + # 'BUILD_SANITIZER' is defined in ValkeySetup module (based on user input) + # If defined, the variables 'VALKEY_SANITAIZER_CFLAGS' and 'VALKEY_SANITAIZER_LDFLAGS' + # are set with the link & compile flags required + message(STATUS "Adding sanitizer flags for target valkey-server") + target_compile_options(valkey-server PRIVATE ${VALKEY_SANITAIZER_CFLAGS}) + target_link_options(valkey-server PRIVATE ${VALKEY_SANITAIZER_LDFLAGS}) +endif () +unset(BUILD_SANITIZER CACHE) + # Target: valkey-cli list(APPEND CLI_LIBS "linenoise") valkey_build_and_install_bin(valkey-cli "${VALKEY_CLI_SRCS}" "${VALKEY_SERVER_LDFLAGS}" "${CLI_LIBS}" "redis-cli") From 653d5f7fe3d44adfb2a2e10c9110a3efacd3f0da Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 25 Nov 2024 09:59:37 +0800 Subject: [PATCH 58/92] Support empty callback on function and free temp function in async way (#1334) We have a replicationEmptyDbCallback, it is a callback used by emptyData while flushing away old data. Previously, we did not add this callback logic for function, in case of abuse, there may be a lot of functions, and also to make the code consistent, we add the same callback logic for function. Changes around this commit: 1. Extend emptyData / functionsLibCtxClear to support passing callback when flushing functions. 2. Added disklessLoad function create and discard helper function, just like disklessLoadInitTempDb and disklessLoadDiscardTempDb), we wll always flush the temp function in a async way to avoid any block. 3. Cleanup around discardTempDb, remove the callback pointer since in async way we don't need the callback. 4. Remove functionsLibCtxClear call in readSyncBulkPayload, because we called emptyData in the previous lines, which also empty functions. We are doing this callback in replication is because during the flush, replica may block a while if the flush is doing in the sync way, to avoid the primary to detect the replica is timing out, replica will use this callback to notify the primary (we also do this callback when loading a RDB). And in the async way, we empty the data in the bio and there is no slw operation, so it will ignores the callback. Signed-off-by: Binbin --- src/db.c | 10 ++++------ src/functions.c | 16 ++++++++-------- src/functions.h | 4 ++-- src/replication.c | 22 +++++++++++++++++----- src/server.h | 2 +- 5 files changed, 32 insertions(+), 22 deletions(-) diff --git a/src/db.c b/src/db.c index 5a57863de8..d3ef19027d 100644 --- a/src/db.c +++ b/src/db.c @@ -574,7 +574,7 @@ long long emptyData(int dbnum, int flags, void(callback)(dict *)) { if (with_functions) { serverAssert(dbnum == -1); - functionsLibCtxClearCurrent(async); + functionsLibCtxClearCurrent(async, callback); } /* Also fire the end event. Note that this event will fire almost @@ -602,12 +602,10 @@ serverDb *initTempDb(void) { return tempDb; } -/* Discard tempDb, this can be slow (similar to FLUSHALL), but it's always async. */ -void discardTempDb(serverDb *tempDb, void(callback)(dict *)) { - int async = 1; - +/* Discard tempDb, it's always async. */ +void discardTempDb(serverDb *tempDb) { /* Release temp DBs. */ - emptyDbStructure(tempDb, -1, async, callback); + emptyDbStructure(tempDb, -1, 1, NULL); for (int i = 0; i < server.dbnum; i++) { kvstoreRelease(tempDb[i].keys); kvstoreRelease(tempDb[i].expires); diff --git a/src/functions.c b/src/functions.c index 916d8fd622..b694e35252 100644 --- a/src/functions.c +++ b/src/functions.c @@ -161,9 +161,9 @@ static void engineLibraryDispose(void *obj) { } /* Clear all the functions from the given library ctx */ -void functionsLibCtxClear(functionsLibCtx *lib_ctx) { - dictEmpty(lib_ctx->functions, NULL); - dictEmpty(lib_ctx->libraries, NULL); +void functionsLibCtxClear(functionsLibCtx *lib_ctx, void(callback)(dict *)) { + dictEmpty(lib_ctx->functions, callback); + dictEmpty(lib_ctx->libraries, callback); dictIterator *iter = dictGetIterator(lib_ctx->engines_stats); dictEntry *entry = NULL; while ((entry = dictNext(iter))) { @@ -175,13 +175,13 @@ void functionsLibCtxClear(functionsLibCtx *lib_ctx) { lib_ctx->cache_memory = 0; } -void functionsLibCtxClearCurrent(int async) { +void functionsLibCtxClearCurrent(int async, void(callback)(dict *)) { if (async) { functionsLibCtx *old_l_ctx = curr_functions_lib_ctx; curr_functions_lib_ctx = functionsLibCtxCreate(); freeFunctionsAsync(old_l_ctx); } else { - functionsLibCtxClear(curr_functions_lib_ctx); + functionsLibCtxClear(curr_functions_lib_ctx, callback); } } @@ -196,7 +196,7 @@ static void functionsLibCtxFreeGeneric(functionsLibCtx *functions_lib_ctx, int a /* Free the given functions ctx */ void functionsLibCtxFree(functionsLibCtx *functions_lib_ctx) { - functionsLibCtxClear(functions_lib_ctx); + functionsLibCtxClear(functions_lib_ctx, NULL); dictRelease(functions_lib_ctx->functions); dictRelease(functions_lib_ctx->libraries); dictRelease(functions_lib_ctx->engines_stats); @@ -380,7 +380,7 @@ libraryJoin(functionsLibCtx *functions_lib_ctx_dst, functionsLibCtx *functions_l dictReleaseIterator(iter); iter = NULL; - functionsLibCtxClear(functions_lib_ctx_src); + functionsLibCtxClear(functions_lib_ctx_src, NULL); if (old_libraries_list) { listRelease(old_libraries_list); old_libraries_list = NULL; @@ -820,7 +820,7 @@ void functionFlushCommand(client *c) { return; } - functionsLibCtxClearCurrent(async); + functionsLibCtxClearCurrent(async, NULL); /* Indicate that the command changed the data so it will be replicated and * counted as a data change (for persistence configuration) */ diff --git a/src/functions.h b/src/functions.h index 429405bb2d..b199fbd06e 100644 --- a/src/functions.h +++ b/src/functions.h @@ -133,9 +133,9 @@ dict *functionsLibGet(void); size_t functionsLibCtxFunctionsLen(functionsLibCtx *functions_ctx); functionsLibCtx *functionsLibCtxGetCurrent(void); functionsLibCtx *functionsLibCtxCreate(void); -void functionsLibCtxClearCurrent(int async); +void functionsLibCtxClearCurrent(int async, void(callback)(dict *)); void functionsLibCtxFree(functionsLibCtx *functions_lib_ctx); -void functionsLibCtxClear(functionsLibCtx *lib_ctx); +void functionsLibCtxClear(functionsLibCtx *lib_ctx, void(callback)(dict *)); void functionsLibCtxSwapWithCurrent(functionsLibCtx *new_lib_ctx, int async); int functionLibCreateFunction(sds name, void *function, functionLibInfo *li, sds desc, uint64_t f_flags, sds *err); diff --git a/src/replication.c b/src/replication.c index 1654847bd6..dcf7ee3f8c 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1981,7 +1981,20 @@ serverDb *disklessLoadInitTempDb(void) { /* Helper function for readSyncBulkPayload() to discard our tempDb * when the loading succeeded or failed. */ void disklessLoadDiscardTempDb(serverDb *tempDb) { - discardTempDb(tempDb, replicationEmptyDbCallback); + discardTempDb(tempDb); +} + +/* Helper function for to initialize temp function lib context. + * The temp ctx may be populated by functionsLibCtxSwapWithCurrent or + * freed by disklessLoadDiscardFunctionsLibCtx later. */ +functionsLibCtx *disklessLoadFunctionsLibCtxCreate(void) { + return functionsLibCtxCreate(); +} + +/* Helper function to discard our temp function lib context + * when the loading succeeded or failed. */ +void disklessLoadDiscardFunctionsLibCtx(functionsLibCtx *temp_functions_lib_ctx) { + freeFunctionsAsync(temp_functions_lib_ctx); } /* If we know we got an entirely different data set from our primary @@ -2186,7 +2199,7 @@ void readSyncBulkPayload(connection *conn) { if (use_diskless_load && server.repl_diskless_load == REPL_DISKLESS_LOAD_SWAPDB) { /* Initialize empty tempDb dictionaries. */ diskless_load_tempDb = disklessLoadInitTempDb(); - temp_functions_lib_ctx = functionsLibCtxCreate(); + temp_functions_lib_ctx = disklessLoadFunctionsLibCtxCreate(); moduleFireServerEvent(VALKEYMODULE_EVENT_REPL_ASYNC_LOAD, VALKEYMODULE_SUBEVENT_REPL_ASYNC_LOAD_STARTED, NULL); } @@ -2226,7 +2239,6 @@ void readSyncBulkPayload(connection *conn) { dbarray = server.db; functions_lib_ctx = functionsLibCtxGetCurrent(); - functionsLibCtxClear(functions_lib_ctx); } rioInitWithConn(&rdb, conn, server.repl_transfer_size); @@ -2264,7 +2276,7 @@ void readSyncBulkPayload(connection *conn) { NULL); disklessLoadDiscardTempDb(diskless_load_tempDb); - functionsLibCtxFree(temp_functions_lib_ctx); + disklessLoadDiscardFunctionsLibCtx(temp_functions_lib_ctx); serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding temporary DB in background"); } else { /* Remove the half-loaded data in case we started with an empty replica. */ @@ -2289,7 +2301,7 @@ void readSyncBulkPayload(connection *conn) { swapMainDbWithTempDb(diskless_load_tempDb); /* swap existing functions ctx with the temporary one */ - functionsLibCtxSwapWithCurrent(temp_functions_lib_ctx, 0); + functionsLibCtxSwapWithCurrent(temp_functions_lib_ctx, 1); moduleFireServerEvent(VALKEYMODULE_EVENT_REPL_ASYNC_LOAD, VALKEYMODULE_SUBEVENT_REPL_ASYNC_LOAD_COMPLETED, NULL); diff --git a/src/server.h b/src/server.h index f4c7306009..09b67b2670 100644 --- a/src/server.h +++ b/src/server.h @@ -3572,7 +3572,7 @@ long long emptyDbStructure(serverDb *dbarray, int dbnum, int async, void(callbac void flushAllDataAndResetRDB(int flags); long long dbTotalServerKeyCount(void); serverDb *initTempDb(void); -void discardTempDb(serverDb *tempDb, void(callback)(dict *)); +void discardTempDb(serverDb *tempDb); int selectDb(client *c, int id); From c4920bca4a6681b2ba652e4dc52b72fe47db516a Mon Sep 17 00:00:00 2001 From: Parth <661497+parthpatel@users.noreply.github.com> Date: Mon, 25 Nov 2024 01:01:43 -0800 Subject: [PATCH 59/92] Integrating fast_float to optionally replace strtod (#1260) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fast_float is a C++ header-only library to parse doubles using SIMD instructions. The purpose is to speed up sorted sets and other commands that use doubles. A single-file copy of fast_float is included in this repo. This introduces an optional dependency on a C++ compiler. The use of fast_float is enabled at compile time using the make variable `USE_FAST_FLOAT=yes`. It is disabled by default. Fixes #1069. --------- Signed-off-by: Parth Patel <661497+parthpatel@users.noreply.github.com> Signed-off-by: Parth <661497+parthpatel@users.noreply.github.com> Signed-off-by: Madelyn Olson Signed-off-by: Viktor Söderqvist Co-authored-by: Roshan Swain Co-authored-by: Madelyn Olson Co-authored-by: Viktor Söderqvist --- .github/workflows/ci.yml | 21 +- .github/workflows/daily.yml | 2 +- deps/Makefile | 7 + deps/README.md | 15 + deps/fast_float/fast_float.h | 3912 +++++++++++++++++ deps/fast_float_c_interface/Makefile | 37 + .../fast_float_strtod.cpp | 24 + src/Makefile | 13 +- src/debug.c | 4 +- src/resp_parser.c | 8 +- src/sort.c | 8 +- src/t_zset.c | 12 +- src/unit/test_files.h | 3 + src/unit/test_valkey_strtod.c | 36 + src/util.c | 8 +- src/valkey-cli.c | 7 +- src/valkey_strtod.h | 42 + tests/test_helper.tcl | 6 +- 18 files changed, 4136 insertions(+), 29 deletions(-) create mode 100644 deps/fast_float/fast_float.h create mode 100644 deps/fast_float_c_interface/Makefile create mode 100644 deps/fast_float_c_interface/fast_float_strtod.cpp create mode 100644 src/unit/test_valkey_strtod.c create mode 100644 src/valkey_strtod.h diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bc946b7193..3fec424cee 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ jobs: - name: make # Fail build if there are warnings # build with TLS just for compilation coverage - run: make -j4 all-with-unit-tests SERVER_CFLAGS='-Werror' BUILD_TLS=yes + run: make -j4 all-with-unit-tests SERVER_CFLAGS='-Werror' BUILD_TLS=yes USE_FAST_FLOAT=yes - name: test run: | sudo apt-get install tcl8.6 tclx @@ -108,23 +108,30 @@ jobs: steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: make - run: make -j3 all-with-unit-tests SERVER_CFLAGS='-Werror' + # Build with additional upcoming features + run: make -j3 all-with-unit-tests SERVER_CFLAGS='-Werror' USE_FAST_FLOAT=yes build-32bit: runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: make + # Fast float requires C++ 32-bit libraries to compile on 64-bit ubuntu + # machine i.e. "-cross" suffixed version. Cross-compiling c++ to 32-bit + # also requires multilib support for g++ compiler i.e. "-multilib" + # suffixed version of g++. g++-multilib generally includes libstdc++. + # *cross version as well, but it is also added explicitly just in case. run: | - sudo apt-get update && sudo apt-get install libc6-dev-i386 - make -j4 SERVER_CFLAGS='-Werror' 32bit + sudo apt-get update + sudo apt-get install libc6-dev-i386 libstdc++-11-dev-i386-cross gcc-multilib g++-multilib + make -j4 SERVER_CFLAGS='-Werror' 32bit USE_FAST_FLOAT=yes build-libc-malloc: runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: make - run: make -j4 SERVER_CFLAGS='-Werror' MALLOC=libc + run: make -j4 SERVER_CFLAGS='-Werror' MALLOC=libc USE_FAST_FLOAT=yes build-almalinux8-jemalloc: runs-on: ubuntu-latest @@ -134,8 +141,8 @@ jobs: - name: make run: | - dnf -y install epel-release gcc make procps-ng which - make -j4 SERVER_CFLAGS='-Werror' + dnf -y install epel-release gcc gcc-c++ make procps-ng which + make -j4 SERVER_CFLAGS='-Werror' USE_FAST_FLOAT=yes format-yaml: runs-on: ubuntu-latest diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index 8bdbc8d4c2..e39e672689 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -319,7 +319,7 @@ jobs: ref: ${{ env.GITHUB_HEAD_REF }} - name: make run: | - make BUILD_TLS=yes SERVER_CFLAGS='-Werror' + make BUILD_TLS=yes SERVER_CFLAGS='-Werror' USE_FAST_FLOAT=yes - name: testprep run: | sudo apt-get install tcl8.6 tclx tcl-tls diff --git a/deps/Makefile b/deps/Makefile index f1e4bd6ce2..72389def95 100644 --- a/deps/Makefile +++ b/deps/Makefile @@ -42,6 +42,7 @@ distclean: -(cd jemalloc && [ -f Makefile ] && $(MAKE) distclean) > /dev/null || true -(cd hdr_histogram && $(MAKE) clean) > /dev/null || true -(cd fpconv && $(MAKE) clean) > /dev/null || true + -(cd fast_float_c_interface && $(MAKE) clean) > /dev/null || true -(rm -f .make-*) .PHONY: distclean @@ -116,3 +117,9 @@ jemalloc: .make-prerequisites cd jemalloc && $(MAKE) lib/libjemalloc.a .PHONY: jemalloc + +fast_float_c_interface: .make-prerequisites + @printf '%b %b\n' $(MAKECOLOR)MAKE$(ENDCOLOR) $(BINCOLOR)$@$(ENDCOLOR) + cd fast_float_c_interface && $(MAKE) + +.PHONY: fast_float_c_interface diff --git a/deps/README.md b/deps/README.md index b918b47456..97a7baf64b 100644 --- a/deps/README.md +++ b/deps/README.md @@ -6,6 +6,7 @@ should be provided by the operating system. * **linenoise** is a readline replacement. It is developed by the same authors of Valkey but is managed as a separated project and updated as needed. * **lua** is Lua 5.1 with minor changes for security and additional libraries. * **hdr_histogram** Used for per-command latency tracking histograms. +* **fast_float** is a replacement for strtod to convert strings to floats efficiently. How to upgrade the above dependencies === @@ -105,3 +106,17 @@ We use a customized version based on master branch commit e4448cf6d1cd08fff51981 2. Copy updated files from newer version onto files in /hdr_histogram. 3. Apply the changes from 1 above to the updated files. +fast_float +--- +The fast_float library provides fast header-only implementations for the C++ from_chars functions for `float` and `double` types as well as integer types. These functions convert ASCII strings representing decimal values (e.g., `1.3e10`) into binary types. The functions are much faster than comparable number-parsing functions from existing C++ standard libraries. + +Specifically, `fast_float` provides the following function to parse floating-point numbers with a C++17-like syntax (the library itself only requires C++11): + + template ())> + from_chars_result_t from_chars(UC const *first, UC const *last, T &value, chars_format fmt = chars_format::general); + +To upgrade the library, +1. Check out https://github.com/fastfloat/fast_float/tree/main +2. cd fast_float +3. Invoke "python3 ./script/amalgamate.py --output fast_float.h" +4. Copy fast_float.h file to "deps/fast_float/". diff --git a/deps/fast_float/fast_float.h b/deps/fast_float/fast_float.h new file mode 100644 index 0000000000..9ba3bc2e97 --- /dev/null +++ b/deps/fast_float/fast_float.h @@ -0,0 +1,3912 @@ +// fast_float by Daniel Lemire +// fast_float by João Paulo Magalhaes +// +// +// with contributions from Eugene Golushkov +// with contributions from Maksim Kita +// with contributions from Marcin Wojdyr +// with contributions from Neal Richardson +// with contributions from Tim Paine +// with contributions from Fabio Pellacini +// with contributions from Lénárd Szolnoki +// with contributions from Jan Pharago +// with contributions from Maya Warrier +// with contributions from Taha Khokhar +// +// +// Licensed under the Apache License, Version 2.0, or the +// MIT License or the Boost License. This file may not be copied, +// modified, or distributed except according to those terms. +// +// MIT License Notice +// +// MIT License +// +// Copyright (c) 2021 The fast_float authors +// +// Permission is hereby granted, free of charge, to any +// person obtaining a copy of this software and associated +// documentation files (the "Software"), to deal in the +// Software without restriction, including without +// limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of +// the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice +// shall be included in all copies or substantial portions +// of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +// SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. +// +// Apache License (Version 2.0) Notice +// +// Copyright 2021 The fast_float authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// +// BOOST License Notice +// +// Boost Software License - Version 1.0 - August 17th, 2003 +// +// Permission is hereby granted, free of charge, to any person or organization +// obtaining a copy of the software and accompanying documentation covered by +// this license (the "Software") to use, reproduce, display, distribute, +// execute, and transmit the Software, and to prepare derivative works of the +// Software, and to permit third-parties to whom the Software is furnished to +// do so, all subject to the following: +// +// The copyright notices in the Software and this entire statement, including +// the above license grant, this restriction and the following disclaimer, +// must be included in all copies of the Software, in whole or in part, and +// all derivative works of the Software, unless such copies or derivative +// works are solely in the form of machine-executable object code generated by +// a source language processor. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. +// + +#ifndef FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H +#define FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H + +#ifdef __has_include +#if __has_include() +#include +#endif +#endif + +// Testing for https://wg21.link/N3652, adopted in C++14 +#if __cpp_constexpr >= 201304 +#define FASTFLOAT_CONSTEXPR14 constexpr +#else +#define FASTFLOAT_CONSTEXPR14 +#endif + +#if defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L +#define FASTFLOAT_HAS_BIT_CAST 1 +#else +#define FASTFLOAT_HAS_BIT_CAST 0 +#endif + +#if defined(__cpp_lib_is_constant_evaluated) && \ + __cpp_lib_is_constant_evaluated >= 201811L +#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 1 +#else +#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 0 +#endif + +// Testing for relevant C++20 constexpr library features +#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED && FASTFLOAT_HAS_BIT_CAST && \ + __cpp_lib_constexpr_algorithms >= 201806L /*For std::copy and std::fill*/ +#define FASTFLOAT_CONSTEXPR20 constexpr +#define FASTFLOAT_IS_CONSTEXPR 1 +#else +#define FASTFLOAT_CONSTEXPR20 +#define FASTFLOAT_IS_CONSTEXPR 0 +#endif + +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) +#define FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE 0 +#else +#define FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE 1 +#endif + +#endif // FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H + +#ifndef FASTFLOAT_FLOAT_COMMON_H +#define FASTFLOAT_FLOAT_COMMON_H + +#include +#include +#include +#include +#include +#include +#ifdef __has_include +#if __has_include() && (__cplusplus > 202002L || _MSVC_LANG > 202002L) +#include +#endif +#endif + +namespace fast_float { + +#define FASTFLOAT_JSONFMT (1 << 5) +#define FASTFLOAT_FORTRANFMT (1 << 6) + +enum chars_format { + scientific = 1 << 0, + fixed = 1 << 2, + hex = 1 << 3, + no_infnan = 1 << 4, + // RFC 8259: https://datatracker.ietf.org/doc/html/rfc8259#section-6 + json = FASTFLOAT_JSONFMT | fixed | scientific | no_infnan, + // Extension of RFC 8259 where, e.g., "inf" and "nan" are allowed. + json_or_infnan = FASTFLOAT_JSONFMT | fixed | scientific, + fortran = FASTFLOAT_FORTRANFMT | fixed | scientific, + general = fixed | scientific +}; + +template struct from_chars_result_t { + UC const *ptr; + std::errc ec; +}; +using from_chars_result = from_chars_result_t; + +template struct parse_options_t { + constexpr explicit parse_options_t(chars_format fmt = chars_format::general, + UC dot = UC('.')) + : format(fmt), decimal_point(dot) {} + + /** Which number formats are accepted */ + chars_format format; + /** The character used as decimal point */ + UC decimal_point; +}; +using parse_options = parse_options_t; + +} // namespace fast_float + +#if FASTFLOAT_HAS_BIT_CAST +#include +#endif + +#if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) || \ + defined(__amd64) || defined(__aarch64__) || defined(_M_ARM64) || \ + defined(__MINGW64__) || defined(__s390x__) || \ + (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || \ + defined(__PPC64LE__)) || \ + defined(__loongarch64)) +#define FASTFLOAT_64BIT 1 +#elif (defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ + defined(__arm__) || defined(_M_ARM) || defined(__ppc__) || \ + defined(__MINGW32__) || defined(__EMSCRIPTEN__)) +#define FASTFLOAT_32BIT 1 +#else + // Need to check incrementally, since SIZE_MAX is a size_t, avoid overflow. +// We can never tell the register width, but the SIZE_MAX is a good +// approximation. UINTPTR_MAX and INTPTR_MAX are optional, so avoid them for max +// portability. +#if SIZE_MAX == 0xffff +#error Unknown platform (16-bit, unsupported) +#elif SIZE_MAX == 0xffffffff +#define FASTFLOAT_32BIT 1 +#elif SIZE_MAX == 0xffffffffffffffff +#define FASTFLOAT_64BIT 1 +#else +#error Unknown platform (not 32-bit, not 64-bit?) +#endif +#endif + +#if ((defined(_WIN32) || defined(_WIN64)) && !defined(__clang__)) || \ + (defined(_M_ARM64) && !defined(__MINGW32__)) +#include +#endif + +#if defined(_MSC_VER) && !defined(__clang__) +#define FASTFLOAT_VISUAL_STUDIO 1 +#endif + +#if defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__ +#define FASTFLOAT_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#elif defined _WIN32 +#define FASTFLOAT_IS_BIG_ENDIAN 0 +#else +#if defined(__APPLE__) || defined(__FreeBSD__) +#include +#elif defined(sun) || defined(__sun) +#include +#elif defined(__MVS__) +#include +#else +#ifdef __has_include +#if __has_include() +#include +#endif //__has_include() +#endif //__has_include +#endif +# +#ifndef __BYTE_ORDER__ +// safe choice +#define FASTFLOAT_IS_BIG_ENDIAN 0 +#endif +# +#ifndef __ORDER_LITTLE_ENDIAN__ +// safe choice +#define FASTFLOAT_IS_BIG_ENDIAN 0 +#endif +# +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define FASTFLOAT_IS_BIG_ENDIAN 0 +#else +#define FASTFLOAT_IS_BIG_ENDIAN 1 +#endif +#endif + +#if defined(__SSE2__) || (defined(FASTFLOAT_VISUAL_STUDIO) && \ + (defined(_M_AMD64) || defined(_M_X64) || \ + (defined(_M_IX86_FP) && _M_IX86_FP == 2))) +#define FASTFLOAT_SSE2 1 +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) +#define FASTFLOAT_NEON 1 +#endif + +#if defined(FASTFLOAT_SSE2) || defined(FASTFLOAT_NEON) +#define FASTFLOAT_HAS_SIMD 1 +#endif + +#if defined(__GNUC__) +// disable -Wcast-align=strict (GCC only) +#define FASTFLOAT_SIMD_DISABLE_WARNINGS \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wcast-align\"") +#else +#define FASTFLOAT_SIMD_DISABLE_WARNINGS +#endif + +#if defined(__GNUC__) +#define FASTFLOAT_SIMD_RESTORE_WARNINGS _Pragma("GCC diagnostic pop") +#else +#define FASTFLOAT_SIMD_RESTORE_WARNINGS +#endif + +#ifdef FASTFLOAT_VISUAL_STUDIO +#define fastfloat_really_inline __forceinline +#else +#define fastfloat_really_inline inline __attribute__((always_inline)) +#endif + +#ifndef FASTFLOAT_ASSERT +#define FASTFLOAT_ASSERT(x) \ + { ((void)(x)); } +#endif + +#ifndef FASTFLOAT_DEBUG_ASSERT +#define FASTFLOAT_DEBUG_ASSERT(x) \ + { ((void)(x)); } +#endif + +// rust style `try!()` macro, or `?` operator +#define FASTFLOAT_TRY(x) \ + { \ + if (!(x)) \ + return false; \ + } + +#define FASTFLOAT_ENABLE_IF(...) \ + typename std::enable_if<(__VA_ARGS__), int>::type + +namespace fast_float { + +fastfloat_really_inline constexpr bool cpp20_and_in_constexpr() { +#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED + return std::is_constant_evaluated(); +#else + return false; +#endif +} + +template +fastfloat_really_inline constexpr bool is_supported_float_type() { + return std::is_same::value || std::is_same::value +#if __STDCPP_FLOAT32_T__ + || std::is_same::value +#endif +#if __STDCPP_FLOAT64_T__ + || std::is_same::value +#endif + ; +} + +template +fastfloat_really_inline constexpr bool is_supported_char_type() { + return std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value; +} + +// Compares two ASCII strings in a case insensitive manner. +template +inline FASTFLOAT_CONSTEXPR14 bool +fastfloat_strncasecmp(UC const *input1, UC const *input2, size_t length) { + char running_diff{0}; + for (size_t i = 0; i < length; ++i) { + running_diff |= (char(input1[i]) ^ char(input2[i])); + } + return (running_diff == 0) || (running_diff == 32); +} + +#ifndef FLT_EVAL_METHOD +#error "FLT_EVAL_METHOD should be defined, please include cfloat." +#endif + +// a pointer and a length to a contiguous block of memory +template struct span { + const T *ptr; + size_t length; + constexpr span(const T *_ptr, size_t _length) : ptr(_ptr), length(_length) {} + constexpr span() : ptr(nullptr), length(0) {} + + constexpr size_t len() const noexcept { return length; } + + FASTFLOAT_CONSTEXPR14 const T &operator[](size_t index) const noexcept { + FASTFLOAT_DEBUG_ASSERT(index < length); + return ptr[index]; + } +}; + +struct value128 { + uint64_t low; + uint64_t high; + constexpr value128(uint64_t _low, uint64_t _high) : low(_low), high(_high) {} + constexpr value128() : low(0), high(0) {} +}; + +/* Helper C++14 constexpr generic implementation of leading_zeroes */ +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int +leading_zeroes_generic(uint64_t input_num, int last_bit = 0) { + if (input_num & uint64_t(0xffffffff00000000)) { + input_num >>= 32; + last_bit |= 32; + } + if (input_num & uint64_t(0xffff0000)) { + input_num >>= 16; + last_bit |= 16; + } + if (input_num & uint64_t(0xff00)) { + input_num >>= 8; + last_bit |= 8; + } + if (input_num & uint64_t(0xf0)) { + input_num >>= 4; + last_bit |= 4; + } + if (input_num & uint64_t(0xc)) { + input_num >>= 2; + last_bit |= 2; + } + if (input_num & uint64_t(0x2)) { /* input_num >>= 1; */ + last_bit |= 1; + } + return 63 - last_bit; +} + +/* result might be undefined when input_num is zero */ +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 int +leading_zeroes(uint64_t input_num) { + assert(input_num > 0); + if (cpp20_and_in_constexpr()) { + return leading_zeroes_generic(input_num); + } +#ifdef FASTFLOAT_VISUAL_STUDIO +#if defined(_M_X64) || defined(_M_ARM64) + unsigned long leading_zero = 0; + // Search the mask data from most significant bit (MSB) + // to least significant bit (LSB) for a set bit (1). + _BitScanReverse64(&leading_zero, input_num); + return (int)(63 - leading_zero); +#else + return leading_zeroes_generic(input_num); +#endif +#else + return __builtin_clzll(input_num); +#endif +} + +// slow emulation routine for 32-bit +fastfloat_really_inline constexpr uint64_t emulu(uint32_t x, uint32_t y) { + return x * (uint64_t)y; +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t +umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) { + uint64_t ad = emulu((uint32_t)(ab >> 32), (uint32_t)cd); + uint64_t bd = emulu((uint32_t)ab, (uint32_t)cd); + uint64_t adbc = ad + emulu((uint32_t)ab, (uint32_t)(cd >> 32)); + uint64_t adbc_carry = (uint64_t)(adbc < ad); + uint64_t lo = bd + (adbc << 32); + *hi = emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) + + (adbc_carry << 32) + (uint64_t)(lo < bd); + return lo; +} + +#ifdef FASTFLOAT_32BIT + +// slow emulation routine for 32-bit +#if !defined(__MINGW64__) +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t _umul128(uint64_t ab, + uint64_t cd, + uint64_t *hi) { + return umul128_generic(ab, cd, hi); +} +#endif // !__MINGW64__ + +#endif // FASTFLOAT_32BIT + +// compute 64-bit a*b +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 value128 +full_multiplication(uint64_t a, uint64_t b) { + if (cpp20_and_in_constexpr()) { + value128 answer; + answer.low = umul128_generic(a, b, &answer.high); + return answer; + } + value128 answer; +#if defined(_M_ARM64) && !defined(__MINGW32__) + // ARM64 has native support for 64-bit multiplications, no need to emulate + // But MinGW on ARM64 doesn't have native support for 64-bit multiplications + answer.high = __umulh(a, b); + answer.low = a * b; +#elif defined(FASTFLOAT_32BIT) || \ + (defined(_WIN64) && !defined(__clang__) && !defined(_M_ARM64)) + answer.low = _umul128(a, b, &answer.high); // _umul128 not available on ARM64 +#elif defined(FASTFLOAT_64BIT) && defined(__SIZEOF_INT128__) + __uint128_t r = ((__uint128_t)a) * b; + answer.low = uint64_t(r); + answer.high = uint64_t(r >> 64); +#else + answer.low = umul128_generic(a, b, &answer.high); +#endif + return answer; +} + +struct adjusted_mantissa { + uint64_t mantissa{0}; + int32_t power2{0}; // a negative value indicates an invalid result + adjusted_mantissa() = default; + constexpr bool operator==(const adjusted_mantissa &o) const { + return mantissa == o.mantissa && power2 == o.power2; + } + constexpr bool operator!=(const adjusted_mantissa &o) const { + return mantissa != o.mantissa || power2 != o.power2; + } +}; + +// Bias so we can get the real exponent with an invalid adjusted_mantissa. +constexpr static int32_t invalid_am_bias = -0x8000; + +// used for binary_format_lookup_tables::max_mantissa +constexpr uint64_t constant_55555 = 5 * 5 * 5 * 5 * 5; + +template struct binary_format_lookup_tables; + +template struct binary_format : binary_format_lookup_tables { + using equiv_uint = + typename std::conditional::type; + + static inline constexpr int mantissa_explicit_bits(); + static inline constexpr int minimum_exponent(); + static inline constexpr int infinite_power(); + static inline constexpr int sign_index(); + static inline constexpr int + min_exponent_fast_path(); // used when fegetround() == FE_TONEAREST + static inline constexpr int max_exponent_fast_path(); + static inline constexpr int max_exponent_round_to_even(); + static inline constexpr int min_exponent_round_to_even(); + static inline constexpr uint64_t max_mantissa_fast_path(int64_t power); + static inline constexpr uint64_t + max_mantissa_fast_path(); // used when fegetround() == FE_TONEAREST + static inline constexpr int largest_power_of_ten(); + static inline constexpr int smallest_power_of_ten(); + static inline constexpr T exact_power_of_ten(int64_t power); + static inline constexpr size_t max_digits(); + static inline constexpr equiv_uint exponent_mask(); + static inline constexpr equiv_uint mantissa_mask(); + static inline constexpr equiv_uint hidden_bit_mask(); +}; + +template struct binary_format_lookup_tables { + static constexpr double powers_of_ten[] = { + 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, + 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22}; + + // Largest integer value v so that (5**index * v) <= 1<<53. + // 0x20000000000000 == 1 << 53 + static constexpr uint64_t max_mantissa[] = { + 0x20000000000000, + 0x20000000000000 / 5, + 0x20000000000000 / (5 * 5), + 0x20000000000000 / (5 * 5 * 5), + 0x20000000000000 / (5 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555), + 0x20000000000000 / (constant_55555 * 5), + 0x20000000000000 / (constant_55555 * 5 * 5), + 0x20000000000000 / (constant_55555 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555 * 5 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555), + 0x20000000000000 / (constant_55555 * constant_55555 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5), + 0x20000000000000 / + (constant_55555 * constant_55555 * constant_55555 * 5 * 5), + 0x20000000000000 / + (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5), + 0x20000000000000 / + (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5 * 5), + 0x20000000000000 / + (constant_55555 * constant_55555 * constant_55555 * constant_55555), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * + constant_55555 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * + constant_55555 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * + constant_55555 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * + constant_55555 * 5 * 5 * 5 * 5)}; +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template +constexpr double binary_format_lookup_tables::powers_of_ten[]; + +template +constexpr uint64_t binary_format_lookup_tables::max_mantissa[]; + +#endif + +template struct binary_format_lookup_tables { + static constexpr float powers_of_ten[] = {1e0f, 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, + 1e6f, 1e7f, 1e8f, 1e9f, 1e10f}; + + // Largest integer value v so that (5**index * v) <= 1<<24. + // 0x1000000 == 1<<24 + static constexpr uint64_t max_mantissa[] = { + 0x1000000, + 0x1000000 / 5, + 0x1000000 / (5 * 5), + 0x1000000 / (5 * 5 * 5), + 0x1000000 / (5 * 5 * 5 * 5), + 0x1000000 / (constant_55555), + 0x1000000 / (constant_55555 * 5), + 0x1000000 / (constant_55555 * 5 * 5), + 0x1000000 / (constant_55555 * 5 * 5 * 5), + 0x1000000 / (constant_55555 * 5 * 5 * 5 * 5), + 0x1000000 / (constant_55555 * constant_55555), + 0x1000000 / (constant_55555 * constant_55555 * 5)}; +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template +constexpr float binary_format_lookup_tables::powers_of_ten[]; + +template +constexpr uint64_t binary_format_lookup_tables::max_mantissa[]; + +#endif + +template <> +inline constexpr int binary_format::min_exponent_fast_path() { +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) + return 0; +#else + return -22; +#endif +} + +template <> +inline constexpr int binary_format::min_exponent_fast_path() { +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) + return 0; +#else + return -10; +#endif +} + +template <> +inline constexpr int binary_format::mantissa_explicit_bits() { + return 52; +} +template <> +inline constexpr int binary_format::mantissa_explicit_bits() { + return 23; +} + +template <> +inline constexpr int binary_format::max_exponent_round_to_even() { + return 23; +} + +template <> +inline constexpr int binary_format::max_exponent_round_to_even() { + return 10; +} + +template <> +inline constexpr int binary_format::min_exponent_round_to_even() { + return -4; +} + +template <> +inline constexpr int binary_format::min_exponent_round_to_even() { + return -17; +} + +template <> inline constexpr int binary_format::minimum_exponent() { + return -1023; +} +template <> inline constexpr int binary_format::minimum_exponent() { + return -127; +} + +template <> inline constexpr int binary_format::infinite_power() { + return 0x7FF; +} +template <> inline constexpr int binary_format::infinite_power() { + return 0xFF; +} + +template <> inline constexpr int binary_format::sign_index() { + return 63; +} +template <> inline constexpr int binary_format::sign_index() { + return 31; +} + +template <> +inline constexpr int binary_format::max_exponent_fast_path() { + return 22; +} +template <> +inline constexpr int binary_format::max_exponent_fast_path() { + return 10; +} + +template <> +inline constexpr uint64_t binary_format::max_mantissa_fast_path() { + return uint64_t(2) << mantissa_explicit_bits(); +} +template <> +inline constexpr uint64_t +binary_format::max_mantissa_fast_path(int64_t power) { + // caller is responsible to ensure that + // power >= 0 && power <= 22 + // + // Work around clang bug https://godbolt.org/z/zedh7rrhc + return (void)max_mantissa[0], max_mantissa[power]; +} +template <> +inline constexpr uint64_t binary_format::max_mantissa_fast_path() { + return uint64_t(2) << mantissa_explicit_bits(); +} +template <> +inline constexpr uint64_t +binary_format::max_mantissa_fast_path(int64_t power) { + // caller is responsible to ensure that + // power >= 0 && power <= 10 + // + // Work around clang bug https://godbolt.org/z/zedh7rrhc + return (void)max_mantissa[0], max_mantissa[power]; +} + +template <> +inline constexpr double +binary_format::exact_power_of_ten(int64_t power) { + // Work around clang bug https://godbolt.org/z/zedh7rrhc + return (void)powers_of_ten[0], powers_of_ten[power]; +} +template <> +inline constexpr float binary_format::exact_power_of_ten(int64_t power) { + // Work around clang bug https://godbolt.org/z/zedh7rrhc + return (void)powers_of_ten[0], powers_of_ten[power]; +} + +template <> inline constexpr int binary_format::largest_power_of_ten() { + return 308; +} +template <> inline constexpr int binary_format::largest_power_of_ten() { + return 38; +} + +template <> +inline constexpr int binary_format::smallest_power_of_ten() { + return -342; +} +template <> inline constexpr int binary_format::smallest_power_of_ten() { + return -64; +} + +template <> inline constexpr size_t binary_format::max_digits() { + return 769; +} +template <> inline constexpr size_t binary_format::max_digits() { + return 114; +} + +template <> +inline constexpr binary_format::equiv_uint +binary_format::exponent_mask() { + return 0x7F800000; +} +template <> +inline constexpr binary_format::equiv_uint +binary_format::exponent_mask() { + return 0x7FF0000000000000; +} + +template <> +inline constexpr binary_format::equiv_uint +binary_format::mantissa_mask() { + return 0x007FFFFF; +} +template <> +inline constexpr binary_format::equiv_uint +binary_format::mantissa_mask() { + return 0x000FFFFFFFFFFFFF; +} + +template <> +inline constexpr binary_format::equiv_uint +binary_format::hidden_bit_mask() { + return 0x00800000; +} +template <> +inline constexpr binary_format::equiv_uint +binary_format::hidden_bit_mask() { + return 0x0010000000000000; +} + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +to_float(bool negative, adjusted_mantissa am, T &value) { + using fastfloat_uint = typename binary_format::equiv_uint; + fastfloat_uint word = (fastfloat_uint)am.mantissa; + word |= fastfloat_uint(am.power2) + << binary_format::mantissa_explicit_bits(); + word |= fastfloat_uint(negative) << binary_format::sign_index(); +#if FASTFLOAT_HAS_BIT_CAST + value = std::bit_cast(word); +#else + ::memcpy(&value, &word, sizeof(T)); +#endif +} + +#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default +template struct space_lut { + static constexpr bool value[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template constexpr bool space_lut::value[]; + +#endif + +inline constexpr bool is_space(uint8_t c) { return space_lut<>::value[c]; } +#endif + +template static constexpr uint64_t int_cmp_zeros() { + static_assert((sizeof(UC) == 1) || (sizeof(UC) == 2) || (sizeof(UC) == 4), + "Unsupported character size"); + return (sizeof(UC) == 1) ? 0x3030303030303030 + : (sizeof(UC) == 2) + ? (uint64_t(UC('0')) << 48 | uint64_t(UC('0')) << 32 | + uint64_t(UC('0')) << 16 | UC('0')) + : (uint64_t(UC('0')) << 32 | UC('0')); +} +template static constexpr int int_cmp_len() { + return sizeof(uint64_t) / sizeof(UC); +} +template static constexpr UC const *str_const_nan() { + return nullptr; +} +template <> constexpr char const *str_const_nan() { return "nan"; } +template <> constexpr wchar_t const *str_const_nan() { return L"nan"; } +template <> constexpr char16_t const *str_const_nan() { + return u"nan"; +} +template <> constexpr char32_t const *str_const_nan() { + return U"nan"; +} +template static constexpr UC const *str_const_inf() { + return nullptr; +} +template <> constexpr char const *str_const_inf() { return "infinity"; } +template <> constexpr wchar_t const *str_const_inf() { + return L"infinity"; +} +template <> constexpr char16_t const *str_const_inf() { + return u"infinity"; +} +template <> constexpr char32_t const *str_const_inf() { + return U"infinity"; +} + +template struct int_luts { + static constexpr uint8_t chdigit[] = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 255, 255, 255, 255, 255, 255, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255}; + + static constexpr size_t maxdigits_u64[] = { + 64, 41, 32, 28, 25, 23, 22, 21, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, + 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13}; + + static constexpr uint64_t min_safe_u64[] = { + 9223372036854775808ull, 12157665459056928801ull, 4611686018427387904, + 7450580596923828125, 4738381338321616896, 3909821048582988049, + 9223372036854775808ull, 12157665459056928801ull, 10000000000000000000ull, + 5559917313492231481, 2218611106740436992, 8650415919381337933, + 2177953337809371136, 6568408355712890625, 1152921504606846976, + 2862423051509815793, 6746640616477458432, 15181127029874798299ull, + 1638400000000000000, 3243919932521508681, 6221821273427820544, + 11592836324538749809ull, 876488338465357824, 1490116119384765625, + 2481152873203736576, 4052555153018976267, 6502111422497947648, + 10260628712958602189ull, 15943230000000000000ull, 787662783788549761, + 1152921504606846976, 1667889514952984961, 2386420683693101056, + 3379220508056640625, 4738381338321616896}; +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template constexpr uint8_t int_luts::chdigit[]; + +template constexpr size_t int_luts::maxdigits_u64[]; + +template constexpr uint64_t int_luts::min_safe_u64[]; + +#endif + +template +fastfloat_really_inline constexpr uint8_t ch_to_digit(UC c) { + return int_luts<>::chdigit[static_cast(c)]; +} + +fastfloat_really_inline constexpr size_t max_digits_u64(int base) { + return int_luts<>::maxdigits_u64[base - 2]; +} + +// If a u64 is exactly max_digits_u64() in length, this is +// the value below which it has definitely overflowed. +fastfloat_really_inline constexpr uint64_t min_safe_u64(int base) { + return int_luts<>::min_safe_u64[base - 2]; +} + +} // namespace fast_float + +#endif + + +#ifndef FASTFLOAT_FAST_FLOAT_H +#define FASTFLOAT_FAST_FLOAT_H + + +namespace fast_float { +/** + * This function parses the character sequence [first,last) for a number. It + * parses floating-point numbers expecting a locale-indepent format equivalent + * to what is used by std::strtod in the default ("C") locale. The resulting + * floating-point value is the closest floating-point values (using either float + * or double), using the "round to even" convention for values that would + * otherwise fall right in-between two values. That is, we provide exact parsing + * according to the IEEE standard. + * + * Given a successful parse, the pointer (`ptr`) in the returned value is set to + * point right after the parsed number, and the `value` referenced is set to the + * parsed value. In case of error, the returned `ec` contains a representative + * error, otherwise the default (`std::errc()`) value is stored. + * + * The implementation does not throw and does not allocate memory (e.g., with + * `new` or `malloc`). + * + * Like the C++17 standard, the `fast_float::from_chars` functions take an + * optional last argument of the type `fast_float::chars_format`. It is a bitset + * value: we check whether `fmt & fast_float::chars_format::fixed` and `fmt & + * fast_float::chars_format::scientific` are set to determine whether we allow + * the fixed point and scientific notation respectively. The default is + * `fast_float::chars_format::general` which allows both `fixed` and + * `scientific`. + */ +template ())> +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars(UC const *first, UC const *last, T &value, + chars_format fmt = chars_format::general) noexcept; + +/** + * Like from_chars, but accepts an `options` argument to govern number parsing. + */ +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars_advanced(UC const *first, UC const *last, T &value, + parse_options_t options) noexcept; +/** + * from_chars for integer types. + */ +template ())> +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars(UC const *first, UC const *last, T &value, int base = 10) noexcept; + +} // namespace fast_float +#endif // FASTFLOAT_FAST_FLOAT_H + +#ifndef FASTFLOAT_ASCII_NUMBER_H +#define FASTFLOAT_ASCII_NUMBER_H + +#include +#include +#include +#include +#include +#include + + +#ifdef FASTFLOAT_SSE2 +#include +#endif + +#ifdef FASTFLOAT_NEON +#include +#endif + +namespace fast_float { + +template fastfloat_really_inline constexpr bool has_simd_opt() { +#ifdef FASTFLOAT_HAS_SIMD + return std::is_same::value; +#else + return false; +#endif +} + +// Next function can be micro-optimized, but compilers are entirely +// able to optimize it well. +template +fastfloat_really_inline constexpr bool is_integer(UC c) noexcept { + return !(c > UC('9') || c < UC('0')); +} + +fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) { + return (val & 0xFF00000000000000) >> 56 | (val & 0x00FF000000000000) >> 40 | + (val & 0x0000FF0000000000) >> 24 | (val & 0x000000FF00000000) >> 8 | + (val & 0x00000000FF000000) << 8 | (val & 0x0000000000FF0000) << 24 | + (val & 0x000000000000FF00) << 40 | (val & 0x00000000000000FF) << 56; +} + +// Read 8 UC into a u64. Truncates UC if not char. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +read8_to_u64(const UC *chars) { + if (cpp20_and_in_constexpr() || !std::is_same::value) { + uint64_t val = 0; + for (int i = 0; i < 8; ++i) { + val |= uint64_t(uint8_t(*chars)) << (i * 8); + ++chars; + } + return val; + } + uint64_t val; + ::memcpy(&val, chars, sizeof(uint64_t)); +#if FASTFLOAT_IS_BIG_ENDIAN == 1 + // Need to read as-if the number was in little-endian order. + val = byteswap(val); +#endif + return val; +} + +#ifdef FASTFLOAT_SSE2 + +fastfloat_really_inline uint64_t simd_read8_to_u64(const __m128i data) { + FASTFLOAT_SIMD_DISABLE_WARNINGS + const __m128i packed = _mm_packus_epi16(data, data); +#ifdef FASTFLOAT_64BIT + return uint64_t(_mm_cvtsi128_si64(packed)); +#else + uint64_t value; + // Visual Studio + older versions of GCC don't support _mm_storeu_si64 + _mm_storel_epi64(reinterpret_cast<__m128i *>(&value), packed); + return value; +#endif + FASTFLOAT_SIMD_RESTORE_WARNINGS +} + +fastfloat_really_inline uint64_t simd_read8_to_u64(const char16_t *chars) { + FASTFLOAT_SIMD_DISABLE_WARNINGS + return simd_read8_to_u64( + _mm_loadu_si128(reinterpret_cast(chars))); + FASTFLOAT_SIMD_RESTORE_WARNINGS +} + +#elif defined(FASTFLOAT_NEON) + +fastfloat_really_inline uint64_t simd_read8_to_u64(const uint16x8_t data) { + FASTFLOAT_SIMD_DISABLE_WARNINGS + uint8x8_t utf8_packed = vmovn_u16(data); + return vget_lane_u64(vreinterpret_u64_u8(utf8_packed), 0); + FASTFLOAT_SIMD_RESTORE_WARNINGS +} + +fastfloat_really_inline uint64_t simd_read8_to_u64(const char16_t *chars) { + FASTFLOAT_SIMD_DISABLE_WARNINGS + return simd_read8_to_u64( + vld1q_u16(reinterpret_cast(chars))); + FASTFLOAT_SIMD_RESTORE_WARNINGS +} + +#endif // FASTFLOAT_SSE2 + +// MSVC SFINAE is broken pre-VS2017 +#if defined(_MSC_VER) && _MSC_VER <= 1900 +template +#else +template ()) = 0> +#endif +// dummy for compile +uint64_t simd_read8_to_u64(UC const *) { + return 0; +} + +// credit @aqrit +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint32_t +parse_eight_digits_unrolled(uint64_t val) { + const uint64_t mask = 0x000000FF000000FF; + const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32) + const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32) + val -= 0x3030303030303030; + val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8; + val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32; + return uint32_t(val); +} + +// Call this if chars are definitely 8 digits. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint32_t +parse_eight_digits_unrolled(UC const *chars) noexcept { + if (cpp20_and_in_constexpr() || !has_simd_opt()) { + return parse_eight_digits_unrolled(read8_to_u64(chars)); // truncation okay + } + return parse_eight_digits_unrolled(simd_read8_to_u64(chars)); +} + +// credit @aqrit +fastfloat_really_inline constexpr bool +is_made_of_eight_digits_fast(uint64_t val) noexcept { + return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) & + 0x8080808080808080)); +} + +#ifdef FASTFLOAT_HAS_SIMD + +// Call this if chars might not be 8 digits. +// Using this style (instead of is_made_of_eight_digits_fast() then +// parse_eight_digits_unrolled()) ensures we don't load SIMD registers twice. +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +simd_parse_if_eight_digits_unrolled(const char16_t *chars, + uint64_t &i) noexcept { + if (cpp20_and_in_constexpr()) { + return false; + } +#ifdef FASTFLOAT_SSE2 + FASTFLOAT_SIMD_DISABLE_WARNINGS + const __m128i data = + _mm_loadu_si128(reinterpret_cast(chars)); + + // (x - '0') <= 9 + // http://0x80.pl/articles/simd-parsing-int-sequences.html + const __m128i t0 = _mm_add_epi16(data, _mm_set1_epi16(32720)); + const __m128i t1 = _mm_cmpgt_epi16(t0, _mm_set1_epi16(-32759)); + + if (_mm_movemask_epi8(t1) == 0) { + i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data)); + return true; + } else + return false; + FASTFLOAT_SIMD_RESTORE_WARNINGS +#elif defined(FASTFLOAT_NEON) + FASTFLOAT_SIMD_DISABLE_WARNINGS + const uint16x8_t data = vld1q_u16(reinterpret_cast(chars)); + + // (x - '0') <= 9 + // http://0x80.pl/articles/simd-parsing-int-sequences.html + const uint16x8_t t0 = vsubq_u16(data, vmovq_n_u16('0')); + const uint16x8_t mask = vcltq_u16(t0, vmovq_n_u16('9' - '0' + 1)); + + if (vminvq_u16(mask) == 0xFFFF) { + i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data)); + return true; + } else + return false; + FASTFLOAT_SIMD_RESTORE_WARNINGS +#else + (void)chars; + (void)i; + return false; +#endif // FASTFLOAT_SSE2 +} + +#endif // FASTFLOAT_HAS_SIMD + +// MSVC SFINAE is broken pre-VS2017 +#if defined(_MSC_VER) && _MSC_VER <= 1900 +template +#else +template ()) = 0> +#endif +// dummy for compile +bool simd_parse_if_eight_digits_unrolled(UC const *, uint64_t &) { + return 0; +} + +template ::value) = 0> +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +loop_parse_if_eight_digits(const UC *&p, const UC *const pend, uint64_t &i) { + if (!has_simd_opt()) { + return; + } + while ((std::distance(p, pend) >= 8) && + simd_parse_if_eight_digits_unrolled( + p, i)) { // in rare cases, this will overflow, but that's ok + p += 8; + } +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +loop_parse_if_eight_digits(const char *&p, const char *const pend, + uint64_t &i) { + // optimizes better than parse_if_eight_digits_unrolled() for UC = char. + while ((std::distance(p, pend) >= 8) && + is_made_of_eight_digits_fast(read8_to_u64(p))) { + i = i * 100000000 + + parse_eight_digits_unrolled(read8_to_u64( + p)); // in rare cases, this will overflow, but that's ok + p += 8; + } +} + +enum class parse_error { + no_error, + // [JSON-only] The minus sign must be followed by an integer. + missing_integer_after_sign, + // A sign must be followed by an integer or dot. + missing_integer_or_dot_after_sign, + // [JSON-only] The integer part must not have leading zeros. + leading_zeros_in_integer_part, + // [JSON-only] The integer part must have at least one digit. + no_digits_in_integer_part, + // [JSON-only] If there is a decimal point, there must be digits in the + // fractional part. + no_digits_in_fractional_part, + // The mantissa must have at least one digit. + no_digits_in_mantissa, + // Scientific notation requires an exponential part. + missing_exponential_part, +}; + +template struct parsed_number_string_t { + int64_t exponent{0}; + uint64_t mantissa{0}; + UC const *lastmatch{nullptr}; + bool negative{false}; + bool valid{false}; + bool too_many_digits{false}; + // contains the range of the significant digits + span integer{}; // non-nullable + span fraction{}; // nullable + parse_error error{parse_error::no_error}; +}; + +using byte_span = span; +using parsed_number_string = parsed_number_string_t; + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t +report_parse_error(UC const *p, parse_error error) { + parsed_number_string_t answer; + answer.valid = false; + answer.lastmatch = p; + answer.error = error; + return answer; +} + +// Assuming that you use no more than 19 digits, this will +// parse an ASCII string. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t +parse_number_string(UC const *p, UC const *pend, + parse_options_t options) noexcept { + chars_format const fmt = options.format; + UC const decimal_point = options.decimal_point; + + parsed_number_string_t answer; + answer.valid = false; + answer.too_many_digits = false; + answer.negative = (*p == UC('-')); +#ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default + if ((*p == UC('-')) || (!(fmt & FASTFLOAT_JSONFMT) && *p == UC('+'))) { +#else + if (*p == UC('-')) { // C++17 20.19.3.(7.1) explicitly forbids '+' sign here +#endif + ++p; + if (p == pend) { + return report_parse_error( + p, parse_error::missing_integer_or_dot_after_sign); + } + if (fmt & FASTFLOAT_JSONFMT) { + if (!is_integer(*p)) { // a sign must be followed by an integer + return report_parse_error(p, + parse_error::missing_integer_after_sign); + } + } else { + if (!is_integer(*p) && + (*p != + decimal_point)) { // a sign must be followed by an integer or the dot + return report_parse_error( + p, parse_error::missing_integer_or_dot_after_sign); + } + } + } + UC const *const start_digits = p; + + uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad) + + while ((p != pend) && is_integer(*p)) { + // a multiplication by 10 is cheaper than an arbitrary integer + // multiplication + i = 10 * i + + uint64_t(*p - + UC('0')); // might overflow, we will handle the overflow later + ++p; + } + UC const *const end_of_integer_part = p; + int64_t digit_count = int64_t(end_of_integer_part - start_digits); + answer.integer = span(start_digits, size_t(digit_count)); + if (fmt & FASTFLOAT_JSONFMT) { + // at least 1 digit in integer part, without leading zeros + if (digit_count == 0) { + return report_parse_error(p, parse_error::no_digits_in_integer_part); + } + if ((start_digits[0] == UC('0') && digit_count > 1)) { + return report_parse_error(start_digits, + parse_error::leading_zeros_in_integer_part); + } + } + + int64_t exponent = 0; + const bool has_decimal_point = (p != pend) && (*p == decimal_point); + if (has_decimal_point) { + ++p; + UC const *before = p; + // can occur at most twice without overflowing, but let it occur more, since + // for integers with many digits, digit parsing is the primary bottleneck. + loop_parse_if_eight_digits(p, pend, i); + + while ((p != pend) && is_integer(*p)) { + uint8_t digit = uint8_t(*p - UC('0')); + ++p; + i = i * 10 + digit; // in rare cases, this will overflow, but that's ok + } + exponent = before - p; + answer.fraction = span(before, size_t(p - before)); + digit_count -= exponent; + } + if (fmt & FASTFLOAT_JSONFMT) { + // at least 1 digit in fractional part + if (has_decimal_point && exponent == 0) { + return report_parse_error(p, + parse_error::no_digits_in_fractional_part); + } + } else if (digit_count == + 0) { // we must have encountered at least one integer! + return report_parse_error(p, parse_error::no_digits_in_mantissa); + } + int64_t exp_number = 0; // explicit exponential part + if (((fmt & chars_format::scientific) && (p != pend) && + ((UC('e') == *p) || (UC('E') == *p))) || + ((fmt & FASTFLOAT_FORTRANFMT) && (p != pend) && + ((UC('+') == *p) || (UC('-') == *p) || (UC('d') == *p) || + (UC('D') == *p)))) { + UC const *location_of_e = p; + if ((UC('e') == *p) || (UC('E') == *p) || (UC('d') == *p) || + (UC('D') == *p)) { + ++p; + } + bool neg_exp = false; + if ((p != pend) && (UC('-') == *p)) { + neg_exp = true; + ++p; + } else if ((p != pend) && + (UC('+') == + *p)) { // '+' on exponent is allowed by C++17 20.19.3.(7.1) + ++p; + } + if ((p == pend) || !is_integer(*p)) { + if (!(fmt & chars_format::fixed)) { + // The exponential part is invalid for scientific notation, so it must + // be a trailing token for fixed notation. However, fixed notation is + // disabled, so report a scientific notation error. + return report_parse_error(p, parse_error::missing_exponential_part); + } + // Otherwise, we will be ignoring the 'e'. + p = location_of_e; + } else { + while ((p != pend) && is_integer(*p)) { + uint8_t digit = uint8_t(*p - UC('0')); + if (exp_number < 0x10000000) { + exp_number = 10 * exp_number + digit; + } + ++p; + } + if (neg_exp) { + exp_number = -exp_number; + } + exponent += exp_number; + } + } else { + // If it scientific and not fixed, we have to bail out. + if ((fmt & chars_format::scientific) && !(fmt & chars_format::fixed)) { + return report_parse_error(p, parse_error::missing_exponential_part); + } + } + answer.lastmatch = p; + answer.valid = true; + + // If we frequently had to deal with long strings of digits, + // we could extend our code by using a 128-bit integer instead + // of a 64-bit integer. However, this is uncommon. + // + // We can deal with up to 19 digits. + if (digit_count > 19) { // this is uncommon + // It is possible that the integer had an overflow. + // We have to handle the case where we have 0.0000somenumber. + // We need to be mindful of the case where we only have zeroes... + // E.g., 0.000000000...000. + UC const *start = start_digits; + while ((start != pend) && (*start == UC('0') || *start == decimal_point)) { + if (*start == UC('0')) { + digit_count--; + } + start++; + } + + if (digit_count > 19) { + answer.too_many_digits = true; + // Let us start again, this time, avoiding overflows. + // We don't need to check if is_integer, since we use the + // pre-tokenized spans from above. + i = 0; + p = answer.integer.ptr; + UC const *int_end = p + answer.integer.len(); + const uint64_t minimal_nineteen_digit_integer{1000000000000000000}; + while ((i < minimal_nineteen_digit_integer) && (p != int_end)) { + i = i * 10 + uint64_t(*p - UC('0')); + ++p; + } + if (i >= minimal_nineteen_digit_integer) { // We have a big integers + exponent = end_of_integer_part - p + exp_number; + } else { // We have a value with a fractional component. + p = answer.fraction.ptr; + UC const *frac_end = p + answer.fraction.len(); + while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) { + i = i * 10 + uint64_t(*p - UC('0')); + ++p; + } + exponent = answer.fraction.ptr - p + exp_number; + } + // We have now corrected both exponent and i, to a truncated value + } + } + answer.exponent = exponent; + answer.mantissa = i; + return answer; +} + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 from_chars_result_t +parse_int_string(UC const *p, UC const *pend, T &value, int base) { + from_chars_result_t answer; + + UC const *const first = p; + + bool negative = (*p == UC('-')); + if (!std::is_signed::value && negative) { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } +#ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default + if ((*p == UC('-')) || (*p == UC('+'))) { +#else + if (*p == UC('-')) { +#endif + ++p; + } + + UC const *const start_num = p; + + while (p != pend && *p == UC('0')) { + ++p; + } + + const bool has_leading_zeros = p > start_num; + + UC const *const start_digits = p; + + uint64_t i = 0; + if (base == 10) { + loop_parse_if_eight_digits(p, pend, i); // use SIMD if possible + } + while (p != pend) { + uint8_t digit = ch_to_digit(*p); + if (digit >= base) { + break; + } + i = uint64_t(base) * i + digit; // might overflow, check this later + p++; + } + + size_t digit_count = size_t(p - start_digits); + + if (digit_count == 0) { + if (has_leading_zeros) { + value = 0; + answer.ec = std::errc(); + answer.ptr = p; + } else { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + } + return answer; + } + + answer.ptr = p; + + // check u64 overflow + size_t max_digits = max_digits_u64(base); + if (digit_count > max_digits) { + answer.ec = std::errc::result_out_of_range; + return answer; + } + // this check can be eliminated for all other types, but they will all require + // a max_digits(base) equivalent + if (digit_count == max_digits && i < min_safe_u64(base)) { + answer.ec = std::errc::result_out_of_range; + return answer; + } + + // check other types overflow + if (!std::is_same::value) { + if (i > uint64_t(std::numeric_limits::max()) + uint64_t(negative)) { + answer.ec = std::errc::result_out_of_range; + return answer; + } + } + + if (negative) { +#ifdef FASTFLOAT_VISUAL_STUDIO +#pragma warning(push) +#pragma warning(disable : 4146) +#endif + // this weird workaround is required because: + // - converting unsigned to signed when its value is greater than signed max + // is UB pre-C++23. + // - reinterpret_casting (~i + 1) would work, but it is not constexpr + // this is always optimized into a neg instruction (note: T is an integer + // type) + value = T(-std::numeric_limits::max() - + T(i - uint64_t(std::numeric_limits::max()))); +#ifdef FASTFLOAT_VISUAL_STUDIO +#pragma warning(pop) +#endif + } else { + value = T(i); + } + + answer.ec = std::errc(); + return answer; +} + +} // namespace fast_float + +#endif + +#ifndef FASTFLOAT_FAST_TABLE_H +#define FASTFLOAT_FAST_TABLE_H + +#include + +namespace fast_float { + +/** + * When mapping numbers from decimal to binary, + * we go from w * 10^q to m * 2^p but we have + * 10^q = 5^q * 2^q, so effectively + * we are trying to match + * w * 2^q * 5^q to m * 2^p. Thus the powers of two + * are not a concern since they can be represented + * exactly using the binary notation, only the powers of five + * affect the binary significand. + */ + +/** + * The smallest non-zero float (binary64) is 2^-1074. + * We take as input numbers of the form w x 10^q where w < 2^64. + * We have that w * 10^-343 < 2^(64-344) 5^-343 < 2^-1076. + * However, we have that + * (2^64-1) * 10^-342 = (2^64-1) * 2^-342 * 5^-342 > 2^-1074. + * Thus it is possible for a number of the form w * 10^-342 where + * w is a 64-bit value to be a non-zero floating-point number. + ********* + * Any number of form w * 10^309 where w>= 1 is going to be + * infinite in binary64 so we never need to worry about powers + * of 5 greater than 308. + */ +template struct powers_template { + + constexpr static int smallest_power_of_five = + binary_format::smallest_power_of_ten(); + constexpr static int largest_power_of_five = + binary_format::largest_power_of_ten(); + constexpr static int number_of_entries = + 2 * (largest_power_of_five - smallest_power_of_five + 1); + // Powers of five from 5^-342 all the way to 5^308 rounded toward one. + constexpr static uint64_t power_of_five_128[number_of_entries] = { + 0xeef453d6923bd65a, 0x113faa2906a13b3f, + 0x9558b4661b6565f8, 0x4ac7ca59a424c507, + 0xbaaee17fa23ebf76, 0x5d79bcf00d2df649, + 0xe95a99df8ace6f53, 0xf4d82c2c107973dc, + 0x91d8a02bb6c10594, 0x79071b9b8a4be869, + 0xb64ec836a47146f9, 0x9748e2826cdee284, + 0xe3e27a444d8d98b7, 0xfd1b1b2308169b25, + 0x8e6d8c6ab0787f72, 0xfe30f0f5e50e20f7, + 0xb208ef855c969f4f, 0xbdbd2d335e51a935, + 0xde8b2b66b3bc4723, 0xad2c788035e61382, + 0x8b16fb203055ac76, 0x4c3bcb5021afcc31, + 0xaddcb9e83c6b1793, 0xdf4abe242a1bbf3d, + 0xd953e8624b85dd78, 0xd71d6dad34a2af0d, + 0x87d4713d6f33aa6b, 0x8672648c40e5ad68, + 0xa9c98d8ccb009506, 0x680efdaf511f18c2, + 0xd43bf0effdc0ba48, 0x212bd1b2566def2, + 0x84a57695fe98746d, 0x14bb630f7604b57, + 0xa5ced43b7e3e9188, 0x419ea3bd35385e2d, + 0xcf42894a5dce35ea, 0x52064cac828675b9, + 0x818995ce7aa0e1b2, 0x7343efebd1940993, + 0xa1ebfb4219491a1f, 0x1014ebe6c5f90bf8, + 0xca66fa129f9b60a6, 0xd41a26e077774ef6, + 0xfd00b897478238d0, 0x8920b098955522b4, + 0x9e20735e8cb16382, 0x55b46e5f5d5535b0, + 0xc5a890362fddbc62, 0xeb2189f734aa831d, + 0xf712b443bbd52b7b, 0xa5e9ec7501d523e4, + 0x9a6bb0aa55653b2d, 0x47b233c92125366e, + 0xc1069cd4eabe89f8, 0x999ec0bb696e840a, + 0xf148440a256e2c76, 0xc00670ea43ca250d, + 0x96cd2a865764dbca, 0x380406926a5e5728, + 0xbc807527ed3e12bc, 0xc605083704f5ecf2, + 0xeba09271e88d976b, 0xf7864a44c633682e, + 0x93445b8731587ea3, 0x7ab3ee6afbe0211d, + 0xb8157268fdae9e4c, 0x5960ea05bad82964, + 0xe61acf033d1a45df, 0x6fb92487298e33bd, + 0x8fd0c16206306bab, 0xa5d3b6d479f8e056, + 0xb3c4f1ba87bc8696, 0x8f48a4899877186c, + 0xe0b62e2929aba83c, 0x331acdabfe94de87, + 0x8c71dcd9ba0b4925, 0x9ff0c08b7f1d0b14, + 0xaf8e5410288e1b6f, 0x7ecf0ae5ee44dd9, + 0xdb71e91432b1a24a, 0xc9e82cd9f69d6150, + 0x892731ac9faf056e, 0xbe311c083a225cd2, + 0xab70fe17c79ac6ca, 0x6dbd630a48aaf406, + 0xd64d3d9db981787d, 0x92cbbccdad5b108, + 0x85f0468293f0eb4e, 0x25bbf56008c58ea5, + 0xa76c582338ed2621, 0xaf2af2b80af6f24e, + 0xd1476e2c07286faa, 0x1af5af660db4aee1, + 0x82cca4db847945ca, 0x50d98d9fc890ed4d, + 0xa37fce126597973c, 0xe50ff107bab528a0, + 0xcc5fc196fefd7d0c, 0x1e53ed49a96272c8, + 0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7a, + 0x9faacf3df73609b1, 0x77b191618c54e9ac, + 0xc795830d75038c1d, 0xd59df5b9ef6a2417, + 0xf97ae3d0d2446f25, 0x4b0573286b44ad1d, + 0x9becce62836ac577, 0x4ee367f9430aec32, + 0xc2e801fb244576d5, 0x229c41f793cda73f, + 0xf3a20279ed56d48a, 0x6b43527578c1110f, + 0x9845418c345644d6, 0x830a13896b78aaa9, + 0xbe5691ef416bd60c, 0x23cc986bc656d553, + 0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa8, + 0x94b3a202eb1c3f39, 0x7bf7d71432f3d6a9, + 0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc53, + 0xe858ad248f5c22c9, 0xd1b3400f8f9cff68, + 0x91376c36d99995be, 0x23100809b9c21fa1, + 0xb58547448ffffb2d, 0xabd40a0c2832a78a, + 0xe2e69915b3fff9f9, 0x16c90c8f323f516c, + 0x8dd01fad907ffc3b, 0xae3da7d97f6792e3, + 0xb1442798f49ffb4a, 0x99cd11cfdf41779c, + 0xdd95317f31c7fa1d, 0x40405643d711d583, + 0x8a7d3eef7f1cfc52, 0x482835ea666b2572, + 0xad1c8eab5ee43b66, 0xda3243650005eecf, + 0xd863b256369d4a40, 0x90bed43e40076a82, + 0x873e4f75e2224e68, 0x5a7744a6e804a291, + 0xa90de3535aaae202, 0x711515d0a205cb36, + 0xd3515c2831559a83, 0xd5a5b44ca873e03, + 0x8412d9991ed58091, 0xe858790afe9486c2, + 0xa5178fff668ae0b6, 0x626e974dbe39a872, + 0xce5d73ff402d98e3, 0xfb0a3d212dc8128f, + 0x80fa687f881c7f8e, 0x7ce66634bc9d0b99, + 0xa139029f6a239f72, 0x1c1fffc1ebc44e80, + 0xc987434744ac874e, 0xa327ffb266b56220, + 0xfbe9141915d7a922, 0x4bf1ff9f0062baa8, + 0x9d71ac8fada6c9b5, 0x6f773fc3603db4a9, + 0xc4ce17b399107c22, 0xcb550fb4384d21d3, + 0xf6019da07f549b2b, 0x7e2a53a146606a48, + 0x99c102844f94e0fb, 0x2eda7444cbfc426d, + 0xc0314325637a1939, 0xfa911155fefb5308, + 0xf03d93eebc589f88, 0x793555ab7eba27ca, + 0x96267c7535b763b5, 0x4bc1558b2f3458de, + 0xbbb01b9283253ca2, 0x9eb1aaedfb016f16, + 0xea9c227723ee8bcb, 0x465e15a979c1cadc, + 0x92a1958a7675175f, 0xbfacd89ec191ec9, + 0xb749faed14125d36, 0xcef980ec671f667b, + 0xe51c79a85916f484, 0x82b7e12780e7401a, + 0x8f31cc0937ae58d2, 0xd1b2ecb8b0908810, + 0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa15, + 0xdfbdcece67006ac9, 0x67a791e093e1d49a, + 0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e0, + 0xaecc49914078536d, 0x58fae9f773886e18, + 0xda7f5bf590966848, 0xaf39a475506a899e, + 0x888f99797a5e012d, 0x6d8406c952429603, + 0xaab37fd7d8f58178, 0xc8e5087ba6d33b83, + 0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a64, + 0x855c3be0a17fcd26, 0x5cf2eea09a55067f, + 0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481e, + 0xd0601d8efc57b08b, 0xf13b94daf124da26, + 0x823c12795db6ce57, 0x76c53d08d6b70858, + 0xa2cb1717b52481ed, 0x54768c4b0c64ca6e, + 0xcb7ddcdda26da268, 0xa9942f5dcf7dfd09, + 0xfe5d54150b090b02, 0xd3f93b35435d7c4c, + 0x9efa548d26e5a6e1, 0xc47bc5014a1a6daf, + 0xc6b8e9b0709f109a, 0x359ab6419ca1091b, + 0xf867241c8cc6d4c0, 0xc30163d203c94b62, + 0x9b407691d7fc44f8, 0x79e0de63425dcf1d, + 0xc21094364dfb5636, 0x985915fc12f542e4, + 0xf294b943e17a2bc4, 0x3e6f5b7b17b2939d, + 0x979cf3ca6cec5b5a, 0xa705992ceecf9c42, + 0xbd8430bd08277231, 0x50c6ff782a838353, + 0xece53cec4a314ebd, 0xa4f8bf5635246428, + 0x940f4613ae5ed136, 0x871b7795e136be99, + 0xb913179899f68584, 0x28e2557b59846e3f, + 0xe757dd7ec07426e5, 0x331aeada2fe589cf, + 0x9096ea6f3848984f, 0x3ff0d2c85def7621, + 0xb4bca50b065abe63, 0xfed077a756b53a9, + 0xe1ebce4dc7f16dfb, 0xd3e8495912c62894, + 0x8d3360f09cf6e4bd, 0x64712dd7abbbd95c, + 0xb080392cc4349dec, 0xbd8d794d96aacfb3, + 0xdca04777f541c567, 0xecf0d7a0fc5583a0, + 0x89e42caaf9491b60, 0xf41686c49db57244, + 0xac5d37d5b79b6239, 0x311c2875c522ced5, + 0xd77485cb25823ac7, 0x7d633293366b828b, + 0x86a8d39ef77164bc, 0xae5dff9c02033197, + 0xa8530886b54dbdeb, 0xd9f57f830283fdfc, + 0xd267caa862a12d66, 0xd072df63c324fd7b, + 0x8380dea93da4bc60, 0x4247cb9e59f71e6d, + 0xa46116538d0deb78, 0x52d9be85f074e608, + 0xcd795be870516656, 0x67902e276c921f8b, + 0x806bd9714632dff6, 0xba1cd8a3db53b6, + 0xa086cfcd97bf97f3, 0x80e8a40eccd228a4, + 0xc8a883c0fdaf7df0, 0x6122cd128006b2cd, + 0xfad2a4b13d1b5d6c, 0x796b805720085f81, + 0x9cc3a6eec6311a63, 0xcbe3303674053bb0, + 0xc3f490aa77bd60fc, 0xbedbfc4411068a9c, + 0xf4f1b4d515acb93b, 0xee92fb5515482d44, + 0x991711052d8bf3c5, 0x751bdd152d4d1c4a, + 0xbf5cd54678eef0b6, 0xd262d45a78a0635d, + 0xef340a98172aace4, 0x86fb897116c87c34, + 0x9580869f0e7aac0e, 0xd45d35e6ae3d4da0, + 0xbae0a846d2195712, 0x8974836059cca109, + 0xe998d258869facd7, 0x2bd1a438703fc94b, + 0x91ff83775423cc06, 0x7b6306a34627ddcf, + 0xb67f6455292cbf08, 0x1a3bc84c17b1d542, + 0xe41f3d6a7377eeca, 0x20caba5f1d9e4a93, + 0x8e938662882af53e, 0x547eb47b7282ee9c, + 0xb23867fb2a35b28d, 0xe99e619a4f23aa43, + 0xdec681f9f4c31f31, 0x6405fa00e2ec94d4, + 0x8b3c113c38f9f37e, 0xde83bc408dd3dd04, + 0xae0b158b4738705e, 0x9624ab50b148d445, + 0xd98ddaee19068c76, 0x3badd624dd9b0957, + 0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d6, + 0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4c, + 0xd47487cc8470652b, 0x7647c3200069671f, + 0x84c8d4dfd2c63f3b, 0x29ecd9f40041e073, + 0xa5fb0a17c777cf09, 0xf468107100525890, + 0xcf79cc9db955c2cc, 0x7182148d4066eeb4, + 0x81ac1fe293d599bf, 0xc6f14cd848405530, + 0xa21727db38cb002f, 0xb8ada00e5a506a7c, + 0xca9cf1d206fdc03b, 0xa6d90811f0e4851c, + 0xfd442e4688bd304a, 0x908f4a166d1da663, + 0x9e4a9cec15763e2e, 0x9a598e4e043287fe, + 0xc5dd44271ad3cdba, 0x40eff1e1853f29fd, + 0xf7549530e188c128, 0xd12bee59e68ef47c, + 0x9a94dd3e8cf578b9, 0x82bb74f8301958ce, + 0xc13a148e3032d6e7, 0xe36a52363c1faf01, + 0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac1, + 0x96f5600f15a7b7e5, 0x29ab103a5ef8c0b9, + 0xbcb2b812db11a5de, 0x7415d448f6b6f0e7, + 0xebdf661791d60f56, 0x111b495b3464ad21, + 0x936b9fcebb25c995, 0xcab10dd900beec34, + 0xb84687c269ef3bfb, 0x3d5d514f40eea742, + 0xe65829b3046b0afa, 0xcb4a5a3112a5112, + 0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ab, + 0xb3f4e093db73a093, 0x59ed216765690f56, + 0xe0f218b8d25088b8, 0x306869c13ec3532c, + 0x8c974f7383725573, 0x1e414218c73a13fb, + 0xafbd2350644eeacf, 0xe5d1929ef90898fa, + 0xdbac6c247d62a583, 0xdf45f746b74abf39, + 0x894bc396ce5da772, 0x6b8bba8c328eb783, + 0xab9eb47c81f5114f, 0x66ea92f3f326564, + 0xd686619ba27255a2, 0xc80a537b0efefebd, + 0x8613fd0145877585, 0xbd06742ce95f5f36, + 0xa798fc4196e952e7, 0x2c48113823b73704, + 0xd17f3b51fca3a7a0, 0xf75a15862ca504c5, + 0x82ef85133de648c4, 0x9a984d73dbe722fb, + 0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebba, + 0xcc963fee10b7d1b3, 0x318df905079926a8, + 0xffbbcfe994e5c61f, 0xfdf17746497f7052, + 0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa633, + 0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc0, + 0xf9bd690a1b68637b, 0x3dfdce7aa3c673b0, + 0x9c1661a651213e2d, 0x6bea10ca65c084e, + 0xc31bfa0fe5698db8, 0x486e494fcff30a62, + 0xf3e2f893dec3f126, 0x5a89dba3c3efccfa, + 0x986ddb5c6b3a76b7, 0xf89629465a75e01c, + 0xbe89523386091465, 0xf6bbb397f1135823, + 0xee2ba6c0678b597f, 0x746aa07ded582e2c, + 0x94db483840b717ef, 0xa8c2a44eb4571cdc, + 0xba121a4650e4ddeb, 0x92f34d62616ce413, + 0xe896a0d7e51e1566, 0x77b020baf9c81d17, + 0x915e2486ef32cd60, 0xace1474dc1d122e, + 0xb5b5ada8aaff80b8, 0xd819992132456ba, + 0xe3231912d5bf60e6, 0x10e1fff697ed6c69, + 0x8df5efabc5979c8f, 0xca8d3ffa1ef463c1, + 0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb2, + 0xddd0467c64bce4a0, 0xac7cb3f6d05ddbde, + 0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96b, + 0xad4ab7112eb3929d, 0x86c16c98d2c953c6, + 0xd89d64d57a607744, 0xe871c7bf077ba8b7, + 0x87625f056c7c4a8b, 0x11471cd764ad4972, + 0xa93af6c6c79b5d2d, 0xd598e40d3dd89bcf, + 0xd389b47879823479, 0x4aff1d108d4ec2c3, + 0x843610cb4bf160cb, 0xcedf722a585139ba, + 0xa54394fe1eedb8fe, 0xc2974eb4ee658828, + 0xce947a3da6a9273e, 0x733d226229feea32, + 0x811ccc668829b887, 0x806357d5a3f525f, + 0xa163ff802a3426a8, 0xca07c2dcb0cf26f7, + 0xc9bcff6034c13052, 0xfc89b393dd02f0b5, + 0xfc2c3f3841f17c67, 0xbbac2078d443ace2, + 0x9d9ba7832936edc0, 0xd54b944b84aa4c0d, + 0xc5029163f384a931, 0xa9e795e65d4df11, + 0xf64335bcf065d37d, 0x4d4617b5ff4a16d5, + 0x99ea0196163fa42e, 0x504bced1bf8e4e45, + 0xc06481fb9bcf8d39, 0xe45ec2862f71e1d6, + 0xf07da27a82c37088, 0x5d767327bb4e5a4c, + 0x964e858c91ba2655, 0x3a6a07f8d510f86f, + 0xbbe226efb628afea, 0x890489f70a55368b, + 0xeadab0aba3b2dbe5, 0x2b45ac74ccea842e, + 0x92c8ae6b464fc96f, 0x3b0b8bc90012929d, + 0xb77ada0617e3bbcb, 0x9ce6ebb40173744, + 0xe55990879ddcaabd, 0xcc420a6a101d0515, + 0x8f57fa54c2a9eab6, 0x9fa946824a12232d, + 0xb32df8e9f3546564, 0x47939822dc96abf9, + 0xdff9772470297ebd, 0x59787e2b93bc56f7, + 0x8bfbea76c619ef36, 0x57eb4edb3c55b65a, + 0xaefae51477a06b03, 0xede622920b6b23f1, + 0xdab99e59958885c4, 0xe95fab368e45eced, + 0x88b402f7fd75539b, 0x11dbcb0218ebb414, + 0xaae103b5fcd2a881, 0xd652bdc29f26a119, + 0xd59944a37c0752a2, 0x4be76d3346f0495f, + 0x857fcae62d8493a5, 0x6f70a4400c562ddb, + 0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb952, + 0xd097ad07a71f26b2, 0x7e2000a41346a7a7, + 0x825ecc24c873782f, 0x8ed400668c0c28c8, + 0xa2f67f2dfa90563b, 0x728900802f0f32fa, + 0xcbb41ef979346bca, 0x4f2b40a03ad2ffb9, + 0xfea126b7d78186bc, 0xe2f610c84987bfa8, + 0x9f24b832e6b0f436, 0xdd9ca7d2df4d7c9, + 0xc6ede63fa05d3143, 0x91503d1c79720dbb, + 0xf8a95fcf88747d94, 0x75a44c6397ce912a, + 0x9b69dbe1b548ce7c, 0xc986afbe3ee11aba, + 0xc24452da229b021b, 0xfbe85badce996168, + 0xf2d56790ab41c2a2, 0xfae27299423fb9c3, + 0x97c560ba6b0919a5, 0xdccd879fc967d41a, + 0xbdb6b8e905cb600f, 0x5400e987bbc1c920, + 0xed246723473e3813, 0x290123e9aab23b68, + 0x9436c0760c86e30b, 0xf9a0b6720aaf6521, + 0xb94470938fa89bce, 0xf808e40e8d5b3e69, + 0xe7958cb87392c2c2, 0xb60b1d1230b20e04, + 0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c2, + 0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af3, + 0xe2280b6c20dd5232, 0x25c6da63c38de1b0, + 0x8d590723948a535f, 0x579c487e5a38ad0e, + 0xb0af48ec79ace837, 0x2d835a9df0c6d851, + 0xdcdb1b2798182244, 0xf8e431456cf88e65, + 0x8a08f0f8bf0f156b, 0x1b8e9ecb641b58ff, + 0xac8b2d36eed2dac5, 0xe272467e3d222f3f, + 0xd7adf884aa879177, 0x5b0ed81dcc6abb0f, + 0x86ccbb52ea94baea, 0x98e947129fc2b4e9, + 0xa87fea27a539e9a5, 0x3f2398d747b36224, + 0xd29fe4b18e88640e, 0x8eec7f0d19a03aad, + 0x83a3eeeef9153e89, 0x1953cf68300424ac, + 0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd7, + 0xcdb02555653131b6, 0x3792f412cb06794d, + 0x808e17555f3ebf11, 0xe2bbd88bbee40bd0, + 0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec4, + 0xc8de047564d20a8b, 0xf245825a5a445275, + 0xfb158592be068d2e, 0xeed6e2f0f0d56712, + 0x9ced737bb6c4183d, 0x55464dd69685606b, + 0xc428d05aa4751e4c, 0xaa97e14c3c26b886, + 0xf53304714d9265df, 0xd53dd99f4b3066a8, + 0x993fe2c6d07b7fab, 0xe546a8038efe4029, + 0xbf8fdb78849a5f96, 0xde98520472bdd033, + 0xef73d256a5c0f77c, 0x963e66858f6d4440, + 0x95a8637627989aad, 0xdde7001379a44aa8, + 0xbb127c53b17ec159, 0x5560c018580d5d52, + 0xe9d71b689dde71af, 0xaab8f01e6e10b4a6, + 0x9226712162ab070d, 0xcab3961304ca70e8, + 0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d22, + 0xe45c10c42a2b3b05, 0x8cb89a7db77c506a, + 0x8eb98a7a9a5b04e3, 0x77f3608e92adb242, + 0xb267ed1940f1c61c, 0x55f038b237591ed3, + 0xdf01e85f912e37a3, 0x6b6c46dec52f6688, + 0x8b61313bbabce2c6, 0x2323ac4b3b3da015, + 0xae397d8aa96c1b77, 0xabec975e0a0d081a, + 0xd9c7dced53c72255, 0x96e7bd358c904a21, + 0x881cea14545c7575, 0x7e50d64177da2e54, + 0xaa242499697392d2, 0xdde50bd1d5d0b9e9, + 0xd4ad2dbfc3d07787, 0x955e4ec64b44e864, + 0x84ec3c97da624ab4, 0xbd5af13bef0b113e, + 0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58e, + 0xcfb11ead453994ba, 0x67de18eda5814af2, + 0x81ceb32c4b43fcf4, 0x80eacf948770ced7, + 0xa2425ff75e14fc31, 0xa1258379a94d028d, + 0xcad2f7f5359a3b3e, 0x96ee45813a04330, + 0xfd87b5f28300ca0d, 0x8bca9d6e188853fc, + 0x9e74d1b791e07e48, 0x775ea264cf55347e, + 0xc612062576589dda, 0x95364afe032a819e, + 0xf79687aed3eec551, 0x3a83ddbd83f52205, + 0x9abe14cd44753b52, 0xc4926a9672793543, + 0xc16d9a0095928a27, 0x75b7053c0f178294, + 0xf1c90080baf72cb1, 0x5324c68b12dd6339, + 0x971da05074da7bee, 0xd3f6fc16ebca5e04, + 0xbce5086492111aea, 0x88f4bb1ca6bcf585, + 0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6, + 0x9392ee8e921d5d07, 0x3aff322e62439fd0, + 0xb877aa3236a4b449, 0x9befeb9fad487c3, + 0xe69594bec44de15b, 0x4c2ebe687989a9b4, + 0x901d7cf73ab0acd9, 0xf9d37014bf60a11, + 0xb424dc35095cd80f, 0x538484c19ef38c95, + 0xe12e13424bb40e13, 0x2865a5f206b06fba, + 0x8cbccc096f5088cb, 0xf93f87b7442e45d4, + 0xafebff0bcb24aafe, 0xf78f69a51539d749, + 0xdbe6fecebdedd5be, 0xb573440e5a884d1c, + 0x89705f4136b4a597, 0x31680a88f8953031, + 0xabcc77118461cefc, 0xfdc20d2b36ba7c3e, + 0xd6bf94d5e57a42bc, 0x3d32907604691b4d, + 0x8637bd05af6c69b5, 0xa63f9a49c2c1b110, + 0xa7c5ac471b478423, 0xfcf80dc33721d54, + 0xd1b71758e219652b, 0xd3c36113404ea4a9, + 0x83126e978d4fdf3b, 0x645a1cac083126ea, + 0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4, + 0xcccccccccccccccc, 0xcccccccccccccccd, + 0x8000000000000000, 0x0, + 0xa000000000000000, 0x0, + 0xc800000000000000, 0x0, + 0xfa00000000000000, 0x0, + 0x9c40000000000000, 0x0, + 0xc350000000000000, 0x0, + 0xf424000000000000, 0x0, + 0x9896800000000000, 0x0, + 0xbebc200000000000, 0x0, + 0xee6b280000000000, 0x0, + 0x9502f90000000000, 0x0, + 0xba43b74000000000, 0x0, + 0xe8d4a51000000000, 0x0, + 0x9184e72a00000000, 0x0, + 0xb5e620f480000000, 0x0, + 0xe35fa931a0000000, 0x0, + 0x8e1bc9bf04000000, 0x0, + 0xb1a2bc2ec5000000, 0x0, + 0xde0b6b3a76400000, 0x0, + 0x8ac7230489e80000, 0x0, + 0xad78ebc5ac620000, 0x0, + 0xd8d726b7177a8000, 0x0, + 0x878678326eac9000, 0x0, + 0xa968163f0a57b400, 0x0, + 0xd3c21bcecceda100, 0x0, + 0x84595161401484a0, 0x0, + 0xa56fa5b99019a5c8, 0x0, + 0xcecb8f27f4200f3a, 0x0, + 0x813f3978f8940984, 0x4000000000000000, + 0xa18f07d736b90be5, 0x5000000000000000, + 0xc9f2c9cd04674ede, 0xa400000000000000, + 0xfc6f7c4045812296, 0x4d00000000000000, + 0x9dc5ada82b70b59d, 0xf020000000000000, + 0xc5371912364ce305, 0x6c28000000000000, + 0xf684df56c3e01bc6, 0xc732000000000000, + 0x9a130b963a6c115c, 0x3c7f400000000000, + 0xc097ce7bc90715b3, 0x4b9f100000000000, + 0xf0bdc21abb48db20, 0x1e86d40000000000, + 0x96769950b50d88f4, 0x1314448000000000, + 0xbc143fa4e250eb31, 0x17d955a000000000, + 0xeb194f8e1ae525fd, 0x5dcfab0800000000, + 0x92efd1b8d0cf37be, 0x5aa1cae500000000, + 0xb7abc627050305ad, 0xf14a3d9e40000000, + 0xe596b7b0c643c719, 0x6d9ccd05d0000000, + 0x8f7e32ce7bea5c6f, 0xe4820023a2000000, + 0xb35dbf821ae4f38b, 0xdda2802c8a800000, + 0xe0352f62a19e306e, 0xd50b2037ad200000, + 0x8c213d9da502de45, 0x4526f422cc340000, + 0xaf298d050e4395d6, 0x9670b12b7f410000, + 0xdaf3f04651d47b4c, 0x3c0cdd765f114000, + 0x88d8762bf324cd0f, 0xa5880a69fb6ac800, + 0xab0e93b6efee0053, 0x8eea0d047a457a00, + 0xd5d238a4abe98068, 0x72a4904598d6d880, + 0x85a36366eb71f041, 0x47a6da2b7f864750, + 0xa70c3c40a64e6c51, 0x999090b65f67d924, + 0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d, + 0x82818f1281ed449f, 0xbff8f10e7a8921a4, + 0xa321f2d7226895c7, 0xaff72d52192b6a0d, + 0xcbea6f8ceb02bb39, 0x9bf4f8a69f764490, + 0xfee50b7025c36a08, 0x2f236d04753d5b4, + 0x9f4f2726179a2245, 0x1d762422c946590, + 0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef5, + 0xf8ebad2b84e0d58b, 0xd2e0898765a7deb2, + 0x9b934c3b330c8577, 0x63cc55f49f88eb2f, + 0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fb, + 0xf316271c7fc3908a, 0x8bef464e3945ef7a, + 0x97edd871cfda3a56, 0x97758bf0e3cbb5ac, + 0xbde94e8e43d0c8ec, 0x3d52eeed1cbea317, + 0xed63a231d4c4fb27, 0x4ca7aaa863ee4bdd, + 0x945e455f24fb1cf8, 0x8fe8caa93e74ef6a, + 0xb975d6b6ee39e436, 0xb3e2fd538e122b44, + 0xe7d34c64a9c85d44, 0x60dbbca87196b616, + 0x90e40fbeea1d3a4a, 0xbc8955e946fe31cd, + 0xb51d13aea4a488dd, 0x6babab6398bdbe41, + 0xe264589a4dcdab14, 0xc696963c7eed2dd1, + 0x8d7eb76070a08aec, 0xfc1e1de5cf543ca2, + 0xb0de65388cc8ada8, 0x3b25a55f43294bcb, + 0xdd15fe86affad912, 0x49ef0eb713f39ebe, + 0x8a2dbf142dfcc7ab, 0x6e3569326c784337, + 0xacb92ed9397bf996, 0x49c2c37f07965404, + 0xd7e77a8f87daf7fb, 0xdc33745ec97be906, + 0x86f0ac99b4e8dafd, 0x69a028bb3ded71a3, + 0xa8acd7c0222311bc, 0xc40832ea0d68ce0c, + 0xd2d80db02aabd62b, 0xf50a3fa490c30190, + 0x83c7088e1aab65db, 0x792667c6da79e0fa, + 0xa4b8cab1a1563f52, 0x577001b891185938, + 0xcde6fd5e09abcf26, 0xed4c0226b55e6f86, + 0x80b05e5ac60b6178, 0x544f8158315b05b4, + 0xa0dc75f1778e39d6, 0x696361ae3db1c721, + 0xc913936dd571c84c, 0x3bc3a19cd1e38e9, + 0xfb5878494ace3a5f, 0x4ab48a04065c723, + 0x9d174b2dcec0e47b, 0x62eb0d64283f9c76, + 0xc45d1df942711d9a, 0x3ba5d0bd324f8394, + 0xf5746577930d6500, 0xca8f44ec7ee36479, + 0x9968bf6abbe85f20, 0x7e998b13cf4e1ecb, + 0xbfc2ef456ae276e8, 0x9e3fedd8c321a67e, + 0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101e, + 0x95d04aee3b80ece5, 0xbba1f1d158724a12, + 0xbb445da9ca61281f, 0x2a8a6e45ae8edc97, + 0xea1575143cf97226, 0xf52d09d71a3293bd, + 0x924d692ca61be758, 0x593c2626705f9c56, + 0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836c, + 0xe498f455c38b997a, 0xb6dfb9c0f956447, + 0x8edf98b59a373fec, 0x4724bd4189bd5eac, + 0xb2977ee300c50fe7, 0x58edec91ec2cb657, + 0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ed, + 0x8b865b215899f46c, 0xbd79e0d20082ee74, + 0xae67f1e9aec07187, 0xecd8590680a3aa11, + 0xda01ee641a708de9, 0xe80e6f4820cc9495, + 0x884134fe908658b2, 0x3109058d147fdcdd, + 0xaa51823e34a7eede, 0xbd4b46f0599fd415, + 0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91a, + 0x850fadc09923329e, 0x3e2cf6bc604ddb0, + 0xa6539930bf6bff45, 0x84db8346b786151c, + 0xcfe87f7cef46ff16, 0xe612641865679a63, + 0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07e, + 0xa26da3999aef7749, 0xe3be5e330f38f09d, + 0xcb090c8001ab551c, 0x5cadf5bfd3072cc5, + 0xfdcb4fa002162a63, 0x73d9732fc7c8f7f6, + 0x9e9f11c4014dda7e, 0x2867e7fddcdd9afa, + 0xc646d63501a1511d, 0xb281e1fd541501b8, + 0xf7d88bc24209a565, 0x1f225a7ca91a4226, + 0x9ae757596946075f, 0x3375788de9b06958, + 0xc1a12d2fc3978937, 0x52d6b1641c83ae, + 0xf209787bb47d6b84, 0xc0678c5dbd23a49a, + 0x9745eb4d50ce6332, 0xf840b7ba963646e0, + 0xbd176620a501fbff, 0xb650e5a93bc3d898, + 0xec5d3fa8ce427aff, 0xa3e51f138ab4cebe, + 0x93ba47c980e98cdf, 0xc66f336c36b10137, + 0xb8a8d9bbe123f017, 0xb80b0047445d4184, + 0xe6d3102ad96cec1d, 0xa60dc059157491e5, + 0x9043ea1ac7e41392, 0x87c89837ad68db2f, + 0xb454e4a179dd1877, 0x29babe4598c311fb, + 0xe16a1dc9d8545e94, 0xf4296dd6fef3d67a, + 0x8ce2529e2734bb1d, 0x1899e4a65f58660c, + 0xb01ae745b101e9e4, 0x5ec05dcff72e7f8f, + 0xdc21a1171d42645d, 0x76707543f4fa1f73, + 0x899504ae72497eba, 0x6a06494a791c53a8, + 0xabfa45da0edbde69, 0x487db9d17636892, + 0xd6f8d7509292d603, 0x45a9d2845d3c42b6, + 0x865b86925b9bc5c2, 0xb8a2392ba45a9b2, + 0xa7f26836f282b732, 0x8e6cac7768d7141e, + 0xd1ef0244af2364ff, 0x3207d795430cd926, + 0x8335616aed761f1f, 0x7f44e6bd49e807b8, + 0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a6, + 0xcd036837130890a1, 0x36dba887c37a8c0f, + 0x802221226be55a64, 0xc2494954da2c9789, + 0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6c, + 0xc83553c5c8965d3d, 0x6f92829494e5acc7, + 0xfa42a8b73abbf48c, 0xcb772339ba1f17f9, + 0x9c69a97284b578d7, 0xff2a760414536efb, + 0xc38413cf25e2d70d, 0xfef5138519684aba, + 0xf46518c2ef5b8cd1, 0x7eb258665fc25d69, + 0x98bf2f79d5993802, 0xef2f773ffbd97a61, + 0xbeeefb584aff8603, 0xaafb550ffacfd8fa, + 0xeeaaba2e5dbf6784, 0x95ba2a53f983cf38, + 0x952ab45cfa97a0b2, 0xdd945a747bf26183, + 0xba756174393d88df, 0x94f971119aeef9e4, + 0xe912b9d1478ceb17, 0x7a37cd5601aab85d, + 0x91abb422ccb812ee, 0xac62e055c10ab33a, + 0xb616a12b7fe617aa, 0x577b986b314d6009, + 0xe39c49765fdf9d94, 0xed5a7e85fda0b80b, + 0x8e41ade9fbebc27d, 0x14588f13be847307, + 0xb1d219647ae6b31c, 0x596eb2d8ae258fc8, + 0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bb, + 0x8aec23d680043bee, 0x25de7bb9480d5854, + 0xada72ccc20054ae9, 0xaf561aa79a10ae6a, + 0xd910f7ff28069da4, 0x1b2ba1518094da04, + 0x87aa9aff79042286, 0x90fb44d2f05d0842, + 0xa99541bf57452b28, 0x353a1607ac744a53, + 0xd3fa922f2d1675f2, 0x42889b8997915ce8, + 0x847c9b5d7c2e09b7, 0x69956135febada11, + 0xa59bc234db398c25, 0x43fab9837e699095, + 0xcf02b2c21207ef2e, 0x94f967e45e03f4bb, + 0x8161afb94b44f57d, 0x1d1be0eebac278f5, + 0xa1ba1ba79e1632dc, 0x6462d92a69731732, + 0xca28a291859bbf93, 0x7d7b8f7503cfdcfe, + 0xfcb2cb35e702af78, 0x5cda735244c3d43e, + 0x9defbf01b061adab, 0x3a0888136afa64a7, + 0xc56baec21c7a1916, 0x88aaa1845b8fdd0, + 0xf6c69a72a3989f5b, 0x8aad549e57273d45, + 0x9a3c2087a63f6399, 0x36ac54e2f678864b, + 0xc0cb28a98fcf3c7f, 0x84576a1bb416a7dd, + 0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d5, + 0x969eb7c47859e743, 0x9f644ae5a4b1b325, + 0xbc4665b596706114, 0x873d5d9f0dde1fee, + 0xeb57ff22fc0c7959, 0xa90cb506d155a7ea, + 0x9316ff75dd87cbd8, 0x9a7f12442d588f2, + 0xb7dcbf5354e9bece, 0xc11ed6d538aeb2f, + 0xe5d3ef282a242e81, 0x8f1668c8a86da5fa, + 0x8fa475791a569d10, 0xf96e017d694487bc, + 0xb38d92d760ec4455, 0x37c981dcc395a9ac, + 0xe070f78d3927556a, 0x85bbe253f47b1417, + 0x8c469ab843b89562, 0x93956d7478ccec8e, + 0xaf58416654a6babb, 0x387ac8d1970027b2, + 0xdb2e51bfe9d0696a, 0x6997b05fcc0319e, + 0x88fcf317f22241e2, 0x441fece3bdf81f03, + 0xab3c2fddeeaad25a, 0xd527e81cad7626c3, + 0xd60b3bd56a5586f1, 0x8a71e223d8d3b074, + 0x85c7056562757456, 0xf6872d5667844e49, + 0xa738c6bebb12d16c, 0xb428f8ac016561db, + 0xd106f86e69d785c7, 0xe13336d701beba52, + 0x82a45b450226b39c, 0xecc0024661173473, + 0xa34d721642b06084, 0x27f002d7f95d0190, + 0xcc20ce9bd35c78a5, 0x31ec038df7b441f4, + 0xff290242c83396ce, 0x7e67047175a15271, + 0x9f79a169bd203e41, 0xf0062c6e984d386, + 0xc75809c42c684dd1, 0x52c07b78a3e60868, + 0xf92e0c3537826145, 0xa7709a56ccdf8a82, + 0x9bbcc7a142b17ccb, 0x88a66076400bb691, + 0xc2abf989935ddbfe, 0x6acff893d00ea435, + 0xf356f7ebf83552fe, 0x583f6b8c4124d43, + 0x98165af37b2153de, 0xc3727a337a8b704a, + 0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5c, + 0xeda2ee1c7064130c, 0x1162def06f79df73, + 0x9485d4d1c63e8be7, 0x8addcb5645ac2ba8, + 0xb9a74a0637ce2ee1, 0x6d953e2bd7173692, + 0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0437, + 0x910ab1d4db9914a0, 0x1d9c9892400a22a2, + 0xb54d5e4a127f59c8, 0x2503beb6d00cab4b, + 0xe2a0b5dc971f303a, 0x2e44ae64840fd61d, + 0x8da471a9de737e24, 0x5ceaecfed289e5d2, + 0xb10d8e1456105dad, 0x7425a83e872c5f47, + 0xdd50f1996b947518, 0xd12f124e28f77719, + 0x8a5296ffe33cc92f, 0x82bd6b70d99aaa6f, + 0xace73cbfdc0bfb7b, 0x636cc64d1001550b, + 0xd8210befd30efa5a, 0x3c47f7e05401aa4e, + 0x8714a775e3e95c78, 0x65acfaec34810a71, + 0xa8d9d1535ce3b396, 0x7f1839a741a14d0d, + 0xd31045a8341ca07c, 0x1ede48111209a050, + 0x83ea2b892091e44d, 0x934aed0aab460432, + 0xa4e4b66b68b65d60, 0xf81da84d5617853f, + 0xce1de40642e3f4b9, 0x36251260ab9d668e, + 0x80d2ae83e9ce78f3, 0xc1d72b7c6b426019, + 0xa1075a24e4421730, 0xb24cf65b8612f81f, + 0xc94930ae1d529cfc, 0xdee033f26797b627, + 0xfb9b7cd9a4a7443c, 0x169840ef017da3b1, + 0x9d412e0806e88aa5, 0x8e1f289560ee864e, + 0xc491798a08a2ad4e, 0xf1a6f2bab92a27e2, + 0xf5b5d7ec8acb58a2, 0xae10af696774b1db, + 0x9991a6f3d6bf1765, 0xacca6da1e0a8ef29, + 0xbff610b0cc6edd3f, 0x17fd090a58d32af3, + 0xeff394dcff8a948e, 0xddfc4b4cef07f5b0, + 0x95f83d0a1fb69cd9, 0x4abdaf101564f98e, + 0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f1, + 0xea53df5fd18d5513, 0x84c86189216dc5ed, + 0x92746b9be2f8552c, 0x32fd3cf5b4e49bb4, + 0xb7118682dbb66a77, 0x3fbc8c33221dc2a1, + 0xe4d5e82392a40515, 0xfabaf3feaa5334a, + 0x8f05b1163ba6832d, 0x29cb4d87f2a7400e, + 0xb2c71d5bca9023f8, 0x743e20e9ef511012, + 0xdf78e4b2bd342cf6, 0x914da9246b255416, + 0x8bab8eefb6409c1a, 0x1ad089b6c2f7548e, + 0xae9672aba3d0c320, 0xa184ac2473b529b1, + 0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741e, + 0x8865899617fb1871, 0x7e2fa67c7a658892, + 0xaa7eebfb9df9de8d, 0xddbb901b98feeab7, + 0xd51ea6fa85785631, 0x552a74227f3ea565, + 0x8533285c936b35de, 0xd53a88958f87275f, + 0xa67ff273b8460356, 0x8a892abaf368f137, + 0xd01fef10a657842c, 0x2d2b7569b0432d85, + 0x8213f56a67f6b29b, 0x9c3b29620e29fc73, + 0xa298f2c501f45f42, 0x8349f3ba91b47b8f, + 0xcb3f2f7642717713, 0x241c70a936219a73, + 0xfe0efb53d30dd4d7, 0xed238cd383aa0110, + 0x9ec95d1463e8a506, 0xf4363804324a40aa, + 0xc67bb4597ce2ce48, 0xb143c6053edcd0d5, + 0xf81aa16fdc1b81da, 0xdd94b7868e94050a, + 0x9b10a4e5e9913128, 0xca7cf2b4191c8326, + 0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f0, + 0xf24a01a73cf2dccf, 0xbc633b39673c8cec, + 0x976e41088617ca01, 0xd5be0503e085d813, + 0xbd49d14aa79dbc82, 0x4b2d8644d8a74e18, + 0xec9c459d51852ba2, 0xddf8e7d60ed1219e, + 0x93e1ab8252f33b45, 0xcabb90e5c942b503, + 0xb8da1662e7b00a17, 0x3d6a751f3b936243, + 0xe7109bfba19c0c9d, 0xcc512670a783ad4, + 0x906a617d450187e2, 0x27fb2b80668b24c5, + 0xb484f9dc9641e9da, 0xb1f9f660802dedf6, + 0xe1a63853bbd26451, 0x5e7873f8a0396973, + 0x8d07e33455637eb2, 0xdb0b487b6423e1e8, + 0xb049dc016abc5e5f, 0x91ce1a9a3d2cda62, + 0xdc5c5301c56b75f7, 0x7641a140cc7810fb, + 0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9d, + 0xac2820d9623bf429, 0x546345fa9fbdcd44, + 0xd732290fbacaf133, 0xa97c177947ad4095, + 0x867f59a9d4bed6c0, 0x49ed8eabcccc485d, + 0xa81f301449ee8c70, 0x5c68f256bfff5a74, + 0xd226fc195c6a2f8c, 0x73832eec6fff3111, + 0x83585d8fd9c25db7, 0xc831fd53c5ff7eab, + 0xa42e74f3d032f525, 0xba3e7ca8b77f5e55, + 0xcd3a1230c43fb26f, 0x28ce1bd2e55f35eb, + 0x80444b5e7aa7cf85, 0x7980d163cf5b81b3, + 0xa0555e361951c366, 0xd7e105bcc332621f, + 0xc86ab5c39fa63440, 0x8dd9472bf3fefaa7, + 0xfa856334878fc150, 0xb14f98f6f0feb951, + 0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d3, + 0xc3b8358109e84f07, 0xa862f80ec4700c8, + 0xf4a642e14c6262c8, 0xcd27bb612758c0fa, + 0x98e7e9cccfbd7dbd, 0x8038d51cb897789c, + 0xbf21e44003acdd2c, 0xe0470a63e6bd56c3, + 0xeeea5d5004981478, 0x1858ccfce06cac74, + 0x95527a5202df0ccb, 0xf37801e0c43ebc8, + 0xbaa718e68396cffd, 0xd30560258f54e6ba, + 0xe950df20247c83fd, 0x47c6b82ef32a2069, + 0x91d28b7416cdd27e, 0x4cdc331d57fa5441, + 0xb6472e511c81471d, 0xe0133fe4adf8e952, + 0xe3d8f9e563a198e5, 0x58180fddd97723a6, + 0x8e679c2f5e44ff8f, 0x570f09eaa7ea7648, + }; +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template +constexpr uint64_t + powers_template::power_of_five_128[number_of_entries]; + +#endif + +using powers = powers_template<>; + +} // namespace fast_float + +#endif + +#ifndef FASTFLOAT_DECIMAL_TO_BINARY_H +#define FASTFLOAT_DECIMAL_TO_BINARY_H + +#include +#include +#include +#include +#include +#include + +namespace fast_float { + +// This will compute or rather approximate w * 5**q and return a pair of 64-bit +// words approximating the result, with the "high" part corresponding to the +// most significant bits and the low part corresponding to the least significant +// bits. +// +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 value128 +compute_product_approximation(int64_t q, uint64_t w) { + const int index = 2 * int(q - powers::smallest_power_of_five); + // For small values of q, e.g., q in [0,27], the answer is always exact + // because The line value128 firstproduct = full_multiplication(w, + // power_of_five_128[index]); gives the exact answer. + value128 firstproduct = + full_multiplication(w, powers::power_of_five_128[index]); + static_assert((bit_precision >= 0) && (bit_precision <= 64), + " precision should be in (0,64]"); + constexpr uint64_t precision_mask = + (bit_precision < 64) ? (uint64_t(0xFFFFFFFFFFFFFFFF) >> bit_precision) + : uint64_t(0xFFFFFFFFFFFFFFFF); + if ((firstproduct.high & precision_mask) == + precision_mask) { // could further guard with (lower + w < lower) + // regarding the second product, we only need secondproduct.high, but our + // expectation is that the compiler will optimize this extra work away if + // needed. + value128 secondproduct = + full_multiplication(w, powers::power_of_five_128[index + 1]); + firstproduct.low += secondproduct.high; + if (secondproduct.high > firstproduct.low) { + firstproduct.high++; + } + } + return firstproduct; +} + +namespace detail { +/** + * For q in (0,350), we have that + * f = (((152170 + 65536) * q ) >> 16); + * is equal to + * floor(p) + q + * where + * p = log(5**q)/log(2) = q * log(5)/log(2) + * + * For negative values of q in (-400,0), we have that + * f = (((152170 + 65536) * q ) >> 16); + * is equal to + * -ceil(p) + q + * where + * p = log(5**-q)/log(2) = -q * log(5)/log(2) + */ +constexpr fastfloat_really_inline int32_t power(int32_t q) noexcept { + return (((152170 + 65536) * q) >> 16) + 63; +} +} // namespace detail + +// create an adjusted mantissa, biased by the invalid power2 +// for significant digits already multiplied by 10 ** q. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 adjusted_mantissa +compute_error_scaled(int64_t q, uint64_t w, int lz) noexcept { + int hilz = int(w >> 63) ^ 1; + adjusted_mantissa answer; + answer.mantissa = w << hilz; + int bias = binary::mantissa_explicit_bits() - binary::minimum_exponent(); + answer.power2 = int32_t(detail::power(int32_t(q)) + bias - hilz - lz - 62 + + invalid_am_bias); + return answer; +} + +// w * 10 ** q, without rounding the representation up. +// the power2 in the exponent will be adjusted by invalid_am_bias. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +compute_error(int64_t q, uint64_t w) noexcept { + int lz = leading_zeroes(w); + w <<= lz; + value128 product = + compute_product_approximation(q, w); + return compute_error_scaled(q, product.high, lz); +} + +// w * 10 ** q +// The returned value should be a valid ieee64 number that simply need to be +// packed. However, in some very rare cases, the computation will fail. In such +// cases, we return an adjusted_mantissa with a negative power of 2: the caller +// should recompute in such cases. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +compute_float(int64_t q, uint64_t w) noexcept { + adjusted_mantissa answer; + if ((w == 0) || (q < binary::smallest_power_of_ten())) { + answer.power2 = 0; + answer.mantissa = 0; + // result should be zero + return answer; + } + if (q > binary::largest_power_of_ten()) { + // we want to get infinity: + answer.power2 = binary::infinite_power(); + answer.mantissa = 0; + return answer; + } + // At this point in time q is in [powers::smallest_power_of_five, + // powers::largest_power_of_five]. + + // We want the most significant bit of i to be 1. Shift if needed. + int lz = leading_zeroes(w); + w <<= lz; + + // The required precision is binary::mantissa_explicit_bits() + 3 because + // 1. We need the implicit bit + // 2. We need an extra bit for rounding purposes + // 3. We might lose a bit due to the "upperbit" routine (result too small, + // requiring a shift) + + value128 product = + compute_product_approximation(q, w); + // The computed 'product' is always sufficient. + // Mathematical proof: + // Noble Mushtak and Daniel Lemire, Fast Number Parsing Without Fallback (to + // appear) See script/mushtak_lemire.py + + // The "compute_product_approximation" function can be slightly slower than a + // branchless approach: value128 product = compute_product(q, w); but in + // practice, we can win big with the compute_product_approximation if its + // additional branch is easily predicted. Which is best is data specific. + int upperbit = int(product.high >> 63); + int shift = upperbit + 64 - binary::mantissa_explicit_bits() - 3; + + answer.mantissa = product.high >> shift; + + answer.power2 = int32_t(detail::power(int32_t(q)) + upperbit - lz - + binary::minimum_exponent()); + if (answer.power2 <= 0) { // we have a subnormal? + // Here have that answer.power2 <= 0 so -answer.power2 >= 0 + if (-answer.power2 + 1 >= + 64) { // if we have more than 64 bits below the minimum exponent, you + // have a zero for sure. + answer.power2 = 0; + answer.mantissa = 0; + // result should be zero + return answer; + } + // next line is safe because -answer.power2 + 1 < 64 + answer.mantissa >>= -answer.power2 + 1; + // Thankfully, we can't have both "round-to-even" and subnormals because + // "round-to-even" only occurs for powers close to 0. + answer.mantissa += (answer.mantissa & 1); // round up + answer.mantissa >>= 1; + // There is a weird scenario where we don't have a subnormal but just. + // Suppose we start with 2.2250738585072013e-308, we end up + // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal + // whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round + // up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer + // subnormal, but we can only know this after rounding. + // So we only declare a subnormal if we are smaller than the threshold. + answer.power2 = + (answer.mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) + ? 0 + : 1; + return answer; + } + + // usually, we round *up*, but if we fall right in between and and we have an + // even basis, we need to round down + // We are only concerned with the cases where 5**q fits in single 64-bit word. + if ((product.low <= 1) && (q >= binary::min_exponent_round_to_even()) && + (q <= binary::max_exponent_round_to_even()) && + ((answer.mantissa & 3) == 1)) { // we may fall between two floats! + // To be in-between two floats we need that in doing + // answer.mantissa = product.high >> (upperbit + 64 - + // binary::mantissa_explicit_bits() - 3); + // ... we dropped out only zeroes. But if this happened, then we can go + // back!!! + if ((answer.mantissa << shift) == product.high) { + answer.mantissa &= ~uint64_t(1); // flip it so that we do not round up + } + } + + answer.mantissa += (answer.mantissa & 1); // round up + answer.mantissa >>= 1; + if (answer.mantissa >= (uint64_t(2) << binary::mantissa_explicit_bits())) { + answer.mantissa = (uint64_t(1) << binary::mantissa_explicit_bits()); + answer.power2++; // undo previous addition + } + + answer.mantissa &= ~(uint64_t(1) << binary::mantissa_explicit_bits()); + if (answer.power2 >= binary::infinite_power()) { // infinity + answer.power2 = binary::infinite_power(); + answer.mantissa = 0; + } + return answer; +} + +} // namespace fast_float + +#endif + +#ifndef FASTFLOAT_BIGINT_H +#define FASTFLOAT_BIGINT_H + +#include +#include +#include +#include + + +namespace fast_float { + +// the limb width: we want efficient multiplication of double the bits in +// limb, or for 64-bit limbs, at least 64-bit multiplication where we can +// extract the high and low parts efficiently. this is every 64-bit +// architecture except for sparc, which emulates 128-bit multiplication. +// we might have platforms where `CHAR_BIT` is not 8, so let's avoid +// doing `8 * sizeof(limb)`. +#if defined(FASTFLOAT_64BIT) && !defined(__sparc) +#define FASTFLOAT_64BIT_LIMB 1 +typedef uint64_t limb; +constexpr size_t limb_bits = 64; +#else +#define FASTFLOAT_32BIT_LIMB +typedef uint32_t limb; +constexpr size_t limb_bits = 32; +#endif + +typedef span limb_span; + +// number of bits in a bigint. this needs to be at least the number +// of bits required to store the largest bigint, which is +// `log2(10**(digits + max_exp))`, or `log2(10**(767 + 342))`, or +// ~3600 bits, so we round to 4000. +constexpr size_t bigint_bits = 4000; +constexpr size_t bigint_limbs = bigint_bits / limb_bits; + +// vector-like type that is allocated on the stack. the entire +// buffer is pre-allocated, and only the length changes. +template struct stackvec { + limb data[size]; + // we never need more than 150 limbs + uint16_t length{0}; + + stackvec() = default; + stackvec(const stackvec &) = delete; + stackvec &operator=(const stackvec &) = delete; + stackvec(stackvec &&) = delete; + stackvec &operator=(stackvec &&other) = delete; + + // create stack vector from existing limb span. + FASTFLOAT_CONSTEXPR20 stackvec(limb_span s) { + FASTFLOAT_ASSERT(try_extend(s)); + } + + FASTFLOAT_CONSTEXPR14 limb &operator[](size_t index) noexcept { + FASTFLOAT_DEBUG_ASSERT(index < length); + return data[index]; + } + FASTFLOAT_CONSTEXPR14 const limb &operator[](size_t index) const noexcept { + FASTFLOAT_DEBUG_ASSERT(index < length); + return data[index]; + } + // index from the end of the container + FASTFLOAT_CONSTEXPR14 const limb &rindex(size_t index) const noexcept { + FASTFLOAT_DEBUG_ASSERT(index < length); + size_t rindex = length - index - 1; + return data[rindex]; + } + + // set the length, without bounds checking. + FASTFLOAT_CONSTEXPR14 void set_len(size_t len) noexcept { + length = uint16_t(len); + } + constexpr size_t len() const noexcept { return length; } + constexpr bool is_empty() const noexcept { return length == 0; } + constexpr size_t capacity() const noexcept { return size; } + // append item to vector, without bounds checking + FASTFLOAT_CONSTEXPR14 void push_unchecked(limb value) noexcept { + data[length] = value; + length++; + } + // append item to vector, returning if item was added + FASTFLOAT_CONSTEXPR14 bool try_push(limb value) noexcept { + if (len() < capacity()) { + push_unchecked(value); + return true; + } else { + return false; + } + } + // add items to the vector, from a span, without bounds checking + FASTFLOAT_CONSTEXPR20 void extend_unchecked(limb_span s) noexcept { + limb *ptr = data + length; + std::copy_n(s.ptr, s.len(), ptr); + set_len(len() + s.len()); + } + // try to add items to the vector, returning if items were added + FASTFLOAT_CONSTEXPR20 bool try_extend(limb_span s) noexcept { + if (len() + s.len() <= capacity()) { + extend_unchecked(s); + return true; + } else { + return false; + } + } + // resize the vector, without bounds checking + // if the new size is longer than the vector, assign value to each + // appended item. + FASTFLOAT_CONSTEXPR20 + void resize_unchecked(size_t new_len, limb value) noexcept { + if (new_len > len()) { + size_t count = new_len - len(); + limb *first = data + len(); + limb *last = first + count; + ::std::fill(first, last, value); + set_len(new_len); + } else { + set_len(new_len); + } + } + // try to resize the vector, returning if the vector was resized. + FASTFLOAT_CONSTEXPR20 bool try_resize(size_t new_len, limb value) noexcept { + if (new_len > capacity()) { + return false; + } else { + resize_unchecked(new_len, value); + return true; + } + } + // check if any limbs are non-zero after the given index. + // this needs to be done in reverse order, since the index + // is relative to the most significant limbs. + FASTFLOAT_CONSTEXPR14 bool nonzero(size_t index) const noexcept { + while (index < len()) { + if (rindex(index) != 0) { + return true; + } + index++; + } + return false; + } + // normalize the big integer, so most-significant zero limbs are removed. + FASTFLOAT_CONSTEXPR14 void normalize() noexcept { + while (len() > 0 && rindex(0) == 0) { + length--; + } + } +}; + +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t +empty_hi64(bool &truncated) noexcept { + truncated = false; + return 0; +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint64_hi64(uint64_t r0, bool &truncated) noexcept { + truncated = false; + int shl = leading_zeroes(r0); + return r0 << shl; +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint64_hi64(uint64_t r0, uint64_t r1, bool &truncated) noexcept { + int shl = leading_zeroes(r0); + if (shl == 0) { + truncated = r1 != 0; + return r0; + } else { + int shr = 64 - shl; + truncated = (r1 << shl) != 0; + return (r0 << shl) | (r1 >> shr); + } +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint32_hi64(uint32_t r0, bool &truncated) noexcept { + return uint64_hi64(r0, truncated); +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint32_hi64(uint32_t r0, uint32_t r1, bool &truncated) noexcept { + uint64_t x0 = r0; + uint64_t x1 = r1; + return uint64_hi64((x0 << 32) | x1, truncated); +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool &truncated) noexcept { + uint64_t x0 = r0; + uint64_t x1 = r1; + uint64_t x2 = r2; + return uint64_hi64(x0, (x1 << 32) | x2, truncated); +} + +// add two small integers, checking for overflow. +// we want an efficient operation. for msvc, where +// we don't have built-in intrinsics, this is still +// pretty fast. +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb +scalar_add(limb x, limb y, bool &overflow) noexcept { + limb z; +// gcc and clang +#if defined(__has_builtin) +#if __has_builtin(__builtin_add_overflow) + if (!cpp20_and_in_constexpr()) { + overflow = __builtin_add_overflow(x, y, &z); + return z; + } +#endif +#endif + + // generic, this still optimizes correctly on MSVC. + z = x + y; + overflow = z < x; + return z; +} + +// multiply two small integers, getting both the high and low bits. +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb +scalar_mul(limb x, limb y, limb &carry) noexcept { +#ifdef FASTFLOAT_64BIT_LIMB +#if defined(__SIZEOF_INT128__) + // GCC and clang both define it as an extension. + __uint128_t z = __uint128_t(x) * __uint128_t(y) + __uint128_t(carry); + carry = limb(z >> limb_bits); + return limb(z); +#else + // fallback, no native 128-bit integer multiplication with carry. + // on msvc, this optimizes identically, somehow. + value128 z = full_multiplication(x, y); + bool overflow; + z.low = scalar_add(z.low, carry, overflow); + z.high += uint64_t(overflow); // cannot overflow + carry = z.high; + return z.low; +#endif +#else + uint64_t z = uint64_t(x) * uint64_t(y) + uint64_t(carry); + carry = limb(z >> limb_bits); + return limb(z); +#endif +} + +// add scalar value to bigint starting from offset. +// used in grade school multiplication +template +inline FASTFLOAT_CONSTEXPR20 bool small_add_from(stackvec &vec, limb y, + size_t start) noexcept { + size_t index = start; + limb carry = y; + bool overflow; + while (carry != 0 && index < vec.len()) { + vec[index] = scalar_add(vec[index], carry, overflow); + carry = limb(overflow); + index += 1; + } + if (carry != 0) { + FASTFLOAT_TRY(vec.try_push(carry)); + } + return true; +} + +// add scalar value to bigint. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +small_add(stackvec &vec, limb y) noexcept { + return small_add_from(vec, y, 0); +} + +// multiply bigint by scalar value. +template +inline FASTFLOAT_CONSTEXPR20 bool small_mul(stackvec &vec, + limb y) noexcept { + limb carry = 0; + for (size_t index = 0; index < vec.len(); index++) { + vec[index] = scalar_mul(vec[index], y, carry); + } + if (carry != 0) { + FASTFLOAT_TRY(vec.try_push(carry)); + } + return true; +} + +// add bigint to bigint starting from index. +// used in grade school multiplication +template +FASTFLOAT_CONSTEXPR20 bool large_add_from(stackvec &x, limb_span y, + size_t start) noexcept { + // the effective x buffer is from `xstart..x.len()`, so exit early + // if we can't get that current range. + if (x.len() < start || y.len() > x.len() - start) { + FASTFLOAT_TRY(x.try_resize(y.len() + start, 0)); + } + + bool carry = false; + for (size_t index = 0; index < y.len(); index++) { + limb xi = x[index + start]; + limb yi = y[index]; + bool c1 = false; + bool c2 = false; + xi = scalar_add(xi, yi, c1); + if (carry) { + xi = scalar_add(xi, 1, c2); + } + x[index + start] = xi; + carry = c1 | c2; + } + + // handle overflow + if (carry) { + FASTFLOAT_TRY(small_add_from(x, 1, y.len() + start)); + } + return true; +} + +// add bigint to bigint. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +large_add_from(stackvec &x, limb_span y) noexcept { + return large_add_from(x, y, 0); +} + +// grade-school multiplication algorithm +template +FASTFLOAT_CONSTEXPR20 bool long_mul(stackvec &x, limb_span y) noexcept { + limb_span xs = limb_span(x.data, x.len()); + stackvec z(xs); + limb_span zs = limb_span(z.data, z.len()); + + if (y.len() != 0) { + limb y0 = y[0]; + FASTFLOAT_TRY(small_mul(x, y0)); + for (size_t index = 1; index < y.len(); index++) { + limb yi = y[index]; + stackvec zi; + if (yi != 0) { + // re-use the same buffer throughout + zi.set_len(0); + FASTFLOAT_TRY(zi.try_extend(zs)); + FASTFLOAT_TRY(small_mul(zi, yi)); + limb_span zis = limb_span(zi.data, zi.len()); + FASTFLOAT_TRY(large_add_from(x, zis, index)); + } + } + } + + x.normalize(); + return true; +} + +// grade-school multiplication algorithm +template +FASTFLOAT_CONSTEXPR20 bool large_mul(stackvec &x, limb_span y) noexcept { + if (y.len() == 1) { + FASTFLOAT_TRY(small_mul(x, y[0])); + } else { + FASTFLOAT_TRY(long_mul(x, y)); + } + return true; +} + +template struct pow5_tables { + static constexpr uint32_t large_step = 135; + static constexpr uint64_t small_power_of_5[] = { + 1UL, + 5UL, + 25UL, + 125UL, + 625UL, + 3125UL, + 15625UL, + 78125UL, + 390625UL, + 1953125UL, + 9765625UL, + 48828125UL, + 244140625UL, + 1220703125UL, + 6103515625UL, + 30517578125UL, + 152587890625UL, + 762939453125UL, + 3814697265625UL, + 19073486328125UL, + 95367431640625UL, + 476837158203125UL, + 2384185791015625UL, + 11920928955078125UL, + 59604644775390625UL, + 298023223876953125UL, + 1490116119384765625UL, + 7450580596923828125UL, + }; +#ifdef FASTFLOAT_64BIT_LIMB + constexpr static limb large_power_of_5[] = { + 1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL, + 10482974169319127550UL, 198276706040285095UL}; +#else + constexpr static limb large_power_of_5[] = { + 4279965485U, 329373468U, 4020270615U, 2137533757U, 4287402176U, + 1057042919U, 1071430142U, 2440757623U, 381945767U, 46164893U}; +#endif +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template constexpr uint32_t pow5_tables::large_step; + +template constexpr uint64_t pow5_tables::small_power_of_5[]; + +template constexpr limb pow5_tables::large_power_of_5[]; + +#endif + +// big integer type. implements a small subset of big integer +// arithmetic, using simple algorithms since asymptotically +// faster algorithms are slower for a small number of limbs. +// all operations assume the big-integer is normalized. +struct bigint : pow5_tables<> { + // storage of the limbs, in little-endian order. + stackvec vec; + + FASTFLOAT_CONSTEXPR20 bigint() : vec() {} + bigint(const bigint &) = delete; + bigint &operator=(const bigint &) = delete; + bigint(bigint &&) = delete; + bigint &operator=(bigint &&other) = delete; + + FASTFLOAT_CONSTEXPR20 bigint(uint64_t value) : vec() { +#ifdef FASTFLOAT_64BIT_LIMB + vec.push_unchecked(value); +#else + vec.push_unchecked(uint32_t(value)); + vec.push_unchecked(uint32_t(value >> 32)); +#endif + vec.normalize(); + } + + // get the high 64 bits from the vector, and if bits were truncated. + // this is to get the significant digits for the float. + FASTFLOAT_CONSTEXPR20 uint64_t hi64(bool &truncated) const noexcept { +#ifdef FASTFLOAT_64BIT_LIMB + if (vec.len() == 0) { + return empty_hi64(truncated); + } else if (vec.len() == 1) { + return uint64_hi64(vec.rindex(0), truncated); + } else { + uint64_t result = uint64_hi64(vec.rindex(0), vec.rindex(1), truncated); + truncated |= vec.nonzero(2); + return result; + } +#else + if (vec.len() == 0) { + return empty_hi64(truncated); + } else if (vec.len() == 1) { + return uint32_hi64(vec.rindex(0), truncated); + } else if (vec.len() == 2) { + return uint32_hi64(vec.rindex(0), vec.rindex(1), truncated); + } else { + uint64_t result = + uint32_hi64(vec.rindex(0), vec.rindex(1), vec.rindex(2), truncated); + truncated |= vec.nonzero(3); + return result; + } +#endif + } + + // compare two big integers, returning the large value. + // assumes both are normalized. if the return value is + // negative, other is larger, if the return value is + // positive, this is larger, otherwise they are equal. + // the limbs are stored in little-endian order, so we + // must compare the limbs in ever order. + FASTFLOAT_CONSTEXPR20 int compare(const bigint &other) const noexcept { + if (vec.len() > other.vec.len()) { + return 1; + } else if (vec.len() < other.vec.len()) { + return -1; + } else { + for (size_t index = vec.len(); index > 0; index--) { + limb xi = vec[index - 1]; + limb yi = other.vec[index - 1]; + if (xi > yi) { + return 1; + } else if (xi < yi) { + return -1; + } + } + return 0; + } + } + + // shift left each limb n bits, carrying over to the new limb + // returns true if we were able to shift all the digits. + FASTFLOAT_CONSTEXPR20 bool shl_bits(size_t n) noexcept { + // Internally, for each item, we shift left by n, and add the previous + // right shifted limb-bits. + // For example, we transform (for u8) shifted left 2, to: + // b10100100 b01000010 + // b10 b10010001 b00001000 + FASTFLOAT_DEBUG_ASSERT(n != 0); + FASTFLOAT_DEBUG_ASSERT(n < sizeof(limb) * 8); + + size_t shl = n; + size_t shr = limb_bits - shl; + limb prev = 0; + for (size_t index = 0; index < vec.len(); index++) { + limb xi = vec[index]; + vec[index] = (xi << shl) | (prev >> shr); + prev = xi; + } + + limb carry = prev >> shr; + if (carry != 0) { + return vec.try_push(carry); + } + return true; + } + + // move the limbs left by `n` limbs. + FASTFLOAT_CONSTEXPR20 bool shl_limbs(size_t n) noexcept { + FASTFLOAT_DEBUG_ASSERT(n != 0); + if (n + vec.len() > vec.capacity()) { + return false; + } else if (!vec.is_empty()) { + // move limbs + limb *dst = vec.data + n; + const limb *src = vec.data; + std::copy_backward(src, src + vec.len(), dst + vec.len()); + // fill in empty limbs + limb *first = vec.data; + limb *last = first + n; + ::std::fill(first, last, 0); + vec.set_len(n + vec.len()); + return true; + } else { + return true; + } + } + + // move the limbs left by `n` bits. + FASTFLOAT_CONSTEXPR20 bool shl(size_t n) noexcept { + size_t rem = n % limb_bits; + size_t div = n / limb_bits; + if (rem != 0) { + FASTFLOAT_TRY(shl_bits(rem)); + } + if (div != 0) { + FASTFLOAT_TRY(shl_limbs(div)); + } + return true; + } + + // get the number of leading zeros in the bigint. + FASTFLOAT_CONSTEXPR20 int ctlz() const noexcept { + if (vec.is_empty()) { + return 0; + } else { +#ifdef FASTFLOAT_64BIT_LIMB + return leading_zeroes(vec.rindex(0)); +#else + // no use defining a specialized leading_zeroes for a 32-bit type. + uint64_t r0 = vec.rindex(0); + return leading_zeroes(r0 << 32); +#endif + } + } + + // get the number of bits in the bigint. + FASTFLOAT_CONSTEXPR20 int bit_length() const noexcept { + int lz = ctlz(); + return int(limb_bits * vec.len()) - lz; + } + + FASTFLOAT_CONSTEXPR20 bool mul(limb y) noexcept { return small_mul(vec, y); } + + FASTFLOAT_CONSTEXPR20 bool add(limb y) noexcept { return small_add(vec, y); } + + // multiply as if by 2 raised to a power. + FASTFLOAT_CONSTEXPR20 bool pow2(uint32_t exp) noexcept { return shl(exp); } + + // multiply as if by 5 raised to a power. + FASTFLOAT_CONSTEXPR20 bool pow5(uint32_t exp) noexcept { + // multiply by a power of 5 + size_t large_length = sizeof(large_power_of_5) / sizeof(limb); + limb_span large = limb_span(large_power_of_5, large_length); + while (exp >= large_step) { + FASTFLOAT_TRY(large_mul(vec, large)); + exp -= large_step; + } +#ifdef FASTFLOAT_64BIT_LIMB + uint32_t small_step = 27; + limb max_native = 7450580596923828125UL; +#else + uint32_t small_step = 13; + limb max_native = 1220703125U; +#endif + while (exp >= small_step) { + FASTFLOAT_TRY(small_mul(vec, max_native)); + exp -= small_step; + } + if (exp != 0) { + // Work around clang bug https://godbolt.org/z/zedh7rrhc + // This is similar to https://github.com/llvm/llvm-project/issues/47746, + // except the workaround described there don't work here + FASTFLOAT_TRY(small_mul( + vec, limb(((void)small_power_of_5[0], small_power_of_5[exp])))); + } + + return true; + } + + // multiply as if by 10 raised to a power. + FASTFLOAT_CONSTEXPR20 bool pow10(uint32_t exp) noexcept { + FASTFLOAT_TRY(pow5(exp)); + return pow2(exp); + } +}; + +} // namespace fast_float + +#endif + +#ifndef FASTFLOAT_DIGIT_COMPARISON_H +#define FASTFLOAT_DIGIT_COMPARISON_H + +#include +#include +#include +#include + + +namespace fast_float { + +// 1e0 to 1e19 +constexpr static uint64_t powers_of_ten_uint64[] = {1UL, + 10UL, + 100UL, + 1000UL, + 10000UL, + 100000UL, + 1000000UL, + 10000000UL, + 100000000UL, + 1000000000UL, + 10000000000UL, + 100000000000UL, + 1000000000000UL, + 10000000000000UL, + 100000000000000UL, + 1000000000000000UL, + 10000000000000000UL, + 100000000000000000UL, + 1000000000000000000UL, + 10000000000000000000UL}; + +// calculate the exponent, in scientific notation, of the number. +// this algorithm is not even close to optimized, but it has no practical +// effect on performance: in order to have a faster algorithm, we'd need +// to slow down performance for faster algorithms, and this is still fast. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int32_t +scientific_exponent(parsed_number_string_t &num) noexcept { + uint64_t mantissa = num.mantissa; + int32_t exponent = int32_t(num.exponent); + while (mantissa >= 10000) { + mantissa /= 10000; + exponent += 4; + } + while (mantissa >= 100) { + mantissa /= 100; + exponent += 2; + } + while (mantissa >= 10) { + mantissa /= 10; + exponent += 1; + } + return exponent; +} + +// this converts a native floating-point number to an extended-precision float. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +to_extended(T value) noexcept { + using equiv_uint = typename binary_format::equiv_uint; + constexpr equiv_uint exponent_mask = binary_format::exponent_mask(); + constexpr equiv_uint mantissa_mask = binary_format::mantissa_mask(); + constexpr equiv_uint hidden_bit_mask = binary_format::hidden_bit_mask(); + + adjusted_mantissa am; + int32_t bias = binary_format::mantissa_explicit_bits() - + binary_format::minimum_exponent(); + equiv_uint bits; +#if FASTFLOAT_HAS_BIT_CAST + bits = std::bit_cast(value); +#else + ::memcpy(&bits, &value, sizeof(T)); +#endif + if ((bits & exponent_mask) == 0) { + // denormal + am.power2 = 1 - bias; + am.mantissa = bits & mantissa_mask; + } else { + // normal + am.power2 = int32_t((bits & exponent_mask) >> + binary_format::mantissa_explicit_bits()); + am.power2 -= bias; + am.mantissa = (bits & mantissa_mask) | hidden_bit_mask; + } + + return am; +} + +// get the extended precision value of the halfway point between b and b+u. +// we are given a native float that represents b, so we need to adjust it +// halfway between b and b+u. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +to_extended_halfway(T value) noexcept { + adjusted_mantissa am = to_extended(value); + am.mantissa <<= 1; + am.mantissa += 1; + am.power2 -= 1; + return am; +} + +// round an extended-precision float to the nearest machine float. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void round(adjusted_mantissa &am, + callback cb) noexcept { + int32_t mantissa_shift = 64 - binary_format::mantissa_explicit_bits() - 1; + if (-am.power2 >= mantissa_shift) { + // have a denormal float + int32_t shift = -am.power2 + 1; + cb(am, std::min(shift, 64)); + // check for round-up: if rounding-nearest carried us to the hidden bit. + am.power2 = (am.mantissa < + (uint64_t(1) << binary_format::mantissa_explicit_bits())) + ? 0 + : 1; + return; + } + + // have a normal float, use the default shift. + cb(am, mantissa_shift); + + // check for carry + if (am.mantissa >= + (uint64_t(2) << binary_format::mantissa_explicit_bits())) { + am.mantissa = (uint64_t(1) << binary_format::mantissa_explicit_bits()); + am.power2++; + } + + // check for infinite: we could have carried to an infinite power + am.mantissa &= ~(uint64_t(1) << binary_format::mantissa_explicit_bits()); + if (am.power2 >= binary_format::infinite_power()) { + am.power2 = binary_format::infinite_power(); + am.mantissa = 0; + } +} + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void +round_nearest_tie_even(adjusted_mantissa &am, int32_t shift, + callback cb) noexcept { + const uint64_t mask = (shift == 64) ? UINT64_MAX : (uint64_t(1) << shift) - 1; + const uint64_t halfway = (shift == 0) ? 0 : uint64_t(1) << (shift - 1); + uint64_t truncated_bits = am.mantissa & mask; + bool is_above = truncated_bits > halfway; + bool is_halfway = truncated_bits == halfway; + + // shift digits into position + if (shift == 64) { + am.mantissa = 0; + } else { + am.mantissa >>= shift; + } + am.power2 += shift; + + bool is_odd = (am.mantissa & 1) == 1; + am.mantissa += uint64_t(cb(is_odd, is_halfway, is_above)); +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void +round_down(adjusted_mantissa &am, int32_t shift) noexcept { + if (shift == 64) { + am.mantissa = 0; + } else { + am.mantissa >>= shift; + } + am.power2 += shift; +} +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +skip_zeros(UC const *&first, UC const *last) noexcept { + uint64_t val; + while (!cpp20_and_in_constexpr() && + std::distance(first, last) >= int_cmp_len()) { + ::memcpy(&val, first, sizeof(uint64_t)); + if (val != int_cmp_zeros()) { + break; + } + first += int_cmp_len(); + } + while (first != last) { + if (*first != UC('0')) { + break; + } + first++; + } +} + +// determine if any non-zero digits were truncated. +// all characters must be valid digits. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +is_truncated(UC const *first, UC const *last) noexcept { + // do 8-bit optimizations, can just compare to 8 literal 0s. + uint64_t val; + while (!cpp20_and_in_constexpr() && + std::distance(first, last) >= int_cmp_len()) { + ::memcpy(&val, first, sizeof(uint64_t)); + if (val != int_cmp_zeros()) { + return true; + } + first += int_cmp_len(); + } + while (first != last) { + if (*first != UC('0')) { + return true; + } + ++first; + } + return false; +} +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +is_truncated(span s) noexcept { + return is_truncated(s.ptr, s.ptr + s.len()); +} + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +parse_eight_digits(const UC *&p, limb &value, size_t &counter, + size_t &count) noexcept { + value = value * 100000000 + parse_eight_digits_unrolled(p); + p += 8; + counter += 8; + count += 8; +} + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void +parse_one_digit(UC const *&p, limb &value, size_t &counter, + size_t &count) noexcept { + value = value * 10 + limb(*p - UC('0')); + p++; + counter++; + count++; +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +add_native(bigint &big, limb power, limb value) noexcept { + big.mul(power); + big.add(value); +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +round_up_bigint(bigint &big, size_t &count) noexcept { + // need to round-up the digits, but need to avoid rounding + // ....9999 to ...10000, which could cause a false halfway point. + add_native(big, 10, 1); + count++; +} + +// parse the significant digits into a big integer +template +inline FASTFLOAT_CONSTEXPR20 void +parse_mantissa(bigint &result, parsed_number_string_t &num, + size_t max_digits, size_t &digits) noexcept { + // try to minimize the number of big integer and scalar multiplication. + // therefore, try to parse 8 digits at a time, and multiply by the largest + // scalar value (9 or 19 digits) for each step. + size_t counter = 0; + digits = 0; + limb value = 0; +#ifdef FASTFLOAT_64BIT_LIMB + size_t step = 19; +#else + size_t step = 9; +#endif + + // process all integer digits. + UC const *p = num.integer.ptr; + UC const *pend = p + num.integer.len(); + skip_zeros(p, pend); + // process all digits, in increments of step per loop + while (p != pend) { + while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && + (max_digits - digits >= 8)) { + parse_eight_digits(p, value, counter, digits); + } + while (counter < step && p != pend && digits < max_digits) { + parse_one_digit(p, value, counter, digits); + } + if (digits == max_digits) { + // add the temporary value, then check if we've truncated any digits + add_native(result, limb(powers_of_ten_uint64[counter]), value); + bool truncated = is_truncated(p, pend); + if (num.fraction.ptr != nullptr) { + truncated |= is_truncated(num.fraction); + } + if (truncated) { + round_up_bigint(result, digits); + } + return; + } else { + add_native(result, limb(powers_of_ten_uint64[counter]), value); + counter = 0; + value = 0; + } + } + + // add our fraction digits, if they're available. + if (num.fraction.ptr != nullptr) { + p = num.fraction.ptr; + pend = p + num.fraction.len(); + if (digits == 0) { + skip_zeros(p, pend); + } + // process all digits, in increments of step per loop + while (p != pend) { + while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && + (max_digits - digits >= 8)) { + parse_eight_digits(p, value, counter, digits); + } + while (counter < step && p != pend && digits < max_digits) { + parse_one_digit(p, value, counter, digits); + } + if (digits == max_digits) { + // add the temporary value, then check if we've truncated any digits + add_native(result, limb(powers_of_ten_uint64[counter]), value); + bool truncated = is_truncated(p, pend); + if (truncated) { + round_up_bigint(result, digits); + } + return; + } else { + add_native(result, limb(powers_of_ten_uint64[counter]), value); + counter = 0; + value = 0; + } + } + } + + if (counter != 0) { + add_native(result, limb(powers_of_ten_uint64[counter]), value); + } +} + +template +inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +positive_digit_comp(bigint &bigmant, int32_t exponent) noexcept { + FASTFLOAT_ASSERT(bigmant.pow10(uint32_t(exponent))); + adjusted_mantissa answer; + bool truncated; + answer.mantissa = bigmant.hi64(truncated); + int bias = binary_format::mantissa_explicit_bits() - + binary_format::minimum_exponent(); + answer.power2 = bigmant.bit_length() - 64 + bias; + + round(answer, [truncated](adjusted_mantissa &a, int32_t shift) { + round_nearest_tie_even( + a, shift, + [truncated](bool is_odd, bool is_halfway, bool is_above) -> bool { + return is_above || (is_halfway && truncated) || + (is_odd && is_halfway); + }); + }); + + return answer; +} + +// the scaling here is quite simple: we have, for the real digits `m * 10^e`, +// and for the theoretical digits `n * 2^f`. Since `e` is always negative, +// to scale them identically, we do `n * 2^f * 5^-f`, so we now have `m * 2^e`. +// we then need to scale by `2^(f- e)`, and then the two significant digits +// are of the same magnitude. +template +inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa negative_digit_comp( + bigint &bigmant, adjusted_mantissa am, int32_t exponent) noexcept { + bigint &real_digits = bigmant; + int32_t real_exp = exponent; + + // get the value of `b`, rounded down, and get a bigint representation of b+h + adjusted_mantissa am_b = am; + // gcc7 buf: use a lambda to remove the noexcept qualifier bug with + // -Wnoexcept-type. + round(am_b, + [](adjusted_mantissa &a, int32_t shift) { round_down(a, shift); }); + T b; + to_float(false, am_b, b); + adjusted_mantissa theor = to_extended_halfway(b); + bigint theor_digits(theor.mantissa); + int32_t theor_exp = theor.power2; + + // scale real digits and theor digits to be same power. + int32_t pow2_exp = theor_exp - real_exp; + uint32_t pow5_exp = uint32_t(-real_exp); + if (pow5_exp != 0) { + FASTFLOAT_ASSERT(theor_digits.pow5(pow5_exp)); + } + if (pow2_exp > 0) { + FASTFLOAT_ASSERT(theor_digits.pow2(uint32_t(pow2_exp))); + } else if (pow2_exp < 0) { + FASTFLOAT_ASSERT(real_digits.pow2(uint32_t(-pow2_exp))); + } + + // compare digits, and use it to director rounding + int ord = real_digits.compare(theor_digits); + adjusted_mantissa answer = am; + round(answer, [ord](adjusted_mantissa &a, int32_t shift) { + round_nearest_tie_even( + a, shift, [ord](bool is_odd, bool _, bool __) -> bool { + (void)_; // not needed, since we've done our comparison + (void)__; // not needed, since we've done our comparison + if (ord > 0) { + return true; + } else if (ord < 0) { + return false; + } else { + return is_odd; + } + }); + }); + + return answer; +} + +// parse the significant digits as a big integer to unambiguously round the +// the significant digits. here, we are trying to determine how to round +// an extended float representation close to `b+h`, halfway between `b` +// (the float rounded-down) and `b+u`, the next positive float. this +// algorithm is always correct, and uses one of two approaches. when +// the exponent is positive relative to the significant digits (such as +// 1234), we create a big-integer representation, get the high 64-bits, +// determine if any lower bits are truncated, and use that to direct +// rounding. in case of a negative exponent relative to the significant +// digits (such as 1.2345), we create a theoretical representation of +// `b` as a big-integer type, scaled to the same binary exponent as +// the actual digits. we then compare the big integer representations +// of both, and use that to direct rounding. +template +inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +digit_comp(parsed_number_string_t &num, adjusted_mantissa am) noexcept { + // remove the invalid exponent bias + am.power2 -= invalid_am_bias; + + int32_t sci_exp = scientific_exponent(num); + size_t max_digits = binary_format::max_digits(); + size_t digits = 0; + bigint bigmant; + parse_mantissa(bigmant, num, max_digits, digits); + // can't underflow, since digits is at most max_digits. + int32_t exponent = sci_exp + 1 - int32_t(digits); + if (exponent >= 0) { + return positive_digit_comp(bigmant, exponent); + } else { + return negative_digit_comp(bigmant, am, exponent); + } +} + +} // namespace fast_float + +#endif + +#ifndef FASTFLOAT_PARSE_NUMBER_H +#define FASTFLOAT_PARSE_NUMBER_H + + +#include +#include +#include +#include +namespace fast_float { + +namespace detail { +/** + * Special case +inf, -inf, nan, infinity, -infinity. + * The case comparisons could be made much faster given that we know that the + * strings a null-free and fixed. + **/ +template +from_chars_result_t FASTFLOAT_CONSTEXPR14 parse_infnan(UC const *first, + UC const *last, + T &value) noexcept { + from_chars_result_t answer{}; + answer.ptr = first; + answer.ec = std::errc(); // be optimistic + bool minusSign = false; + if (*first == + UC('-')) { // assume first < last, so dereference without checks; + // C++17 20.19.3.(7.1) explicitly forbids '+' here + minusSign = true; + ++first; + } +#ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default + if (*first == UC('+')) { + ++first; + } +#endif + if (last - first >= 3) { + if (fastfloat_strncasecmp(first, str_const_nan(), 3)) { + answer.ptr = (first += 3); + value = minusSign ? -std::numeric_limits::quiet_NaN() + : std::numeric_limits::quiet_NaN(); + // Check for possible nan(n-char-seq-opt), C++17 20.19.3.7, + // C11 7.20.1.3.3. At least MSVC produces nan(ind) and nan(snan). + if (first != last && *first == UC('(')) { + for (UC const *ptr = first + 1; ptr != last; ++ptr) { + if (*ptr == UC(')')) { + answer.ptr = ptr + 1; // valid nan(n-char-seq-opt) + break; + } else if (!((UC('a') <= *ptr && *ptr <= UC('z')) || + (UC('A') <= *ptr && *ptr <= UC('Z')) || + (UC('0') <= *ptr && *ptr <= UC('9')) || *ptr == UC('_'))) + break; // forbidden char, not nan(n-char-seq-opt) + } + } + return answer; + } + if (fastfloat_strncasecmp(first, str_const_inf(), 3)) { + if ((last - first >= 8) && + fastfloat_strncasecmp(first + 3, str_const_inf() + 3, 5)) { + answer.ptr = first + 8; + } else { + answer.ptr = first + 3; + } + value = minusSign ? -std::numeric_limits::infinity() + : std::numeric_limits::infinity(); + return answer; + } + } + answer.ec = std::errc::invalid_argument; + return answer; +} + +/** + * Returns true if the floating-pointing rounding mode is to 'nearest'. + * It is the default on most system. This function is meant to be inexpensive. + * Credit : @mwalcott3 + */ +fastfloat_really_inline bool rounds_to_nearest() noexcept { + // https://lemire.me/blog/2020/06/26/gcc-not-nearest/ +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) + return false; +#endif + // See + // A fast function to check your floating-point rounding mode + // https://lemire.me/blog/2022/11/16/a-fast-function-to-check-your-floating-point-rounding-mode/ + // + // This function is meant to be equivalent to : + // prior: #include + // return fegetround() == FE_TONEAREST; + // However, it is expected to be much faster than the fegetround() + // function call. + // + // The volatile keywoard prevents the compiler from computing the function + // at compile-time. + // There might be other ways to prevent compile-time optimizations (e.g., + // asm). The value does not need to be std::numeric_limits::min(), any + // small value so that 1 + x should round to 1 would do (after accounting for + // excess precision, as in 387 instructions). + static volatile float fmin = std::numeric_limits::min(); + float fmini = fmin; // we copy it so that it gets loaded at most once. +// +// Explanation: +// Only when fegetround() == FE_TONEAREST do we have that +// fmin + 1.0f == 1.0f - fmin. +// +// FE_UPWARD: +// fmin + 1.0f > 1 +// 1.0f - fmin == 1 +// +// FE_DOWNWARD or FE_TOWARDZERO: +// fmin + 1.0f == 1 +// 1.0f - fmin < 1 +// +// Note: This may fail to be accurate if fast-math has been +// enabled, as rounding conventions may not apply. +#ifdef FASTFLOAT_VISUAL_STUDIO +#pragma warning(push) +// todo: is there a VS warning? +// see +// https://stackoverflow.com/questions/46079446/is-there-a-warning-for-floating-point-equality-checking-in-visual-studio-2013 +#elif defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wfloat-equal" +#elif defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wfloat-equal" +#endif + return (fmini + 1.0f == 1.0f - fmini); +#ifdef FASTFLOAT_VISUAL_STUDIO +#pragma warning(pop) +#elif defined(__clang__) +#pragma clang diagnostic pop +#elif defined(__GNUC__) +#pragma GCC diagnostic pop +#endif +} + +} // namespace detail + +template struct from_chars_caller { + template + FASTFLOAT_CONSTEXPR20 static from_chars_result_t + call(UC const *first, UC const *last, T &value, + parse_options_t options) noexcept { + return from_chars_advanced(first, last, value, options); + } +}; + +#if __STDCPP_FLOAT32_T__ == 1 +template <> struct from_chars_caller { + template + FASTFLOAT_CONSTEXPR20 static from_chars_result_t + call(UC const *first, UC const *last, std::float32_t &value, + parse_options_t options) noexcept { + // if std::float32_t is defined, and we are in C++23 mode; macro set for + // float32; set value to float due to equivalence between float and + // float32_t + float val; + auto ret = from_chars_advanced(first, last, val, options); + value = val; + return ret; + } +}; +#endif + +#if __STDCPP_FLOAT64_T__ == 1 +template <> struct from_chars_caller { + template + FASTFLOAT_CONSTEXPR20 static from_chars_result_t + call(UC const *first, UC const *last, std::float64_t &value, + parse_options_t options) noexcept { + // if std::float64_t is defined, and we are in C++23 mode; macro set for + // float64; set value as double due to equivalence between double and + // float64_t + double val; + auto ret = from_chars_advanced(first, last, val, options); + value = val; + return ret; + } +}; +#endif + +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars(UC const *first, UC const *last, T &value, + chars_format fmt /*= chars_format::general*/) noexcept { + return from_chars_caller::call(first, last, value, + parse_options_t(fmt)); +} + +/** + * This function overload takes parsed_number_string_t structure that is created + * and populated either by from_chars_advanced function taking chars range and + * parsing options or other parsing custom function implemented by user. + */ +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars_advanced(parsed_number_string_t &pns, T &value) noexcept { + + static_assert(is_supported_float_type(), + "only some floating-point types are supported"); + static_assert(is_supported_char_type(), + "only char, wchar_t, char16_t and char32_t are supported"); + + from_chars_result_t answer; + + answer.ec = std::errc(); // be optimistic + answer.ptr = pns.lastmatch; + // The implementation of the Clinger's fast path is convoluted because + // we want round-to-nearest in all cases, irrespective of the rounding mode + // selected on the thread. + // We proceed optimistically, assuming that detail::rounds_to_nearest() + // returns true. + if (binary_format::min_exponent_fast_path() <= pns.exponent && + pns.exponent <= binary_format::max_exponent_fast_path() && + !pns.too_many_digits) { + // Unfortunately, the conventional Clinger's fast path is only possible + // when the system rounds to the nearest float. + // + // We expect the next branch to almost always be selected. + // We could check it first (before the previous branch), but + // there might be performance advantages at having the check + // be last. + if (!cpp20_and_in_constexpr() && detail::rounds_to_nearest()) { + // We have that fegetround() == FE_TONEAREST. + // Next is Clinger's fast path. + if (pns.mantissa <= binary_format::max_mantissa_fast_path()) { + value = T(pns.mantissa); + if (pns.exponent < 0) { + value = value / binary_format::exact_power_of_ten(-pns.exponent); + } else { + value = value * binary_format::exact_power_of_ten(pns.exponent); + } + if (pns.negative) { + value = -value; + } + return answer; + } + } else { + // We do not have that fegetround() == FE_TONEAREST. + // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's + // proposal + if (pns.exponent >= 0 && + pns.mantissa <= + binary_format::max_mantissa_fast_path(pns.exponent)) { +#if defined(__clang__) || defined(FASTFLOAT_32BIT) + // Clang may map 0 to -0.0 when fegetround() == FE_DOWNWARD + if (pns.mantissa == 0) { + value = pns.negative ? T(-0.) : T(0.); + return answer; + } +#endif + value = T(pns.mantissa) * + binary_format::exact_power_of_ten(pns.exponent); + if (pns.negative) { + value = -value; + } + return answer; + } + } + } + adjusted_mantissa am = + compute_float>(pns.exponent, pns.mantissa); + if (pns.too_many_digits && am.power2 >= 0) { + if (am != compute_float>(pns.exponent, pns.mantissa + 1)) { + am = compute_error>(pns.exponent, pns.mantissa); + } + } + // If we called compute_float>(pns.exponent, pns.mantissa) + // and we have an invalid power (am.power2 < 0), then we need to go the long + // way around again. This is very uncommon. + if (am.power2 < 0) { + am = digit_comp(pns, am); + } + to_float(pns.negative, am, value); + // Test for over/underflow. + if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) || + am.power2 == binary_format::infinite_power()) { + answer.ec = std::errc::result_out_of_range; + } + return answer; +} + +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars_advanced(UC const *first, UC const *last, T &value, + parse_options_t options) noexcept { + + static_assert(is_supported_float_type(), + "only some floating-point types are supported"); + static_assert(is_supported_char_type(), + "only char, wchar_t, char16_t and char32_t are supported"); + + from_chars_result_t answer; +#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default + while ((first != last) && fast_float::is_space(uint8_t(*first))) { + first++; + } +#endif + if (first == last) { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } + parsed_number_string_t pns = + parse_number_string(first, last, options); + if (!pns.valid) { + if (options.format & chars_format::no_infnan) { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } else { + return detail::parse_infnan(first, last, value); + } + } + + // call overload that takes parsed_number_string_t directly. + return from_chars_advanced(pns, value); +} + +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars(UC const *first, UC const *last, T &value, int base) noexcept { + static_assert(is_supported_char_type(), + "only char, wchar_t, char16_t and char32_t are supported"); + + from_chars_result_t answer; +#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default + while ((first != last) && fast_float::is_space(uint8_t(*first))) { + first++; + } +#endif + if (first == last || base < 2 || base > 36) { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } + return parse_int_string(first, last, value, base); +} + +} // namespace fast_float + +#endif diff --git a/deps/fast_float_c_interface/Makefile b/deps/fast_float_c_interface/Makefile new file mode 100644 index 0000000000..4db3efe2c3 --- /dev/null +++ b/deps/fast_float_c_interface/Makefile @@ -0,0 +1,37 @@ +CCCOLOR:="\033[34m" +SRCCOLOR:="\033[33m" +ENDCOLOR:="\033[0m" + +CXX?=c++ +# we need = instead of := so that $@ in QUIET_CXX gets evaluated in the rule and is assigned appropriate value. +TEMP:=$(CXX) +QUIET_CXX=@printf ' %b %b\n' $(CCCOLOR)C++$(ENDCOLOR) $(SRCCOLOR)$@$(ENDCOLOR) 1>&2; +CXX=$(QUIET_CXX)$(TEMP) + +WARN=-Wall -W -Wno-missing-field-initializers + +STD=-pedantic -std=c++11 + +OPT?=-O3 +CLANG := $(findstring clang,$(shell sh -c '$(CC) --version | head -1')) +ifeq ($(OPT),-O3) + ifeq (clang,$(CLANG)) + OPT+=-flto + else + OPT+=-flto=auto -ffat-lto-objects + endif +endif + +# 1) Today src/Makefile passes -m32 flag for explicit 32-bit build on 64-bit machine, via CFLAGS. For 32-bit build on +# 32-bit machine and 64-bit on 64-bit machine, CFLAGS are empty. No other flags are set that can conflict with C++, +# therefore let's use CFLAGS without changes for now. +# 2) FASTFLOAT_ALLOWS_LEADING_PLUS allows +inf to be parsed as inf, instead of error. +CXXFLAGS=$(STD) $(OPT) $(WARN) -static -fPIC -fno-exceptions $(CFLAGS) -D FASTFLOAT_ALLOWS_LEADING_PLUS + +.PHONY: all clean + +all: fast_float_strtod.o + +clean: + rm -f *.o || true; + diff --git a/deps/fast_float_c_interface/fast_float_strtod.cpp b/deps/fast_float_c_interface/fast_float_strtod.cpp new file mode 100644 index 0000000000..8e5d19470f --- /dev/null +++ b/deps/fast_float_c_interface/fast_float_strtod.cpp @@ -0,0 +1,24 @@ +/* + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + +#include "../fast_float/fast_float.h" +#include + +extern "C" +{ + double fast_float_strtod(const char *str, const char** endptr) + { + double temp = 0; + auto answer = fast_float::from_chars(str, str + strlen(str), temp); + if (answer.ec != std::errc()) { + errno = (answer.ec == std::errc::result_out_of_range) ? ERANGE : EINVAL; + } + if (endptr) { + *endptr = answer.ptr; + } + return temp; + } +} diff --git a/src/Makefile b/src/Makefile index f876f55dec..0cbf5763cb 100644 --- a/src/Makefile +++ b/src/Makefile @@ -424,6 +424,17 @@ ENGINE_TEST_OBJ:=$(sort $(patsubst unit/%.c,unit/%.o,$(ENGINE_TEST_FILES))) ENGINE_UNIT_TESTS:=$(ENGINE_NAME)-unit-tests$(PROG_SUFFIX) ALL_SOURCES=$(sort $(patsubst %.o,%.c,$(ENGINE_SERVER_OBJ) $(ENGINE_CLI_OBJ) $(ENGINE_BENCHMARK_OBJ))) +USE_FAST_FLOAT?=no +ifeq ($(USE_FAST_FLOAT),yes) + # valkey_strtod.h uses this flag to switch valkey_strtod function to fast_float_strtod, + # therefore let's pass it to compiler for preprocessing. + FINAL_CFLAGS += -D USE_FAST_FLOAT + # next, let's build and add actual library containing fast_float_strtod function for linking. + DEPENDENCY_TARGETS += fast_float_c_interface + FAST_FLOAT_STRTOD_OBJECT := ../deps/fast_float_c_interface/fast_float_strtod.o + FINAL_LIBS += $(FAST_FLOAT_STRTOD_OBJECT) +endif + all: $(SERVER_NAME) $(ENGINE_SENTINEL_NAME) $(ENGINE_CLI_NAME) $(ENGINE_BENCHMARK_NAME) $(ENGINE_CHECK_RDB_NAME) $(ENGINE_CHECK_AOF_NAME) $(TLS_MODULE) $(RDMA_MODULE) @echo "" @echo "Hint: It's a good idea to run 'make test' ;)" @@ -588,7 +599,7 @@ bench: $(ENGINE_BENCHMARK_NAME) 32bit: @echo "" - @echo "WARNING: if it fails under Linux you probably need to install libc6-dev-i386" + @echo "WARNING: if it fails under Linux you probably need to install libc6-dev-i386 and libstdc++-11-dev-i386-cross" @echo "" $(MAKE) all-with-unit-tests CFLAGS="-m32" LDFLAGS="-m32" diff --git a/src/debug.c b/src/debug.c index 082e20a3b6..38b66dacb5 100644 --- a/src/debug.c +++ b/src/debug.c @@ -46,6 +46,8 @@ #include #include +#include "valkey_strtod.h" + #ifdef HAVE_BACKTRACE #include #ifndef __OpenBSD__ @@ -846,7 +848,7 @@ void debugCommand(client *c) { "string|integer|double|bignum|null|array|set|map|attrib|push|verbatim|true|false"); } } else if (!strcasecmp(c->argv[1]->ptr, "sleep") && c->argc == 3) { - double dtime = strtod(c->argv[2]->ptr, NULL); + double dtime = valkey_strtod(c->argv[2]->ptr, NULL); long long utime = dtime * 1000000; struct timespec tv; diff --git a/src/resp_parser.c b/src/resp_parser.c index 950d2227b7..101e883d2f 100644 --- a/src/resp_parser.c +++ b/src/resp_parser.c @@ -58,6 +58,8 @@ #include "resp_parser.h" #include "server.h" +#include "valkey_strtod.h" + static int parseBulk(ReplyParser *parser, void *p_ctx) { const char *proto = parser->curr_location; char *p = strchr(proto + 1, '\r'); @@ -150,13 +152,11 @@ static int parseDouble(ReplyParser *parser, void *p_ctx) { parser->curr_location = p + 2; /* for \r\n */ char buf[MAX_LONG_DOUBLE_CHARS + 1]; size_t len = p - proto - 1; - double d; + double d = 0; if (len <= MAX_LONG_DOUBLE_CHARS) { memcpy(buf, proto + 1, len); buf[len] = '\0'; - d = strtod(buf, NULL); /* We expect a valid representation. */ - } else { - d = 0; + d = valkey_strtod(buf, NULL); /* We expect a valid representation. */ } parser->callbacks.double_callback(p_ctx, d, proto, parser->curr_location - proto); return C_OK; diff --git a/src/sort.c b/src/sort.c index 92777b068c..ad0496da79 100644 --- a/src/sort.c +++ b/src/sort.c @@ -34,6 +34,8 @@ #include /* isnan() */ #include "cluster.h" +#include "valkey_strtod.h" + zskiplistNode *zslGetElementByRank(zskiplist *zsl, unsigned long rank); serverSortOperation *createSortOperation(int type, robj *pattern) { @@ -479,9 +481,9 @@ void sortCommandGeneric(client *c, int readonly) { } else { if (sdsEncodedObject(byval)) { char *eptr; - - vector[j].u.score = strtod(byval->ptr, &eptr); - if (eptr[0] != '\0' || errno == ERANGE || isnan(vector[j].u.score)) { + errno = 0; + vector[j].u.score = valkey_strtod(byval->ptr, &eptr); + if (eptr[0] != '\0' || errno == ERANGE || errno == EINVAL || isnan(vector[j].u.score)) { int_conversion_error = 1; } } else if (byval->encoding == OBJ_ENCODING_INT) { diff --git a/src/t_zset.c b/src/t_zset.c index 069ab0924a..a1e71208cb 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -60,6 +60,8 @@ #include "intset.h" /* Compact integer set structure */ #include +#include "valkey_strtod.h" + /*----------------------------------------------------------------------------- * Skiplist implementation of the low level API *----------------------------------------------------------------------------*/ @@ -546,11 +548,11 @@ static int zslParseRange(robj *min, robj *max, zrangespec *spec) { spec->min = (long)min->ptr; } else { if (((char *)min->ptr)[0] == '(') { - spec->min = strtod((char *)min->ptr + 1, &eptr); + spec->min = valkey_strtod((char *)min->ptr + 1, &eptr); if (eptr[0] != '\0' || isnan(spec->min)) return C_ERR; spec->minex = 1; } else { - spec->min = strtod((char *)min->ptr, &eptr); + spec->min = valkey_strtod((char *)min->ptr, &eptr); if (eptr[0] != '\0' || isnan(spec->min)) return C_ERR; } } @@ -558,11 +560,11 @@ static int zslParseRange(robj *min, robj *max, zrangespec *spec) { spec->max = (long)max->ptr; } else { if (((char *)max->ptr)[0] == '(') { - spec->max = strtod((char *)max->ptr + 1, &eptr); + spec->max = valkey_strtod((char *)max->ptr + 1, &eptr); if (eptr[0] != '\0' || isnan(spec->max)) return C_ERR; spec->maxex = 1; } else { - spec->max = strtod((char *)max->ptr, &eptr); + spec->max = valkey_strtod((char *)max->ptr, &eptr); if (eptr[0] != '\0' || isnan(spec->max)) return C_ERR; } } @@ -757,7 +759,7 @@ double zzlStrtod(unsigned char *vstr, unsigned int vlen) { if (vlen > sizeof(buf) - 1) vlen = sizeof(buf) - 1; memcpy(buf, vstr, vlen); buf[vlen] = '\0'; - return strtod(buf, NULL); + return valkey_strtod(buf, NULL); } double zzlGetScore(unsigned char *sptr) { diff --git a/src/unit/test_files.h b/src/unit/test_files.h index 87bc031fb4..6ab7373007 100644 --- a/src/unit/test_files.h +++ b/src/unit/test_files.h @@ -166,6 +166,7 @@ int test_ld2string(int argc, char **argv, int flags); int test_fixedpoint_d2string(int argc, char **argv, int flags); int test_version2num(int argc, char **argv, int flags); int test_reclaimFilePageCache(int argc, char **argv, int flags); +int test_valkey_strtod(int argc, char **argv, int flags); int test_ziplistCreateIntList(int argc, char **argv, int flags); int test_ziplistPop(int argc, char **argv, int flags); int test_ziplistGetElementAtIndex3(int argc, char **argv, int flags); @@ -220,6 +221,7 @@ unitTest __test_rax_c[] = {{"test_raxRandomWalk", test_raxRandomWalk}, {"test_ra unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {"test_sdssplitargs", test_sdssplitargs}, {NULL, NULL}}; unitTest __test_sha1_c[] = {{"test_sha1", test_sha1}, {NULL, NULL}}; unitTest __test_util_c[] = {{"test_string2ll", test_string2ll}, {"test_string2l", test_string2l}, {"test_ll2string", test_ll2string}, {"test_ld2string", test_ld2string}, {"test_fixedpoint_d2string", test_fixedpoint_d2string}, {"test_version2num", test_version2num}, {"test_reclaimFilePageCache", test_reclaimFilePageCache}, {NULL, NULL}}; +unitTest __test_valkey_strtod_c[] = {{"test_valkey_strtod", test_valkey_strtod}, {NULL, NULL}}; unitTest __test_ziplist_c[] = {{"test_ziplistCreateIntList", test_ziplistCreateIntList}, {"test_ziplistPop", test_ziplistPop}, {"test_ziplistGetElementAtIndex3", test_ziplistGetElementAtIndex3}, {"test_ziplistGetElementOutOfRange", test_ziplistGetElementOutOfRange}, {"test_ziplistGetLastElement", test_ziplistGetLastElement}, {"test_ziplistGetFirstElement", test_ziplistGetFirstElement}, {"test_ziplistGetElementOutOfRangeReverse", test_ziplistGetElementOutOfRangeReverse}, {"test_ziplistIterateThroughFullList", test_ziplistIterateThroughFullList}, {"test_ziplistIterateThroughListFrom1ToEnd", test_ziplistIterateThroughListFrom1ToEnd}, {"test_ziplistIterateThroughListFrom2ToEnd", test_ziplistIterateThroughListFrom2ToEnd}, {"test_ziplistIterateThroughStartOutOfRange", test_ziplistIterateThroughStartOutOfRange}, {"test_ziplistIterateBackToFront", test_ziplistIterateBackToFront}, {"test_ziplistIterateBackToFrontDeletingAllItems", test_ziplistIterateBackToFrontDeletingAllItems}, {"test_ziplistDeleteInclusiveRange0To0", test_ziplistDeleteInclusiveRange0To0}, {"test_ziplistDeleteInclusiveRange0To1", test_ziplistDeleteInclusiveRange0To1}, {"test_ziplistDeleteInclusiveRange1To2", test_ziplistDeleteInclusiveRange1To2}, {"test_ziplistDeleteWithStartIndexOutOfRange", test_ziplistDeleteWithStartIndexOutOfRange}, {"test_ziplistDeleteWithNumOverflow", test_ziplistDeleteWithNumOverflow}, {"test_ziplistDeleteFooWhileIterating", test_ziplistDeleteFooWhileIterating}, {"test_ziplistReplaceWithSameSize", test_ziplistReplaceWithSameSize}, {"test_ziplistReplaceWithDifferentSize", test_ziplistReplaceWithDifferentSize}, {"test_ziplistRegressionTestForOver255ByteStrings", test_ziplistRegressionTestForOver255ByteStrings}, {"test_ziplistRegressionTestDeleteNextToLastEntries", test_ziplistRegressionTestDeleteNextToLastEntries}, {"test_ziplistCreateLongListAndCheckIndices", test_ziplistCreateLongListAndCheckIndices}, {"test_ziplistCompareStringWithZiplistEntries", test_ziplistCompareStringWithZiplistEntries}, {"test_ziplistMergeTest", test_ziplistMergeTest}, {"test_ziplistStressWithRandomPayloadsOfDifferentEncoding", test_ziplistStressWithRandomPayloadsOfDifferentEncoding}, {"test_ziplistCascadeUpdateEdgeCases", test_ziplistCascadeUpdateEdgeCases}, {"test_ziplistInsertEdgeCase", test_ziplistInsertEdgeCase}, {"test_ziplistStressWithVariableSize", test_ziplistStressWithVariableSize}, {"test_BenchmarkziplistFind", test_BenchmarkziplistFind}, {"test_BenchmarkziplistIndex", test_BenchmarkziplistIndex}, {"test_BenchmarkziplistValidateIntegrity", test_BenchmarkziplistValidateIntegrity}, {"test_BenchmarkziplistCompareWithString", test_BenchmarkziplistCompareWithString}, {"test_BenchmarkziplistCompareWithNumber", test_BenchmarkziplistCompareWithNumber}, {"test_ziplistStress__ziplistCascadeUpdate", test_ziplistStress__ziplistCascadeUpdate}, {NULL, NULL}}; unitTest __test_zipmap_c[] = {{"test_zipmapIterateWithLargeKey", test_zipmapIterateWithLargeKey}, {"test_zipmapIterateThroughElements", test_zipmapIterateThroughElements}, {NULL, NULL}}; unitTest __test_zmalloc_c[] = {{"test_zmallocInitialUsedMemory", test_zmallocInitialUsedMemory}, {"test_zmallocAllocReallocCallocAndFree", test_zmallocAllocReallocCallocAndFree}, {"test_zmallocAllocZeroByteAndFree", test_zmallocAllocZeroByteAndFree}, {NULL, NULL}}; @@ -240,6 +242,7 @@ struct unitTestSuite { {"test_sds.c", __test_sds_c}, {"test_sha1.c", __test_sha1_c}, {"test_util.c", __test_util_c}, + {"test_valkey_strtod.c", __test_valkey_strtod_c}, {"test_ziplist.c", __test_ziplist_c}, {"test_zipmap.c", __test_zipmap_c}, {"test_zmalloc.c", __test_zmalloc_c}, diff --git a/src/unit/test_valkey_strtod.c b/src/unit/test_valkey_strtod.c new file mode 100644 index 0000000000..4796d7a5b6 --- /dev/null +++ b/src/unit/test_valkey_strtod.c @@ -0,0 +1,36 @@ +/* + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + + +#include "../valkey_strtod.h" +#include "errno.h" +#include "math.h" +#include "test_help.h" + +int test_valkey_strtod(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + errno = 0; + double value = valkey_strtod("231.2341234", NULL); + TEST_ASSERT(value == 231.2341234); + TEST_ASSERT(errno == 0); + + value = valkey_strtod("+inf", NULL); + TEST_ASSERT(isinf(value)); + TEST_ASSERT(errno == 0); + + value = valkey_strtod("-inf", NULL); + TEST_ASSERT(isinf(value)); + TEST_ASSERT(errno == 0); + + value = valkey_strtod("inf", NULL); + TEST_ASSERT(isinf(value)); + TEST_ASSERT(errno == 0); + + return 0; +} diff --git a/src/util.c b/src/util.c index b1235c2822..0b7af2d3fa 100644 --- a/src/util.c +++ b/src/util.c @@ -51,6 +51,8 @@ #include "sha256.h" #include "config.h" +#include "valkey_strtod.h" + #define UNUSED(x) ((void)(x)) /* Glob-style pattern matching. */ @@ -595,10 +597,12 @@ int string2ld(const char *s, size_t slen, long double *dp) { int string2d(const char *s, size_t slen, double *dp) { errno = 0; char *eptr; - *dp = strtod(s, &eptr); + *dp = valkey_strtod(s, &eptr); if (slen == 0 || isspace(((const char *)s)[0]) || (size_t)(eptr - (char *)s) != slen || - (errno == ERANGE && (*dp == HUGE_VAL || *dp == -HUGE_VAL || fpclassify(*dp) == FP_ZERO)) || isnan(*dp)) + (errno == ERANGE && (*dp == HUGE_VAL || *dp == -HUGE_VAL || fpclassify(*dp) == FP_ZERO)) || isnan(*dp) || errno == EINVAL) { + errno = 0; return 0; + } return 1; } diff --git a/src/valkey-cli.c b/src/valkey-cli.c index dc31981483..4416e09431 100644 --- a/src/valkey-cli.c +++ b/src/valkey-cli.c @@ -65,6 +65,8 @@ #include "mt19937-64.h" #include "cli_commands.h" +#include "valkey_strtod.h" + #define UNUSED(V) ((void)V) #define OUTPUT_STANDARD 0 @@ -2537,9 +2539,10 @@ static int parseOptions(int argc, char **argv) { exit(1); } } else if (!strcmp(argv[i], "-t") && !lastarg) { + errno = 0; char *eptr; - double seconds = strtod(argv[++i], &eptr); - if (eptr[0] != '\0' || isnan(seconds) || seconds < 0.0) { + double seconds = valkey_strtod(argv[++i], &eptr); + if (eptr[0] != '\0' || isnan(seconds) || seconds < 0.0 || errno == EINVAL || errno == ERANGE) { fprintf(stderr, "Invalid connection timeout for -t.\n"); exit(1); } diff --git a/src/valkey_strtod.h b/src/valkey_strtod.h new file mode 100644 index 0000000000..037a3f3cec --- /dev/null +++ b/src/valkey_strtod.h @@ -0,0 +1,42 @@ +#ifndef FAST_FLOAT_STRTOD_H +#define FAST_FLOAT_STRTOD_H + +#ifdef USE_FAST_FLOAT + +#include "errno.h" + +/** + * Converts a null-terminated byte string to a double using the fast_float library. + * + * This function provides a C-compatible wrapper around the fast_float library's string-to-double + * conversion functionality. It aims to offer a faster alternative to the standard strtod function. + * + * str: A pointer to the null-terminated byte string to be converted. + * eptr: On success, stores char pointer pointing to '\0' at the end of the string. + * On failure, stores char pointer pointing to first invalid character in the string. + * returns: On success, the function returns the converted double value. + * On failure, it returns 0.0 and stores error code in errno to ERANGE or EINVAL. + * + * note: This function uses the fast_float library (https://github.com/fastfloat/fast_float) for + * the actual conversion, which can be significantly faster than standard library functions. + * Refer to "../deps/fast_float_c_interface" for more details. + * Refer to https://github.com/fastfloat/fast_float for more information on the underlying library. + */ +double fast_float_strtod(const char *str, char **endptr); + +static inline double valkey_strtod(const char *str, char **endptr) { + errno = 0; + return fast_float_strtod(str, endptr); +} + +#else + +#include + +static inline double valkey_strtod(const char *str, char **endptr) { + return strtod(str, endptr); +} + +#endif + +#endif // FAST_FLOAT_STRTOD_H diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 7c15413806..1f0658071a 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -35,12 +35,12 @@ foreach test_dir $test_dirs { set cluster_test_dir unit/cluster foreach file [glob -nocomplain $dir/tests/$cluster_test_dir/*.tcl] { - lappend ::cluster_all_tests $cluster_test_dir/[file root [file tail $file]] + lappend ::cluster_all_tests $cluster_test_dir/[file root [file tail $file]] } set moduleapi_test_dir unit/moduleapi foreach file [glob -nocomplain $dir/tests/$moduleapi_test_dir/*.tcl] { - lappend ::module_api_all_tests $moduleapi_test_dir/[file root [file tail $file]] + lappend ::module_api_all_tests $moduleapi_test_dir/[file root [file tail $file]] } # Index to the next test to run in the ::all_tests list. @@ -654,7 +654,7 @@ for {set j 0} {$j < [llength $argv]} {incr j} { } } elseif {$opt eq {--quiet}} { set ::quiet 1 - } elseif {$opt eq {--io-threads}} { + } elseif {$opt eq {--io-threads}} { set ::io_threads 1 } elseif {$opt eq {--tls} || $opt eq {--tls-module}} { package require tls 1.6 From cf1a1e0931bd2db77c23b0058d8660461873dc8f Mon Sep 17 00:00:00 2001 From: Ray Cao Date: Mon, 25 Nov 2024 23:16:46 +0800 Subject: [PATCH 60/92] Optimize sdscatrepr by batch processing printable characters (#1342) Optimize sdscatrepr by reducing realloc calls, furthermore, we can reduce memcpy calls by batch processing of consecutive printable characters. Signed-off-by: Ray Cao Co-authored-by: Ray Cao --- src/sds.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/src/sds.c b/src/sds.c index 4dd7d709aa..ee7a2c0f97 100644 --- a/src/sds.c +++ b/src/sds.c @@ -954,23 +954,30 @@ void sdsfreesplitres(sds *tokens, int count) { sds sdscatrepr(sds s, const char *p, size_t len) { s = sdsMakeRoomFor(s, len + 2); s = sdscatlen(s, "\"", 1); - while (len--) { - switch (*p) { - case '\\': - case '"': s = sdscatprintf(s, "\\%c", *p); break; - case '\n': s = sdscatlen(s, "\\n", 2); break; - case '\r': s = sdscatlen(s, "\\r", 2); break; - case '\t': s = sdscatlen(s, "\\t", 2); break; - case '\a': s = sdscatlen(s, "\\a", 2); break; - case '\b': s = sdscatlen(s, "\\b", 2); break; - default: - if (isprint(*p)) - s = sdscatlen(s, p, 1); - else + while (len) { + if (isprint(*p)) { + const char *start = p; + while (len && isprint(*p)) { + len--; + p++; + } + s = sdscatlen(s, start, p - start); + } else { + switch (*p) { + case '\\': + case '"': s = sdscatprintf(s, "\\%c", *p); break; + case '\n': s = sdscatlen(s, "\\n", 2); break; + case '\r': s = sdscatlen(s, "\\r", 2); break; + case '\t': s = sdscatlen(s, "\\t", 2); break; + case '\a': s = sdscatlen(s, "\\a", 2); break; + case '\b': s = sdscatlen(s, "\\b", 2); break; + default: s = sdscatprintf(s, "\\x%02x", (unsigned char)*p); - break; + break; + } + p++; + len--; } - p++; } return sdscatlen(s, "\"", 1); } From 2d48a39c2781e72200b2e360ef250009c6701711 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 25 Nov 2024 23:56:51 +0800 Subject: [PATCH 61/92] Save open's errno when opening temp rdb fails to prevent it from being modified (#1347) Apparently on Mac, sleep will modify errno to ETIMEDOUT, and then it prints the misleading message: Operation timed out. Signed-off-by: Binbin --- src/replication.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/replication.c b/src/replication.c index dcf7ee3f8c..92dcc3a105 100644 --- a/src/replication.c +++ b/src/replication.c @@ -3688,7 +3688,11 @@ void syncWithPrimary(connection *conn) { snprintf(tmpfile, 256, "temp-%d.%ld.rdb", (int)server.unixtime, (long int)getpid()); dfd = open(tmpfile, O_CREAT | O_WRONLY | O_EXCL, 0644); if (dfd != -1) break; + /* We save the errno of open to prevent some systems from modifying it after + * the sleep call. For example, sleep in Mac will change errno to ETIMEDOUT. */ + int saved_errno = errno; sleep(1); + errno = saved_errno; } if (dfd == -1) { serverLog(LL_WARNING, "Opening the temp file needed for PRIMARY <-> REPLICA synchronization: %s", From 469d41fb37d7c88d508b0a8c7ac495a8f00c717f Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 26 Nov 2024 00:00:47 +0800 Subject: [PATCH 62/92] Avoid double close on repl_transfer_fd (#1349) The code is ok before 2de544cfcc6d1aa7cf6d0c75a6116f7fc27b6fd6, but now we will set server.repl_transfer_fd right after dfd was initiated, and in here we have a double close error since dfd and server.repl_transfer_fd are the same fd. Also move the declaration of dfd/maxtries to a small scope to avoid the confusion since they are only used in this code. Signed-off-by: Binbin --- src/replication.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/replication.c b/src/replication.c index 92dcc3a105..97aa10dfab 100644 --- a/src/replication.c +++ b/src/replication.c @@ -3414,7 +3414,6 @@ void dualChannelSetupMainConnForPsync(connection *conn) { * establish a connection with the primary. */ void syncWithPrimary(connection *conn) { char tmpfile[256], *err = NULL; - int dfd = -1, maxtries = 5; int psync_result; /* If this event fired after the user turned the instance into a primary @@ -3684,6 +3683,7 @@ void syncWithPrimary(connection *conn) { /* Prepare a suitable temp file for bulk transfer */ if (!useDisklessLoad()) { + int dfd = -1, maxtries = 5; while (maxtries--) { snprintf(tmpfile, 256, "temp-%d.%ld.rdb", (int)server.unixtime, (long int)getpid()); dfd = open(tmpfile, O_CREAT | O_WRONLY | O_EXCL, 0644); @@ -3744,7 +3744,6 @@ void syncWithPrimary(connection *conn) { /* Fall through to regular error handling */ error: - if (dfd != -1) close(dfd); connClose(conn); server.repl_transfer_s = NULL; if (server.repl_rdb_transfer_s) { From 9305b49145172da781b8af2b5b96f9643e4367ec Mon Sep 17 00:00:00 2001 From: Amit Nagler <58042354+naglera@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:51:52 +0200 Subject: [PATCH 63/92] Add tag for dual-channel logs (#999) This PR introduces a consistent tagging system for dual-channel logs. The goal is to improve log readability and filterability, making it easier for operators to manage and analyze log entries. Resolves https://github.com/valkey-io/valkey/issues/986 --------- Signed-off-by: naglera --- src/networking.c | 21 ++-- src/replication.c | 102 +++++++++--------- src/server.h | 5 + .../integration/dual-channel-replication.tcl | 2 +- 4 files changed, 70 insertions(+), 60 deletions(-) diff --git a/src/networking.c b/src/networking.c index 93aa9d00ae..9c51efc537 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1713,10 +1713,10 @@ void freeClient(client *c) { /* Log link disconnection with replica */ if (getClientType(c) == CLIENT_TYPE_REPLICA) { - serverLog(LL_NOTICE, - c->flag.repl_rdb_channel ? "Replica %s rdb channel disconnected." - : "Connection with replica %s lost.", - replicationGetReplicaName(c)); + if (c->flag.repl_rdb_channel) + dualChannelServerLog(LL_NOTICE, "Replica %s rdb channel disconnected.", replicationGetReplicaName(c)); + else + serverLog(LL_NOTICE, "Connection with replica %s lost.", replicationGetReplicaName(c)); } /* Free the query buffer */ @@ -1963,14 +1963,15 @@ int freeClientsInAsyncFreeQueue(void) { if (!c->rdb_client_disconnect_time) { if (c->conn) connSetReadHandler(c->conn, NULL); c->rdb_client_disconnect_time = server.unixtime; - serverLog(LL_VERBOSE, "Postpone RDB client id=%llu (%s) free for %d seconds", (unsigned long long)c->id, - replicationGetReplicaName(c), server.wait_before_rdb_client_free); + dualChannelServerLog(LL_VERBOSE, "Postpone RDB client id=%llu (%s) free for %d seconds", + (unsigned long long)c->id, replicationGetReplicaName(c), server.wait_before_rdb_client_free); } if (server.unixtime - c->rdb_client_disconnect_time <= server.wait_before_rdb_client_free) continue; - serverLog(LL_NOTICE, - "Replica main channel failed to establish PSYNC within the grace period (%ld seconds). " - "Freeing RDB client %llu.", - (long int)(server.unixtime - c->rdb_client_disconnect_time), (unsigned long long)c->id); + dualChannelServerLog( + LL_NOTICE, + "Replica main channel failed to establish PSYNC within the grace period (%ld seconds). " + "Freeing RDB client %llu.", + (long int)(server.unixtime - c->rdb_client_disconnect_time), (unsigned long long)c->id); c->flag.protected_rdb_channel = 0; } diff --git a/src/replication.c b/src/replication.c index 97aa10dfab..260da1cd6e 100644 --- a/src/replication.c +++ b/src/replication.c @@ -227,9 +227,9 @@ void addRdbReplicaToPsyncWait(client *replica_rdb_client) { tail->refcount++; } } - serverLog(LL_DEBUG, "Add rdb replica %s to waiting psync, with cid %llu, %s ", - replicationGetReplicaName(replica_rdb_client), (unsigned long long)replica_rdb_client->id, - tail ? "tracking repl-backlog tail" : "no repl-backlog to track"); + dualChannelServerLog(LL_DEBUG, "Add rdb replica %s to waiting psync, with cid %llu, %s ", + replicationGetReplicaName(replica_rdb_client), (unsigned long long)replica_rdb_client->id, + tail ? "tracking repl-backlog tail" : "no repl-backlog to track"); replica_rdb_client->ref_repl_buf_node = tail ? ln : NULL; /* Prevent rdb client from being freed before psync is established. */ replica_rdb_client->flag.protected_rdb_channel = 1; @@ -252,8 +252,8 @@ void backfillRdbReplicasToPsyncWait(void) { if (replica_rdb_client->ref_repl_buf_node) continue; replica_rdb_client->ref_repl_buf_node = ln; head->refcount++; - serverLog(LL_DEBUG, "Attach replica rdb client %llu to repl buf block", - (long long unsigned int)replica_rdb_client->id); + dualChannelServerLog(LL_DEBUG, "Attach replica rdb client %llu to repl buf block", + (long long unsigned int)replica_rdb_client->id); } raxStop(&iter); } @@ -271,10 +271,10 @@ void removeReplicaFromPsyncWait(client *replica_main_client) { } replica_rdb_client->ref_repl_buf_node = NULL; replica_rdb_client->flag.protected_rdb_channel = 0; - serverLog(LL_DEBUG, "Remove psync waiting replica %s with cid %llu, repl buffer block %s", - replicationGetReplicaName(replica_main_client), - (long long unsigned int)replica_main_client->associated_rdb_client_id, - o ? "ref count decreased" : "doesn't exist"); + dualChannelServerLog(LL_DEBUG, "Remove psync waiting replica %s with cid %llu, repl buffer block %s", + replicationGetReplicaName(replica_main_client), + (long long unsigned int)replica_main_client->associated_rdb_client_id, + o ? "ref count decreased" : "doesn't exist"); uint64_t id = htonu64(replica_rdb_client->id); raxRemove(server.replicas_waiting_psync, (unsigned char *)&id, sizeof(id), NULL); } @@ -391,8 +391,8 @@ void freeReplicaReferencedReplBuffer(client *replica) { if (replica->flag.repl_rdb_channel) { uint64_t rdb_cid = htonu64(replica->id); if (raxRemove(server.replicas_waiting_psync, (unsigned char *)&rdb_cid, sizeof(rdb_cid), NULL)) { - serverLog(LL_DEBUG, "Remove psync waiting replica %s with cid %llu from replicas rax.", - replicationGetReplicaName(replica), (long long unsigned int)replica->id); + dualChannelServerLog(LL_DEBUG, "Remove psync waiting replica %s with cid %llu from replicas rax.", + replicationGetReplicaName(replica), (long long unsigned int)replica->id); } } if (replica->ref_repl_buf_node != NULL) { @@ -1121,10 +1121,11 @@ void syncCommand(client *c) { * resync. */ if (primary_replid[0] != '?') server.stat_sync_partial_err++; if (c->replica_capa & REPLICA_CAPA_DUAL_CHANNEL) { - serverLog(LL_NOTICE, - "Replica %s is capable of dual channel synchronization, and partial sync isn't possible. " - "Full sync will continue with dedicated RDB channel.", - replicationGetReplicaName(c)); + dualChannelServerLog(LL_NOTICE, + "Replica %s is capable of dual channel synchronization, and partial sync " + "isn't possible. " + "Full sync will continue with dedicated RDB channel.", + replicationGetReplicaName(c)); const char *buf = "+DUALCHANNELSYNC\r\n"; if (connWrite(c->conn, buf, strlen(buf)) != (int)strlen(buf)) { freeClientAsync(c); @@ -2565,7 +2566,7 @@ void freePendingReplDataBuf(void) { * provisional primary struct, and free local replication buffer. */ void replicationAbortDualChannelSyncTransfer(void) { serverAssert(server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE); - serverLog(LL_NOTICE, "Aborting dual channel sync"); + dualChannelServerLog(LL_NOTICE, "Aborting dual channel sync"); if (server.repl_rdb_transfer_s) { connClose(server.repl_rdb_transfer_s); server.repl_rdb_transfer_s = NULL; @@ -2594,8 +2595,9 @@ int sendCurrentOffsetToReplica(client *replica) { int buflen; buflen = snprintf(buf, sizeof(buf), "$ENDOFF:%lld %s %d %llu\r\n", server.primary_repl_offset, server.replid, server.db->id, (long long unsigned int)replica->id); - serverLog(LL_NOTICE, "Sending to replica %s RDB end offset %lld and client-id %llu", - replicationGetReplicaName(replica), server.primary_repl_offset, (long long unsigned int)replica->id); + dualChannelServerLog(LL_NOTICE, "Sending to replica %s RDB end offset %lld and client-id %llu", + replicationGetReplicaName(replica), server.primary_repl_offset, + (long long unsigned int)replica->id); if (connSyncWrite(replica->conn, buf, buflen, server.repl_syncio_timeout * 1000) != buflen) { freeClientAsync(replica); return C_ERR; @@ -2604,7 +2606,7 @@ int sendCurrentOffsetToReplica(client *replica) { } static int dualChannelReplHandleHandshake(connection *conn, sds *err) { - serverLog(LL_DEBUG, "Received first reply from primary using rdb connection."); + dualChannelServerLog(LL_DEBUG, "Received first reply from primary using rdb connection."); /* AUTH with the primary if required. */ if (server.primary_auth) { char *args[] = {"AUTH", NULL, NULL}; @@ -2620,7 +2622,7 @@ static int dualChannelReplHandleHandshake(connection *conn, sds *err) { argc++; *err = sendCommandArgv(conn, argc, args, lens); if (*err) { - serverLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); + dualChannelServerLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); return C_ERR; } } @@ -2630,14 +2632,14 @@ static int dualChannelReplHandleHandshake(connection *conn, sds *err) { NULL); sdsfree(portstr); if (*err) { - serverLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); + dualChannelServerLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); return C_ERR; } if (connSetReadHandler(conn, dualChannelFullSyncWithPrimary) == C_ERR) { char conninfo[CONN_INFO_LEN]; - serverLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno), - connGetInfo(conn, conninfo, sizeof(conninfo))); + dualChannelServerLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno), + connGetInfo(conn, conninfo, sizeof(conninfo))); return C_ERR; } return C_OK; @@ -2646,11 +2648,11 @@ static int dualChannelReplHandleHandshake(connection *conn, sds *err) { static int dualChannelReplHandleAuthReply(connection *conn, sds *err) { *err = receiveSynchronousResponse(conn); if (*err == NULL) { - serverLog(LL_WARNING, "Primary did not respond to auth command during SYNC handshake"); + dualChannelServerLog(LL_WARNING, "Primary did not respond to auth command during SYNC handshake"); return C_ERR; } if ((*err)[0] == '-') { - serverLog(LL_WARNING, "Unable to AUTH to Primary: %s", *err); + dualChannelServerLog(LL_WARNING, "Unable to AUTH to Primary: %s", *err); return C_ERR; } server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; @@ -2660,17 +2662,17 @@ static int dualChannelReplHandleAuthReply(connection *conn, sds *err) { static int dualChannelReplHandleReplconfReply(connection *conn, sds *err) { *err = receiveSynchronousResponse(conn); if (*err == NULL) { - serverLog(LL_WARNING, "Primary did not respond to replconf command during SYNC handshake"); + dualChannelServerLog(LL_WARNING, "Primary did not respond to replconf command during SYNC handshake"); return C_ERR; } if (*err[0] == '-') { - serverLog(LL_NOTICE, "Server does not support sync with offset, dual channel sync approach cannot be used: %s", - *err); + dualChannelServerLog(LL_NOTICE, "Server does not support sync with offset, dual channel sync approach cannot be used: %s", + *err); return C_ERR; } if (connSyncWrite(conn, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) { - serverLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(conn)); + dualChannelServerLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(conn)); return C_ERR; } return C_OK; @@ -2684,7 +2686,7 @@ static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) { } if (*err[0] == '\0') { /* Retry again later */ - serverLog(LL_DEBUG, "Received empty $ENDOFF response"); + dualChannelServerLog(LL_DEBUG, "Received empty $ENDOFF response"); return C_RETRY; } long long reploffset; @@ -2693,7 +2695,7 @@ static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) { /* Parse end offset response */ char *endoff_format = "$ENDOFF:%lld %40s %d %llu"; if (sscanf(*err, endoff_format, &reploffset, primary_replid, &dbid, &rdb_client_id) != 4) { - serverLog(LL_WARNING, "Received unexpected $ENDOFF response: %s", *err); + dualChannelServerLog(LL_WARNING, "Received unexpected $ENDOFF response: %s", *err); return C_ERR; } server.rdb_client_id = rdb_client_id; @@ -2741,7 +2743,8 @@ static void dualChannelFullSyncWithPrimary(connection *conn) { /* Check for errors in the socket: after a non blocking connect() we * may find that the socket is in error state. */ if (connGetState(conn) != CONN_STATE_CONNECTED) { - serverLog(LL_WARNING, "Error condition on socket for dual channel replication: %s", connGetLastError(conn)); + dualChannelServerLog(LL_WARNING, "Error condition on socket for dual channel replication: %s", + connGetLastError(conn)); goto error; } switch (server.repl_rdb_channel_state) { @@ -2830,13 +2833,13 @@ int readIntoReplDataBlock(connection *conn, replDataBufBlock *data_block, size_t int nread = connRead(conn, data_block->buf + data_block->used, read); if (nread == -1) { if (connGetState(conn) != CONN_STATE_CONNECTED) { - serverLog(LL_NOTICE, "Error reading from primary: %s", connGetLastError(conn)); + dualChannelServerLog(LL_NOTICE, "Error reading from primary: %s", connGetLastError(conn)); cancelReplicationHandshake(1); } return C_ERR; } if (nread == 0) { - serverLog(LL_VERBOSE, "Provisional primary closed connection"); + dualChannelServerLog(LL_VERBOSE, "Provisional primary closed connection"); cancelReplicationHandshake(1); return C_ERR; } @@ -2865,7 +2868,7 @@ void bufferReplData(connection *conn) { if (readlen && remaining_bytes == 0) { if (server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes && server.pending_repl_data.len > server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes) { - serverLog(LL_NOTICE, "Replication buffer limit reached, stopping buffering."); + dualChannelServerLog(LL_NOTICE, "Replication buffer limit reached, stopping buffering."); /* Stop accumulating primary commands. */ connSetReadHandler(conn, NULL); break; @@ -2938,7 +2941,7 @@ void dualChannelSyncSuccess(void) { /* Wait for the accumulated buffer to be processed before reading any more replication updates */ if (server.pending_repl_data.blocks && streamReplDataBufToDb(server.primary) == C_ERR) { /* Sync session aborted during repl data streaming. */ - serverLog(LL_WARNING, "Failed to stream local replication buffer into memory"); + dualChannelServerLog(LL_WARNING, "Failed to stream local replication buffer into memory"); /* Verify sync is still in progress */ if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { replicationAbortDualChannelSyncTransfer(); @@ -2947,7 +2950,7 @@ void dualChannelSyncSuccess(void) { return; } freePendingReplDataBuf(); - serverLog(LL_NOTICE, "Successfully streamed replication data into memory"); + dualChannelServerLog(LL_NOTICE, "Successfully streamed replication data into memory"); /* We can resume reading from the primary connection once the local replication buffer has been loaded. */ replicationSteadyStateInit(); replicationSendAck(); /* Send ACK to notify primary that replica is synced */ @@ -2963,7 +2966,7 @@ int dualChannelSyncHandlePsync(void) { if (server.repl_rdb_channel_state < REPL_DUAL_CHANNEL_RDB_LOADED) { /* RDB is still loading */ if (connSetReadHandler(server.repl_provisional_primary.conn, bufferReplData) == C_ERR) { - serverLog(LL_WARNING, "Error while setting readable handler: %s", strerror(errno)); + dualChannelServerLog(LL_WARNING, "Error while setting readable handler: %s", strerror(errno)); cancelReplicationHandshake(1); return C_ERR; } @@ -2972,7 +2975,7 @@ int dualChannelSyncHandlePsync(void) { } serverAssert(server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOADED); /* RDB is loaded */ - serverLog(LL_DEBUG, "Dual channel sync - psync established after rdb load"); + dualChannelServerLog(LL_DEBUG, "Psync established after rdb load"); dualChannelSyncSuccess(); return C_OK; } @@ -3066,8 +3069,9 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) { /* While in dual channel replication, we should use our prepared repl id and offset. */ psync_replid = server.repl_provisional_primary.replid; snprintf(psync_offset, sizeof(psync_offset), "%lld", server.repl_provisional_primary.reploff + 1); - serverLog(LL_NOTICE, "Trying a partial resynchronization using main channel (request %s:%s).", psync_replid, - psync_offset); + dualChannelServerLog(LL_NOTICE, + "Trying a partial resynchronization using main channel (request %s:%s).", + psync_replid, psync_offset); } else if (server.cached_primary) { psync_replid = server.cached_primary->replid; snprintf(psync_offset, sizeof(psync_offset), "%lld", server.cached_primary->reploff + 1); @@ -3214,7 +3218,7 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) { /* A response of +DUALCHANNELSYNC from the primary implies that partial * synchronization is not possible and that the primary supports full * sync using dedicated RDB channel. Full sync will continue that way. */ - serverLog(LL_NOTICE, "PSYNC is not possible, initialize RDB channel."); + dualChannelServerLog(LL_NOTICE, "PSYNC is not possible, initialize RDB channel."); sdsfree(reply); return PSYNC_FULLRESYNC_DUAL_CHANNEL; } @@ -3258,7 +3262,7 @@ int dualChannelReplMainConnRecvCapaReply(connection *conn, sds *err) { *err = receiveSynchronousResponse(conn); if (*err == NULL) return C_ERR; if ((*err)[0] == '-') { - serverLog(LL_NOTICE, "Primary does not understand REPLCONF identify: %s", *err); + dualChannelServerLog(LL_NOTICE, "Primary does not understand REPLCONF identify: %s", *err); return C_ERR; } return C_OK; @@ -3267,7 +3271,7 @@ int dualChannelReplMainConnRecvCapaReply(connection *conn, sds *err) { int dualChannelReplMainConnSendPsync(connection *conn, sds *err) { if (server.debug_pause_after_fork) debugPauseProcess(); if (replicaTryPartialResynchronization(conn, 0) == PSYNC_WRITE_ERROR) { - serverLog(LL_WARNING, "Aborting dual channel sync. Write error."); + dualChannelServerLog(LL_WARNING, "Aborting dual channel sync. Write error."); *err = sdsnew(connGetLastError(conn)); return C_ERR; } @@ -3279,8 +3283,8 @@ int dualChannelReplMainConnRecvPsyncReply(connection *conn, sds *err) { if (psync_result == PSYNC_WAIT_REPLY) return C_OK; /* Try again later... */ if (psync_result == PSYNC_CONTINUE) { - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Primary accepted a Partial Resynchronization%s", - server.repl_rdb_transfer_s != NULL ? ", RDB load in background." : "."); + dualChannelServerLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Primary accepted a Partial Resynchronization%s", + server.repl_rdb_transfer_s != NULL ? ", RDB load in background." : "."); if (server.supervised_mode == SUPERVISED_SYSTEMD) { serverCommunicateSystemd("STATUS=PRIMARY <-> REPLICA sync: Partial Resynchronization accepted. Ready to " "accept connections in read-write mode.\n"); @@ -3328,7 +3332,7 @@ void dualChannelSetupMainConnForPsync(connection *conn) { } if (ret == C_ERR) { - serverLog(LL_WARNING, "Aborting dual channel sync. Main channel psync result %d %s", ret, err ? err : ""); + dualChannelServerLog(LL_WARNING, "Aborting dual channel sync. Main channel psync result %d %s", ret, err ? err : ""); cancelReplicationHandshake(1); } sdsfree(err); @@ -3717,8 +3721,8 @@ void syncWithPrimary(connection *conn) { } if (connSetReadHandler(conn, NULL) == C_ERR) { char conninfo[CONN_INFO_LEN]; - serverLog(LL_WARNING, "Can't clear main connection handler: %s (%s)", strerror(errno), - connGetInfo(conn, conninfo, sizeof(conninfo))); + dualChannelServerLog(LL_WARNING, "Can't clear main connection handler: %s (%s)", strerror(errno), + connGetInfo(conn, conninfo, sizeof(conninfo))); goto error; } server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_SEND_HANDSHAKE; diff --git a/src/server.h b/src/server.h index 09b67b2670..0ec105a7ba 100644 --- a/src/server.h +++ b/src/server.h @@ -4044,6 +4044,11 @@ void debugPauseProcess(void); _serverLog(level, __VA_ARGS__); \ } while (0) +/* dualChannelServerLog - Log messages related to dual-channel operations + * This macro wraps the serverLog function, prepending "" + * to the log message. */ +#define dualChannelServerLog(level, ...) serverLog(level, " " __VA_ARGS__) + #define serverDebug(fmt, ...) printf("DEBUG %s:%d > " fmt "\n", __FILE__, __LINE__, __VA_ARGS__) #define serverDebugMark() printf("-- MARK %s:%d --\n", __FILE__, __LINE__) diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl index 05bdc130c1..055ed670ab 100644 --- a/tests/integration/dual-channel-replication.tcl +++ b/tests/integration/dual-channel-replication.tcl @@ -485,7 +485,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { } wait_for_value_to_propegate_to_replica $primary $replica "key1" # Confirm the occurrence of a race condition. - wait_for_log_messages -1 {"*Dual channel sync - psync established after rdb load*"} 0 2000 1 + wait_for_log_messages -1 {"* Psync established after rdb load*"} 0 2000 1 } } } From 66ae8b71352853ee90a0a9d4cddbbb406c189416 Mon Sep 17 00:00:00 2001 From: ranshid <88133677+ranshid@users.noreply.github.com> Date: Wed, 27 Nov 2024 07:34:02 +0200 Subject: [PATCH 64/92] change the container image to ubuntu:plucky (#1359) Our fortify workflow is running on ubuntu lunar container that is EOL since [January 25, 2024(January 25, 2024](https://lists.ubuntu.com/archives/ubuntu-announce/2024-January/000298.html). This case cause the workflow to fail during update actions like: ``` apt-get update && apt-get install -y make gcc-13 update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-1[3](https://github.com/valkey-io/valkey/actions/runs/12021130026/job/33547460209#step:5:3) 100 make all-with-unit-tests CC=gcc OPT=-O3 SERVER_CFLAGS='-Werror -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3' shell: sh -e {0} Ign:1 http://security.ubuntu.com/ubuntu lunar-security InRelease Err:2 http://security.ubuntu.com/ubuntu lunar-security Release [4](https://github.com/valkey-io/valkey/actions/runs/12021130026/job/33547460209#step:5:4)04 Not Found [IP: 91.189.91.82 80] Ign:3 http://archive.ubuntu.com/ubuntu lunar InRelease Ign:4 http://archive.ubuntu.com/ubuntu lunar-updates InRelease Ign:[5](https://github.com/valkey-io/valkey/actions/runs/12021130026/job/33547460209#step:5:5) http://archive.ubuntu.com/ubuntu lunar-backports InRelease Err:[6](https://github.com/valkey-io/valkey/actions/runs/12021130026/job/33547460209#step:5:7) http://archive.ubuntu.com/ubuntu lunar Release 404 Not Found [IP: 185.125.190.81 80] Err:7 http://archive.ubuntu.com/ubuntu lunar-updates Release 404 Not Found [IP: 185.125.190.81 80] Err:8 http://archive.ubuntu.com/ubuntu lunar-backports Release 404 Not Found [IP: 185.125.190.81 80] Reading package lists... E: The repository 'http://security.ubuntu.com/ubuntu lunar-security Release' does not have a Release file. E: The repository 'http://archive.ubuntu.com/ubuntu lunar Release' does not have a Release file. E: The repository 'http://archive.ubuntu.com/ubuntu lunar-updates Release' does not have a Release file. E: The repository 'http://archive.ubuntu.com/ubuntu lunar-backports Release' does not have a Release file. update-alternatives: error: alternative path /usr/bin/gcc-[13](https://github.com/valkey-io/valkey/actions/runs/12021130026/job/33547460209#step:5:14) doesn't exist Error: Process completed with exit code 2. ``` example: https://github.com/valkey-io/valkey/actions/runs/12021130026/job/33547460209 This pr uses the latest stable ubuntu image release [plucky](https://hub.docker.com/layers/library/ubuntu/plucky/images/sha256-dc4565c7636f006c26d54c988faae576465e825ea349fef6fd3af6bf5100e8b6?context=explore) Signed-off-by: Ran Shidlansik --- .github/workflows/daily.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index e39e672689..c06d73440d 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -86,7 +86,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'fortify') - container: ubuntu:lunar + container: ubuntu:plucky timeout-minutes: 14400 steps: - name: prep From 5d08149e726bb7d393d76401c7be683ceaf67b7b Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 27 Nov 2024 18:02:07 +0800 Subject: [PATCH 65/92] Use fake client flag to replace not conn check (#1198) The fake client flag was introduced in #1063, we want this to replace all !conn fake client checks. Signed-off-by: Binbin --- src/module.c | 1 + src/networking.c | 12 ++++++++++-- src/server.c | 3 ++- src/server.h | 7 ++++--- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/module.c b/src/module.c index 1e98b36f30..794038beb4 100644 --- a/src/module.c +++ b/src/module.c @@ -681,6 +681,7 @@ void moduleReleaseTempClient(client *c) { c->bufpos = 0; c->raw_flag = 0; c->flag.module = 1; + c->flag.fake = 1; c->user = NULL; /* Root user */ c->cmd = c->lastcmd = c->realcmd = c->io_parsed_cmd = NULL; if (c->bstate.async_rm_call_handle) { diff --git a/src/networking.c b/src/networking.c index 9c51efc537..01aaa48148 100644 --- a/src/networking.c +++ b/src/networking.c @@ -314,7 +314,11 @@ int prepareClientToWrite(client *c) { * is set. */ if (c->flag.primary && !c->flag.primary_force_reply) return C_ERR; - if (!c->conn) return C_ERR; /* Fake client for AOF loading. */ + /* Skip the fake client, such as the fake client for AOF loading. + * But CLIENT_ID_CACHED_RESPONSE is allowed since it is a fake client + * but has a connection to cache the response. */ + if (c->flag.fake && c->id != CLIENT_ID_CACHED_RESPONSE) return C_ERR; + serverAssert(c->conn); /* Schedule the client to write the output buffers to the socket, unless * it should already be setup to do so (it has already pending data). */ @@ -348,6 +352,9 @@ sds aggregateClientOutputBuffer(client *c) { * It needs be paired with `deleteCachedResponseClient` function to stop caching. */ client *createCachedResponseClient(int resp) { struct client *recording_client = createClient(NULL); + /* It is a fake client but with a connection, setting a special client id, + * so we can identify it's a fake cached response client. */ + recording_client->id = CLIENT_ID_CACHED_RESPONSE; recording_client->resp = resp; /* Allocating the `conn` allows to prepare the caching client before adding * data to the clients output buffer by `prepareClientToWrite`. */ @@ -4499,7 +4506,8 @@ int checkClientOutputBufferLimits(client *c) { * * Returns 1 if client was (flagged) closed. */ int closeClientOnOutputBufferLimitReached(client *c, int async) { - if (!c->conn) return 0; /* It is unsafe to free fake clients. */ + if (c->flag.fake) return 0; /* It is unsafe to free fake clients. */ + serverAssert(c->conn); serverAssert(c->reply_bytes < SIZE_MAX - (1024 * 64)); /* Note that c->reply_bytes is irrelevant for replica clients * (they use the global repl buffers). */ diff --git a/src/server.c b/src/server.c index 6d346ac74c..a83ef9096c 100644 --- a/src/server.c +++ b/src/server.c @@ -970,9 +970,10 @@ void updateClientMemoryUsage(client *c) { } int clientEvictionAllowed(client *c) { - if (server.maxmemory_clients == 0 || c->flag.no_evict || !c->conn) { + if (server.maxmemory_clients == 0 || c->flag.no_evict || c->flag.fake) { return 0; } + serverAssert(c->conn); int type = getClientType(c); return (type == CLIENT_TYPE_NORMAL || type == CLIENT_TYPE_PUBSUB); } diff --git a/src/server.h b/src/server.h index 0ec105a7ba..70bd3868c3 100644 --- a/src/server.h +++ b/src/server.h @@ -1094,9 +1094,10 @@ typedef struct { /* With multiplexing we need to take per-client state. * Clients are taken in a linked list. */ -#define CLIENT_ID_AOF (UINT64_MAX) /* Reserved ID for the AOF client. If you \ - need more reserved IDs use UINT64_MAX-1, \ - -2, ... and so forth. */ +#define CLIENT_ID_AOF (UINT64_MAX) /* Reserved ID for the AOF client. If you \ + need more reserved IDs use UINT64_MAX-1, \ + -2, ... and so forth. */ +#define CLIENT_ID_CACHED_RESPONSE (UINT64_MAX - 1) /* Client for cached response, see createCachedResponseClient. */ /* Replication backlog is not a separate memory, it just is one consumer of * the global replication buffer. This structure records the reference of From db7b7396ff1cc98832396a57e8d3e76e0eebd5fa Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 28 Nov 2024 00:16:55 +0800 Subject: [PATCH 66/92] Make KEYS can visit expired key in import-source state (#1326) After #1185, a client in import-source state can visit expired key both in read commands and write commands, this commit handle keyIsExpired function to handle import-source state as well, so KEYS can visit the expired key. This is not particularly important, but it ensures the definition, also doing some cleanup around the test, verified that the client can indeed visit the expired key. Signed-off-by: Binbin --- src/db.c | 10 ++++++++-- src/networking.c | 2 +- tests/unit/expire.tcl | 23 ++++++++++++++--------- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/src/db.c b/src/db.c index d3ef19027d..3f6452c44c 100644 --- a/src/db.c +++ b/src/db.c @@ -390,7 +390,7 @@ robj *dbRandomKey(serverDb *db) { if (allvolatile && (server.primary_host || server.import_mode) && --maxtries == 0) { /* If the DB is composed only of keys with an expire set, * it could happen that all the keys are already logically - * expired in the repilca, so the function cannot stop because + * expired in the replica, so the function cannot stop because * expireIfNeeded() is false, nor it can stop because * dictGetFairRandomKey() returns NULL (there are keys to return). * To prevent the infinite loop we do some tries, but if there @@ -1808,7 +1808,13 @@ int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) { /* Check if the key is expired. */ int keyIsExpired(serverDb *db, robj *key) { int dict_index = getKVStoreIndexForKey(key->ptr); - return keyIsExpiredWithDictIndex(db, key, dict_index); + if (!keyIsExpiredWithDictIndex(db, key, dict_index)) return 0; + + /* See expireIfNeededWithDictIndex for more details. */ + if (server.primary_host == NULL && server.import_mode) { + if (server.current_client && server.current_client->flag.import_source) return 0; + } + return 1; } keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, int flags, int dict_index) { diff --git a/src/networking.c b/src/networking.c index 01aaa48148..97479967f6 100644 --- a/src/networking.c +++ b/src/networking.c @@ -3617,7 +3617,7 @@ void clientCommand(client *c) { "NO-TOUCH (ON|OFF)", " Will not touch LRU/LFU stats when this mode is on.", "IMPORT-SOURCE (ON|OFF)", - " Mark this connection as an import source if server.import_mode is true.", + " Mark this connection as an import source if import-mode is enabled.", " Sync tools can set their connections into 'import-source' state to visit", " expired keys.", NULL}; diff --git a/tests/unit/expire.tcl b/tests/unit/expire.tcl index fba425f62d..941acfad38 100644 --- a/tests/unit/expire.tcl +++ b/tests/unit/expire.tcl @@ -841,7 +841,7 @@ start_server {tags {"expire"}} { r set foo1 bar PX 1 r set foo2 bar PX 1 - after 100 + after 10 assert_equal [r dbsize] {2} @@ -879,22 +879,27 @@ start_server {tags {"expire"}} { assert_equal [r debug set-active-expire 1] {OK} } {} {needs:debug} - test {RANDOMKEY can return expired key in import mode} { + test {Client can visit expired key in import-source state} { r flushall r config set import-mode yes - assert_equal [r client import-source on] {OK} - r set foo1 bar PX 1 + r set foo1 1 PX 1 after 10 - set client [valkey [srv "host"] [srv "port"] 0 $::tls] - if {!$::singledb} { - $client select 9 - } - assert_equal [$client ttl foo1] {-2} + # Normal clients cannot visit expired key. + assert_equal [r get foo1] {} + assert_equal [r ttl foo1] {-2} + assert_equal [r dbsize] 1 + # Client can visit expired key when in import-source state. + assert_equal [r client import-source on] {OK} + assert_equal [r ttl foo1] {0} + assert_equal [r get foo1] {1} + assert_equal [r incr foo1] {2} assert_equal [r randomkey] {foo1} + assert_equal [r scan 0 match * count 10000] {0 foo1} + assert_equal [r keys *] {foo1} assert_equal [r client import-source off] {OK} r config set import-mode no From a939cb88ee0c0512c003106be483b7c6968b3e7f Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 28 Nov 2024 14:10:48 +0800 Subject: [PATCH 67/92] Handle keyIsExpiredWithDictIndex to make it check for import mode (#1368) In #1326 we make KEYS can visit expired key in import-source state by updating keyIsExpired to check for import mode. But after #1205, we now use keyIsExpiredWithDictIndex to optimize and remove the redundant dict_index, and keyIsExpiredWithDictIndex does not handle this logic. In this commit, we handle keyIsExpiredWithDictIndex to make it check for import mode as well so that KEYS can visit the expired key. Signed-off-by: Binbin --- src/db.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/db.c b/src/db.c index 3f6452c44c..3c3ccb4899 100644 --- a/src/db.c +++ b/src/db.c @@ -1789,7 +1789,7 @@ void propagateDeletion(serverDb *db, robj *key, int lazy) { decrRefCount(argv[1]); } -int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) { +static int keyIsExpiredWithDictIndexImpl(serverDb *db, robj *key, int dict_index) { /* Don't expire anything while loading. It will be done later. */ if (server.loading) return 0; @@ -1806,9 +1806,8 @@ int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) { } /* Check if the key is expired. */ -int keyIsExpired(serverDb *db, robj *key) { - int dict_index = getKVStoreIndexForKey(key->ptr); - if (!keyIsExpiredWithDictIndex(db, key, dict_index)) return 0; +int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) { + if (!keyIsExpiredWithDictIndexImpl(db, key, dict_index)) return 0; /* See expireIfNeededWithDictIndex for more details. */ if (server.primary_host == NULL && server.import_mode) { @@ -1817,9 +1816,15 @@ int keyIsExpired(serverDb *db, robj *key) { return 1; } +/* Check if the key is expired. */ +int keyIsExpired(serverDb *db, robj *key) { + int dict_index = getKVStoreIndexForKey(key->ptr); + return keyIsExpiredWithDictIndex(db, key, dict_index); +} + keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, int flags, int dict_index) { if (server.lazy_expire_disabled) return KEY_VALID; - if (!keyIsExpiredWithDictIndex(db, key, dict_index)) return KEY_VALID; + if (!keyIsExpiredWithDictIndexImpl(db, key, dict_index)) return KEY_VALID; /* If we are running in the context of a replica, instead of * evicting the expired key from the database, we return ASAP: From fd58f8d0585a3e558fbb837c2302ef51dc8d1810 Mon Sep 17 00:00:00 2001 From: zvi-code <54795925+zvi-code@users.noreply.github.com> Date: Thu, 28 Nov 2024 17:27:00 +0200 Subject: [PATCH 68/92] Disable lazy free in defrag test to fix 32bit daily failure (#1370) Signed-off-by: Zvi Schneider Co-authored-by: Zvi Schneider --- tests/unit/memefficiency.tcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index d5a6a6efe2..67329f03f1 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -720,11 +720,11 @@ run_solo {defrag} { } } - start_cluster 1 0 {tags {"defrag external:skip cluster"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save ""}} { + start_cluster 1 0 {tags {"defrag external:skip cluster"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save "" lazyfree-lazy-user-del no}} { test_active_defrag "cluster" } - start_server {tags {"defrag external:skip standalone"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save ""}} { + start_server {tags {"defrag external:skip standalone"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save "" lazyfree-lazy-user-del no}} { test_active_defrag "standalone" } } ;# run_solo From 4695d118dd6126b9b4f3e3415198df398e8bbb79 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Fri, 29 Nov 2024 18:13:34 +0800 Subject: [PATCH 69/92] RDMA builtin support (#1209) There are several patches in this PR: * Abstract set/rewrite config bind option: `bind` option is a special config, `socket` and `tls` are using the same one. However RDMA uses the similar style but different one. Use a bit abstract work to make it flexible for both `socket` and `RDMA`. (Even for QUIC in the future.) * Introduce closeListener for connection type: closing socket by a simple syscall would be fine, RDMA has complex logic. Introduce connection type specific close listener method. * RDMA: Use valkey.conf style instead of module parameters: use `--rdma-bind` and `--rdma-port` style instead of module parameters. The module style config `rdma.bind` and `rdma.port` are removed. * RDMA: Support builtin: support `make BUILD_RDMA=yes`. module style is still kept for now. Signed-off-by: zhenwei pi --- README.md | 26 +++- cmake/Modules/SourceFiles.cmake | 1 + cmake/Modules/ValkeySetup.cmake | 29 +++-- src/CMakeLists.txt | 2 +- src/Makefile | 30 ++--- src/config.c | 114 +++++++++++++--- src/connection.c | 3 + src/connection.h | 10 ++ src/rdma.c | 222 ++++++++------------------------ src/server.c | 28 ++-- src/server.h | 13 +- src/socket.c | 14 ++ src/tls.c | 5 + src/unix.c | 5 + tests/rdma/run.py | 2 +- tests/unit/introspection.tcl | 4 + valkey.conf | 48 +++++++ 17 files changed, 314 insertions(+), 242 deletions(-) diff --git a/README.md b/README.md index a32ac255df..c447cc8d47 100644 --- a/README.md +++ b/README.md @@ -37,8 +37,13 @@ To build TLS as Valkey module: Note that sentinel mode does not support TLS module. To build with experimental RDMA support you'll need RDMA development libraries -(e.g. librdmacm-dev and libibverbs-dev on Debian/Ubuntu). For now, Valkey only -supports RDMA as connection module mode. Run: +(e.g. librdmacm-dev and libibverbs-dev on Debian/Ubuntu). + +To build RDMA support as Valkey built-in: + + % make BUILD_RDMA=yes + +To build RDMA as Valkey module: % make BUILD_RDMA=module @@ -203,20 +208,27 @@ Note that Valkey Over RDMA is an experimental feature. It may be changed or removed in any minor or major version. Currently, it is only supported on Linux. -To manually run a Valkey server with RDMA mode: +* RDMA built-in mode: + ``` + ./src/valkey-server --protected-mode no \ + --rdma-bind 192.168.122.100 --rdma-port 6379 + ``` - % ./src/valkey-server --protected-mode no \ - --loadmodule src/valkey-rdma.so bind=192.168.122.100 port=6379 +* RDMA module mode: + ``` + ./src/valkey-server --protected-mode no \ + --loadmodule src/valkey-rdma.so --rdma-bind 192.168.122.100 --rdma-port 6379 + ``` It's possible to change bind address/port of RDMA by runtime command: - 192.168.122.100:6379> CONFIG SET rdma.port 6380 + 192.168.122.100:6379> CONFIG SET rdma-port 6380 It's also possible to have both RDMA and TCP available, and there is no conflict of TCP(6379) and RDMA(6379), Ex: % ./src/valkey-server --protected-mode no \ - --loadmodule src/valkey-rdma.so bind=192.168.122.100 port=6379 \ + --loadmodule src/valkey-rdma.so --rdma-bind 192.168.122.100 --rdma-port 6379 \ --port 6379 Note that the network card (192.168.122.100 of this example) should support diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake index 873229d6f0..c34ae644a2 100644 --- a/cmake/Modules/SourceFiles.cmake +++ b/cmake/Modules/SourceFiles.cmake @@ -88,6 +88,7 @@ set(VALKEY_SERVER_SRCS ${CMAKE_SOURCE_DIR}/src/tracking.c ${CMAKE_SOURCE_DIR}/src/socket.c ${CMAKE_SOURCE_DIR}/src/tls.c + ${CMAKE_SOURCE_DIR}/src/rdma.c ${CMAKE_SOURCE_DIR}/src/sha256.c ${CMAKE_SOURCE_DIR}/src/timeout.c ${CMAKE_SOURCE_DIR}/src/setcpuaffinity.c diff --git a/cmake/Modules/ValkeySetup.cmake b/cmake/Modules/ValkeySetup.cmake index 4fafd07910..8a4d4da1c9 100644 --- a/cmake/Modules/ValkeySetup.cmake +++ b/cmake/Modules/ValkeySetup.cmake @@ -208,25 +208,30 @@ if (BUILD_RDMA) # RDMA support (Linux only) if (LINUX AND NOT APPLE) valkey_parse_build_option(${BUILD_RDMA} USE_RDMA) + find_package(PkgConfig REQUIRED) + # Locate librdmacm & libibverbs, fail if we can't find them + valkey_pkg_config(librdmacm RDMACM_LIBS) + valkey_pkg_config(libibverbs IBVERBS_LIBS) + message(STATUS "${RDMACM_LIBS};${IBVERBS_LIBS}") + list(APPEND RDMA_LIBS "${RDMACM_LIBS};${IBVERBS_LIBS}") + if (USE_RDMA EQUAL 2) # Module message(STATUS "Building RDMA as module") add_valkey_server_compiler_options("-DUSE_RDMA=2") - - # Locate librdmacm & libibverbs, fail if we can't find them - valkey_pkg_config(librdmacm RDMACM_LIBS) - valkey_pkg_config(libibverbs IBVERBS_LIBS) - - list(APPEND RDMA_LIBS "${RDMACM_LIBS};${IBVERBS_LIBS}") - set(BUILD_RDMA_MODULE 1) - elseif (USE_RDMA EQUAL 1) - # RDMA can only be built as a module. So disable it - message(WARNING "BUILD_RDMA can be one of: [NO | 0 | MODULE], but '${BUILD_RDMA}' was provided") - message(STATUS "RDMA build is disabled") - set(USE_RDMA 0) + set(BUILD_RDMA_MODULE 2) + elseif (USE_RDMA EQUAL 1) # Builtin + message(STATUS "Building RDMA as builtin") + add_valkey_server_compiler_options("-DUSE_RDMA=1") + add_valkey_server_compiler_options("-DBUILD_RDMA_MODULE=0") + list(APPEND SERVER_LIBS "${RDMA_LIBS}") endif () else () message(WARNING "RDMA is only supported on Linux platforms") endif () +else () + # By default, RDMA is disabled + message(STATUS "RDMA is disabled") + set(USE_RDMA 0) endif () set(BUILDING_ARM64 0) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 51e1b5a2e6..b87dff3db0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -55,7 +55,7 @@ if (BUILD_RDMA_MODULE) set(MODULE_NAME "valkey-rdma") message(STATUS "Building RDMA module") add_library(${MODULE_NAME} SHARED "${VALKEY_RDMA_MODULE_SRCS}") - target_compile_options(${MODULE_NAME} PRIVATE -DBUILD_RDMA_MODULE -DUSE_RDMA=1) + target_compile_options(${MODULE_NAME} PRIVATE -DBUILD_RDMA_MODULE=2 -DUSE_RDMA=1) target_link_libraries(${MODULE_NAME} "${RDMA_LIBS}") # remove the "lib" prefix from the module set_target_properties(${MODULE_NAME} PROPERTIES PREFIX "") diff --git a/src/Makefile b/src/Makefile index 0cbf5763cb..3b4ad0a2ef 100644 --- a/src/Makefile +++ b/src/Makefile @@ -325,26 +325,26 @@ ifeq ($(BUILD_TLS),module) TLS_MODULE_CFLAGS+=-DUSE_OPENSSL=$(BUILD_MODULE) $(OPENSSL_CFLAGS) -DBUILD_TLS_MODULE=$(BUILD_MODULE) endif -BUILD_RDMA:=no -RDMA_MODULE= -RDMA_MODULE_NAME:=valkey-rdma$(PROG_SUFFIX).so -RDMA_MODULE_CFLAGS:=$(FINAL_CFLAGS) -ifeq ($(BUILD_RDMA),module) - FINAL_CFLAGS+=-DUSE_RDMA=$(BUILD_MODULE) - RDMA_PKGCONFIG := $(shell $(PKG_CONFIG) --exists librdmacm libibverbs && echo $$?) +RDMA_LIBS= +RDMA_PKGCONFIG := $(shell $(PKG_CONFIG) --exists librdmacm libibverbs && echo $$?) ifeq ($(RDMA_PKGCONFIG),0) RDMA_LIBS=$(shell $(PKG_CONFIG) --libs librdmacm libibverbs) else RDMA_LIBS=-lrdmacm -libverbs endif - RDMA_MODULE=$(RDMA_MODULE_NAME) - RDMA_MODULE_CFLAGS+=-DUSE_RDMA=$(BUILD_YES) -DBUILD_RDMA_MODULE $(RDMA_LIBS) -else -ifeq ($(BUILD_RDMA),no) - # disable RDMA, do nothing -else - $(error "RDMA is only supported as module (BUILD_RDMA=module), or disabled (BUILD_RDMA=no)") + +ifeq ($(BUILD_RDMA),yes) + FINAL_CFLAGS+=-DUSE_RDMA=$(BUILD_YES) -DBUILD_RDMA_MODULE=$(BUILD_NO) + FINAL_LIBS += $(RDMA_LIBS) endif + +RDMA_MODULE= +RDMA_MODULE_NAME:=valkey-rdma$(PROG_SUFFIX).so +RDMA_MODULE_CFLAGS:=$(FINAL_CFLAGS) +ifeq ($(BUILD_RDMA),module) + FINAL_CFLAGS+=-DUSE_RDMA=$(BUILD_MODULE) + RDMA_MODULE=$(RDMA_MODULE_NAME) + RDMA_MODULE_CFLAGS+=-DUSE_RDMA=$(BUILD_MODULE) -DBUILD_RDMA_MODULE=$(BUILD_MODULE) $(RDMA_LIBS) endif ifndef V @@ -411,7 +411,7 @@ endif ENGINE_NAME=valkey SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX) ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX) -ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o +ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o rdma.o ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX) ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX) diff --git a/src/config.c b/src/config.c index c4009adefa..7f0901c50a 100644 --- a/src/config.c +++ b/src/config.c @@ -1536,10 +1536,27 @@ void rewriteConfigOOMScoreAdjValuesOption(standardConfig *config, const char *na } /* Rewrite the bind option. */ -void rewriteConfigBindOption(standardConfig *config, const char *name, struct rewriteConfigState *state) { +static void rewriteConfigBindOption(standardConfig *config, const char *name, struct rewriteConfigState *state, char **bindaddr, int bindaddr_count) { UNUSED(config); int force = 1; sds line, addresses; + + /* Rewrite as bind ... */ + if (bindaddr_count > 0) + addresses = sdsjoin(bindaddr, bindaddr_count, " "); + else + addresses = sdsnew("\"\""); + line = sdsnew(name); + line = sdscatlen(line, " ", 1); + line = sdscatsds(line, addresses); + sdsfree(addresses); + + rewriteConfigRewriteLine(state, name, line, force); +} + +/* Rewrite the bind option. */ +static void rewriteConfigSocketBindOption(standardConfig *config, const char *name, struct rewriteConfigState *state) { + UNUSED(config); int is_default = 0; /* Compare server.bindaddr with CONFIG_DEFAULT_BINDADDR */ @@ -1559,17 +1576,7 @@ void rewriteConfigBindOption(standardConfig *config, const char *name, struct re return; } - /* Rewrite as bind ... */ - if (server.bindaddr_count > 0) - addresses = sdsjoin(server.bindaddr, server.bindaddr_count, " "); - else - addresses = sdsnew("\"\""); - line = sdsnew(name); - line = sdscatlen(line, " ", 1); - line = sdscatsds(line, addresses); - sdsfree(addresses); - - rewriteConfigRewriteLine(state, name, line, force); + rewriteConfigBindOption(config, name, state, server.bindaddr, server.bindaddr_count); } /* Rewrite the loadmodule option. */ @@ -2637,7 +2644,7 @@ static int applyBind(const char **err) { tcp_listener->ct = connectionByType(CONN_TYPE_SOCKET); if (changeListener(tcp_listener) == C_ERR) { *err = "Failed to bind to specified addresses."; - if (tls_listener) closeListener(tls_listener); /* failed with TLS together */ + if (tls_listener) connCloseListener(tls_listener); /* failed with TLS together */ return 0; } @@ -2649,7 +2656,7 @@ static int applyBind(const char **err) { tls_listener->ct = connectionByType(CONN_TYPE_TLS); if (changeListener(tls_listener) == C_ERR) { *err = "Failed to bind to specified addresses."; - closeListener(tcp_listener); /* failed with TCP together */ + connCloseListener(tcp_listener); /* failed with TCP together */ return 0; } } @@ -2922,8 +2929,9 @@ static sds getConfigNotifyKeyspaceEventsOption(standardConfig *config) { return keyspaceEventsFlagsToString(server.notify_keyspace_events); } -static int setConfigBindOption(standardConfig *config, sds *argv, int argc, const char **err) { +static int setConfigBindOption(standardConfig *config, sds *argv, int argc, const char **err, char **bindaddr, int *bindaddr_count) { UNUSED(config); + int orig_bindaddr_count = *bindaddr_count; int j; if (argc > CONFIG_BINDADDR_MAX) { @@ -2935,11 +2943,73 @@ static int setConfigBindOption(standardConfig *config, sds *argv, int argc, cons if (argc == 1 && sdslen(argv[0]) == 0) argc = 0; /* Free old bind addresses */ - for (j = 0; j < server.bindaddr_count; j++) { - zfree(server.bindaddr[j]); + for (j = 0; j < orig_bindaddr_count; j++) zfree(bindaddr[j]); + for (j = 0; j < argc; j++) bindaddr[j] = zstrdup(argv[j]); + *bindaddr_count = argc; + + return 1; +} + +static int setConfigSocketBindOption(standardConfig *config, sds *argv, int argc, const char **err) { + UNUSED(config); + return setConfigBindOption(config, argv, argc, err, server.bindaddr, &server.bindaddr_count); +} + +static int setConfigRdmaBindOption(standardConfig *config, sds *argv, int argc, const char **err) { + UNUSED(config); + return setConfigBindOption(config, argv, argc, err, server.rdma_ctx_config.bindaddr, &server.rdma_ctx_config.bindaddr_count); +} + +static sds getConfigRdmaBindOption(standardConfig *config) { + UNUSED(config); + return sdsjoin(server.rdma_ctx_config.bindaddr, server.rdma_ctx_config.bindaddr_count, " "); +} + +static void rewriteConfigRdmaBindOption(standardConfig *config, const char *name, struct rewriteConfigState *state) { + UNUSED(config); + + if (server.rdma_ctx_config.bindaddr_count) { + rewriteConfigBindOption(config, name, state, server.rdma_ctx_config.bindaddr, + server.rdma_ctx_config.bindaddr_count); + } +} + +static int applyRdmaBind(const char **err) { + connListener *rdma_listener = listenerByType(CONN_TYPE_RDMA); + + if (!rdma_listener) { + *err = "No RDMA building support."; + return 0; + } + + rdma_listener->bindaddr = server.rdma_ctx_config.bindaddr; + rdma_listener->bindaddr_count = server.rdma_ctx_config.bindaddr_count; + rdma_listener->port = server.rdma_ctx_config.port; + rdma_listener->ct = connectionByType(CONN_TYPE_RDMA); + if (changeListener(rdma_listener) == C_ERR) { + *err = "Failed to bind to specified addresses for RDMA."; + return 0; + } + + return 1; +} + +static int updateRdmaPort(const char **err) { + connListener *listener = listenerByType(CONN_TYPE_RDMA); + + if (listener == NULL) { + *err = "No RDMA building support."; + return 0; + } + + listener->bindaddr = server.rdma_ctx_config.bindaddr; + listener->bindaddr_count = server.rdma_ctx_config.bindaddr_count; + listener->port = server.rdma_ctx_config.port; + listener->ct = connectionByType(CONN_TYPE_RDMA); + if (changeListener(listener) == C_ERR) { + *err = "Unable to listen on this port for RDMA. Check server logs."; + return 0; } - for (j = 0; j < argc; j++) server.bindaddr[j] = zstrdup(argv[j]); - server.bindaddr_count = argc; return 1; } @@ -3237,6 +3307,9 @@ standardConfig static_configs[] = { createIntConfig("watchdog-period", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, INT_MAX, server.watchdog_period, 0, INTEGER_CONFIG, NULL, updateWatchdogPeriod), createIntConfig("shutdown-timeout", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.shutdown_timeout, 10, INTEGER_CONFIG, NULL, NULL), createIntConfig("repl-diskless-sync-max-replicas", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.repl_diskless_sync_max_replicas, 0, INTEGER_CONFIG, NULL, NULL), + createIntConfig("rdma-port", NULL, MODIFIABLE_CONFIG, 0, 65535, server.rdma_ctx_config.port, 0, INTEGER_CONFIG, NULL, updateRdmaPort), + createIntConfig("rdma-rx-size", NULL, IMMUTABLE_CONFIG, 64 * 1024, 16 * 1024 * 1024, server.rdma_ctx_config.rx_size, 1024 * 1024, INTEGER_CONFIG, NULL, NULL), + createIntConfig("rdma-completion-vector", NULL, IMMUTABLE_CONFIG, -1, 1024, server.rdma_ctx_config.completion_vector, -1, INTEGER_CONFIG, NULL, NULL), /* Unsigned int configs */ createUIntConfig("maxclients", NULL, MODIFIABLE_CONFIG, 1, UINT_MAX, server.maxclients, 10000, INTEGER_CONFIG, NULL, updateMaxclients), @@ -3316,7 +3389,8 @@ standardConfig static_configs[] = { createSpecialConfig("client-output-buffer-limit", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigClientOutputBufferLimitOption, getConfigClientOutputBufferLimitOption, rewriteConfigClientOutputBufferLimitOption, NULL), createSpecialConfig("oom-score-adj-values", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigOOMScoreAdjValuesOption, getConfigOOMScoreAdjValuesOption, rewriteConfigOOMScoreAdjValuesOption, updateOOMScoreAdj), createSpecialConfig("notify-keyspace-events", NULL, MODIFIABLE_CONFIG, setConfigNotifyKeyspaceEventsOption, getConfigNotifyKeyspaceEventsOption, rewriteConfigNotifyKeyspaceEventsOption, NULL), - createSpecialConfig("bind", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigBindOption, getConfigBindOption, rewriteConfigBindOption, applyBind), + createSpecialConfig("bind", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigSocketBindOption, getConfigBindOption, rewriteConfigSocketBindOption, applyBind), + createSpecialConfig("rdma-bind", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigRdmaBindOption, getConfigRdmaBindOption, rewriteConfigRdmaBindOption, applyRdmaBind), createSpecialConfig("replicaof", "slaveof", IMMUTABLE_CONFIG | MULTI_ARG_CONFIG, setConfigReplicaOfOption, getConfigReplicaOfOption, rewriteConfigReplicaOfOption, NULL), createSpecialConfig("latency-tracking-info-percentiles", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigLatencyTrackingInfoPercentilesOutputOption, getConfigLatencyTrackingInfoPercentilesOutputOption, rewriteConfigLatencyTrackingInfoPercentilesOutputOption, NULL), diff --git a/src/connection.c b/src/connection.c index f0c1c2d364..8807541d77 100644 --- a/src/connection.c +++ b/src/connection.c @@ -66,6 +66,9 @@ int connTypeInitialize(void) { /* may fail if without BUILD_TLS=yes */ RedisRegisterConnectionTypeTLS(); + /* may fail if without BUILD_RDMA=yes */ + RegisterConnectionTypeRdma(); + return C_OK; } diff --git a/src/connection.h b/src/connection.h index 0762441732..8a2775ee34 100644 --- a/src/connection.h +++ b/src/connection.h @@ -60,6 +60,7 @@ typedef enum { #define CONN_TYPE_SOCKET "tcp" #define CONN_TYPE_UNIX "unix" #define CONN_TYPE_TLS "tls" +#define CONN_TYPE_RDMA "rdma" #define CONN_TYPE_MAX 8 /* 8 is enough to be extendable */ typedef void (*ConnectionCallbackFunc)(struct connection *conn); @@ -79,6 +80,7 @@ typedef struct ConnectionType { int (*addr)(connection *conn, char *ip, size_t ip_len, int *port, int remote); int (*is_local)(connection *conn); int (*listen)(connListener *listener); + void (*closeListener)(connListener *listener); /* create/shutdown/close connection */ connection *(*conn_create)(void); @@ -442,6 +444,13 @@ static inline int connListen(connListener *listener) { return listener->ct->listen(listener); } +/* Close a listened listener */ +static inline void connCloseListener(connListener *listener) { + if (listener->count) { + listener->ct->closeListener(listener); + } +} + /* Get accept_handler of a connection type */ static inline aeFileProc *connAcceptHandler(ConnectionType *ct) { if (ct) return ct->accept_handler; @@ -454,6 +463,7 @@ sds getListensInfoString(sds info); int RedisRegisterConnectionTypeSocket(void); int RedisRegisterConnectionTypeUnix(void); int RedisRegisterConnectionTypeTLS(void); +int RegisterConnectionTypeRdma(void); /* Return 1 if connection is using TLS protocol, 0 if otherwise. */ static inline int connIsTLS(connection *conn) { diff --git a/src/rdma.c b/src/rdma.c index 7cdcb24913..de7ea396a1 100644 --- a/src/rdma.c +++ b/src/rdma.c @@ -10,9 +10,10 @@ #define VALKEYMODULE_CORE_MODULE #include "server.h" - -#if defined USE_RDMA && defined __linux__ /* currently RDMA is only supported on Linux */ #include "connection.h" + +#if defined __linux__ /* currently RDMA is only supported on Linux */ +#if (USE_RDMA == 1 /* BUILD_YES */) || ((USE_RDMA == 2 /* BUILD_MODULE */) && (BUILD_RDMA_MODULE == 2)) #include "connhelpers.h" #include @@ -128,12 +129,10 @@ typedef struct rdma_listener { static list *pending_list; static rdma_listener *rdma_listeners; +static serverRdmaContextConfig *rdma_config; static ConnectionType CT_RDMA; -static int valkey_rdma_rx_size = VALKEY_RDMA_DEFAULT_RX_SIZE; -static int valkey_rdma_comp_vector = -1; /* -1 means a random one */ - static void serverRdmaError(char *err, const char *fmt, ...) { va_list ap; @@ -272,7 +271,7 @@ static int rdmaSetupIoBuf(RdmaContext *ctx, struct rdma_cm_id *cm_id) { /* setup recv buf & MR */ access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE; - length = valkey_rdma_rx_size; + length = rdma_config->rx_size; ctx->rx.addr = page_aligned_zalloc(length); ctx->rx.length = length; ctx->rx.mr = ibv_reg_mr(ctx->pd, ctx->rx.addr, length, access); @@ -295,6 +294,7 @@ static int rdmaCreateResource(RdmaContext *ctx, struct rdma_cm_id *cm_id) { struct ibv_comp_channel *comp_channel = NULL; struct ibv_cq *cq = NULL; struct ibv_pd *pd = NULL; + int comp_vector = rdma_config->completion_vector; if (ibv_query_device(cm_id->verbs, &device_attr)) { serverLog(LL_WARNING, "RDMA: ibv ibv query device failed"); @@ -317,8 +317,13 @@ static int rdmaCreateResource(RdmaContext *ctx, struct rdma_cm_id *cm_id) { ctx->comp_channel = comp_channel; + /* negative number means a random one */ + if (comp_vector < 0) { + comp_vector = abs((int)random()); + } + cq = ibv_create_cq(cm_id->verbs, VALKEY_RDMA_MAX_WQE * 2, NULL, comp_channel, - valkey_rdma_comp_vector % cm_id->verbs->num_comp_vectors); + comp_vector % cm_id->verbs->num_comp_vectors); if (!cq) { serverLog(LL_WARNING, "RDMA: ibv create cq failed"); return C_ERR; @@ -1610,9 +1615,28 @@ int connRdmaListen(connListener *listener) { rdma_listener++; } + rdma_config = listener->priv; return C_OK; } +static void connRdmaCloseListener(connListener *listener) { + /* Close old servers */ + for (int i = 0; i < listener->count; i++) { + if (listener->fd[i] == -1) continue; + + aeDeleteFileEvent(server.el, listener->fd[i], AE_READABLE); + listener->fd[i] = -1; + struct rdma_listener *rdma_listener = &rdma_listeners[i]; + rdma_destroy_id(rdma_listener->cm_id); + rdma_destroy_event_channel(rdma_listener->cm_channel); + } + + listener->count = 0; + zfree(rdma_listeners); + rdma_listeners = NULL; + rdma_config = NULL; +} + static int connRdmaAddr(connection *conn, char *ip, size_t ip_len, int *port, int remote) { rdma_connection *rdma_conn = (rdma_connection *)conn; struct rdma_cm_id *cm_id = rdma_conn->cm_id; @@ -1740,6 +1764,7 @@ static ConnectionType CT_RDMA = { //.cluster_accept_handler = NULL, .is_local = connRdmaIsLocal, .listen = connRdmaListen, + .closeListener = connRdmaCloseListener, .addr = connRdmaAddr, /* create/close connection */ @@ -1769,17 +1794,6 @@ static ConnectionType CT_RDMA = { .process_pending_data = rdmaProcessPendingData, }; -static struct connListener *rdmaListener(void) { - static struct connListener *listener = NULL; - - if (listener) return listener; - - listener = listenerByType(CONN_TYPE_RDMA); - serverAssert(listener != NULL); - - return listener; -} - ConnectionType *connectionTypeRdma(void) { static ConnectionType *ct_rdma = NULL; @@ -1791,133 +1805,28 @@ ConnectionType *connectionTypeRdma(void) { return ct_rdma; } -/* rdma listener has different create/close logic from TCP, we can't re-use 'int changeListener(connListener *listener)' - * directly */ -static int rdmaChangeListener(void) { - struct connListener *listener = rdmaListener(); - - /* Close old servers */ - for (int i = 0; i < listener->count; i++) { - if (listener->fd[i] == -1) continue; - - aeDeleteFileEvent(server.el, listener->fd[i], AE_READABLE); - listener->fd[i] = -1; - struct rdma_listener *rdma_listener = &rdma_listeners[i]; - rdma_destroy_id(rdma_listener->cm_id); - rdma_destroy_event_channel(rdma_listener->cm_channel); - } - - listener->count = 0; - zfree(rdma_listeners); - rdma_listeners = NULL; - - closeListener(listener); - - /* Just close the server if port disabled */ - if (listener->port == 0) { - if (server.set_proc_title) serverSetProcTitle(NULL); - return VALKEYMODULE_OK; - } - - /* Re-create listener */ - if (connListen(listener) != C_OK) { - return VALKEYMODULE_ERR; - } - - /* Create event handlers */ - if (createSocketAcceptHandler(listener, listener->ct->accept_handler) != C_OK) { - serverPanic("Unrecoverable error creating %s accept handler.", listener->ct->get_type(NULL)); - } - - if (server.set_proc_title) serverSetProcTitle(NULL); - - return VALKEYMODULE_OK; -} - -#ifdef BUILD_RDMA_MODULE - -#include "release.h" - -static long long rdmaGetPort(const char *name, void *privdata) { - UNUSED(name); - UNUSED(privdata); - struct connListener *listener = rdmaListener(); - - return listener->port; -} - -static int rdmaSetPort(const char *name, long long val, void *privdata, ValkeyModuleString **err) { - UNUSED(name); - UNUSED(privdata); - UNUSED(err); - struct connListener *listener = rdmaListener(); - listener->port = val; - - return VALKEYMODULE_OK; -} - -static ValkeyModuleString *rdma_bind; - -static void rdmaBuildBind(void *ctx) { - struct connListener *listener = rdmaListener(); - - if (rdma_bind) ValkeyModule_FreeString(NULL, rdma_bind); - - sds rdma_bind_str = sdsjoin(listener->bindaddr, listener->bindaddr_count, " "); - rdma_bind = ValkeyModule_CreateString(ctx, rdma_bind_str, sdslen(rdma_bind_str)); +int RegisterConnectionTypeRdma(void) { + return connTypeRegister(&CT_RDMA); } -static ValkeyModuleString *rdmaGetBind(const char *name, void *privdata) { - UNUSED(name); - UNUSED(privdata); +#else - return rdma_bind; +int RegisterConnectionTypeRdma(void) { + serverLog(LL_VERBOSE, "Connection type %s not builtin", CONN_TYPE_RDMA); + return C_ERR; } -static int rdmaSetBind(const char *name, ValkeyModuleString *val, void *privdata, ValkeyModuleString **err) { - UNUSED(name); - UNUSED(err); - struct connListener *listener = rdmaListener(); - const char *bind = ValkeyModule_StringPtrLen(val, NULL); - int nexts; - sds *exts = sdssplitlen(bind, strlen(bind), " ", 1, &nexts); - - if (nexts > CONFIG_BINDADDR_MAX) { - serverLog(LL_WARNING, "RDMA: Unsupported bind ( > %d)", CONFIG_BINDADDR_MAX); - return VALKEYMODULE_ERR; - } - - /* Free old bind addresses */ - for (int j = 0; j < listener->bindaddr_count; j++) { - zfree(listener->bindaddr[j]); - } - - for (int j = 0; j < nexts; j++) listener->bindaddr[j] = zstrdup(exts[j]); - listener->bindaddr_count = nexts; - - sdsfreesplitres(exts, nexts); - rdmaBuildBind(privdata); - - return VALKEYMODULE_OK; -} +#endif -static int rdmaApplyListener(ValkeyModuleCtx *ctx, void *privdata, ValkeyModuleString **err) { - UNUSED(ctx); - UNUSED(privdata); - UNUSED(err); +#if BUILD_RDMA_MODULE == 2 /* BUILD_MODULE */ - return rdmaChangeListener(); -} +#include "release.h" -static void rdmaListenerAddConfig(void *ctx) { - serverAssert(ValkeyModule_RegisterNumericConfig(ctx, "port", 0, VALKEYMODULE_CONFIG_DEFAULT, 0, 65535, rdmaGetPort, - rdmaSetPort, rdmaApplyListener, NULL) == VALKEYMODULE_OK); - serverAssert(ValkeyModule_RegisterStringConfig(ctx, "bind", "", VALKEYMODULE_CONFIG_DEFAULT, rdmaGetBind, - rdmaSetBind, rdmaApplyListener, ctx) == VALKEYMODULE_OK); - serverAssert(ValkeyModule_LoadConfigs(ctx) == VALKEYMODULE_OK); -} int ValkeyModule_OnLoad(void *ctx, ValkeyModuleString **argv, int argc) { + UNUSED(argv); + UNUSED(argc); + /* Connection modules MUST be part of the same build as valkey. */ if (strcmp(REDIS_BUILD_ID_RAW, serverBuildIdRaw())) { serverLog(LL_NOTICE, "Connection type %s was not built together with the valkey-server used.", CONN_TYPE_RDMA); @@ -1936,40 +1845,6 @@ int ValkeyModule_OnLoad(void *ctx, ValkeyModuleString **argv, int argc) { if (connTypeRegister(&CT_RDMA) != C_OK) return VALKEYMODULE_ERR; - rdmaListenerAddConfig(ctx); - - struct connListener *listener = rdmaListener(); - listener->ct = connectionTypeRdma(); - listener->bindaddr = zcalloc_num(CONFIG_BINDADDR_MAX, sizeof(listener->bindaddr[0])); - - for (int i = 0; i < argc; i++) { - robj *str = (robj *)argv[i]; - int nexts; - sds *exts = sdssplitlen(str->ptr, strlen(str->ptr), "=", 1, &nexts); - if (nexts != 2) { - serverLog(LL_WARNING, "RDMA: Unsupported argument \"%s\"", (char *)str->ptr); - return VALKEYMODULE_ERR; - } - - if (!strcasecmp(exts[0], "bind")) { - listener->bindaddr[listener->bindaddr_count++] = zstrdup(exts[1]); - } else if (!strcasecmp(exts[0], "port")) { - listener->port = atoi(exts[1]); - } else if (!strcasecmp(exts[0], "rx-size")) { - valkey_rdma_rx_size = atoi(exts[1]); - } else if (!strcasecmp(exts[0], "comp-vector")) { - valkey_rdma_comp_vector = atoi(exts[1]); - } else { - serverLog(LL_WARNING, "RDMA: Unsupported argument \"%s\"", (char *)str->ptr); - return VALKEYMODULE_ERR; - } - - sdsfreesplitres(exts, nexts); - } - - rdmaBuildBind(ctx); - if (valkey_rdma_comp_vector == -1) valkey_rdma_comp_vector = abs((int)random()); - return VALKEYMODULE_OK; } @@ -1981,4 +1856,11 @@ int ValkeyModule_OnUnload(void *arg) { #endif /* BUILD_RDMA_MODULE */ -#endif /* USE_RDMA && __linux__ */ +#else /* __linux__ */ + +int RegisterConnectionTypeRdma(void) { + serverLog(LL_VERBOSE, "Connection type %s is supported on Linux only", CONN_TYPE_RDMA); + return C_ERR; +} + +#endif /* __linux__ */ diff --git a/src/server.c b/src/server.c index a83ef9096c..df57659715 100644 --- a/src/server.c +++ b/src/server.c @@ -2482,19 +2482,6 @@ void checkTcpBacklogSettings(void) { #endif } -void closeListener(connListener *sfd) { - int j; - - for (j = 0; j < sfd->count; j++) { - if (sfd->fd[j] == -1) continue; - - aeDeleteFileEvent(server.el, sfd->fd[j], AE_READABLE); - close(sfd->fd[j]); - } - - sfd->count = 0; -} - /* Create an event handler for accepting new connections in TCP or TLS domain sockets. * This works atomically for all socket fds */ int createSocketAcceptHandler(connListener *sfd, aeFileProc *accept_handler) { @@ -2558,7 +2545,7 @@ int listenToPort(connListener *sfd) { continue; /* Rollback successful listens before exiting */ - closeListener(sfd); + connCloseListener(sfd); return C_ERR; } if (server.socket_mark_id > 0) anetSetSockMarkId(NULL, sfd->fd[sfd->count], server.socket_mark_id); @@ -2899,6 +2886,17 @@ void initListeners(void) { listener->priv = &server.unix_ctx_config; /* Unix socket specified */ } + if (server.rdma_ctx_config.port != 0) { + conn_index = connectionIndexByType(CONN_TYPE_RDMA); + if (conn_index < 0) serverPanic("Failed finding connection listener of %s", CONN_TYPE_RDMA); + listener = &server.listeners[conn_index]; + listener->bindaddr = server.rdma_ctx_config.bindaddr; + listener->bindaddr_count = server.rdma_ctx_config.bindaddr_count; + listener->port = server.rdma_ctx_config.port; + listener->ct = connectionByType(CONN_TYPE_RDMA); + listener->priv = &server.rdma_ctx_config; + } + /* create all the configured listener, and add handler to start to accept */ int listen_fds = 0; for (int j = 0; j < CONN_TYPE_MAX; j++) { @@ -6297,7 +6295,7 @@ connListener *listenerByType(const char *typename) { /* Close original listener, re-create a new listener from the updated bind address & port */ int changeListener(connListener *listener) { /* Close old servers */ - closeListener(listener); + connCloseListener(listener); /* Just close the server if port disabled */ if (listener->port == 0) { diff --git a/src/server.h b/src/server.h index 70bd3868c3..b9e8be9479 100644 --- a/src/server.h +++ b/src/server.h @@ -1614,6 +1614,17 @@ typedef struct serverUnixContextConfig { unsigned int perm; /* UNIX socket permission (see mode_t) */ } serverUnixContextConfig; +/*----------------------------------------------------------------------------- + * RDMA Context Configuration + *----------------------------------------------------------------------------*/ +typedef struct serverRdmaContextConfig { + char *bindaddr[CONFIG_BINDADDR_MAX]; + int bindaddr_count; + int port; + int rx_size; + int completion_vector; +} serverRdmaContextConfig; + /*----------------------------------------------------------------------------- * AOF manifest definition *----------------------------------------------------------------------------*/ @@ -2229,6 +2240,7 @@ struct valkeyServer { int tls_auth_clients; serverTLSContextConfig tls_ctx_config; serverUnixContextConfig unix_ctx_config; + serverRdmaContextConfig rdma_ctx_config; /* cpu affinity */ char *server_cpulist; /* cpu affinity list of server main/io thread. */ char *bio_cpulist; /* cpu affinity list of bio thread. */ @@ -3293,7 +3305,6 @@ void setupSignalHandlers(void); int createSocketAcceptHandler(connListener *sfd, aeFileProc *accept_handler); connListener *listenerByType(const char *typename); int changeListener(connListener *listener); -void closeListener(connListener *listener); struct serverCommand *lookupSubcommand(struct serverCommand *container, sds sub_name); struct serverCommand *lookupCommand(robj **argv, int argc); struct serverCommand *lookupCommandBySdsLogic(dict *commands, sds s); diff --git a/src/socket.c b/src/socket.c index 7344d66ad8..d89e6c8767 100644 --- a/src/socket.c +++ b/src/socket.c @@ -339,6 +339,19 @@ static int connSocketListen(connListener *listener) { return listenToPort(listener); } +static void connSocketCloseListener(connListener *listener) { + int j; + + for (j = 0; j < listener->count; j++) { + if (listener->fd[j] == -1) continue; + + aeDeleteFileEvent(server.el, listener->fd[j], AE_READABLE); + close(listener->fd[j]); + } + + listener->count = 0; +} + static int connSocketBlockingConnect(connection *conn, const char *addr, int port, long long timeout) { int fd = anetTcpNonBlockConnect(NULL, addr, port); if (fd == -1) { @@ -395,6 +408,7 @@ static ConnectionType CT_Socket = { .addr = connSocketAddr, .is_local = connSocketIsLocal, .listen = connSocketListen, + .closeListener = connSocketCloseListener, /* create/shutdown/close connection */ .conn_create = connCreateSocket, diff --git a/src/tls.c b/src/tls.c index d1dd567354..48b75553de 100644 --- a/src/tls.c +++ b/src/tls.c @@ -805,6 +805,10 @@ static int connTLSListen(connListener *listener) { return listenToPort(listener); } +static void connTLSCloseListener(connListener *listener) { + connectionTypeTcp()->closeListener(listener); +} + static void connTLSShutdown(connection *conn_) { tls_connection *conn = (tls_connection *)conn_; @@ -1147,6 +1151,7 @@ static ConnectionType CT_TLS = { .addr = connTLSAddr, .is_local = connTLSIsLocal, .listen = connTLSListen, + .closeListener = connTLSCloseListener, /* create/shutdown/close connection */ .conn_create = connCreateTLS, diff --git a/src/unix.c b/src/unix.c index 35778779f9..86df05bd52 100644 --- a/src/unix.c +++ b/src/unix.c @@ -74,6 +74,10 @@ static int connUnixListen(connListener *listener) { return C_OK; } +static void connUnixCloseListener(connListener *listener) { + connectionTypeTcp()->closeListener(listener); +} + static connection *connCreateUnix(void) { connection *conn = zcalloc(sizeof(connection)); conn->type = &CT_Unix; @@ -174,6 +178,7 @@ static ConnectionType CT_Unix = { .addr = connUnixAddr, .is_local = connUnixIsLocal, .listen = connUnixListen, + .closeListener = connUnixCloseListener, /* create/shutdown/close connection */ .conn_create = connCreateUnix, diff --git a/tests/rdma/run.py b/tests/rdma/run.py index 0724c27adc..09168f368a 100755 --- a/tests/rdma/run.py +++ b/tests/rdma/run.py @@ -63,7 +63,7 @@ def test_rdma(ipaddr): rdmapath = valkeydir + "/src/valkey-rdma.so" svrcmd = [svrpath, "--port", "0", "--loglevel", "verbose", "--protected-mode", "yes", "--appendonly", "no", "--daemonize", "no", "--dir", valkeydir + "/tests/rdma/tmp", - "--loadmodule", rdmapath, "port=6379", "bind=" + ipaddr] + "--loadmodule", rdmapath, "--rdma-port", "6379", "--rdma-bind", ipaddr] svr = subprocess.Popen(svrcmd, shell=False, stdout=subprocess.PIPE) try: diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index 352f5f183e..d79bb1c7da 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -558,6 +558,10 @@ start_server {tags {"introspection"}} { req-res-logfile client-default-resp dual-channel-replication-enabled + rdma-completion-vector + rdma-rx-size + rdma-bind + rdma-port } if {!$::tls} { diff --git a/valkey.conf b/valkey.conf index bf82b01874..8d3e11c515 100644 --- a/valkey.conf +++ b/valkey.conf @@ -300,6 +300,54 @@ tcp-keepalive 300 # # tls-session-cache-timeout 60 +################################### RDMA ###################################### + +# Valkey Over RDMA is experimental, it may be changed or be removed in any minor or major version. +# By default, RDMA is disabled. To enable it, the "rdma-port" configuration +# directive can be used to define RDMA-listening ports. +# +# rdma-port 6379 +# rdma-bind 192.168.1.100 + +# The RDMA receive transfer buffer is 1M by default. It can be set between 64K and 16M. +# Note that page size aligned size is preferred. +# +# rdma-rx-size 1048576 + +# The RDMA completion queue will use the completion vector to signal completion events +# via hardware interrupts. A large number of hardware interrupts can affect CPU performance. +# It is possible to tune the performance using rdma-completion-vector. +# +# Example 1. a) Pin hardware interrupt vectors [0, 3] to CPU [0, 3]. +# b) Set CPU affinity for valkey to CPU [4, X]. +# c) Any valkey server uses a random RDMA completion vector [-1]. +# All valkey servers will not affect each other and will be isolated from kernel interrupts. +# +# SYS SYS SYS SYS VALKEY VALKEY VALKEY +# | | | | | | | +# CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 ... CPUX +# | | | | +# INTR0 INTR1 INTR2 INTR3 +# +# Example 2. a) 1:1 pin hardware interrupt vectors [0, X] to CPU [0, X]. +# b) Set CPU affinity for valkey [M] to CPU [M]. +# c) Valkey server [M] uses RDMA completion vector [M]. +# A single CPU [M] handles hardware interrupts, the RDMA completion vector [M], +# and the valkey server [M] within its context only. +# This avoids overhead and function calls across multiple CPUs, fully isolating +# each valkey server from one another. +# +# VALKEY VALKEY VALKEY VALKEY VALKEY VALKEY VALKEY +# | | | | | | | +# CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 ... CPUX +# | | | | | | | +# INTR0 INTR1 INTR2 INTR3 INTR4 INTR5 INTRX +# +# Use 0 and positive numbers to specify the RDMA completion vector, or specify -1 to allow +# the server to use a random vector for a new connection. The default vector is -1. +# +# rdma-completion-vector 0 + ################################# GENERAL ##################################### # By default the server does not run as a daemon. Use 'yes' if you need it. From c8ceb2ee255c899b0cb05b69f0511fc7dcf4ddca Mon Sep 17 00:00:00 2001 From: Stav Ben-Tov <90314138+stav-bentov@users.noreply.github.com> Date: Sun, 1 Dec 2024 13:24:18 +0200 Subject: [PATCH 70/92] Use zfree_with_size for client buffer (#1376) Replace occurrences of 'zfree' with 'zfree_with_size' to improve performance. 'zfree_with_size' function avoids calling 'zmalloc_size' to retrieve buffer size and uses previuos calculation of size for calling 'zfree_with_size'. This results in faster memory deallocation and reduces overhead. Signed-off-by: stav bentov Co-authored-by: stav bentov --- src/networking.c | 2 +- src/server.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/networking.c b/src/networking.c index 97479967f6..bbd684a3e5 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1760,7 +1760,7 @@ void freeClient(client *c) { /* Free data structures. */ listRelease(c->reply); c->reply = NULL; - zfree(c->buf); + zfree_with_size(c->buf, c->buf_usable_size); c->buf = NULL; freeReplicaReferencedReplBuffer(c); freeClientArgv(c); diff --git a/src/server.c b/src/server.c index df57659715..ef9f523145 100644 --- a/src/server.c +++ b/src/server.c @@ -889,9 +889,10 @@ int clientsCronResizeOutputBuffer(client *c, mstime_t now_ms) { if (new_buffer_size) { oldbuf = c->buf; + size_t oldbuf_size = c->buf_usable_size; c->buf = zmalloc_usable(new_buffer_size, &c->buf_usable_size); memcpy(c->buf, oldbuf, c->bufpos); - zfree(oldbuf); + zfree_with_size(oldbuf, oldbuf_size); } return 0; } From 9c48f567907087637e19bf30a5a137d8b50e0df3 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sun, 1 Dec 2024 21:33:21 +0800 Subject: [PATCH 71/92] Reset repl_down_since to zero only on state change (#1149) We should reset repl_down_since only on state change, in the current code, if the rdb channel in the dual channel is normal, that is, rdb is loaded normally, but the psync channel is abnormal, we will set repl_down_since 0 here. If the primary is down at this time, the replica may be abnormal when calculating data_age in cluster failover, since repl_state != REPL_STATE_CONNECTED, this causes the replica to be unable to initiate an election due to the old data_age. In dualChannelSyncHandleRdbLoadCompletion, if the psync channel is not established, the function will return. We will set repl_state to REPL_STATE_CONNECTED and set repl_down_since to 0 in dualChannelSyncSuccess, that is, in establishPrimaryConnection. See also 677d10b2a8ff7f13033ccfe56ffcd246dbe70fb6 for more details. Signed-off-by: Binbin --- src/replication.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/replication.c b/src/replication.c index 260da1cd6e..d17199bfc3 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2405,10 +2405,10 @@ void readSyncBulkPayload(connection *conn) { } else { replicationCreatePrimaryClient(server.repl_transfer_s, rsi.repl_stream_db); server.repl_state = REPL_STATE_CONNECTED; + server.repl_down_since = 0; /* Send the initial ACK immediately to put this replica in online state. */ replicationSendAck(); } - server.repl_down_since = 0; /* Fire the primary link modules event. */ moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL); From 7043ef0bbb627b66bcaa75351b1b141c96852df8 Mon Sep 17 00:00:00 2001 From: Amit Nagler <58042354+naglera@users.noreply.github.com> Date: Sun, 1 Dec 2024 15:33:43 +0200 Subject: [PATCH 72/92] Split dual-channel COB overrun tests to separate servers (#1374) 1. The test isn't waiting long enough for the output buffer to overrun. This problem is happening because an error from the previous test is bleeding into the current test's logs. The simplest fix would be to split these tests. 2. Increased replication timeout to ensure sync fails due to output buffer overrun before a timeout occurs. Fixes #1367 Signed-off-by: naglera --- .../integration/dual-channel-replication.tcl | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl index 055ed670ab..e417dad6c9 100644 --- a/tests/integration/dual-channel-replication.tcl +++ b/tests/integration/dual-channel-replication.tcl @@ -775,7 +775,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { $replica config set dual-channel-replication-enabled yes $replica config set loglevel debug - $replica config set repl-timeout 10 + $replica config set repl-timeout 60 $primary config set repl-backlog-size 1mb test "Test dual-channel-replication primary gets cob overrun before established psync" { @@ -815,6 +815,37 @@ start_server {tags {"dual-channel-replication external:skip"}} { } else { fail "Primary should abort sync" } + stop_write_load $load_handle0 + stop_write_load $load_handle1 + stop_write_load $load_handle2 + } +} + +start_server {tags {"dual-channel-replication external:skip"}} { + set primary [srv 0 client] + set primary_host [srv 0 host] + set primary_port [srv 0 port] + set loglines [count_log_lines 0] + + $primary config set repl-diskless-sync yes + $primary config set dual-channel-replication-enabled yes + $primary config set client-output-buffer-limit "replica 1100k 0 0" + $primary config set loglevel debug + start_server {} { + set replica [srv 0 client] + set replica_host [srv 0 host] + set replica_port [srv 0 port] + set replica_log [srv 0 stdout] + set replica_pid [srv 0 pid] + + set load_handle0 [start_write_load $primary_host $primary_port 60] + set load_handle1 [start_write_load $primary_host $primary_port 60] + set load_handle2 [start_write_load $primary_host $primary_port 60] + + $replica config set dual-channel-replication-enabled yes + $replica config set loglevel debug + $replica config set repl-timeout 60 + $primary config set repl-backlog-size 1mb $replica debug pause-after-fork 1 $primary debug populate 1000 primary 100000 From 90475af59429583182402ee3b408d7bcb36d56cd Mon Sep 17 00:00:00 2001 From: Vadym Khoptynets <1099644+poiuj@users.noreply.github.com> Date: Sun, 1 Dec 2024 17:12:27 +0200 Subject: [PATCH 73/92] Free strings during BGSAVE/BGAOFRW to reduce copy-on-write (#905) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Motivation** Copy-on-write (COW) amplification refers to the issue where writing to a small object leads to the entire page being cloned, resulting in inefficient memory usage. This issue arises during the BGSAVE process, which can be particularly problematic on instances with limited memory. If the BGSAVE process could release unneeded memory, it could reduce memory consumption. To address this, the BGSAVE process calls the `madvise` function to signal the operating system to reclaim the buffer. However, this approach does not work for buffers smaller than a page (usually 4KiB). Even after multiple such calls, where a full page may be free, the operating system will not reclaim it. To solve this issue, we can call `zfree` directly. This allows the allocator (jemalloc) to handle the bookkeeping and release pages when buffers are no longer needed. This approach reduces copy-on-write events. **Benchmarks** To understand how usage of `zfree` affects BGSAVE and the memory consumption I ran 45 benchmarks that compares my clonewith the vanilla version. The benchmark has the following steps: 1. Start a new Valkey process 2. Fill the DB with data sequentially 3. Run a warmup to randomize the memory layout 4. Introduce fragmentation by deleting part of the keys 5. In parallel: 1. Trigger BGSAVE 2. Start 80/20 get/set load I played the following parameters to understand their influence: 1. Number of keys: 3M, 6M, and 12M. 2. Data size. While key themselves are of fixed length ~30 bytes, the value size is 120, 250, 500, 1000, and 2000 bytes. 3. Fragmentation. I delete 5%, 10%, and 15% of the original key range. I'm attaching a graph of BGSAVE process memory consumption. Instead of all benchmarks, I show the most representative runs IMO. 3m-fixed For 2000 bytes values peak memory usage is ~53% compared to vanilla. The peak happens at 57% BGSAVE progress. For 500 bytes values the peak is ~80% compared to vanilla. And happens at ~80% progress. For 120 bytes the difference is under 5%, and the patched version could even use more memory. ![500b-fixed](https://github.com/user-attachments/assets/b09451d3-4bce-4f33-b3db-2b5df2178ed2) For 12M keys, the peak is ~85% of the vanilla’s. Happens at ~70% mark. For 6M keys, the peak is ~87% of the vanilla’s. Happens at ~77% mark. For 3M keys, the peak is ~87% of the vanilla’s Happens at ~80% mark. **Changes** The PR contains 2 changes: 1. Static buffer for RDB comrpession. RDB compression leads to COW events even without any write load if we use `zfree`. It happens because the compression functions allocates a new buffer for each object. Together with freeing objects with `zfree` it leads to reusing of the memory shared with the main process. To deal with this problem, we use a pre-allocated constant 8K buffer for compression. If the object size is too big for this buffer, than we fall back to the ad hoc allocation behavior. 2. Freeing string objects instead of dismissing them Call to `zfree` is more expensive than direct call to `madvise`. But with #453 strings use the fast path – `zfree_with_size`. As a possible next step we can optimize `zfree` for other data types as well. --------- Signed-off-by: Vadym Khoptynets Signed-off-by: ranshid <88133677+ranshid@users.noreply.github.com> Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com> Co-authored-by: Viktor Söderqvist --- src/object.c | 9 +++++++-- src/rdb.c | 19 ++++++++++++------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/object.c b/src/object.c index 8c1cf64892..035198ad89 100644 --- a/src/object.c +++ b/src/object.c @@ -398,9 +398,14 @@ void decrRefCount(robj *o) { } } -/* See dismissObject() */ +/* See dismissObject(). sds is an exception, because the allocation + * size is known. Instead of dismissing it with madvise(MADV_DONTNEED) + * we free it via the allocator, which has minimal overhead when the + * size is known. This has advantage that it allows the allocator to + * accumulate free buffers to free whole pages, while madvise is nop + * if the buffer is less than a page. */ void dismissSds(sds s) { - dismissMemory(sdsAllocPtr(s), sdsAllocSize(s)); + sdsfree(s); } /* See dismissObject() */ diff --git a/src/rdb.c b/src/rdb.c index 1c200e54f5..ca904f7f98 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -49,6 +49,9 @@ #include #include +/* Size of the static buffer used for rdbcompression */ +#define LZF_STATIC_BUFFER_SIZE (8 * 1024) + /* This macro is called when the internal RDB structure is corrupt */ #define rdbReportCorruptRDB(...) rdbReportError(1, __LINE__, __VA_ARGS__) /* This macro is called when RDB read failed (possibly a short read) */ @@ -388,18 +391,20 @@ ssize_t rdbSaveLzfBlob(rio *rdb, void *data, size_t compress_len, size_t origina ssize_t rdbSaveLzfStringObject(rio *rdb, unsigned char *s, size_t len) { size_t comprlen, outlen; void *out; + static void *buffer = NULL; /* We require at least four bytes compression for this to be worth it */ if (len <= 4) return 0; outlen = len - 4; - if ((out = zmalloc(outlen + 1)) == NULL) return 0; - comprlen = lzf_compress(s, len, out, outlen); - if (comprlen == 0) { - zfree(out); - return 0; + if (outlen < LZF_STATIC_BUFFER_SIZE) { + if (!buffer) buffer = zmalloc(LZF_STATIC_BUFFER_SIZE); + out = buffer; + } else { + if ((out = zmalloc(outlen + 1)) == NULL) return 0; } - ssize_t nwritten = rdbSaveLzfBlob(rdb, out, comprlen, len); - zfree(out); + comprlen = lzf_compress(s, len, out, outlen); + ssize_t nwritten = comprlen ? rdbSaveLzfBlob(rdb, out, comprlen, len) : 0; + if (out != buffer) zfree(out); return nwritten; } From fbbfe5d3d3833c74d86c324ca9ffee8b97856724 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 2 Dec 2024 15:55:24 +0800 Subject: [PATCH 74/92] Print logs when the cluster state changes to fail or the fail reason changes (#1188) This log allows us to easily distinguish between full coverage and minority partition when the cluster fails. Sometimes it is not easy to see the minority partition in a healthy shards (both primary and replicas). And we decided not to add a cluster_fail_reason field to cluster info. Given that there are only two reasons and both are well-known and if we ended up adding more down the road we can add it in the furture. Signed-off-by: Binbin --- src/cluster.h | 6 ++++++ src/cluster_legacy.c | 39 +++++++++++++++++++++++++++++++++++-- src/cluster_legacy.h | 1 + tests/unit/cluster/info.tcl | 23 ++++++++++++++++++++++ 4 files changed, 67 insertions(+), 2 deletions(-) diff --git a/src/cluster.h b/src/cluster.h index 65eadf4c65..142f2d70b3 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -12,6 +12,12 @@ #define CLUSTER_FAIL 1 /* The cluster can't work */ #define CLUSTER_NAMELEN 40 /* sha1 hex length */ +/* Reason why the cluster state changes to fail. When adding new reasons, + * make sure to update clusterLogFailReason. */ +#define CLUSTER_FAIL_NONE 0 +#define CLUSTER_FAIL_NOT_FULL_COVERAGE 1 +#define CLUSTER_FAIL_MINORITY_PARTITION 2 + /* Redirection errors returned by getNodeByQuery(). */ #define CLUSTER_REDIR_NONE 0 /* Node can serve the request. */ #define CLUSTER_REDIR_CROSS_SLOT 1 /* -CROSSSLOT request. */ diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index e4b25e265d..6ea8eb2e67 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1082,6 +1082,7 @@ void clusterInit(void) { server.cluster->myself = NULL; server.cluster->currentEpoch = 0; server.cluster->state = CLUSTER_FAIL; + server.cluster->fail_reason = CLUSTER_FAIL_NONE; server.cluster->size = 0; server.cluster->todo_before_sleep = 0; server.cluster->nodes = dictCreate(&clusterNodesDictType); @@ -4493,7 +4494,7 @@ void clusterLogCantFailover(int reason) { case CLUSTER_CANT_FAILOVER_WAITING_DELAY: msg = "Waiting the delay before I can start a new failover."; break; case CLUSTER_CANT_FAILOVER_EXPIRED: msg = "Failover attempt expired."; break; case CLUSTER_CANT_FAILOVER_WAITING_VOTES: msg = "Waiting for votes, but majority still not reached."; break; - default: msg = "Unknown reason code."; break; + default: serverPanic("Unknown cant failover reason code."); } lastlog_time = time(NULL); serverLog(LL_NOTICE, "Currently unable to failover: %s", msg); @@ -5362,6 +5363,23 @@ void clusterCloseAllSlots(void) { * Cluster state evaluation function * -------------------------------------------------------------------------- */ +void clusterLogFailReason(int reason) { + if (reason == CLUSTER_FAIL_NONE) return; + + char *msg; + switch (reason) { + case CLUSTER_FAIL_NOT_FULL_COVERAGE: + msg = "At least one hash slot is not served by any available node. " + "Please check the 'cluster-require-full-coverage' configuration."; + break; + case CLUSTER_FAIL_MINORITY_PARTITION: + msg = "I am part of a minority partition."; + break; + default: serverPanic("Unknown fail reason code."); + } + serverLog(LL_WARNING, "Cluster is currently down: %s", msg); +} + /* The following are defines that are only used in the evaluation function * and are based on heuristics. Actually the main point about the rejoin and * writable delay is that they should be a few orders of magnitude larger @@ -5371,7 +5389,7 @@ void clusterCloseAllSlots(void) { #define CLUSTER_WRITABLE_DELAY 2000 void clusterUpdateState(void) { - int j, new_state; + int j, new_state, new_reason; int reachable_primaries = 0; static mstime_t among_minority_time; static mstime_t first_call_time = 0; @@ -5392,12 +5410,14 @@ void clusterUpdateState(void) { /* Start assuming the state is OK. We'll turn it into FAIL if there * are the right conditions. */ new_state = CLUSTER_OK; + new_reason = CLUSTER_FAIL_NONE; /* Check if all the slots are covered. */ if (server.cluster_require_full_coverage) { for (j = 0; j < CLUSTER_SLOTS; j++) { if (server.cluster->slots[j] == NULL || server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL)) { new_state = CLUSTER_FAIL; + new_reason = CLUSTER_FAIL_NOT_FULL_COVERAGE; break; } } @@ -5432,6 +5452,7 @@ void clusterUpdateState(void) { if (reachable_primaries < needed_quorum) { new_state = CLUSTER_FAIL; + new_reason = CLUSTER_FAIL_MINORITY_PARTITION; among_minority_time = mstime(); } } @@ -5455,7 +5476,21 @@ void clusterUpdateState(void) { serverLog(new_state == CLUSTER_OK ? LL_NOTICE : LL_WARNING, "Cluster state changed: %s", new_state == CLUSTER_OK ? "ok" : "fail"); server.cluster->state = new_state; + + /* Cluster state changes from ok to fail, print a log. */ + if (new_state == CLUSTER_FAIL) { + clusterLogFailReason(new_reason); + server.cluster->fail_reason = new_reason; + } } + + /* Cluster state is still fail, but the reason has changed, print a log. */ + if (new_state == CLUSTER_FAIL && new_reason != server.cluster->fail_reason) { + clusterLogFailReason(new_reason); + server.cluster->fail_reason = new_reason; + } + + if (new_state == CLUSTER_OK) server.cluster->fail_reason = CLUSTER_FAIL_NONE; } /* This function is called after the node startup in order to verify that data diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 39148c748d..5595402a4d 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -370,6 +370,7 @@ struct clusterState { clusterNode *myself; /* This node */ uint64_t currentEpoch; int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */ + int fail_reason; /* Why the cluster state changes to fail. */ int size; /* Num of primary nodes with at least one slot */ dict *nodes; /* Hash table of name -> clusterNode structures */ dict *shards; /* Hash table of shard_id -> list (of nodes) structures */ diff --git a/tests/unit/cluster/info.tcl b/tests/unit/cluster/info.tcl index 0d7b249899..f882378172 100644 --- a/tests/unit/cluster/info.tcl +++ b/tests/unit/cluster/info.tcl @@ -41,3 +41,26 @@ test "errorstats: rejected call due to MOVED Redirection" { } } ;# start_cluster + +start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000}} { + test "fail reason changed" { + # Kill one primary, so the cluster fail with not-full-coverage. + pause_process [srv 0 pid] + wait_for_condition 1000 50 { + [CI 1 cluster_state] eq {fail} && + [CI 2 cluster_state] eq {fail} + } else { + fail "Cluster doesn't fail" + } + verify_log_message -1 "*At least one hash slot is not served by any available node*" 0 + verify_log_message -2 "*At least one hash slot is not served by any available node*" 0 + + # Kill one more primary, so the cluster fail with minority-partition. + pause_process [srv -1 pid] + wait_for_log_messages -2 {"*minority partition*"} 0 1000 50 + + resume_process [srv 0 pid] + resume_process [srv -1 pid] + wait_for_cluster_state ok + } +} From 3df609ef06f71c37a45049ec1df9611b9f763d55 Mon Sep 17 00:00:00 2001 From: Nugine Date: Tue, 3 Dec 2024 02:40:38 +0800 Subject: [PATCH 75/92] Optimize PFCOUNT, PFMERGE command by SIMD acceleration (#1293) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR optimizes the performance of HyperLogLog commands (PFCOUNT, PFMERGE) by adding AVX2 fast paths. Two AVX2 functions are added for conversion between raw representation and dense representation. They are 15 ~ 30 times faster than scalar implementaion. Note that sparse representation is not accelerated. AVX2 fast paths are enabled when the CPU supports AVX2 (checked at runtime) and the hyperloglog configuration is default (HLL_REGISTERS == 16384 && HLL_BITS == 6). `PFDEBUG SIMD (ON|OFF)` subcommand is added for unit tests. A new TCL unit test checks that the results produced by non-AVX2 and AVX2 implementations are exactly equal. When merging 3 dense hll structures, the benchmark shows a 12x speedup compared to the scalar version. ``` pfcount key1 key2 key3 pfmerge keyall key1 key2 key3 ``` ``` ====================================================================================================== Type Ops/sec Avg. Latency p50 Latency p99 Latency p99.9 Latency KB/sec ------------------------------------------------------------------------------------------------------ PFCOUNT-scalar 5665.56 35.29839 32.25500 63.99900 67.58300 608.60 PFCOUNT-avx2 72377.83 2.75834 2.67100 5.34300 6.81500 7774.96 ------------------------------------------------------------------------------------------------------ PFMERGE-scalar 9851.29 20.28806 20.09500 36.86300 39.16700 615.71 PFMERGE-avx2 125621.89 1.59126 1.55100 3.11900 4.70300 15702.74 ------------------------------------------------------------------------------------------------------ scalar: valkey:unstable 2df56d87c0ebe802f38e8922bb2ea1e4ca9cfa76 avx2: Nugine:hll-simd 8f9adc34021080d96e60bd0abe06b043f3ed0275 CPU: 13th Gen Intel® Core™ i9-13900H × 20 Memory: 32.0 GiB OS: Ubuntu 22.04.5 LTS ``` Experiment repo: https://github.com/Nugine/redis-hyperloglog Benchmark script: https://github.com/Nugine/redis-hyperloglog/blob/main/scripts/memtier.sh Algorithm: https://github.com/Nugine/redis-hyperloglog/blob/main/cpp/bench.cpp --------- Signed-off-by: Xuyang Wang --- src/config.h | 13 ++ src/hyperloglog.c | 303 +++++++++++++++++++++++++++++++++++-- tests/unit/hyperloglog.tcl | 40 +++++ 3 files changed, 345 insertions(+), 11 deletions(-) diff --git a/src/config.h b/src/config.h index 3b79c5c681..a2e9f353dc 100644 --- a/src/config.h +++ b/src/config.h @@ -364,4 +364,17 @@ void setcpuaffinity(const char *cpulist); #define valkey_prefetch(addr) ((void)(addr)) #endif +/* Check if we can compile AVX2 code */ +#if defined(__x86_64__) && ((defined(__GNUC__) && __GNUC__ >= 5) || (defined(__clang__) && __clang_major__ >= 4)) +#if defined(__has_attribute) && __has_attribute(target) +#define HAVE_AVX2 +#endif +#endif + +#if defined(HAVE_AVX2) +#define ATTRIBUTE_TARGET_AVX2 __attribute__((target("avx2"))) +#else +#define ATTRIBUTE_TARGET_AVX2 +#endif + #endif diff --git a/src/hyperloglog.c b/src/hyperloglog.c index 563c5e7941..9a48c821ab 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -35,6 +35,10 @@ #include #include +#ifdef HAVE_AVX2 +#include +#endif + /* The HyperLogLog implementation is based on the following ideas: * * * The use of a 64 bit hash function as proposed in [1], in order to estimate @@ -208,6 +212,13 @@ struct hllhdr { static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected"; +#ifdef HAVE_AVX2 +static int simd_enabled = 1; +#define HLL_USE_AVX2 (simd_enabled && __builtin_cpu_supports("avx2")) +#else +#define HLL_USE_AVX2 0 +#endif + /* =========================== Low level bit macros ========================= */ /* Macros to access the dense representation. @@ -1064,6 +1075,136 @@ int hllAdd(robj *o, unsigned char *ele, size_t elesize) { } } +#ifdef HAVE_AVX2 +/* A specialized version of hllMergeDense, optimized for default configurations. + * + * Requirements: + * 1) HLL_REGISTERS == 16384 && HLL_BITS == 6 + * 2) The CPU supports AVX2 (checked at runtime in hllMergeDense) + * + * reg_raw: pointer to the raw representation array (16384 bytes, one byte per register) + * reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register) + */ +ATTRIBUTE_TARGET_AVX2 +void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) { + /* Shuffle indices for unpacking bytes of dense registers + * From: {XXXX|AAAB|BBCC|CDDD|EEEF|FFGG|GHHH|XXXX} + * To: {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0} + */ + const __m256i shuffle = _mm256_setr_epi8( // + 4, 5, 6, -1, // + 7, 8, 9, -1, // + 10, 11, 12, -1, // + 13, 14, 15, -1, // + 0, 1, 2, -1, // + 3, 4, 5, -1, // + 6, 7, 8, -1, // + 9, 10, 11, -1 // + ); + + /* Merge the first 8 registers (6 bytes) normally + * as the AVX2 algorithm needs 4 padding bytes at the start */ + uint8_t val; + for (int i = 0; i < 8; i++) { + HLL_DENSE_GET_REGISTER(val, reg_dense, i); + if (val > reg_raw[i]) { + reg_raw[i] = val; + } + } + + /* Dense to Raw: + * + * 4 registers in 3 bytes: + * {bbaaaaaa|ccccbbbb|ddddddcc} + * + * LOAD 32 bytes (32 registers) per iteration: + * 4(padding) + 12(16 registers) + 12(16 registers) + 4(padding) + * {XXXX|AAAB|BBCC|CDDD|EEEF|FFGG|GHHH|XXXX} + * + * SHUFFLE to: + * {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0} + * {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8 + * + * AVX2 is little endian, each of the 8 groups is a little-endian int32. + * A group (int32) contains 3 valid bytes (4 registers) and a zero byte. + * + * extract registers in each group with AND and SHIFT: + * {00aaaaaa|00000000|00000000|00000000} x8 (<<0) + * {00000000|00bbbbbb|00000000|00000000} x8 (<<2) + * {00000000|00000000|00cccccc|00000000} x8 (<<4) + * {00000000|00000000|00000000|00dddddd} x8 (<<6) + * + * merge the extracted registers with OR: + * {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8 + * + * Finally, compute MAX(reg_raw, merged) and STORE it back to reg_raw + */ + + /* Skip 8 registers (6 bytes) */ + const uint8_t *r = reg_dense + 6 - 4; + uint8_t *t = reg_raw + 8; + + for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) { + __m256i x0, x; + x0 = _mm256_loadu_si256((__m256i *)r); + x = _mm256_shuffle_epi8(x0, shuffle); + + __m256i a1, a2, a3, a4; + a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f)); + a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00000fc0)); + a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x0003f000)); + a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x00fc0000)); + + a2 = _mm256_slli_epi32(a2, 2); + a3 = _mm256_slli_epi32(a3, 4); + a4 = _mm256_slli_epi32(a4, 6); + + __m256i y1, y2, y; + y1 = _mm256_or_si256(a1, a2); + y2 = _mm256_or_si256(a3, a4); + y = _mm256_or_si256(y1, y2); + + __m256i z = _mm256_loadu_si256((__m256i *)t); + + z = _mm256_max_epu8(z, y); + + _mm256_storeu_si256((__m256i *)t, z); + + r += 24; + t += 32; + } + + /* Merge the last 24 registers normally + * as the AVX2 algorithm needs 4 padding bytes at the end */ + for (int i = HLL_REGISTERS - 24; i < HLL_REGISTERS; i++) { + HLL_DENSE_GET_REGISTER(val, reg_dense, i); + if (val > reg_raw[i]) { + reg_raw[i] = val; + } + } +} +#endif + +/* Merge dense-encoded registers to raw registers array. */ +void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) { +#ifdef HAVE_AVX2 + if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { + if (HLL_USE_AVX2) { + hllMergeDenseAVX2(reg_raw, reg_dense); + return; + } + } +#endif + + uint8_t val; + for (int i = 0; i < HLL_REGISTERS; i++) { + HLL_DENSE_GET_REGISTER(val, reg_dense, i); + if (val > reg_raw[i]) { + reg_raw[i] = val; + } + } +} + /* Merge by computing MAX(registers[i],hll[i]) the HyperLogLog 'hll' * with an array of uint8_t HLL_REGISTERS registers pointed by 'max'. * @@ -1077,12 +1218,7 @@ int hllMerge(uint8_t *max, robj *hll) { int i; if (hdr->encoding == HLL_DENSE) { - uint8_t val; - - for (i = 0; i < HLL_REGISTERS; i++) { - HLL_DENSE_GET_REGISTER(val, hdr->registers, i); - if (val > max[i]) max[i] = val; - } + hllMergeDense(max, hdr->registers); } else { uint8_t *p = hll->ptr, *end = p + sdslen(hll->ptr); long runlen, regval; @@ -1114,6 +1250,121 @@ int hllMerge(uint8_t *max, robj *hll) { return C_OK; } +#ifdef HAVE_AVX2 +/* A specialized version of hllDenseCompress, optimized for default configurations. + * + * Requirements: + * 1) HLL_REGISTERS == 16384 && HLL_BITS == 6 + * 2) The CPU supports AVX2 (checked at runtime in hllDenseCompress) + * + * reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register) + * reg_raw: pointer to the raw representation array (16384 bytes, one byte per register) + */ +ATTRIBUTE_TARGET_AVX2 +void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) { + /* Shuffle indices for packing bytes of dense registers + * From: {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0} + * To: {AAAB|BBCC|CDDD|0000|EEEF|FFGG|GHHH|0000} + */ + const __m256i shuffle = _mm256_setr_epi8( // + 0, 1, 2, // + 4, 5, 6, // + 8, 9, 10, // + 12, 13, 14, // + -1, -1, -1, -1, // + 0, 1, 2, // + 4, 5, 6, // + 8, 9, 10, // + 12, 13, 14, // + -1, -1, -1, -1 // + ); + + /* Raw to Dense: + * + * LOAD 32 bytes (32 registers) per iteration: + * {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8 + * + * AVX2 is little endian, each of the 8 groups is a little-endian int32. + * A group (int32) contains 4 registers. + * + * move the registers to correct positions with AND and SHIFT: + * {00aaaaaa|00000000|00000000|00000000} x8 (>>0) + * {bb000000|0000bbbb|00000000|00000000} x8 (>>2) + * {00000000|cccc0000|000000cc|00000000} x8 (>>4) + * {00000000|00000000|dddddd00|00000000} x8 (>>6) + * + * merge the registers with OR: + * {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8 + * {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0} + * + * SHUFFLE to: + * {AAAB|BBCC|CDDD|0000|EEEF|FFGG|GHHH|0000} + * + * STORE the lower half and higher half respectively: + * AAABBBCCCDDD0000 + * EEEFFFGGGHHH0000 + * AAABBBCCCDDDEEEFFFGGGHHH0000 + * + * Note that the last 4 bytes are padding bytes. + */ + + const uint8_t *r = reg_raw; + uint8_t *t = reg_dense; + + for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) { + __m256i x = _mm256_loadu_si256((__m256i *)r); + + __m256i a1, a2, a3, a4; + a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f)); + a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00003f00)); + a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x003f0000)); + a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x3f000000)); + + a2 = _mm256_srli_epi32(a2, 2); + a3 = _mm256_srli_epi32(a3, 4); + a4 = _mm256_srli_epi32(a4, 6); + + __m256i y1, y2, y; + y1 = _mm256_or_si256(a1, a2); + y2 = _mm256_or_si256(a3, a4); + y = _mm256_or_si256(y1, y2); + y = _mm256_shuffle_epi8(y, shuffle); + + __m128i lower, higher; + lower = _mm256_castsi256_si128(y); + higher = _mm256_extracti128_si256(y, 1); + + _mm_storeu_si128((__m128i *)t, lower); + _mm_storeu_si128((__m128i *)(t + 12), higher); + + r += 32; + t += 24; + } + + /* Merge the last 32 registers normally + * as the AVX2 algorithm needs 4 padding bytes at the end */ + for (int i = HLL_REGISTERS - 32; i < HLL_REGISTERS; i++) { + HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]); + } +} +#endif + +/* Compress raw registers to dense representation. */ +void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) { +#ifdef HAVE_AVX2 + if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { + if (HLL_USE_AVX2) { + hllDenseCompressAVX2(reg_dense, reg_raw); + return; + } + } +#endif + + for (int i = 0; i < HLL_REGISTERS; i++) { + HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]); + } +} + /* ========================== HyperLogLog commands ========================== */ /* Create an HLL object. We always create the HLL using sparse encoding. @@ -1363,12 +1614,17 @@ void pfmergeCommand(client *c) { /* Write the resulting HLL to the destination HLL registers and * invalidate the cached value. */ - for (j = 0; j < HLL_REGISTERS; j++) { - if (max[j] == 0) continue; + if (use_dense) { hdr = o->ptr; - switch (hdr->encoding) { - case HLL_DENSE: hllDenseSet(hdr->registers, j, max[j]); break; - case HLL_SPARSE: hllSparseSet(o, j, max[j]); break; + hllDenseCompress(hdr->registers, max); + } else { + for (j = 0; j < HLL_REGISTERS; j++) { + if (max[j] == 0) continue; + hdr = o->ptr; + switch (hdr->encoding) { + case HLL_DENSE: hllDenseSet(hdr->registers, j, max[j]); break; + case HLL_SPARSE: hllSparseSet(o, j, max[j]); break; + } } } hdr = o->ptr; /* o->ptr may be different now, as a side effect of @@ -1494,6 +1750,7 @@ void pfselftestCommand(client *c) { * PFDEBUG DECODE * PFDEBUG ENCODING * PFDEBUG TODENSE + * PFDEBUG SIMD (ON|OFF) */ void pfdebugCommand(client *c) { char *cmd = c->argv[1]->ptr; @@ -1501,6 +1758,30 @@ void pfdebugCommand(client *c) { robj *o; int j; + if (!strcasecmp(cmd, "simd")) { + if (c->argc != 3) goto arityerr; + + if (!strcasecmp(c->argv[2]->ptr, "on")) { +#ifdef HAVE_AVX2 + simd_enabled = 1; +#endif + } else if (!strcasecmp(c->argv[2]->ptr, "off")) { +#ifdef HAVE_AVX2 + simd_enabled = 0; +#endif + } else { + addReplyError(c, "Argument must be ON or OFF"); + } + + if (HLL_USE_AVX2) { + addReplyStatus(c, "enabled"); + } else { + addReplyStatus(c, "disabled"); + } + + return; + } + o = lookupKeyWrite(c->db, c->argv[2]); if (o == NULL) { addReplyError(c, "The specified key does not exist"); diff --git a/tests/unit/hyperloglog.tcl b/tests/unit/hyperloglog.tcl index c1b3b3a79f..765d5e0bdd 100644 --- a/tests/unit/hyperloglog.tcl +++ b/tests/unit/hyperloglog.tcl @@ -222,6 +222,46 @@ start_server {tags {"hll"}} { assert_equal 3 [r pfcount destkey] } + test {PFMERGE results with simd} { + r del hllscalar{t} hllsimd{t} hll1{t} hll2{t} hll3{t} + for {set x 1} {$x < 2000} {incr x} { + r pfadd hll1{t} [expr rand()] + } + for {set x 1} {$x < 4000} {incr x} { + r pfadd hll2{t} [expr rand()] + } + for {set x 1} {$x < 8000} {incr x} { + r pfadd hll3{t} [expr rand()] + } + assert {[r pfcount hll1{t}] > 0} + assert {[r pfcount hll2{t}] > 0} + assert {[r pfcount hll3{t}] > 0} + + r pfdebug simd off + set scalar [r pfcount hll1{t} hll2{t} hll3{t}] + r pfdebug simd on + set simd [r pfcount hll1{t} hll2{t} hll3{t}] + assert {$scalar > 0} + assert {$simd > 0} + assert_equal $scalar $simd + + r pfdebug simd off + r pfmerge hllscalar{t} hll1{t} hll2{t} hll3{t} + r pfdebug simd on + r pfmerge hllsimd{t} hll1{t} hll2{t} hll3{t} + + set scalar [r pfcount hllscalar{t}] + set simd [r pfcount hllsimd{t}] + assert {$scalar > 0} + assert {$simd > 0} + assert_equal $scalar $simd + + set scalar [r get hllscalar{t}] + set simd [r get hllsimd{t}] + assert_equal $scalar $simd + + } {} {needs:pfdebug} + test {PFCOUNT multiple-keys merge returns cardinality of union #1} { r del hll1{t} hll2{t} hll3{t} for {set x 1} {$x < 10000} {incr x} { From 397201c48f4cb7fd052fd98c66385eaab1981e1c Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Tue, 3 Dec 2024 08:42:29 -0800 Subject: [PATCH 76/92] Refactor of ActiveDefrag to reduce latencies (#1242) Refer to: https://github.com/valkey-io/valkey/issues/1141 This update refactors the defrag code to: * Make the overall code more readable and maintainable * Reduce latencies incurred during defrag processing With this update, the defrag cycle time is reduced to 500us, with more frequent cycles. This results in much more predictable latencies, with a dramatic reduction in tail latencies. (See https://github.com/valkey-io/valkey/issues/1141 for more complete details.) This update is focused mostly on the high-level processing, and does NOT address lower level functions which aren't currently timebound (e.g. `activeDefragSdsDict()`, and `moduleDefragGlobals()`). These are out of scope for this update and left for a future update. I fixed `kvstoreDictLUTDefrag` because it was using up to 7ms on a CME single shard. See original github issue for performance details. --------- Signed-off-by: Jim Brunner Signed-off-by: Madelyn Olson Co-authored-by: Madelyn Olson --- src/ae.c | 2 +- src/config.c | 5 +- src/defrag.c | 1078 +++++++++++++++++++------------ src/dict.c | 4 +- src/dict.h | 2 +- src/kvstore.c | 23 +- src/kvstore.h | 4 +- src/server.c | 29 +- src/server.h | 11 +- tests/unit/memefficiency.tcl | 25 +- tests/unit/moduleapi/defrag.tcl | 1 - valkey.conf | 18 +- 12 files changed, 731 insertions(+), 471 deletions(-) diff --git a/src/ae.c b/src/ae.c index 9bf8619902..643ff17070 100644 --- a/src/ae.c +++ b/src/ae.c @@ -85,7 +85,7 @@ aeEventLoop *aeCreateEventLoop(int setsize) { if (eventLoop->events == NULL || eventLoop->fired == NULL) goto err; eventLoop->setsize = setsize; eventLoop->timeEventHead = NULL; - eventLoop->timeEventNextId = 0; + eventLoop->timeEventNextId = 1; eventLoop->stop = 0; eventLoop->maxfd = -1; eventLoop->beforesleep = NULL; diff --git a/src/config.c b/src/config.c index 7f0901c50a..5a07c2c0f0 100644 --- a/src/config.c +++ b/src/config.c @@ -3278,10 +3278,11 @@ standardConfig static_configs[] = { createIntConfig("list-max-listpack-size", "list-max-ziplist-size", MODIFIABLE_CONFIG, INT_MIN, INT_MAX, server.list_max_listpack_size, -2, INTEGER_CONFIG, NULL, NULL), createIntConfig("tcp-keepalive", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.tcpkeepalive, 300, INTEGER_CONFIG, NULL, NULL), createIntConfig("cluster-migration-barrier", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.cluster_migration_barrier, 1, INTEGER_CONFIG, NULL, NULL), - createIntConfig("active-defrag-cycle-min", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cycle_min, 1, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: 1% CPU min (at lower threshold) */ - createIntConfig("active-defrag-cycle-max", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cycle_max, 25, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: 25% CPU max (at upper threshold) */ + createIntConfig("active-defrag-cycle-min", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cpu_min, 1, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: 1% CPU min (at lower threshold) */ + createIntConfig("active-defrag-cycle-max", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cpu_max, 25, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: 25% CPU max (at upper threshold) */ createIntConfig("active-defrag-threshold-lower", NULL, MODIFIABLE_CONFIG, 0, 1000, server.active_defrag_threshold_lower, 10, INTEGER_CONFIG, NULL, NULL), /* Default: don't defrag when fragmentation is below 10% */ createIntConfig("active-defrag-threshold-upper", NULL, MODIFIABLE_CONFIG, 0, 1000, server.active_defrag_threshold_upper, 100, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: maximum defrag force at 100% fragmentation */ + createIntConfig("active-defrag-cycle-us", NULL, MODIFIABLE_CONFIG, 0, 100000, server.active_defrag_cycle_us, 500, INTEGER_CONFIG, NULL, updateDefragConfiguration), createIntConfig("lfu-log-factor", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.lfu_log_factor, 10, INTEGER_CONFIG, NULL, NULL), createIntConfig("lfu-decay-time", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.lfu_decay_time, 1, INTEGER_CONFIG, NULL, NULL), createIntConfig("replica-priority", "slave-priority", MODIFIABLE_CONFIG, 0, INT_MAX, server.replica_priority, 100, INTEGER_CONFIG, NULL, NULL), diff --git a/src/defrag.c b/src/defrag.c index b49a175f7c..d0c7632f17 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -38,23 +38,126 @@ #ifdef HAVE_DEFRAG -typedef struct defragCtx { - void *privdata; +typedef enum { DEFRAG_NOT_DONE = 0, + DEFRAG_DONE = 1 } doneStatus; + + +/* + * Defragmentation is performed in stages. Each stage is serviced by a stage function + * (defragStageFn). The stage function is passed a target (void*) to defrag. The contents of that + * target are unique to the particular stage - and may even be NULL for some stage functions. The + * same stage function can be used multiple times (for different stages) each having a different + * target. + * + * The stage function is required to maintain an internal static state. This allows the stage + * function to continue when invoked in an iterative manner. When invoked with a 0 endtime, the + * stage function is required to clear it's internal state and prepare to begin a new stage. It + * should return false (more work to do) as it should NOT perform any real "work" during init. + * + * Parameters: + * endtime - This is the monotonic time that the function should end and return. This ensures + * a bounded latency due to defrag. When endtime is 0, the internal state should be + * cleared, preparing to begin the stage with a new target. + * target - This is the "thing" that should be defragged. It's type is dependent on the + * type of the stage function. This might be a dict, a kvstore, a DB, or other. + * privdata - A pointer to arbitrary private data which is unique to the stage function. + * + * Returns: + * - DEFRAG_DONE if the stage is complete + * - DEFRAG_NOT_DONE if there is more work to do + */ +typedef doneStatus (*defragStageFn)(monotime endtime, void *target, void *privdata); + +typedef struct { + defragStageFn stage_fn; // The function to be invoked for the stage + void *target; // The target that the function will defrag + void *privdata; // Private data, unique to the stage function +} StageDescriptor; + +/* Globals needed for the main defrag processing logic. + * Doesn't include variables specific to a stage or type of data. */ +struct DefragContext { + monotime start_cycle; // Time of beginning of defrag cycle + long long start_defrag_hits; // server.stat_active_defrag_hits captured at beginning of cycle + list *remaining_stages; // List of stages which remain to be processed + StageDescriptor *current_stage; // The stage that's currently being processed + + long long timeproc_id; // Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID) + monotime timeproc_end_time; // Ending time of previous timerproc execution + long timeproc_overage_us; // A correction value if over/under target CPU percent +}; +static struct DefragContext defrag; + + +/* There are a number of stages which process a kvstore. To simplify this, a stage helper function + * `defragStageKvstoreHelper()` is defined. This function aids in iterating over the kvstore. It + * uses these definitions. + */ +/* State of the kvstore helper. The private data (privdata) passed to the kvstore helper MUST BEGIN + * with a kvstoreIterState (or be passed as NULL). */ +#define KVS_SLOT_DEFRAG_LUT -2 +#define KVS_SLOT_UNASSIGNED -1 +typedef struct { + kvstore *kvs; int slot; - void *aux; -} defragCtx; + unsigned long cursor; +} kvstoreIterState; +/* The kvstore helper uses this function to perform tasks before continuing the iteration. For the + * main dictionary, large items are set aside and processed by this function before continuing with + * iteration over the kvstore. + * endtime - This is the monotonic time that the function should end and return. + * privdata - Private data for functions invoked by the helper. If provided in the call to + * `defragStageKvstoreHelper()`, the `kvstoreIterState` portion (at the beginning) + * will be updated with the current kvstore iteration status. + * + * Returns: + * - DEFRAG_DONE if the pre-continue work is complete + * - DEFRAG_NOT_DONE if there is more work to do + */ +typedef doneStatus (*kvstoreHelperPreContinueFn)(monotime endtime, void *privdata); + + +// Private data for main dictionary keys +typedef struct { + kvstoreIterState kvstate; + serverDb *db; + dictEntry *saved_expire_de; +} defragKeysCtx; +static_assert(offsetof(defragKeysCtx, kvstate) == 0, "defragStageKvstoreHelper requires this"); + +// Private data for pubsub kvstores +typedef dict *(*getClientChannelsFn)(client *); +typedef struct { + getClientChannelsFn fn; +} getClientChannelsFnWrapper; -typedef struct defragPubSubCtx { - kvstore *pubsub_channels; - dict *(*clientPubSubChannels)(client *); +typedef struct { + kvstoreIterState kvstate; + getClientChannelsFn getPubSubChannels; } defragPubSubCtx; +static_assert(offsetof(defragPubSubCtx, kvstate) == 0, "defragStageKvstoreHelper requires this"); -/* Defrag helper for generic allocations. - * - * returns NULL in case the allocation wasn't moved. - * when it returns a non-null value, the old pointer was already released - * and should NOT be accessed. */ -void *activeDefragAlloc(void *ptr) { + +/* When scanning a main kvstore, large elements are queued for later handling rather than + * causing a large latency spike while processing a hash table bucket. This list is only used + * for stage: "defragStageDbKeys". It will only contain values for the current kvstore being + * defragged. + * Note that this is a list of key names. It's possible that the key may be deleted or modified + * before "later" and we will search by key name to find the entry when we defrag the item later. + */ +static list *defrag_later; +static unsigned long defrag_later_cursor; + + +/* this method was added to jemalloc in order to help us understand which + * pointers are worthwhile moving and which aren't */ +int je_get_defrag_hint(void *ptr); + +/* Defrag function which allocates and copies memory if needed, but DOESN'T free the old block. + * It is the responsibility of the caller to free the old block if a non-NULL value (new block) + * is returned. (Returns NULL if no relocation was needed.) + */ +static void *activeDefragAllocWithoutFree(void *ptr, size_t *allocation_size) { size_t size; void *newptr; if (!allocatorShouldDefrag(ptr)) { @@ -67,28 +170,43 @@ void *activeDefragAlloc(void *ptr) { size = zmalloc_size(ptr); newptr = allocatorDefragAlloc(size); memcpy(newptr, ptr, size); - allocatorDefragFree(ptr, size); + if (allocation_size) *allocation_size = size; + server.stat_active_defrag_hits++; return newptr; } +/* Defrag helper for generic allocations. + * + * Returns NULL in case the allocation wasn't moved. + * When it returns a non-null value, the old pointer was already released + * and should NOT be accessed. */ +void *activeDefragAlloc(void *ptr) { + size_t allocation_size; + void *newptr = activeDefragAllocWithoutFree(ptr, &allocation_size); + if (newptr) allocatorDefragFree(ptr, allocation_size); + return newptr; +} + /* This method captures the expiry db dict entry which refers to data stored in keys db dict entry. */ -void defragEntryStartCbForKeys(void *ctx, void *oldptr) { - defragCtx *defragctx = (defragCtx *)ctx; - serverDb *db = defragctx->privdata; +static void defragEntryStartCbForKeys(void *ctx, void *oldptr) { + defragKeysCtx *defragctx = (defragKeysCtx *)ctx; + serverDb *db = defragctx->db; sds oldsds = (sds)dictGetKey((dictEntry *)oldptr); - int slot = defragctx->slot; + int slot = defragctx->kvstate.slot; if (kvstoreDictSize(db->expires, slot)) { dictEntry *expire_de = kvstoreDictFind(db->expires, slot, oldsds); - defragctx->aux = expire_de; + defragctx->saved_expire_de = expire_de; + } else { + defragctx->saved_expire_de = NULL; } } /* This method updates the key of expiry db dict entry. The key might be no longer valid * as it could have been cleaned up during the defrag-realloc of the main dictionary. */ -void defragEntryFinishCbForKeys(void *ctx, void *newptr) { - defragCtx *defragctx = (defragCtx *)ctx; - dictEntry *expire_de = (dictEntry *)defragctx->aux; +static void defragEntryFinishCbForKeys(void *ctx, void *newptr) { + defragKeysCtx *defragctx = (defragKeysCtx *)ctx; + dictEntry *expire_de = defragctx->saved_expire_de; /* Item doesn't have TTL associated to it. */ if (!expire_de) return; /* No reallocation happened. */ @@ -96,18 +214,18 @@ void defragEntryFinishCbForKeys(void *ctx, void *newptr) { expire_de = NULL; return; } - serverDb *db = defragctx->privdata; + serverDb *db = defragctx->db; sds newsds = (sds)dictGetKey((dictEntry *)newptr); - int slot = defragctx->slot; + int slot = defragctx->kvstate.slot; kvstoreDictSetKey(db->expires, slot, expire_de, newsds); } -/*Defrag helper for sds strings +/* Defrag helper for sds strings * - * returns NULL in case the allocation wasn't moved. - * when it returns a non-null value, the old pointer was already released + * Returns NULL in case the allocation wasn't moved. + * When it returns a non-null value, the old pointer was already released * and should NOT be accessed. */ -sds activeDefragSds(sds sdsptr) { +static sds activeDefragSds(sds sdsptr) { void *ptr = sdsAllocPtr(sdsptr); void *newptr = activeDefragAlloc(ptr); if (newptr) { @@ -118,60 +236,48 @@ sds activeDefragSds(sds sdsptr) { return NULL; } -/* Defrag helper for robj and/or string objects with expected refcount. - * - * Like activeDefragStringOb, but it requires the caller to pass in the expected - * reference count. In some cases, the caller needs to update a robj whose - * reference count is not 1, in these cases, the caller must explicitly pass - * in the reference count, otherwise defragmentation will not be performed. - * Note that the caller is responsible for updating any other references to the robj. */ -robj *activeDefragStringObEx(robj *ob, int expected_refcount) { - robj *ret = NULL; - if (ob->refcount != expected_refcount) return NULL; - - /* try to defrag robj (only if not an EMBSTR type (handled below). */ - if (ob->type != OBJ_STRING || ob->encoding != OBJ_ENCODING_EMBSTR) { - if ((ret = activeDefragAlloc(ob))) { - ob = ret; - } +/* Performs defrag on a string-type (or generic) robj, but does not free the old robj. This is the + * caller's responsibility. This is necessary for string objects with multiple references. In this + * case the caller can fix the references before freeing the original object. + */ +static robj *activeDefragStringObWithoutFree(robj *ob, size_t *allocation_size) { + if (ob->type == OBJ_STRING && ob->encoding == OBJ_ENCODING_RAW) { + // Try to defrag the linked sds, regardless of if robj will be moved + sds newsds = activeDefragSds((sds)ob->ptr); + if (newsds) ob->ptr = newsds; } - /* try to defrag string object */ - if (ob->type == OBJ_STRING) { - if (ob->encoding == OBJ_ENCODING_RAW) { - sds newsds = activeDefragSds((sds)ob->ptr); - if (newsds) { - ob->ptr = newsds; - } - } else if (ob->encoding == OBJ_ENCODING_EMBSTR) { - /* The sds is embedded in the object allocation, calculate the - * offset and update the pointer in the new allocation. */ - long ofs = (intptr_t)ob->ptr - (intptr_t)ob; - if ((ret = activeDefragAlloc(ob))) { - ret->ptr = (void *)((intptr_t)ret + ofs); - } - } else if (ob->encoding != OBJ_ENCODING_INT) { - serverPanic("Unknown string encoding"); - } + robj *new_robj = activeDefragAllocWithoutFree(ob, allocation_size); + + if (new_robj && ob->type == OBJ_STRING && ob->encoding == OBJ_ENCODING_EMBSTR) { + // If the robj is moved, correct the internal pointer + long embstr_offset = (intptr_t)ob->ptr - (intptr_t)ob; + new_robj->ptr = (void *)((intptr_t)new_robj + embstr_offset); } - return ret; + return new_robj; } + /* Defrag helper for robj and/or string objects * - * returns NULL in case the allocation wasn't moved. - * when it returns a non-null value, the old pointer was already released + * Returns NULL in case the allocation wasn't moved. + * When it returns a non-null value, the old pointer was already released * and should NOT be accessed. */ robj *activeDefragStringOb(robj *ob) { - return activeDefragStringObEx(ob, 1); + size_t allocation_size; + if (ob->refcount != 1) return NULL; // Unsafe to defrag if multiple refs + robj *new_robj = activeDefragStringObWithoutFree(ob, &allocation_size); + if (new_robj) allocatorDefragFree(ob, allocation_size); + return new_robj; } + /* Defrag helper for lua scripts * - * returns NULL in case the allocation wasn't moved. - * when it returns a non-null value, the old pointer was already released + * Returns NULL in case the allocation wasn't moved. + * When it returns a non-null value, the old pointer was already released * and should NOT be accessed. */ -luaScript *activeDefragLuaScript(luaScript *script) { +static luaScript *activeDefragLuaScript(luaScript *script) { luaScript *ret = NULL; /* try to defrag script struct */ @@ -193,7 +299,7 @@ luaScript *activeDefragLuaScript(luaScript *script) { * Returns NULL in case the allocation wasn't moved. * When it returns a non-null value, the old pointer was already released * and should NOT be accessed. */ -dict *dictDefragTables(dict *d) { +static dict *dictDefragTables(dict *d) { dict *ret = NULL; dictEntry **newtable; /* handle the dict struct */ @@ -211,7 +317,7 @@ dict *dictDefragTables(dict *d) { } /* Internal function used by zslDefrag */ -void zslUpdateNode(zskiplist *zsl, zskiplistNode *oldnode, zskiplistNode *newnode, zskiplistNode **update) { +static void zslUpdateNode(zskiplist *zsl, zskiplistNode *oldnode, zskiplistNode *newnode, zskiplistNode **update) { int i; for (i = 0; i < zsl->level; i++) { if (update[i]->level[i].forward == oldnode) update[i]->level[i].forward = newnode; @@ -233,7 +339,7 @@ void zslUpdateNode(zskiplist *zsl, zskiplistNode *oldnode, zskiplistNode *newnod * only need to defrag the skiplist, but not update the obj pointer. * When return value is non-NULL, it is the score reference that must be updated * in the dict record. */ -double *zslDefrag(zskiplist *zsl, double score, sds oldele, sds newele) { +static double *zslDefrag(zskiplist *zsl, double score, sds oldele, sds newele) { zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x, *newx; int i; sds ele = newele ? newele : oldele; @@ -267,7 +373,7 @@ double *zslDefrag(zskiplist *zsl, double score, sds oldele, sds newele) { /* Defrag helper for sorted set. * Defrag a single dict entry key name, and corresponding skiplist struct */ -void activeDefragZsetEntry(zset *zs, dictEntry *de) { +static void activeDefragZsetEntry(zset *zs, dictEntry *de) { sds newsds; double *newscore; sds sdsele = dictGetKey(de); @@ -284,13 +390,13 @@ void activeDefragZsetEntry(zset *zs, dictEntry *de) { #define DEFRAG_SDS_DICT_VAL_VOID_PTR 3 #define DEFRAG_SDS_DICT_VAL_LUA_SCRIPT 4 -void activeDefragSdsDictCallback(void *privdata, const dictEntry *de) { +static void activeDefragSdsDictCallback(void *privdata, const dictEntry *de) { UNUSED(privdata); UNUSED(de); } /* Defrag a dict with sds key and optional value (either ptr, sds or robj string) */ -void activeDefragSdsDict(dict *d, int val_type) { +static void activeDefragSdsDict(dict *d, int val_type) { unsigned long cursor = 0; dictDefragFunctions defragfns = { .defragAlloc = activeDefragAlloc, @@ -306,34 +412,7 @@ void activeDefragSdsDict(dict *d, int val_type) { } /* Defrag a list of ptr, sds or robj string values */ -void activeDefragList(list *l, int val_type) { - listNode *ln, *newln; - for (ln = l->head; ln; ln = ln->next) { - if ((newln = activeDefragAlloc(ln))) { - if (newln->prev) - newln->prev->next = newln; - else - l->head = newln; - if (newln->next) - newln->next->prev = newln; - else - l->tail = newln; - ln = newln; - } - if (val_type == DEFRAG_SDS_DICT_VAL_IS_SDS) { - sds newsds, sdsele = ln->value; - if ((newsds = activeDefragSds(sdsele))) ln->value = newsds; - } else if (val_type == DEFRAG_SDS_DICT_VAL_IS_STROB) { - robj *newele, *ele = ln->value; - if ((newele = activeDefragStringOb(ele))) ln->value = newele; - } else if (val_type == DEFRAG_SDS_DICT_VAL_VOID_PTR) { - void *newptr, *ptr = ln->value; - if ((newptr = activeDefragAlloc(ptr))) ln->value = newptr; - } - } -} - -void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) { +static void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) { quicklistNode *newnode, *node = *node_ref; unsigned char *newzl; if ((newnode = activeDefragAlloc(node))) { @@ -350,7 +429,7 @@ void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) { if ((newzl = activeDefragAlloc(node->entry))) node->entry = newzl; } -void activeDefragQuickListNodes(quicklist *ql) { +static void activeDefragQuickListNodes(quicklist *ql) { quicklistNode *node = ql->head; while (node) { activeDefragQuickListNode(ql, &node); @@ -361,13 +440,18 @@ void activeDefragQuickListNodes(quicklist *ql) { /* when the value has lots of elements, we want to handle it later and not as * part of the main dictionary scan. this is needed in order to prevent latency * spikes when handling large items */ -void defragLater(serverDb *db, dictEntry *kde) { +static void defragLater(dictEntry *kde) { + if (!defrag_later) { + defrag_later = listCreate(); + listSetFreeMethod(defrag_later, (void (*)(void *))sdsfree); + defrag_later_cursor = 0; + } sds key = sdsdup(dictGetKey(kde)); - listAddNodeTail(db->defrag_later, key); + listAddNodeTail(defrag_later, key); } /* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */ -long scanLaterList(robj *ob, unsigned long *cursor, long long endtime) { +static long scanLaterList(robj *ob, unsigned long *cursor, monotime endtime) { quicklist *ql = ob->ptr; quicklistNode *node; long iterations = 0; @@ -392,7 +476,7 @@ long scanLaterList(robj *ob, unsigned long *cursor, long long endtime) { activeDefragQuickListNode(ql, &node); server.stat_active_defrag_scanned++; if (++iterations > 128 && !bookmark_failed) { - if (ustime() > endtime) { + if (getMonotonicUs() > endtime) { if (!quicklistBookmarkCreate(&ql, "_AD", node)) { bookmark_failed = 1; } else { @@ -413,14 +497,14 @@ typedef struct { zset *zs; } scanLaterZsetData; -void scanLaterZsetCallback(void *privdata, const dictEntry *_de) { +static void scanLaterZsetCallback(void *privdata, const dictEntry *_de) { dictEntry *de = (dictEntry *)_de; scanLaterZsetData *data = privdata; activeDefragZsetEntry(data->zs, de); server.stat_active_defrag_scanned++; } -void scanLaterZset(robj *ob, unsigned long *cursor) { +static void scanLaterZset(robj *ob, unsigned long *cursor) { if (ob->type != OBJ_ZSET || ob->encoding != OBJ_ENCODING_SKIPLIST) return; zset *zs = (zset *)ob->ptr; dict *d = zs->dict; @@ -430,13 +514,13 @@ void scanLaterZset(robj *ob, unsigned long *cursor) { } /* Used as scan callback when all the work is done in the dictDefragFunctions. */ -void scanCallbackCountScanned(void *privdata, const dictEntry *de) { +static void scanCallbackCountScanned(void *privdata, const dictEntry *de) { UNUSED(privdata); UNUSED(de); server.stat_active_defrag_scanned++; } -void scanLaterSet(robj *ob, unsigned long *cursor) { +static void scanLaterSet(robj *ob, unsigned long *cursor) { if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HT) return; dict *d = ob->ptr; dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc, @@ -444,7 +528,7 @@ void scanLaterSet(robj *ob, unsigned long *cursor) { *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL); } -void scanLaterHash(robj *ob, unsigned long *cursor) { +static void scanLaterHash(robj *ob, unsigned long *cursor) { if (ob->type != OBJ_HASH || ob->encoding != OBJ_ENCODING_HT) return; dict *d = ob->ptr; dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc, @@ -453,18 +537,18 @@ void scanLaterHash(robj *ob, unsigned long *cursor) { *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL); } -void defragQuicklist(serverDb *db, dictEntry *kde) { +static void defragQuicklist(dictEntry *kde) { robj *ob = dictGetVal(kde); quicklist *ql = ob->ptr, *newql; serverAssert(ob->type == OBJ_LIST && ob->encoding == OBJ_ENCODING_QUICKLIST); if ((newql = activeDefragAlloc(ql))) ob->ptr = ql = newql; if (ql->len > server.active_defrag_max_scan_fields) - defragLater(db, kde); + defragLater(kde); else activeDefragQuickListNodes(ql); } -void defragZsetSkiplist(serverDb *db, dictEntry *kde) { +static void defragZsetSkiplist(dictEntry *kde) { robj *ob = dictGetVal(kde); zset *zs = (zset *)ob->ptr; zset *newzs; @@ -477,7 +561,7 @@ void defragZsetSkiplist(serverDb *db, dictEntry *kde) { if ((newzsl = activeDefragAlloc(zs->zsl))) zs->zsl = newzsl; if ((newheader = activeDefragAlloc(zs->zsl->header))) zs->zsl->header = newheader; if (dictSize(zs->dict) > server.active_defrag_max_scan_fields) - defragLater(db, kde); + defragLater(kde); else { dictIterator *di = dictGetIterator(zs->dict); while ((de = dictNext(di)) != NULL) { @@ -489,26 +573,26 @@ void defragZsetSkiplist(serverDb *db, dictEntry *kde) { if ((newdict = dictDefragTables(zs->dict))) zs->dict = newdict; } -void defragHash(serverDb *db, dictEntry *kde) { +static void defragHash(dictEntry *kde) { robj *ob = dictGetVal(kde); dict *d, *newd; serverAssert(ob->type == OBJ_HASH && ob->encoding == OBJ_ENCODING_HT); d = ob->ptr; if (dictSize(d) > server.active_defrag_max_scan_fields) - defragLater(db, kde); + defragLater(kde); else activeDefragSdsDict(d, DEFRAG_SDS_DICT_VAL_IS_SDS); /* defrag the dict struct and tables */ if ((newd = dictDefragTables(ob->ptr))) ob->ptr = newd; } -void defragSet(serverDb *db, dictEntry *kde) { +static void defragSet(dictEntry *kde) { robj *ob = dictGetVal(kde); dict *d, *newd; serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HT); d = ob->ptr; if (dictSize(d) > server.active_defrag_max_scan_fields) - defragLater(db, kde); + defragLater(kde); else activeDefragSdsDict(d, DEFRAG_SDS_DICT_NO_VAL); /* defrag the dict struct and tables */ @@ -517,7 +601,7 @@ void defragSet(serverDb *db, dictEntry *kde) { /* Defrag callback for radix tree iterator, called for each node, * used in order to defrag the nodes allocations. */ -int defragRaxNode(raxNode **noderef) { +static int defragRaxNode(raxNode **noderef) { raxNode *newnode = activeDefragAlloc(*noderef); if (newnode) { *noderef = newnode; @@ -527,7 +611,7 @@ int defragRaxNode(raxNode **noderef) { } /* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */ -int scanLaterStreamListpacks(robj *ob, unsigned long *cursor, long long endtime) { +static int scanLaterStreamListpacks(robj *ob, unsigned long *cursor, monotime endtime) { static unsigned char last[sizeof(streamID)]; raxIterator ri; long iterations = 0; @@ -563,7 +647,7 @@ int scanLaterStreamListpacks(robj *ob, unsigned long *cursor, long long endtime) if (newdata) raxSetData(ri.node, ri.data = newdata); server.stat_active_defrag_scanned++; if (++iterations > 128) { - if (ustime() > endtime) { + if (getMonotonicUs() > endtime) { serverAssert(ri.key_len == sizeof(last)); memcpy(last, ri.key, ri.key_len); raxStop(&ri); @@ -585,7 +669,7 @@ typedef void *(raxDefragFunction)(raxIterator *ri, void *privdata); * 2) rax nodes * 3) rax entry data (only if defrag_data is specified) * 4) call a callback per element, and allow the callback to return a new pointer for the element */ -void defragRadixTree(rax **raxref, int defrag_data, raxDefragFunction *element_cb, void *element_cb_data) { +static void defragRadixTree(rax **raxref, int defrag_data, raxDefragFunction *element_cb, void *element_cb_data) { raxIterator ri; rax *rax; if ((rax = activeDefragAlloc(*raxref))) *raxref = rax; @@ -608,7 +692,7 @@ typedef struct { streamConsumer *c; } PendingEntryContext; -void *defragStreamConsumerPendingEntry(raxIterator *ri, void *privdata) { +static void *defragStreamConsumerPendingEntry(raxIterator *ri, void *privdata) { PendingEntryContext *ctx = privdata; streamNACK *nack = ri->data, *newnack; nack->consumer = ctx->c; /* update nack pointer to consumer */ @@ -622,7 +706,7 @@ void *defragStreamConsumerPendingEntry(raxIterator *ri, void *privdata) { return newnack; } -void *defragStreamConsumer(raxIterator *ri, void *privdata) { +static void *defragStreamConsumer(raxIterator *ri, void *privdata) { streamConsumer *c = ri->data; streamCG *cg = privdata; void *newc = activeDefragAlloc(c); @@ -638,7 +722,7 @@ void *defragStreamConsumer(raxIterator *ri, void *privdata) { return newc; /* returns NULL if c was not defragged */ } -void *defragStreamConsumerGroup(raxIterator *ri, void *privdata) { +static void *defragStreamConsumerGroup(raxIterator *ri, void *privdata) { streamCG *cg = ri->data; UNUSED(privdata); if (cg->consumers) defragRadixTree(&cg->consumers, 0, defragStreamConsumer, cg); @@ -646,7 +730,7 @@ void *defragStreamConsumerGroup(raxIterator *ri, void *privdata) { return NULL; } -void defragStream(serverDb *db, dictEntry *kde) { +static void defragStream(dictEntry *kde) { robj *ob = dictGetVal(kde); serverAssert(ob->type == OBJ_STREAM && ob->encoding == OBJ_ENCODING_STREAM); stream *s = ob->ptr, *news; @@ -657,7 +741,7 @@ void defragStream(serverDb *db, dictEntry *kde) { if (raxSize(s->rax) > server.active_defrag_max_scan_fields) { rax *newrax = activeDefragAlloc(s->rax); if (newrax) s->rax = newrax; - defragLater(db, kde); + defragLater(kde); } else defragRadixTree(&s->rax, 1, NULL, NULL); @@ -667,25 +751,25 @@ void defragStream(serverDb *db, dictEntry *kde) { /* Defrag a module key. This is either done immediately or scheduled * for later. Returns then number of pointers defragged. */ -void defragModule(serverDb *db, dictEntry *kde) { +static void defragModule(serverDb *db, dictEntry *kde) { robj *obj = dictGetVal(kde); serverAssert(obj->type == OBJ_MODULE); - if (!moduleDefragValue(dictGetKey(kde), obj, db->id)) defragLater(db, kde); + if (!moduleDefragValue(dictGetKey(kde), obj, db->id)) defragLater(kde); } /* for each key we scan in the main dict, this function will attempt to defrag * all the various pointers it has. */ -void defragKey(defragCtx *ctx, dictEntry *de) { - serverDb *db = ctx->privdata; - int slot = ctx->slot; +static void defragKey(defragKeysCtx *ctx, dictEntry *de) { + serverDb *db = ctx->db; + int slot = ctx->kvstate.slot; robj *newob, *ob; unsigned char *newzl; /* Try to defrag robj and / or string value. */ ob = dictGetVal(de); if ((newob = activeDefragStringOb(ob))) { - kvstoreDictSetVal(db->keys, slot, de, newob); + kvstoreDictSetVal(ctx->kvstate.kvs, slot, de, newob); ob = newob; } @@ -693,7 +777,7 @@ void defragKey(defragCtx *ctx, dictEntry *de) { /* Already handled in activeDefragStringOb. */ } else if (ob->type == OBJ_LIST) { if (ob->encoding == OBJ_ENCODING_QUICKLIST) { - defragQuicklist(db, de); + defragQuicklist(de); } else if (ob->encoding == OBJ_ENCODING_LISTPACK) { if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl; } else { @@ -701,7 +785,7 @@ void defragKey(defragCtx *ctx, dictEntry *de) { } } else if (ob->type == OBJ_SET) { if (ob->encoding == OBJ_ENCODING_HT) { - defragSet(db, de); + defragSet(de); } else if (ob->encoding == OBJ_ENCODING_INTSET || ob->encoding == OBJ_ENCODING_LISTPACK) { void *newptr, *ptr = ob->ptr; if ((newptr = activeDefragAlloc(ptr))) ob->ptr = newptr; @@ -712,7 +796,7 @@ void defragKey(defragCtx *ctx, dictEntry *de) { if (ob->encoding == OBJ_ENCODING_LISTPACK) { if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl; } else if (ob->encoding == OBJ_ENCODING_SKIPLIST) { - defragZsetSkiplist(db, de); + defragZsetSkiplist(de); } else { serverPanic("Unknown sorted set encoding"); } @@ -720,12 +804,12 @@ void defragKey(defragCtx *ctx, dictEntry *de) { if (ob->encoding == OBJ_ENCODING_LISTPACK) { if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl; } else if (ob->encoding == OBJ_ENCODING_HT) { - defragHash(db, de); + defragHash(de); } else { serverPanic("Unknown hash encoding"); } } else if (ob->type == OBJ_STREAM) { - defragStream(db, de); + defragStream(de); } else if (ob->type == OBJ_MODULE) { defragModule(db, de); } else { @@ -734,9 +818,9 @@ void defragKey(defragCtx *ctx, dictEntry *de) { } /* Defrag scan callback for the main db dictionary. */ -void defragScanCallback(void *privdata, const dictEntry *de) { +static void dbKeysScanCallback(void *privdata, const dictEntry *de) { long long hits_before = server.stat_active_defrag_hits; - defragKey((defragCtx *)privdata, (dictEntry *)de); + defragKey((defragKeysCtx *)privdata, (dictEntry *)de); if (server.stat_active_defrag_hits != hits_before) server.stat_active_defrag_key_hits++; else @@ -750,7 +834,7 @@ void defragScanCallback(void *privdata, const dictEntry *de) { * fragmentation ratio in order to decide if a defrag action should be taken * or not, a false detection can cause the defragmenter to waste a lot of CPU * without the possibility of getting any results. */ -float getAllocatorFragmentation(size_t *out_frag_bytes) { +static float getAllocatorFragmentation(size_t *out_frag_bytes) { size_t resident, active, allocated, frag_smallbins_bytes; zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL); frag_smallbins_bytes = allocatorDefragGetFragSmallbins(); @@ -768,18 +852,18 @@ float getAllocatorFragmentation(size_t *out_frag_bytes) { } /* Defrag scan callback for the pubsub dictionary. */ -void defragPubsubScanCallback(void *privdata, const dictEntry *de) { - defragCtx *ctx = privdata; - defragPubSubCtx *pubsub_ctx = ctx->privdata; - kvstore *pubsub_channels = pubsub_ctx->pubsub_channels; +static void defragPubsubScanCallback(void *privdata, const dictEntry *de) { + defragPubSubCtx *ctx = privdata; + kvstore *pubsub_channels = ctx->kvstate.kvs; robj *newchannel, *channel = dictGetKey(de); dict *newclients, *clients = dictGetVal(de); + size_t allocation_size; /* Try to defrag the channel name. */ serverAssert(channel->refcount == (int)dictSize(clients) + 1); - newchannel = activeDefragStringObEx(channel, dictSize(clients) + 1); + newchannel = activeDefragStringObWithoutFree(channel, &allocation_size); if (newchannel) { - kvstoreDictSetKey(pubsub_channels, ctx->slot, (dictEntry *)de, newchannel); + kvstoreDictSetKey(pubsub_channels, ctx->kvstate.slot, (dictEntry *)de, newchannel); /* The channel name is shared by the client's pubsub(shard) and server's * pubsub(shard), after defraging the channel name, we need to update @@ -788,35 +872,26 @@ void defragPubsubScanCallback(void *privdata, const dictEntry *de) { dictEntry *clientde; while ((clientde = dictNext(di)) != NULL) { client *c = dictGetKey(clientde); - dictEntry *pubsub_channel = dictFind(pubsub_ctx->clientPubSubChannels(c), newchannel); + dict *client_channels = ctx->getPubSubChannels(c); + dictEntry *pubsub_channel = dictFind(client_channels, newchannel); serverAssert(pubsub_channel); - dictSetKey(pubsub_ctx->clientPubSubChannels(c), pubsub_channel, newchannel); + dictSetKey(ctx->getPubSubChannels(c), pubsub_channel, newchannel); } dictReleaseIterator(di); + // Now that we're done correcting the references, we can safely free the old channel robj + allocatorDefragFree(channel, allocation_size); } /* Try to defrag the dictionary of clients that is stored as the value part. */ if ((newclients = dictDefragTables(clients))) - kvstoreDictSetVal(pubsub_channels, ctx->slot, (dictEntry *)de, newclients); + kvstoreDictSetVal(pubsub_channels, ctx->kvstate.slot, (dictEntry *)de, newclients); server.stat_active_defrag_scanned++; } -/* We may need to defrag other globals, one small allocation can hold a full allocator run. - * so although small, it is still important to defrag these */ -void defragOtherGlobals(void) { - /* there are many more pointers to defrag (e.g. client argv, output / aof buffers, etc. - * but we assume most of these are short lived, we only need to defrag allocations - * that remain static for a long time */ - activeDefragSdsDict(evalScriptsDict(), DEFRAG_SDS_DICT_VAL_LUA_SCRIPT); - moduleDefragGlobals(); - kvstoreDictLUTDefrag(server.pubsub_channels, dictDefragTables); - kvstoreDictLUTDefrag(server.pubsubshard_channels, dictDefragTables); -} - /* returns 0 more work may or may not be needed (see non-zero cursor), * and 1 if time is up and more work is needed. */ -int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime, int dbid) { +static int defragLaterItem(dictEntry *de, unsigned long *cursor, monotime endtime, int dbid) { if (de) { robj *ob = dictGetVal(de); if (ob->type == OBJ_LIST) { @@ -830,7 +905,8 @@ int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime, int } else if (ob->type == OBJ_STREAM) { return scanLaterStreamListpacks(ob, cursor, endtime); } else if (ob->type == OBJ_MODULE) { - return moduleLateDefrag(dictGetKey(de), ob, cursor, endtime, dbid); + long long endtimeWallClock = ustime() + (endtime - getMonotonicUs()); + return moduleLateDefrag(dictGetKey(de), ob, cursor, endtimeWallClock, dbid); } else { *cursor = 0; /* object type may have changed since we schedule it for later */ } @@ -840,299 +916,474 @@ int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime, int return 0; } -/* static variables serving defragLaterStep to continue scanning a key from were we stopped last time. */ -static sds defrag_later_current_key = NULL; -static unsigned long defrag_later_cursor = 0; -/* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */ -int defragLaterStep(serverDb *db, int slot, long long endtime) { +// A kvstoreHelperPreContinueFn +static doneStatus defragLaterStep(monotime endtime, void *privdata) { + defragKeysCtx *ctx = privdata; + unsigned int iterations = 0; unsigned long long prev_defragged = server.stat_active_defrag_hits; unsigned long long prev_scanned = server.stat_active_defrag_scanned; - long long key_defragged; - do { - /* if we're not continuing a scan from the last call or loop, start a new one */ - if (!defrag_later_cursor) { - listNode *head = listFirst(db->defrag_later); - - /* Move on to next key */ - if (defrag_later_current_key) { - serverAssert(defrag_later_current_key == head->value); - listDelNode(db->defrag_later, head); - defrag_later_cursor = 0; - defrag_later_current_key = NULL; - } + while (defrag_later && listLength(defrag_later) > 0) { + listNode *head = listFirst(defrag_later); + sds key = head->value; + dictEntry *de = kvstoreDictFind(ctx->kvstate.kvs, ctx->kvstate.slot, key); - /* stop if we reached the last one. */ - head = listFirst(db->defrag_later); - if (!head) return 0; - - /* start a new key */ - defrag_later_current_key = head->value; - defrag_later_cursor = 0; - } - - /* each time we enter this function we need to fetch the key from the dict again (if it still exists) */ - dictEntry *de = kvstoreDictFind(db->keys, slot, defrag_later_current_key); - key_defragged = server.stat_active_defrag_hits; - do { - int quit = 0; - if (defragLaterItem(de, &defrag_later_cursor, endtime, db->id)) - quit = 1; /* time is up, we didn't finish all the work */ - - /* Once in 16 scan iterations, 512 pointer reallocations, or 64 fields - * (if we have a lot of pointers in one hash bucket, or rehashing), - * check if we reached the time limit. */ - if (quit || (++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 || - server.stat_active_defrag_scanned - prev_scanned > 64)) { - if (quit || ustime() > endtime) { - if (key_defragged != server.stat_active_defrag_hits) - server.stat_active_defrag_key_hits++; - else - server.stat_active_defrag_key_misses++; - return 1; - } - iterations = 0; - prev_defragged = server.stat_active_defrag_hits; - prev_scanned = server.stat_active_defrag_scanned; - } - } while (defrag_later_cursor); - if (key_defragged != server.stat_active_defrag_hits) + long long key_defragged = server.stat_active_defrag_hits; + bool timeout = (defragLaterItem(de, &defrag_later_cursor, endtime, ctx->db->id) == 1); + if (key_defragged != server.stat_active_defrag_hits) { server.stat_active_defrag_key_hits++; - else + } else { server.stat_active_defrag_key_misses++; - } while (1); -} + } -#define INTERPOLATE(x, x1, x2, y1, y2) ((y1) + ((x) - (x1)) * ((y2) - (y1)) / ((x2) - (x1))) -#define LIMIT(y, min, max) ((y) < (min) ? min : ((y) > (max) ? max : (y))) + if (timeout) break; -/* decide if defrag is needed, and at what CPU effort to invest in it */ -void computeDefragCycles(void) { - size_t frag_bytes; - float frag_pct = getAllocatorFragmentation(&frag_bytes); - /* If we're not already running, and below the threshold, exit. */ - if (!server.active_defrag_running) { - if (frag_pct < server.active_defrag_threshold_lower || frag_bytes < server.active_defrag_ignore_bytes) return; + if (defrag_later_cursor == 0) { + // the item is finished, move on + listDelNode(defrag_later, head); + } + + if (++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 || + server.stat_active_defrag_scanned - prev_scanned > 64) { + if (getMonotonicUs() > endtime) break; + iterations = 0; + prev_defragged = server.stat_active_defrag_hits; + prev_scanned = server.stat_active_defrag_scanned; + } } - /* Calculate the adaptive aggressiveness of the defrag based on the current - * fragmentation and configurations. */ - int cpu_pct = INTERPOLATE(frag_pct, server.active_defrag_threshold_lower, server.active_defrag_threshold_upper, - server.active_defrag_cycle_min, server.active_defrag_cycle_max); - cpu_pct = LIMIT(cpu_pct, server.active_defrag_cycle_min, server.active_defrag_cycle_max); + return (!defrag_later || listLength(defrag_later) == 0) ? DEFRAG_DONE : DEFRAG_NOT_DONE; +} - /* Normally we allow increasing the aggressiveness during a scan, but don't - * reduce it, since we should not lower the aggressiveness when fragmentation - * drops. But when a configuration is made, we should reconsider it. */ - if (cpu_pct > server.active_defrag_running || server.active_defrag_configuration_changed) { - server.active_defrag_running = cpu_pct; - server.active_defrag_configuration_changed = 0; - serverLog(LL_VERBOSE, "Starting active defrag, frag=%.0f%%, frag_bytes=%zu, cpu=%d%%", frag_pct, frag_bytes, - cpu_pct); + +/* This helper function handles most of the work for iterating over a kvstore. 'privdata', if + * provided, MUST begin with 'kvstoreIterState' and this part is automatically updated by this + * function during the iteration. */ +static doneStatus defragStageKvstoreHelper(monotime endtime, + kvstore *kvs, + dictScanFunction scan_fn, + kvstoreHelperPreContinueFn precontinue_fn, + const dictDefragFunctions *defragfns, + void *privdata) { + static kvstoreIterState state; // STATIC - this persists + if (endtime == 0) { + // Starting the stage, set up the state information for this stage + state.kvs = kvs; + state.slot = KVS_SLOT_DEFRAG_LUT; + state.cursor = 0; + return DEFRAG_NOT_DONE; } -} + serverAssert(kvs == state.kvs); // Shouldn't change during the stage -/* Perform incremental defragmentation work from the serverCron. - * This works in a similar way to activeExpireCycle, in the sense that - * we do incremental work across calls. */ -void activeDefragCycle(void) { - static int slot = -1; - static int current_db = -1; - static int defrag_later_item_in_progress = 0; - static int defrag_stage = 0; - static unsigned long defrag_cursor = 0; - static serverDb *db = NULL; - static long long start_scan, start_stat; unsigned int iterations = 0; unsigned long long prev_defragged = server.stat_active_defrag_hits; unsigned long long prev_scanned = server.stat_active_defrag_scanned; - long long start, timelimit, endtime; - mstime_t latency; - int all_stages_finished = 0; - int quit = 0; - if (!server.active_defrag_enabled) { - if (server.active_defrag_running) { - /* if active defrag was disabled mid-run, start from fresh next time. */ - server.active_defrag_running = 0; - server.active_defrag_configuration_changed = 0; - if (db) listEmpty(db->defrag_later); - defrag_later_current_key = NULL; - defrag_later_cursor = 0; - current_db = -1; - defrag_stage = 0; - defrag_cursor = 0; - slot = -1; - defrag_later_item_in_progress = 0; - db = NULL; - goto update_metrics; + if (state.slot == KVS_SLOT_DEFRAG_LUT) { + // Before we start scanning the kvstore, handle the main structures + do { + state.cursor = kvstoreDictLUTDefrag(kvs, state.cursor, dictDefragTables); + if (getMonotonicUs() >= endtime) return DEFRAG_NOT_DONE; + } while (state.cursor != 0); + state.slot = KVS_SLOT_UNASSIGNED; + } + + while (true) { + if (++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 || server.stat_active_defrag_scanned - prev_scanned > 64) { + if (getMonotonicUs() >= endtime) break; + iterations = 0; + prev_defragged = server.stat_active_defrag_hits; + prev_scanned = server.stat_active_defrag_scanned; } - return; + + if (precontinue_fn) { + if (privdata) *(kvstoreIterState *)privdata = state; + if (precontinue_fn(endtime, privdata) == DEFRAG_NOT_DONE) return DEFRAG_NOT_DONE; + } + + if (!state.cursor) { + // If there's no cursor, we're ready to begin a new kvstore slot. + if (state.slot == KVS_SLOT_UNASSIGNED) { + state.slot = kvstoreGetFirstNonEmptyDictIndex(kvs); + } else { + state.slot = kvstoreGetNextNonEmptyDictIndex(kvs, state.slot); + } + + if (state.slot == KVS_SLOT_UNASSIGNED) return DEFRAG_DONE; + } + + // Whatever privdata's actual type, this function requires that it begins with kvstoreIterState. + if (privdata) *(kvstoreIterState *)privdata = state; + state.cursor = kvstoreDictScanDefrag(kvs, state.slot, state.cursor, + scan_fn, defragfns, privdata); } - if (hasActiveChildProcess()) return; /* Defragging memory while there's a fork will just do damage. */ + return DEFRAG_NOT_DONE; +} + - /* Once a second, check if the fragmentation justfies starting a scan - * or making it more aggressive. */ - run_with_period(1000) { - computeDefragCycles(); +// Note: target is a DB, (not a KVS like most stages) +static doneStatus defragStageDbKeys(monotime endtime, void *target, void *privdata) { + UNUSED(privdata); + serverDb *db = (serverDb *)target; + + static defragKeysCtx ctx; // STATIC - this persists + if (endtime == 0) { + ctx.db = db; + // Don't return yet. Call the helper with endtime==0 below. } + serverAssert(ctx.db == db); - /* Normally it is checked once a second, but when there is a configuration - * change, we want to check it as soon as possible. */ - if (server.active_defrag_configuration_changed) { - computeDefragCycles(); - server.active_defrag_configuration_changed = 0; + /* Note: for DB keys, we use the start/finish callback to fix an expires table entry if + * the main DB entry has been moved. */ + static const dictDefragFunctions defragfns = { + .defragAlloc = activeDefragAlloc, + .defragKey = NULL, // Handled by dbKeysScanCallback + .defragVal = NULL, // Handled by dbKeysScanCallback + .defragEntryStartCb = defragEntryStartCbForKeys, + .defragEntryFinishCb = defragEntryFinishCbForKeys}; + + return defragStageKvstoreHelper(endtime, db->keys, + dbKeysScanCallback, defragLaterStep, &defragfns, &ctx); +} + + +static doneStatus defragStageExpiresKvstore(monotime endtime, void *target, void *privdata) { + UNUSED(privdata); + static const dictDefragFunctions defragfns = { + .defragAlloc = activeDefragAlloc, + .defragKey = NULL, // Not needed for expires (just a ref) + .defragVal = NULL, // Not needed for expires (no value) + }; + return defragStageKvstoreHelper(endtime, (kvstore *)target, + scanCallbackCountScanned, NULL, &defragfns, NULL); +} + + +static doneStatus defragStagePubsubKvstore(monotime endtime, void *target, void *privdata) { + // target is server.pubsub_channels or server.pubsubshard_channels + getClientChannelsFnWrapper *fnWrapper = privdata; + + static const dictDefragFunctions defragfns = { + .defragAlloc = activeDefragAlloc, + .defragKey = NULL, // Handled by defragPubsubScanCallback + .defragVal = NULL, // Not needed for expires (no value) + }; + defragPubSubCtx ctx; + + ctx.getPubSubChannels = fnWrapper->fn; + return defragStageKvstoreHelper(endtime, (kvstore *)target, + defragPubsubScanCallback, NULL, &defragfns, &ctx); +} + + +static doneStatus defragLuaScripts(monotime endtime, void *target, void *privdata) { + UNUSED(target); + UNUSED(privdata); + if (endtime == 0) return DEFRAG_NOT_DONE; // required initialization + activeDefragSdsDict(evalScriptsDict(), DEFRAG_SDS_DICT_VAL_LUA_SCRIPT); + return DEFRAG_DONE; +} + + +static doneStatus defragModuleGlobals(monotime endtime, void *target, void *privdata) { + UNUSED(target); + UNUSED(privdata); + if (endtime == 0) return DEFRAG_NOT_DONE; // required initialization + moduleDefragGlobals(); + return DEFRAG_DONE; +} + + +static bool defragIsRunning(void) { + return (defrag.timeproc_id > 0); +} + + +static void addDefragStage(defragStageFn stage_fn, void *target, void *privdata) { + StageDescriptor *stage = zmalloc(sizeof(StageDescriptor)); + stage->stage_fn = stage_fn; + stage->target = target; + stage->privdata = privdata; + listAddNodeTail(defrag.remaining_stages, stage); +} + + +// Called at the end of a complete defrag cycle, or when defrag is terminated +static void endDefragCycle(bool normal_termination) { + if (normal_termination) { + // For normal termination, we expect... + serverAssert(!defrag.current_stage); + serverAssert(listLength(defrag.remaining_stages) == 0); + serverAssert(!defrag_later || listLength(defrag_later) == 0); + } else { + // Defrag is being terminated abnormally + aeDeleteTimeEvent(server.el, defrag.timeproc_id); + + if (defrag.current_stage) { + zfree(defrag.current_stage); + defrag.current_stage = NULL; + } + listSetFreeMethod(defrag.remaining_stages, zfree); } + defrag.timeproc_id = AE_DELETED_EVENT_ID; - if (!server.active_defrag_running) return; + listRelease(defrag.remaining_stages); + defrag.remaining_stages = NULL; - /* See activeExpireCycle for how timelimit is handled. */ - start = ustime(); - timelimit = 1000000 * server.active_defrag_running / server.hz / 100; - if (timelimit <= 0) timelimit = 1; - endtime = start + timelimit; - latencyStartMonitor(latency); + if (defrag_later) { + listRelease(defrag_later); + defrag_later = NULL; + } + defrag_later_cursor = 0; - dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc, - .defragEntryStartCb = defragEntryStartCbForKeys, - .defragEntryFinishCb = defragEntryFinishCbForKeys}; - do { - /* if we're not continuing a scan from the last call or loop, start a new one */ - if (!defrag_stage && !defrag_cursor && (slot < 0)) { - /* finish any leftovers from previous db before moving to the next one */ - if (db && defragLaterStep(db, slot, endtime)) { - quit = 1; /* time is up, we didn't finish all the work */ - break; /* this will exit the function and we'll continue on the next cycle */ - } + size_t frag_bytes; + float frag_pct = getAllocatorFragmentation(&frag_bytes); + serverLog(LL_VERBOSE, "Active defrag done in %dms, reallocated=%d, frag=%.0f%%, frag_bytes=%zu", + (int)elapsedMs(defrag.start_cycle), (int)(server.stat_active_defrag_hits - defrag.start_defrag_hits), + frag_pct, frag_bytes); - /* Move on to next database, and stop if we reached the last one. */ - if (++current_db >= server.dbnum) { - /* defrag other items not part of the db / keys */ - defragOtherGlobals(); - - long long now = ustime(); - size_t frag_bytes; - float frag_pct = getAllocatorFragmentation(&frag_bytes); - serverLog(LL_VERBOSE, "Active defrag done in %dms, reallocated=%d, frag=%.0f%%, frag_bytes=%zu", - (int)((now - start_scan) / 1000), (int)(server.stat_active_defrag_hits - start_stat), - frag_pct, frag_bytes); - - start_scan = now; - current_db = -1; - defrag_stage = 0; - defrag_cursor = 0; - slot = -1; - defrag_later_item_in_progress = 0; - db = NULL; - server.active_defrag_running = 0; - - computeDefragCycles(); /* if another scan is needed, start it right away */ - if (server.active_defrag_running != 0 && ustime() < endtime) continue; - break; - } else if (current_db == 0) { - /* Start a scan from the first database. */ - start_scan = ustime(); - start_stat = server.stat_active_defrag_hits; - } + server.stat_total_active_defrag_time += elapsedUs(server.stat_last_active_defrag_time); + server.stat_last_active_defrag_time = 0; + server.active_defrag_cpu_percent = 0; +} + + +/* Must be called at the start of the timeProc as it measures the delay from the end of the previous + * timeProc invocation when performing the computation. */ +static int computeDefragCycleUs(void) { + long dutyCycleUs; - db = &server.db[current_db]; - kvstoreDictLUTDefrag(db->keys, dictDefragTables); - kvstoreDictLUTDefrag(db->expires, dictDefragTables); - defrag_stage = 0; - defrag_cursor = 0; - slot = -1; - defrag_later_item_in_progress = 0; + int targetCpuPercent = server.active_defrag_cpu_percent; + serverAssert(targetCpuPercent > 0 && targetCpuPercent < 100); + + static int prevCpuPercent = 0; // STATIC - this persists + if (targetCpuPercent != prevCpuPercent) { + /* If the targetCpuPercent changes, the value might be different from when the last wait + * time was computed. In this case, don't consider wait time. (This is really only an + * issue in crazy tests that dramatically increase CPU while defrag is running.) */ + defrag.timeproc_end_time = 0; + prevCpuPercent = targetCpuPercent; + } + + // Given when the last duty cycle ended, compute time needed to achieve the desired percentage. + if (defrag.timeproc_end_time == 0) { + // Either the first call to the timeProc, or we were paused for some reason. + defrag.timeproc_overage_us = 0; + dutyCycleUs = server.active_defrag_cycle_us; + } else { + long waitedUs = getMonotonicUs() - defrag.timeproc_end_time; + /* Given the elapsed wait time between calls, compute the necessary duty time needed to + * achieve the desired CPU percentage. + * With: D = duty time, W = wait time, P = percent + * Solve: D P + * ----- = ----- + * D + W 100 + * Solving for D: + * D = P * W / (100 - P) + * + * Note that dutyCycleUs addresses starvation. If the wait time was long, we will compensate + * with a proportionately long duty-cycle. This won't significantly affect perceived + * latency, because clients are already being impacted by the long cycle time which caused + * the starvation of the timer. */ + dutyCycleUs = targetCpuPercent * waitedUs / (100 - targetCpuPercent); + + // Also adjust for any accumulated overage(underage). + dutyCycleUs -= defrag.timeproc_overage_us; + defrag.timeproc_overage_us = 0; + + if (dutyCycleUs < server.active_defrag_cycle_us) { + /* We never reduce our cycle time, that would increase overhead. Instead, we track this + * as part of the overage, and increase wait time between cycles. */ + defrag.timeproc_overage_us = server.active_defrag_cycle_us - dutyCycleUs; + dutyCycleUs = server.active_defrag_cycle_us; } + } + return dutyCycleUs; +} - /* This array of structures holds the parameters for all defragmentation stages. */ - typedef struct defragStage { - kvstore *kvs; - dictScanFunction *scanfn; - void *privdata; - } defragStage; - defragStage defrag_stages[] = { - {db->keys, defragScanCallback, db}, - {db->expires, scanCallbackCountScanned, NULL}, - {server.pubsub_channels, defragPubsubScanCallback, - &(defragPubSubCtx){server.pubsub_channels, getClientPubSubChannels}}, - {server.pubsubshard_channels, defragPubsubScanCallback, - &(defragPubSubCtx){server.pubsubshard_channels, getClientPubSubShardChannels}}, - }; - do { - int num_stages = sizeof(defrag_stages) / sizeof(defrag_stages[0]); - serverAssert(defrag_stage < num_stages); - defragStage *current_stage = &defrag_stages[defrag_stage]; - - /* before scanning the next bucket, see if we have big keys left from the previous bucket to scan */ - if (defragLaterStep(db, slot, endtime)) { - quit = 1; /* time is up, we didn't finish all the work */ - break; /* this will exit the function and we'll continue on the next cycle */ - } - if (!defrag_later_item_in_progress) { - /* Continue defragmentation from the previous stage. - * If slot is -1, it means this stage starts from the first non-empty slot. */ - if (slot == -1) slot = kvstoreGetFirstNonEmptyDictIndex(current_stage->kvs); - defrag_cursor = kvstoreDictScanDefrag(current_stage->kvs, slot, defrag_cursor, current_stage->scanfn, - &defragfns, &(defragCtx){current_stage->privdata, slot}); - } +/* Must be called at the end of the timeProc as it records the timeproc_end_time for use in the next + * computeDefragCycleUs computation. */ +static int computeDelayMs(monotime intendedEndtime) { + defrag.timeproc_end_time = getMonotonicUs(); + int overage = defrag.timeproc_end_time - intendedEndtime; + defrag.timeproc_overage_us += overage; // track over/under desired CPU + + int targetCpuPercent = server.active_defrag_cpu_percent; + serverAssert(targetCpuPercent > 0 && targetCpuPercent < 100); + + // Given the desired duty cycle, what inter-cycle delay do we need to achieve that? + // We want to achieve a specific CPU percent. To do that, we can't use a skewed computation. + // Example, if we run for 1ms and delay 10ms, that's NOT 10%, because the total cycle time is 11ms. + // Instead, if we rum for 1ms, our total time should be 10ms. So the delay is only 9ms. + long totalCycleTimeUs = server.active_defrag_cycle_us * 100 / targetCpuPercent; + long delayUs = totalCycleTimeUs - server.active_defrag_cycle_us; + // Only increase delay by the fraction of the overage that would be non-duty-cycle + delayUs += defrag.timeproc_overage_us * (100 - targetCpuPercent) / 100; // "overage" might be negative + if (delayUs < 0) delayUs = 0; + long delayMs = delayUs / 1000; // round down + return delayMs; +} - if (!defrag_cursor) { - /* Move to the next slot only if regular and large item scanning has been completed. */ - if (listLength(db->defrag_later) > 0) { - defrag_later_item_in_progress = 1; - continue; - } - /* Move to the next slot in the current stage. If we've reached the end, move to the next stage. */ - if ((slot = kvstoreGetNextNonEmptyDictIndex(current_stage->kvs, slot)) == -1) defrag_stage++; - defrag_later_item_in_progress = 0; - } +/* An independent time proc for defrag. While defrag is running, this is called much more often + * than the server cron. Frequent short calls provides low latency impact. */ +static long long activeDefragTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData) { + UNUSED(eventLoop); + UNUSED(id); + UNUSED(clientData); - /* Check if all defragmentation stages have been processed. - * If so, mark as finished and reset the stage counter to move on to next database. */ - if (defrag_stage == num_stages) { - all_stages_finished = 1; - defrag_stage = 0; - } + // This timer shouldn't be registered unless there's work to do. + serverAssert(defrag.current_stage || listLength(defrag.remaining_stages) > 0); - /* Once in 16 scan iterations, 512 pointer reallocations. or 64 keys - * (if we have a lot of pointers in one hash bucket or rehashing), - * check if we reached the time limit. - * But regardless, don't start a new db in this loop, this is because after - * the last db we call defragOtherGlobals, which must be done in one cycle */ - if (all_stages_finished || ++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 || - server.stat_active_defrag_scanned - prev_scanned > 64) { - /* Quit if all stages were finished or timeout. */ - if (all_stages_finished || ustime() > endtime) { - quit = 1; - break; - } - iterations = 0; - prev_defragged = server.stat_active_defrag_hits; - prev_scanned = server.stat_active_defrag_scanned; - } - } while (!all_stages_finished && !quit); - } while (!quit); + if (!server.active_defrag_enabled) { + // Defrag has been disabled while running + endDefragCycle(false); + return AE_NOMORE; + } + + if (hasActiveChildProcess()) { + // If there's a child process, pause the defrag, polling until the child completes. + defrag.timeproc_end_time = 0; // prevent starvation recovery + return 100; + } + + monotime starttime = getMonotonicUs(); + monotime endtime = starttime + computeDefragCycleUs(); + + mstime_t latency; + latencyStartMonitor(latency); + + if (!defrag.current_stage) { + defrag.current_stage = listNodeValue(listFirst(defrag.remaining_stages)); + listDelNode(defrag.remaining_stages, listFirst(defrag.remaining_stages)); + // Initialize the stage with endtime==0 + doneStatus status = defrag.current_stage->stage_fn(0, defrag.current_stage->target, defrag.current_stage->privdata); + serverAssert(status == DEFRAG_NOT_DONE); // Initialization should always return DEFRAG_NOT_DONE + } + + doneStatus status = defrag.current_stage->stage_fn(endtime, defrag.current_stage->target, defrag.current_stage->privdata); + if (status == DEFRAG_DONE) { + zfree(defrag.current_stage); + defrag.current_stage = NULL; + } latencyEndMonitor(latency); latencyAddSampleIfNeeded("active-defrag-cycle", latency); -update_metrics: - if (server.active_defrag_running > 0) { - if (server.stat_last_active_defrag_time == 0) elapsedStart(&server.stat_last_active_defrag_time); - } else if (server.stat_last_active_defrag_time != 0) { - server.stat_total_active_defrag_time += elapsedUs(server.stat_last_active_defrag_time); - server.stat_last_active_defrag_time = 0; + if (defrag.current_stage || listLength(defrag.remaining_stages) > 0) { + return computeDelayMs(endtime); + } else { + endDefragCycle(true); + return AE_NOMORE; // Ends the timer proc + } +} + + +/* During long running scripts, or while loading, there is a periodic function for handling other + * actions. This interface allows defrag to continue running, avoiding a single long defrag step + * after the long operation completes. */ +void defragWhileBlocked(void) { + if (!defragIsRunning()) return; + + // Save off the timeproc_id. If we have a normal termination, it will be cleared. + long long timeproc_id = defrag.timeproc_id; + + // Simulate a single call of the timer proc + long long reschedule_delay = activeDefragTimeProc(NULL, 0, NULL); + if (reschedule_delay == AE_NOMORE) { + // If it's done, deregister the timer + aeDeleteTimeEvent(server.el, timeproc_id); } + /* Otherwise, just ignore the reschedule_delay, the timer will pop the next time that the + * event loop can process timers again. */ +} + + +static void beginDefragCycle(void) { + serverAssert(!defragIsRunning()); + + serverAssert(defrag.remaining_stages == NULL); + defrag.remaining_stages = listCreate(); + + for (int dbid = 0; dbid < server.dbnum; dbid++) { + serverDb *db = &server.db[dbid]; + addDefragStage(defragStageDbKeys, db, NULL); + addDefragStage(defragStageExpiresKvstore, db->expires, NULL); + } + + static getClientChannelsFnWrapper getClientPubSubChannelsFn = {getClientPubSubChannels}; + static getClientChannelsFnWrapper getClientPubSubShardChannelsFn = {getClientPubSubShardChannels}; + addDefragStage(defragStagePubsubKvstore, server.pubsub_channels, &getClientPubSubChannelsFn); + addDefragStage(defragStagePubsubKvstore, server.pubsubshard_channels, &getClientPubSubShardChannelsFn); + + addDefragStage(defragLuaScripts, NULL, NULL); + addDefragStage(defragModuleGlobals, NULL, NULL); + + defrag.current_stage = NULL; + defrag.start_cycle = getMonotonicUs(); + defrag.start_defrag_hits = server.stat_active_defrag_hits; + defrag.timeproc_end_time = 0; + defrag.timeproc_overage_us = 0; + defrag.timeproc_id = aeCreateTimeEvent(server.el, 0, activeDefragTimeProc, NULL, NULL); + + elapsedStart(&server.stat_last_active_defrag_time); +} + + +#define INTERPOLATE(x, x1, x2, y1, y2) ((y1) + ((x) - (x1)) * ((y2) - (y1)) / ((x2) - (x1))) +#define LIMIT(y, min, max) ((y) < (min) ? min : ((y) > (max) ? max : (y))) + +/* decide if defrag is needed, and at what CPU effort to invest in it */ +static void updateDefragCpuPercent(void) { + size_t frag_bytes; + float frag_pct = getAllocatorFragmentation(&frag_bytes); + if (server.active_defrag_cpu_percent == 0) { + if (frag_pct < server.active_defrag_threshold_lower || + frag_bytes < server.active_defrag_ignore_bytes) return; + } + + /* Calculate the adaptive aggressiveness of the defrag based on the current + * fragmentation and configurations. */ + int cpu_pct = INTERPOLATE(frag_pct, server.active_defrag_threshold_lower, server.active_defrag_threshold_upper, + server.active_defrag_cpu_min, server.active_defrag_cpu_max); + cpu_pct = LIMIT(cpu_pct, server.active_defrag_cpu_min, server.active_defrag_cpu_max); + + /* Normally we allow increasing the aggressiveness during a scan, but don't + * reduce it, since we should not lower the aggressiveness when fragmentation + * drops. But when a configuration is made, we should reconsider it. */ + if (cpu_pct > server.active_defrag_cpu_percent || server.active_defrag_configuration_changed) { + server.active_defrag_configuration_changed = 0; + if (defragIsRunning()) { + serverLog(LL_VERBOSE, "Changing active defrag CPU, frag=%.0f%%, frag_bytes=%zu, cpu=%d%%", + frag_pct, frag_bytes, cpu_pct); + } else { + serverLog(LL_VERBOSE, "Starting active defrag, frag=%.0f%%, frag_bytes=%zu, cpu=%d%%", + frag_pct, frag_bytes, cpu_pct); + } + server.active_defrag_cpu_percent = cpu_pct; + } +} + + +void monitorActiveDefrag(void) { + if (!server.active_defrag_enabled) return; + + /* Defrag gets paused while a child process is active. So there's no point in starting a new + * cycle or adjusting the CPU percentage for an existing cycle. */ + if (hasActiveChildProcess()) return; + + updateDefragCpuPercent(); + + if (server.active_defrag_cpu_percent > 0 && !defragIsRunning()) beginDefragCycle(); } #else /* HAVE_DEFRAG */ -void activeDefragCycle(void) { +void monitorActiveDefrag(void) { /* Not implemented yet. */ } @@ -1146,4 +1397,7 @@ robj *activeDefragStringOb(robj *ob) { return NULL; } +void defragWhileBlocked(void) { +} + #endif diff --git a/src/dict.c b/src/dict.c index 48c0f815bb..f75369d533 100644 --- a/src/dict.c +++ b/src/dict.c @@ -1321,7 +1321,7 @@ unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) { /* Reallocate the dictEntry, key and value allocations in a bucket using the * provided allocation functions in order to defrag them. */ -static void dictDefragBucket(dictEntry **bucketref, dictDefragFunctions *defragfns, void *privdata) { +static void dictDefragBucket(dictEntry **bucketref, const dictDefragFunctions *defragfns, void *privdata) { dictDefragAllocFunction *defragalloc = defragfns->defragAlloc; dictDefragAllocFunction *defragkey = defragfns->defragKey; dictDefragAllocFunction *defragval = defragfns->defragVal; @@ -1499,7 +1499,7 @@ unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, void *pri * where NULL means that no reallocation happened and the old memory is still * valid. */ unsigned long -dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata) { +dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, const dictDefragFunctions *defragfns, void *privdata) { int htidx0, htidx1; const dictEntry *de, *next; unsigned long m0, m1; diff --git a/src/dict.h b/src/dict.h index 88ebd7bf99..854d026cdc 100644 --- a/src/dict.h +++ b/src/dict.h @@ -238,7 +238,7 @@ void dictSetHashFunctionSeed(uint8_t *seed); uint8_t *dictGetHashFunctionSeed(void); unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, void *privdata); unsigned long -dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata); +dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, const dictDefragFunctions *defragfns, void *privdata); uint64_t dictGetHash(dict *d, const void *key); void dictRehashingInfo(dict *d, unsigned long long *from_size, unsigned long long *to_size); diff --git a/src/kvstore.c b/src/kvstore.c index 49662f330a..344a8af5cf 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -739,7 +739,7 @@ unsigned long kvstoreDictScanDefrag(kvstore *kvs, int didx, unsigned long v, dictScanFunction *fn, - dictDefragFunctions *defragfns, + const dictDefragFunctions *defragfns, void *privdata) { dict *d = kvstoreGetDict(kvs, didx); if (!d) return 0; @@ -750,14 +750,27 @@ unsigned long kvstoreDictScanDefrag(kvstore *kvs, * within dict, it only reallocates the memory used by the dict structure itself using * the provided allocation function. This feature was added for the active defrag feature. * - * The 'defragfn' callback is called with a reference to the dict - * that callback can reallocate. */ -void kvstoreDictLUTDefrag(kvstore *kvs, kvstoreDictLUTDefragFunction *defragfn) { - for (int didx = 0; didx < kvs->num_dicts; didx++) { + * With 16k dictionaries for cluster mode with 1 shard, this operation may require substantial time + * to execute. A "cursor" is used to perform the operation iteratively. When first called, a + * cursor value of 0 should be provided. The return value is an updated cursor which should be + * provided on the next iteration. The operation is complete when 0 is returned. + * + * The 'defragfn' callback is called with a reference to the dict that callback can reallocate. */ +unsigned long kvstoreDictLUTDefrag(kvstore *kvs, unsigned long cursor, kvstoreDictLUTDefragFunction *defragfn) { + for (int didx = cursor; didx < kvs->num_dicts; didx++) { dict **d = kvstoreGetDictRef(kvs, didx), *newd; if (!*d) continue; + + listNode *rehashing_node = NULL; + if (listLength(kvs->rehashing) > 0) { + rehashing_node = ((kvstoreDictMetadata *)dictMetadata(*d))->rehashing_node; + } + if ((newd = defragfn(*d))) *d = newd; + if (rehashing_node) listNodeValue(rehashing_node) = *d; + return (didx + 1); } + return 0; } uint64_t kvstoreGetHash(kvstore *kvs, const void *key) { diff --git a/src/kvstore.h b/src/kvstore.h index 81a0d9a96e..00ec472e73 100644 --- a/src/kvstore.h +++ b/src/kvstore.h @@ -68,10 +68,10 @@ unsigned long kvstoreDictScanDefrag(kvstore *kvs, int didx, unsigned long v, dictScanFunction *fn, - dictDefragFunctions *defragfns, + const dictDefragFunctions *defragfns, void *privdata); typedef dict *(kvstoreDictLUTDefragFunction)(dict *d); -void kvstoreDictLUTDefrag(kvstore *kvs, kvstoreDictLUTDefragFunction *defragfn); +unsigned long kvstoreDictLUTDefrag(kvstore *kvs, unsigned long cursor, kvstoreDictLUTDefragFunction *defragfn); void *kvstoreDictFetchValue(kvstore *kvs, int didx, const void *key); dictEntry *kvstoreDictFind(kvstore *kvs, int didx, void *key); dictEntry *kvstoreDictAddRaw(kvstore *kvs, int didx, void *key, dictEntry **existing); diff --git a/src/server.c b/src/server.c index ef9f523145..d77f67248c 100644 --- a/src/server.c +++ b/src/server.c @@ -1140,8 +1140,8 @@ void databasesCron(void) { } } - /* Defrag keys gradually. */ - activeDefragCycle(); + /* Start active defrag cycle or adjust defrag CPU if needed. */ + monitorActiveDefrag(); /* Perform hash tables rehashing if needed, but only if there are no * other processes saving the DB on disk. Otherwise rehashing is bad @@ -1611,24 +1611,7 @@ void whileBlockedCron(void) { mstime_t latency; latencyStartMonitor(latency); - /* In some cases we may be called with big intervals, so we may need to do - * extra work here. This is because some of the functions in serverCron rely - * on the fact that it is performed every 10 ms or so. For instance, if - * activeDefragCycle needs to utilize 25% cpu, it will utilize 2.5ms, so we - * need to call it multiple times. */ - long hz_ms = 1000 / server.hz; - while (server.blocked_last_cron < server.mstime) { - /* Defrag keys gradually. */ - activeDefragCycle(); - - server.blocked_last_cron += hz_ms; - - /* Increment cronloop so that run_with_period works. */ - server.cronloops++; - } - - /* Other cron jobs do not need to be done in a loop. No need to check - * server.blocked_last_cron since we have an early exit at the top. */ + defragWhileBlocked(); /* Update memory stats during loading (excluding blocked scripts) */ if (server.loading) cronUpdateMemoryStats(); @@ -2120,7 +2103,7 @@ void initServerConfig(void) { server.aof_flush_postponed_start = 0; server.aof_last_incr_size = 0; server.aof_last_incr_fsync_offset = 0; - server.active_defrag_running = 0; + server.active_defrag_cpu_percent = 0; server.active_defrag_configuration_changed = 0; server.notify_keyspace_events = 0; server.blocked_clients = 0; @@ -2722,8 +2705,6 @@ void initServer(void) { server.db[j].watched_keys = dictCreate(&keylistDictType); server.db[j].id = j; server.db[j].avg_ttl = 0; - server.db[j].defrag_later = listCreate(); - listSetFreeMethod(server.db[j].defrag_later, (void (*)(void *))sdsfree); } evictionPoolAlloc(); /* Initialize the LRU keys pool. */ /* Note that server.pubsub_channels was chosen to be a kvstore (with only one dict, which @@ -5704,7 +5685,7 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "mem_aof_buffer:%zu\r\n", mh->aof_buffer, "mem_allocator:%s\r\n", ZMALLOC_LIB, "mem_overhead_db_hashtable_rehashing:%zu\r\n", mh->overhead_db_hashtable_rehashing, - "active_defrag_running:%d\r\n", server.active_defrag_running, + "active_defrag_running:%d\r\n", server.active_defrag_cpu_percent, "lazyfree_pending_objects:%zu\r\n", lazyfreeGetPendingObjectsCount(), "lazyfreed_objects:%zu\r\n", lazyfreeGetFreedObjectsCount())); freeMemoryOverheadData(mh); diff --git a/src/server.h b/src/server.h index b9e8be9479..0aac1acbd8 100644 --- a/src/server.h +++ b/src/server.h @@ -961,7 +961,6 @@ typedef struct serverDb { int id; /* Database ID */ long long avg_ttl; /* Average TTL, just for stats */ unsigned long expires_cursor; /* Cursor of the active expire cycle. */ - list *defrag_later; /* List of key names to attempt to defrag one by one, gradually. */ } serverDb; /* forward declaration for functions ctx */ @@ -1702,7 +1701,7 @@ struct valkeyServer { int last_sig_received; /* Indicates the last SIGNAL received, if any (e.g., SIGINT or SIGTERM). */ int shutdown_flags; /* Flags passed to prepareForShutdown(). */ int activerehashing; /* Incremental rehash in serverCron() */ - int active_defrag_running; /* Active defragmentation running (holds current scan aggressiveness) */ + int active_defrag_cpu_percent; /* Current desired CPU percentage for active defrag */ char *pidfile; /* PID file path */ int arch_bits; /* 32 or 64 depending on sizeof(long) */ int cronloops; /* Number of times the cron function run */ @@ -1899,8 +1898,9 @@ struct valkeyServer { size_t active_defrag_ignore_bytes; /* minimum amount of fragmentation waste to start active defrag */ int active_defrag_threshold_lower; /* minimum percentage of fragmentation to start active defrag */ int active_defrag_threshold_upper; /* maximum percentage of fragmentation at which we use maximum effort */ - int active_defrag_cycle_min; /* minimal effort for defrag in CPU percentage */ - int active_defrag_cycle_max; /* maximal effort for defrag in CPU percentage */ + int active_defrag_cpu_min; /* minimal effort for defrag in CPU percentage */ + int active_defrag_cpu_max; /* maximal effort for defrag in CPU percentage */ + int active_defrag_cycle_us; /* standard duration of defrag cycle */ unsigned long active_defrag_max_scan_fields; /* maximum number of fields of set/hash/zset/list to process from within the main dict scan */ size_t client_max_querybuf_len; /* Limit for client query buffer length */ @@ -3353,7 +3353,8 @@ void bytesToHuman(char *s, size_t size, unsigned long long n); void enterExecutionUnit(int update_cached_time, long long us); void exitExecutionUnit(void); void resetServerStats(void); -void activeDefragCycle(void); +void monitorActiveDefrag(void); +void defragWhileBlocked(void); unsigned int getLRUClock(void); unsigned int LRU_CLOCK(void); const char *evictPolicyToString(void); diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index 67329f03f1..abd23b1d83 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -40,7 +40,6 @@ run_solo {defrag} { proc test_active_defrag {type} { if {[string match {*jemalloc*} [s mem_allocator]] && [r debug mallctl arenas.page] <= 8192} { test "Active defrag main dictionary: $type" { - r config set hz 100 r config set activedefrag no r config set active-defrag-threshold-lower 5 r config set active-defrag-cycle-min 65 @@ -89,6 +88,8 @@ run_solo {defrag} { r config set active-defrag-cycle-min 65 r config set active-defrag-cycle-max 75 + after 1000 ;# Give defrag time to work (might be multiple cycles) + # Wait for the active defrag to stop working. wait_for_condition 2000 100 { [s active_defrag_running] eq 0 @@ -138,12 +139,13 @@ run_solo {defrag} { r config resetstat r config set key-load-delay -25 ;# sleep on average 1/25 usec r debug loadaof + after 1000 ;# give defrag a chance to work before turning it off r config set activedefrag no + # measure hits and misses right after aof loading set misses [s active_defrag_misses] set hits [s active_defrag_hits] - after 120 ;# serverCron only updates the info once in 100ms set frag [s allocator_frag_ratio] set max_latency 0 foreach event [r latency latest] { @@ -181,7 +183,6 @@ run_solo {defrag} { r flushdb sync r script flush sync r config resetstat - r config set hz 100 r config set activedefrag no r config set active-defrag-threshold-lower 5 r config set active-defrag-cycle-min 65 @@ -203,7 +204,7 @@ run_solo {defrag} { $rd read ; # Discard script load replies $rd read ; # Discard set replies } - after 120 ;# serverCron only updates the info once in 100ms + after 1000 ;# give defrag some time to work if {$::verbose} { puts "used [s allocator_allocated]" puts "rss [s allocator_active]" @@ -239,6 +240,8 @@ run_solo {defrag} { fail "defrag not started." } + after 1000 ;# Give defrag time to work (might be multiple cycles) + # wait for the active defrag to stop working wait_for_condition 500 100 { [s active_defrag_running] eq 0 @@ -266,7 +269,6 @@ run_solo {defrag} { test "Active defrag big keys: $type" { r flushdb sync r config resetstat - r config set hz 100 r config set activedefrag no r config set active-defrag-max-scan-fields 1000 r config set active-defrag-threshold-lower 5 @@ -361,6 +363,8 @@ run_solo {defrag} { fail "defrag not started." } + after 1000 ;# Give defrag some time to work (it may run several cycles) + # wait for the active defrag to stop working wait_for_condition 500 100 { [s active_defrag_running] eq 0 @@ -407,7 +411,6 @@ run_solo {defrag} { test "Active defrag pubsub: $type" { r flushdb sync r config resetstat - r config set hz 100 r config set activedefrag no r config set active-defrag-threshold-lower 5 r config set active-defrag-cycle-min 65 @@ -430,7 +433,6 @@ run_solo {defrag} { $rd read ; # Discard set replies } - after 120 ;# serverCron only updates the info once in 100ms if {$::verbose} { puts "used [s allocator_allocated]" puts "rss [s allocator_active]" @@ -466,6 +468,8 @@ run_solo {defrag} { fail "defrag not started." } + after 1000 ;# Give defrag some time to work (it may run several cycles) + # wait for the active defrag to stop working wait_for_condition 500 100 { [s active_defrag_running] eq 0 @@ -475,6 +479,7 @@ run_solo {defrag} { puts [r memory malloc-stats] fail "defrag didn't stop." } + r config set activedefrag no ;# disable before we accidentally create more frag # test the fragmentation is lower after 120 ;# serverCron only updates the info once in 100ms @@ -507,7 +512,6 @@ run_solo {defrag} { test "Active defrag big list: $type" { r flushdb sync r config resetstat - r config set hz 100 r config set activedefrag no r config set active-defrag-max-scan-fields 1000 r config set active-defrag-threshold-lower 5 @@ -561,6 +565,8 @@ run_solo {defrag} { fail "defrag not started." } + after 1000 ;# Give defrag some time to work (it may run several cycles) + # wait for the active defrag to stop working wait_for_condition 500 100 { [s active_defrag_running] eq 0 @@ -619,7 +625,6 @@ run_solo {defrag} { start_server {tags {"defrag"} overrides {save ""}} { r flushdb sync r config resetstat - r config set hz 100 r config set activedefrag no r config set active-defrag-max-scan-fields 1000 r config set active-defrag-threshold-lower 5 @@ -685,6 +690,8 @@ run_solo {defrag} { fail "defrag not started." } + after 1000 ;# Give defrag some time to work (it may run several cycles) + # wait for the active defrag to stop working wait_for_condition 500 100 { [s active_defrag_running] eq 0 diff --git a/tests/unit/moduleapi/defrag.tcl b/tests/unit/moduleapi/defrag.tcl index e169f8de9b..6d8f55bd06 100644 --- a/tests/unit/moduleapi/defrag.tcl +++ b/tests/unit/moduleapi/defrag.tcl @@ -2,7 +2,6 @@ set testmodule [file normalize tests/modules/defragtest.so] start_server {tags {"modules"} overrides {{save ""}}} { r module load $testmodule 10000 - r config set hz 100 r config set active-defrag-ignore-bytes 1 r config set active-defrag-threshold-lower 0 r config set active-defrag-cycle-min 99 diff --git a/valkey.conf b/valkey.conf index 8d3e11c515..b997e8179b 100644 --- a/valkey.conf +++ b/valkey.conf @@ -2381,9 +2381,8 @@ rdb-save-incremental-fsync yes # Fragmentation is a natural process that happens with every allocator (but # less so with Jemalloc, fortunately) and certain workloads. Normally a server # restart is needed in order to lower the fragmentation, or at least to flush -# away all the data and create it again. However thanks to this feature -# implemented by Oran Agra, this process can happen at runtime -# in a "hot" way, while the server is running. +# away all the data and create it again. However thanks to this feature, this +# process can happen at runtime in a "hot" way, while the server is running. # # Basically when the fragmentation is over a certain level (see the # configuration options below) the server will start to create new copies of the @@ -2421,18 +2420,23 @@ rdb-save-incremental-fsync yes # Maximum percentage of fragmentation at which we use maximum effort # active-defrag-threshold-upper 100 -# Minimal effort for defrag in CPU percentage, to be used when the lower -# threshold is reached +# Minimal effort for defrag in CPU percentage, not cycle time as the name might +# suggest, to be used when the lower threshold is reached. # active-defrag-cycle-min 1 -# Maximal effort for defrag in CPU percentage, to be used when the upper -# threshold is reached +# Maximal effort for defrag in CPU percentage, not cycle time as the name might +# suggest, to be used when the upper threshold is reached. # active-defrag-cycle-max 25 # Maximum number of set/hash/zset/list fields that will be processed from # the main dictionary scan # active-defrag-max-scan-fields 1000 +# The time spent (in microseconds) of the periodic active defrag process. This +# affects the latency impact of active defrag on client commands. Smaller numbers +# will result in less latency impact at the cost of increased defrag overhead. +# active-defrag-cycle-us 500 + # Jemalloc background thread for purging will be enabled by default jemalloc-bg-thread yes From 9f8b174c2eec4be1b6bc15745ac479c95dbd3a6b Mon Sep 17 00:00:00 2001 From: uriyage <78144248+uriyage@users.noreply.github.com> Date: Tue, 3 Dec 2024 19:20:31 +0200 Subject: [PATCH 77/92] Optimize IO thread offload for modified argv (#1360) ### Improve expired commands performance with IO threads #### Background In our IO threads architecture, IO threads allocate client argv's and later when we free it after processCommand we offload its free to the IO threads. With jemalloc, it's crucial that the same thread that allocates memory also frees it. For some commands we modify the client's argv in the main thread during command processing (for example in `SET EX` command we rewrite the command to use absolute time for replication propagation). #### Current issues 1. When commands are rewritten (e.g., expire commands), we store the original argv in `c->original_argv`. However, we're currently: - Freeing new argv (allocated by main thread) in IO threads - Freeing original argv (allocated by IO threads) in main thread 2. Currently, `c->original_argv` points to new array with old objects, while `c->argv` has old array with new objects, making memory free management complicated. #### Changes 1. Refactored argv modification handling code to ensure consistency - both array and objects are now either all new or all old 2. Moved original_argv cleanup to happen in resetClient after argv cleanup 3. Modified IO threads code to properly handle original argv cleanup when argv are modified. #### Performance Impact Benchmark with `SET EX` commands (650 clients, 512 byte value, 8 IO threads): - New implementation: **729,548 ops/sec** - Old implementation: **633,243 ops/sec** Representing a **~15%** performance improvement due to more efficient memory handling. --------- Signed-off-by: Uri Yagelnik Signed-off-by: ranshid <88133677+ranshid@users.noreply.github.com> Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com> --- src/blocked.c | 1 - src/io_threads.c | 18 ++--- src/io_threads.h | 2 +- src/multi.c | 4 ++ src/networking.c | 90 ++++++++++++++++-------- src/server.c | 4 -- src/unit/test_files.h | 4 ++ src/unit/test_networking.c | 131 +++++++++++++++++++++++++++++++++++ tests/unit/introspection.tcl | 26 +++++++ 9 files changed, 235 insertions(+), 45 deletions(-) create mode 100644 src/unit/test_networking.c diff --git a/src/blocked.c b/src/blocked.c index 8e1974a703..aeec560b3f 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -206,7 +206,6 @@ void unblockClient(client *c, int queue_for_reprocessing) { /* Reset the client for a new query, unless the client has pending command to process * or in case a shutdown operation was canceled and we are still in the processCommand sequence */ if (!c->flag.pending_command && c->bstate.btype != BLOCKED_SHUTDOWN) { - freeClientOriginalArgv(c); /* Clients that are not blocked on keys are not reprocessed so we must * call reqresAppendResponse here (for clients blocked on key, * unblockClientOnKey is called, which eventually calls processCommand, diff --git a/src/io_threads.c b/src/io_threads.c index f4471b96d0..1ebd748bc2 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -441,8 +441,8 @@ void IOThreadFreeArgv(void *data) { /* This function attempts to offload the client's argv to an IO thread. * Returns C_OK if the client's argv were successfully offloaded to an IO thread, * C_ERR otherwise. */ -int tryOffloadFreeArgvToIOThreads(client *c) { - if (server.active_io_threads_num <= 1 || c->argc == 0) { +int tryOffloadFreeArgvToIOThreads(client *c, int argc, robj **argv) { + if (server.active_io_threads_num <= 1 || argc == 0) { return C_ERR; } @@ -456,11 +456,11 @@ int tryOffloadFreeArgvToIOThreads(client *c) { int last_arg_to_free = -1; /* Prepare the argv */ - for (int j = 0; j < c->argc; j++) { - if (c->argv[j]->refcount > 1) { - decrRefCount(c->argv[j]); + for (int j = 0; j < argc; j++) { + if (argv[j]->refcount > 1) { + decrRefCount(argv[j]); /* Set argv[j] to NULL to avoid double free */ - c->argv[j] = NULL; + argv[j] = NULL; } else { last_arg_to_free = j; } @@ -468,17 +468,17 @@ int tryOffloadFreeArgvToIOThreads(client *c) { /* If no argv to free, free the argv array at the main thread */ if (last_arg_to_free == -1) { - zfree(c->argv); + zfree(argv); return C_OK; } /* We set the refcount of the last arg to free to 0 to indicate that * this is the last argument to free. With this approach, we don't need to * send the argc to the IO thread and we can send just the argv ptr. */ - c->argv[last_arg_to_free]->refcount = 0; + argv[last_arg_to_free]->refcount = 0; /* Must succeed as we checked the free space before. */ - IOJobQueue_push(jq, IOThreadFreeArgv, c->argv); + IOJobQueue_push(jq, IOThreadFreeArgv, argv); return C_OK; } diff --git a/src/io_threads.h b/src/io_threads.h index f9a9cf762f..8818f08588 100644 --- a/src/io_threads.h +++ b/src/io_threads.h @@ -9,7 +9,7 @@ int inMainThread(void); int trySendReadToIOThreads(client *c); int trySendWriteToIOThreads(client *c); int tryOffloadFreeObjToIOThreads(robj *o); -int tryOffloadFreeArgvToIOThreads(client *c); +int tryOffloadFreeArgvToIOThreads(client *c, int argc, robj **argv); void adjustIOThreadsByEventLoad(int numevents, int increase_only); void drainIOThreadsQueue(void); void trySendPollJobToIOThreads(void); diff --git a/src/multi.c b/src/multi.c index bcffb90912..9e1f019244 100644 --- a/src/multi.c +++ b/src/multi.c @@ -238,6 +238,10 @@ void execCommand(client *c) { c->mstate.commands[j].argv = c->argv; c->mstate.commands[j].argv_len = c->argv_len; c->mstate.commands[j].cmd = c->cmd; + + /* The original argv has already been processed for slowlog and monitor, + * so we can safely free it before proceeding to the next command. */ + freeClientOriginalArgv(c); } // restore old DENY_BLOCKING value diff --git a/src/networking.c b/src/networking.c index bbd684a3e5..debd94ddfc 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1488,14 +1488,19 @@ void freeClientOriginalArgv(client *c) { /* We didn't rewrite this client */ if (!c->original_argv) return; - for (int j = 0; j < c->original_argc; j++) decrRefCount(c->original_argv[j]); - zfree(c->original_argv); + if (tryOffloadFreeArgvToIOThreads(c, c->original_argc, c->original_argv) == C_ERR) { + for (int j = 0; j < c->original_argc; j++) decrRefCount(c->original_argv[j]); + zfree(c->original_argv); + } + c->original_argv = NULL; c->original_argc = 0; } void freeClientArgv(client *c) { - if (tryOffloadFreeArgvToIOThreads(c) == C_ERR) { + /* If original_argv exists, 'c->argv' was allocated by the main thread, + * so it's more efficient to free it directly here rather than offloading to IO threads */ + if (c->original_argv || tryOffloadFreeArgvToIOThreads(c, c->argc, c->argv) == C_ERR) { for (int j = 0; j < c->argc; j++) decrRefCount(c->argv[j]); zfree(c->argv); } @@ -2545,6 +2550,7 @@ void resetClient(client *c) { serverCommandProc *prevcmd = c->cmd ? c->cmd->proc : NULL; freeClientArgv(c); + freeClientOriginalArgv(c); c->cur_script = NULL; c->reqtype = 0; c->multibulklen = 0; @@ -4248,16 +4254,53 @@ void securityWarningCommand(client *c) { freeClientAsync(c); } -/* Keep track of the original command arguments so that we can generate - * an accurate slowlog entry after the command has been executed. */ -static void retainOriginalCommandVector(client *c) { - /* We already rewrote this command, so don't rewrite it again */ - if (c->original_argv) return; - c->original_argc = c->argc; - c->original_argv = zmalloc(sizeof(robj *) * (c->argc)); - for (int j = 0; j < c->argc; j++) { - c->original_argv[j] = c->argv[j]; - incrRefCount(c->argv[j]); +/* This function preserves the original command arguments for accurate slowlog recording. + * + * It performs the following operations: + * - Stores the initial command vector if not already saved + * - Manages memory allocation for command argument modifications + * + * new_argc - The new number of arguments to allocate space for if necessary. + * new_argv - Optional pointer to a new argument vector. If NULL, space will be + * allocated for new_argc arguments, preserving the existing arguments. + */ +static void backupAndUpdateClientArgv(client *c, int new_argc, robj **new_argv) { + robj **old_argv = c->argv; + int old_argc = c->argc; + + /* Store original arguments if not already saved */ + if (!c->original_argv) { + c->original_argc = old_argc; + c->original_argv = old_argv; + } + + /* Handle direct argv replacement */ + if (new_argv) { + c->argv = new_argv; + } else if (c->original_argv == old_argv || new_argc > old_argc) { + /* Allocate new array if necessary */ + c->argv = zmalloc(sizeof(robj *) * new_argc); + + for (int i = 0; i < old_argc && i < new_argc; i++) { + c->argv[i] = old_argv[i]; + incrRefCount(c->argv[i]); + } + + /* Initialize new argument slots to NULL */ + for (int i = old_argc; i < new_argc; i++) { + c->argv[i] = NULL; + } + } + + c->argc = new_argc; + c->argv_len = new_argc; + + /* Clean up old argv if necessary */ + if (c->argv != old_argv && c->original_argv != old_argv) { + for (int i = 0; i < old_argc; i++) { + if (old_argv[i]) decrRefCount(old_argv[i]); + } + zfree(old_argv); } } @@ -4265,7 +4308,7 @@ static void retainOriginalCommandVector(client *c) { * in the slowlog. This information is stored in the * original_argv array. */ void redactClientCommandArgument(client *c, int argc) { - retainOriginalCommandVector(c); + backupAndUpdateClientArgv(c, c->argc, NULL); if (c->original_argv[argc] == shared.redacted) { /* This argument has already been redacted */ return; @@ -4298,10 +4341,7 @@ void rewriteClientCommandVector(client *c, int argc, ...) { /* Completely replace the client command vector with the provided one. */ void replaceClientCommandVector(client *c, int argc, robj **argv) { int j; - retainOriginalCommandVector(c); - freeClientArgv(c); - c->argv = argv; - c->argc = argc; + backupAndUpdateClientArgv(c, argc, argv); c->argv_len_sum = 0; for (j = 0; j < c->argc; j++) if (c->argv[j]) c->argv_len_sum += getStringObjectLen(c->argv[j]); @@ -4322,19 +4362,9 @@ void replaceClientCommandVector(client *c, int argc, robj **argv) { * free the no longer used objects on c->argv. */ void rewriteClientCommandArgument(client *c, int i, robj *newval) { robj *oldval; - retainOriginalCommandVector(c); + int new_argc = (i >= c->argc) ? i + 1 : c->argc; + backupAndUpdateClientArgv(c, new_argc, NULL); - /* We need to handle both extending beyond argc (just update it and - * initialize the new element) or beyond argv_len (realloc is needed). - */ - if (i >= c->argc) { - if (i >= c->argv_len) { - c->argv = zrealloc(c->argv, sizeof(robj *) * (i + 1)); - c->argv_len = i + 1; - } - c->argc = i + 1; - c->argv[i] = NULL; - } oldval = c->argv[i]; if (oldval) c->argv_len_sum -= getStringObjectLen(oldval); if (newval) c->argv_len_sum += getStringObjectLen(newval); diff --git a/src/server.c b/src/server.c index d77f67248c..21dca85067 100644 --- a/src/server.c +++ b/src/server.c @@ -3659,10 +3659,6 @@ void call(client *c, int flags) { replicationFeedMonitors(c, server.monitors, c->db->id, argv, argc); } - /* Clear the original argv. - * If the client is blocked we will handle slowlog when it is unblocked. */ - if (!c->flag.blocked) freeClientOriginalArgv(c); - /* Populate the per-command and per-slot statistics that we show in INFO commandstats and CLUSTER SLOT-STATS, * respectively. If the client is blocked we will handle latency stats and duration when it is unblocked. */ if (update_command_stats && !c->flag.blocked) { diff --git a/src/unit/test_files.h b/src/unit/test_files.h index 6ab7373007..bc3eac4222 100644 --- a/src/unit/test_files.h +++ b/src/unit/test_files.h @@ -84,6 +84,8 @@ int test_listpackBenchmarkLpValidateIntegrity(int argc, char **argv, int flags); int test_listpackBenchmarkLpCompareWithString(int argc, char **argv, int flags); int test_listpackBenchmarkLpCompareWithNumber(int argc, char **argv, int flags); int test_listpackBenchmarkFree(int argc, char **argv, int flags); +int test_backupAndUpdateClientArgv(int argc, char **argv, int flags); +int test_rewriteClientCommandArgument(int argc, char **argv, int flags); int test_quicklistCreateList(int argc, char **argv, int flags); int test_quicklistAddToTailOfEmptyList(int argc, char **argv, int flags); int test_quicklistAddToHeadOfEmptyList(int argc, char **argv, int flags); @@ -216,6 +218,7 @@ unitTest __test_endianconv_c[] = {{"test_endianconv", test_endianconv}, {NULL, N unitTest __test_intset_c[] = {{"test_intsetValueEncodings", test_intsetValueEncodings}, {"test_intsetBasicAdding", test_intsetBasicAdding}, {"test_intsetLargeNumberRandomAdd", test_intsetLargeNumberRandomAdd}, {"test_intsetUpgradeFromint16Toint32", test_intsetUpgradeFromint16Toint32}, {"test_intsetUpgradeFromint16Toint64", test_intsetUpgradeFromint16Toint64}, {"test_intsetUpgradeFromint32Toint64", test_intsetUpgradeFromint32Toint64}, {"test_intsetStressLookups", test_intsetStressLookups}, {"test_intsetStressAddDelete", test_intsetStressAddDelete}, {NULL, NULL}}; unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict}, {NULL, NULL}}; unitTest __test_listpack_c[] = {{"test_listpackCreateIntList", test_listpackCreateIntList}, {"test_listpackCreateList", test_listpackCreateList}, {"test_listpackLpPrepend", test_listpackLpPrepend}, {"test_listpackLpPrependInteger", test_listpackLpPrependInteger}, {"test_listpackGetELementAtIndex", test_listpackGetELementAtIndex}, {"test_listpackPop", test_listpackPop}, {"test_listpackGetELementAtIndex2", test_listpackGetELementAtIndex2}, {"test_listpackIterate0toEnd", test_listpackIterate0toEnd}, {"test_listpackIterate1toEnd", test_listpackIterate1toEnd}, {"test_listpackIterate2toEnd", test_listpackIterate2toEnd}, {"test_listpackIterateBackToFront", test_listpackIterateBackToFront}, {"test_listpackIterateBackToFrontWithDelete", test_listpackIterateBackToFrontWithDelete}, {"test_listpackDeleteWhenNumIsMinusOne", test_listpackDeleteWhenNumIsMinusOne}, {"test_listpackDeleteWithNegativeIndex", test_listpackDeleteWithNegativeIndex}, {"test_listpackDeleteInclusiveRange0_0", test_listpackDeleteInclusiveRange0_0}, {"test_listpackDeleteInclusiveRange0_1", test_listpackDeleteInclusiveRange0_1}, {"test_listpackDeleteInclusiveRange1_2", test_listpackDeleteInclusiveRange1_2}, {"test_listpackDeleteWitStartIndexOutOfRange", test_listpackDeleteWitStartIndexOutOfRange}, {"test_listpackDeleteWitNumOverflow", test_listpackDeleteWitNumOverflow}, {"test_listpackBatchDelete", test_listpackBatchDelete}, {"test_listpackDeleteFooWhileIterating", test_listpackDeleteFooWhileIterating}, {"test_listpackReplaceWithSameSize", test_listpackReplaceWithSameSize}, {"test_listpackReplaceWithDifferentSize", test_listpackReplaceWithDifferentSize}, {"test_listpackRegressionGt255Bytes", test_listpackRegressionGt255Bytes}, {"test_listpackCreateLongListAndCheckIndices", test_listpackCreateLongListAndCheckIndices}, {"test_listpackCompareStrsWithLpEntries", test_listpackCompareStrsWithLpEntries}, {"test_listpackLpMergeEmptyLps", test_listpackLpMergeEmptyLps}, {"test_listpackLpMergeLp1Larger", test_listpackLpMergeLp1Larger}, {"test_listpackLpMergeLp2Larger", test_listpackLpMergeLp2Larger}, {"test_listpackLpNextRandom", test_listpackLpNextRandom}, {"test_listpackLpNextRandomCC", test_listpackLpNextRandomCC}, {"test_listpackRandomPairWithOneElement", test_listpackRandomPairWithOneElement}, {"test_listpackRandomPairWithManyElements", test_listpackRandomPairWithManyElements}, {"test_listpackRandomPairsWithOneElement", test_listpackRandomPairsWithOneElement}, {"test_listpackRandomPairsWithManyElements", test_listpackRandomPairsWithManyElements}, {"test_listpackRandomPairsUniqueWithOneElement", test_listpackRandomPairsUniqueWithOneElement}, {"test_listpackRandomPairsUniqueWithManyElements", test_listpackRandomPairsUniqueWithManyElements}, {"test_listpackPushVariousEncodings", test_listpackPushVariousEncodings}, {"test_listpackLpFind", test_listpackLpFind}, {"test_listpackLpValidateIntegrity", test_listpackLpValidateIntegrity}, {"test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN", test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN}, {"test_listpackStressWithRandom", test_listpackStressWithRandom}, {"test_listpackSTressWithVariableSize", test_listpackSTressWithVariableSize}, {"test_listpackBenchmarkInit", test_listpackBenchmarkInit}, {"test_listpackBenchmarkLpAppend", test_listpackBenchmarkLpAppend}, {"test_listpackBenchmarkLpFindString", test_listpackBenchmarkLpFindString}, {"test_listpackBenchmarkLpFindNumber", test_listpackBenchmarkLpFindNumber}, {"test_listpackBenchmarkLpSeek", test_listpackBenchmarkLpSeek}, {"test_listpackBenchmarkLpValidateIntegrity", test_listpackBenchmarkLpValidateIntegrity}, {"test_listpackBenchmarkLpCompareWithString", test_listpackBenchmarkLpCompareWithString}, {"test_listpackBenchmarkLpCompareWithNumber", test_listpackBenchmarkLpCompareWithNumber}, {"test_listpackBenchmarkFree", test_listpackBenchmarkFree}, {NULL, NULL}}; +unitTest __test_networking_c[] = {{"test_backupAndUpdateClientArgv", test_backupAndUpdateClientArgv}, {"test_rewriteClientCommandArgument", test_rewriteClientCommandArgument}, {NULL, NULL}}; unitTest __test_quicklist_c[] = {{"test_quicklistCreateList", test_quicklistCreateList}, {"test_quicklistAddToTailOfEmptyList", test_quicklistAddToTailOfEmptyList}, {"test_quicklistAddToHeadOfEmptyList", test_quicklistAddToHeadOfEmptyList}, {"test_quicklistAddToTail5xAtCompress", test_quicklistAddToTail5xAtCompress}, {"test_quicklistAddToHead5xAtCompress", test_quicklistAddToHead5xAtCompress}, {"test_quicklistAddToTail500xAtCompress", test_quicklistAddToTail500xAtCompress}, {"test_quicklistAddToHead500xAtCompress", test_quicklistAddToHead500xAtCompress}, {"test_quicklistRotateEmpty", test_quicklistRotateEmpty}, {"test_quicklistComprassionPlainNode", test_quicklistComprassionPlainNode}, {"test_quicklistNextPlainNode", test_quicklistNextPlainNode}, {"test_quicklistRotatePlainNode", test_quicklistRotatePlainNode}, {"test_quicklistRotateOneValOnce", test_quicklistRotateOneValOnce}, {"test_quicklistRotate500Val5000TimesAtCompress", test_quicklistRotate500Val5000TimesAtCompress}, {"test_quicklistPopEmpty", test_quicklistPopEmpty}, {"test_quicklistPop1StringFrom1", test_quicklistPop1StringFrom1}, {"test_quicklistPopHead1NumberFrom1", test_quicklistPopHead1NumberFrom1}, {"test_quicklistPopHead500From500", test_quicklistPopHead500From500}, {"test_quicklistPopHead5000From500", test_quicklistPopHead5000From500}, {"test_quicklistIterateForwardOver500List", test_quicklistIterateForwardOver500List}, {"test_quicklistIterateReverseOver500List", test_quicklistIterateReverseOver500List}, {"test_quicklistInsertAfter1Element", test_quicklistInsertAfter1Element}, {"test_quicklistInsertBefore1Element", test_quicklistInsertBefore1Element}, {"test_quicklistInsertHeadWhileHeadNodeIsFull", test_quicklistInsertHeadWhileHeadNodeIsFull}, {"test_quicklistInsertTailWhileTailNodeIsFull", test_quicklistInsertTailWhileTailNodeIsFull}, {"test_quicklistInsertOnceInElementsWhileIteratingAtCompress", test_quicklistInsertOnceInElementsWhileIteratingAtCompress}, {"test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistDuplicateEmptyList", test_quicklistDuplicateEmptyList}, {"test_quicklistDuplicateListOf1Element", test_quicklistDuplicateListOf1Element}, {"test_quicklistDuplicateListOf500", test_quicklistDuplicateListOf500}, {"test_quicklistIndex1200From500ListAtFill", test_quicklistIndex1200From500ListAtFill}, {"test_quicklistIndex12From500ListAtFill", test_quicklistIndex12From500ListAtFill}, {"test_quicklistIndex100From500ListAtFill", test_quicklistIndex100From500ListAtFill}, {"test_quicklistIndexTooBig1From50ListAtFill", test_quicklistIndexTooBig1From50ListAtFill}, {"test_quicklistDeleteRangeEmptyList", test_quicklistDeleteRangeEmptyList}, {"test_quicklistDeleteRangeOfEntireNodeInListOfOneNode", test_quicklistDeleteRangeOfEntireNodeInListOfOneNode}, {"test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts", test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts}, {"test_quicklistDeleteMiddle100Of500List", test_quicklistDeleteMiddle100Of500List}, {"test_quicklistDeleteLessThanFillButAcrossNodes", test_quicklistDeleteLessThanFillButAcrossNodes}, {"test_quicklistDeleteNegative1From500List", test_quicklistDeleteNegative1From500List}, {"test_quicklistDeleteNegative1From500ListWithOverflowCounts", test_quicklistDeleteNegative1From500ListWithOverflowCounts}, {"test_quicklistDeleteNegative100From500List", test_quicklistDeleteNegative100From500List}, {"test_quicklistDelete10Count5From50List", test_quicklistDelete10Count5From50List}, {"test_quicklistNumbersOnlyListRead", test_quicklistNumbersOnlyListRead}, {"test_quicklistNumbersLargerListRead", test_quicklistNumbersLargerListRead}, {"test_quicklistNumbersLargerListReadB", test_quicklistNumbersLargerListReadB}, {"test_quicklistLremTestAtCompress", test_quicklistLremTestAtCompress}, {"test_quicklistIterateReverseDeleteAtCompress", test_quicklistIterateReverseDeleteAtCompress}, {"test_quicklistIteratorAtIndexTestAtCompress", test_quicklistIteratorAtIndexTestAtCompress}, {"test_quicklistLtrimTestAAtCompress", test_quicklistLtrimTestAAtCompress}, {"test_quicklistLtrimTestBAtCompress", test_quicklistLtrimTestBAtCompress}, {"test_quicklistLtrimTestCAtCompress", test_quicklistLtrimTestCAtCompress}, {"test_quicklistLtrimTestDAtCompress", test_quicklistLtrimTestDAtCompress}, {"test_quicklistVerifySpecificCompressionOfInteriorNodes", test_quicklistVerifySpecificCompressionOfInteriorNodes}, {"test_quicklistBookmarkGetUpdatedToNextItem", test_quicklistBookmarkGetUpdatedToNextItem}, {"test_quicklistBookmarkLimit", test_quicklistBookmarkLimit}, {"test_quicklistCompressAndDecompressQuicklistListpackNode", test_quicklistCompressAndDecompressQuicklistListpackNode}, {"test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX", test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX}, {NULL, NULL}}; unitTest __test_rax_c[] = {{"test_raxRandomWalk", test_raxRandomWalk}, {"test_raxIteratorUnitTests", test_raxIteratorUnitTests}, {"test_raxTryInsertUnitTests", test_raxTryInsertUnitTests}, {"test_raxRegressionTest1", test_raxRegressionTest1}, {"test_raxRegressionTest2", test_raxRegressionTest2}, {"test_raxRegressionTest3", test_raxRegressionTest3}, {"test_raxRegressionTest4", test_raxRegressionTest4}, {"test_raxRegressionTest5", test_raxRegressionTest5}, {"test_raxRegressionTest6", test_raxRegressionTest6}, {"test_raxBenchmark", test_raxBenchmark}, {"test_raxHugeKey", test_raxHugeKey}, {"test_raxFuzz", test_raxFuzz}, {NULL, NULL}}; unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {"test_sdssplitargs", test_sdssplitargs}, {NULL, NULL}}; @@ -237,6 +240,7 @@ struct unitTestSuite { {"test_intset.c", __test_intset_c}, {"test_kvstore.c", __test_kvstore_c}, {"test_listpack.c", __test_listpack_c}, + {"test_networking.c", __test_networking_c}, {"test_quicklist.c", __test_quicklist_c}, {"test_rax.c", __test_rax_c}, {"test_sds.c", __test_sds_c}, diff --git a/src/unit/test_networking.c b/src/unit/test_networking.c new file mode 100644 index 0000000000..ac042d907f --- /dev/null +++ b/src/unit/test_networking.c @@ -0,0 +1,131 @@ +#include + +#include "../networking.c" +#include "../server.c" +#include "test_help.h" + +int test_backupAndUpdateClientArgv(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + client *c = zmalloc(sizeof(client)); + + /* Test 1: Initial backup of arguments */ + c->argc = 2; + robj **initial_argv = zmalloc(sizeof(robj *) * 2); + c->argv = initial_argv; + c->argv[0] = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test")); + c->argv[1] = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test2")); + c->original_argv = NULL; + + backupAndUpdateClientArgv(c, 3, NULL); + + TEST_ASSERT(c->argv != initial_argv); + TEST_ASSERT(c->original_argv == initial_argv); + TEST_ASSERT(c->original_argc == 2); + TEST_ASSERT(c->argc == 3); + TEST_ASSERT(c->argv_len == 3); + TEST_ASSERT(c->argv[0]->refcount == 2); + TEST_ASSERT(c->argv[1]->refcount == 2); + TEST_ASSERT(c->argv[2] == NULL); + + /* Test 2: Direct argv replacement */ + robj **new_argv = zmalloc(sizeof(robj *) * 2); + new_argv[0] = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test")); + new_argv[1] = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test2")); + + backupAndUpdateClientArgv(c, 2, new_argv); + + TEST_ASSERT(c->argv == new_argv); + TEST_ASSERT(c->argc == 2); + TEST_ASSERT(c->argv_len == 2); + TEST_ASSERT(c->original_argv != c->argv); + TEST_ASSERT(c->original_argv == initial_argv); + TEST_ASSERT(c->original_argc == 2); + TEST_ASSERT(c->original_argv[0]->refcount == 1); + TEST_ASSERT(c->original_argv[1]->refcount == 1); + + /* Test 3: Expanding argc */ + backupAndUpdateClientArgv(c, 4, NULL); + + TEST_ASSERT(c->argc == 4); + TEST_ASSERT(c->argv_len == 4); + TEST_ASSERT(c->argv[0] != NULL); + TEST_ASSERT(c->argv[1] != NULL); + TEST_ASSERT(c->argv[2] == NULL); + TEST_ASSERT(c->argv[3] == NULL); + TEST_ASSERT(c->original_argv == initial_argv); + + /* Cleanup */ + for (int i = 0; i < c->original_argc; i++) { + decrRefCount(c->original_argv[i]); + } + zfree(c->original_argv); + + for (int i = 0; i < c->argc; i++) { + if (c->argv[i]) decrRefCount(c->argv[i]); + } + zfree(c->argv); + zfree(c); + + return 0; +} + +int test_rewriteClientCommandArgument(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + client *c = zmalloc(sizeof(client)); + c->argc = 3; + robj **initial_argv = zmalloc(sizeof(robj *) * 3); + c->argv = initial_argv; + c->original_argv = NULL; + c->argv_len_sum = 0; + + /* Initialize client with command "SET key value" */ + c->argv[0] = createStringObject("SET", 3); + robj *original_key = createStringObject("key", 3); + c->argv[1] = original_key; + c->argv[2] = createStringObject("value", 5); + c->argv_len_sum = 11; // 3 + 3 + 5 + + /* Test 1: Rewrite existing argument */ + robj *newval = createStringObject("newkey", 6); + rewriteClientCommandArgument(c, 1, newval); + + TEST_ASSERT(c->argv[1] == newval); + TEST_ASSERT(c->argv[1]->refcount == 2); + TEST_ASSERT(c->argv_len_sum == 14); // 3 + 6 + 5 + TEST_ASSERT(c->original_argv == initial_argv); + TEST_ASSERT(c->original_argv[1] == original_key); + TEST_ASSERT(c->original_argv[1]->refcount == 1); + + /* Test 3: Extend argument vector */ + robj *extraval = createStringObject("extra", 5); + rewriteClientCommandArgument(c, 3, extraval); + + TEST_ASSERT(c->argc == 4); + TEST_ASSERT(c->argv[3] == extraval); + TEST_ASSERT(c->argv_len_sum == 19); // 3 + 6 + 5 + 5 + TEST_ASSERT(c->original_argv == initial_argv); + + /* Cleanup */ + for (int i = 0; i < c->argc; i++) { + if (c->argv[i]) decrRefCount(c->argv[i]); + } + zfree(c->argv); + + for (int i = 0; i < c->original_argc; i++) { + if (c->original_argv[i]) decrRefCount(c->original_argv[i]); + } + zfree(c->original_argv); + + decrRefCount(newval); + decrRefCount(extraval); + + zfree(c); + + return 0; +} diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index d79bb1c7da..a51f07927d 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -376,6 +376,32 @@ start_server {tags {"introspection"}} { $rd close } + # This test verifies that MONITOR correctly records overwritten commands + # when executed within a MULTI-EXEC block. Specifically, it checks that even if + # the original SET-EX command arguments are overwritten for replica propagation, the MONITOR output + # still shows the original command. + test {MONITOR correctly records SET EX in MULTI-EXEC} { + # Start monitoring client + set rd [valkey_deferring_client] + $rd monitor + $rd read ; # Discard the OK + + # Execute multi-exec block with SET EX commands + r multi + r set "{slot}key1" value1 ex 3600 + r set "{slot}key2" value2 ex 1800 + r exec + + # Verify monitor output shows the original commands: + assert_match {*"multi"*} [$rd read] + assert_match {*"set"*"{slot}key1"*"value1"*"ex"*"3600"*} [$rd read] + assert_match {*"set"*"{slot}key2"*"value2"*"ex"*"1800"*} [$rd read] + assert_match {*"exec"*} [$rd read] + + # Clean up monitoring client + $rd close + } + test {MONITOR log blocked command only once} { # need to reconnect in order to reset the clients state reconnect From 349bc7547bf28ae304537bd6888d575e2409f25c Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Tue, 3 Dec 2024 11:19:53 -0800 Subject: [PATCH 78/92] defrag: use monotime in module interface (#1388) The recent PR (https://github.com/valkey-io/valkey/pull/1242) converted Active Defrag to use `monotime`. In that change, a conversion was performed to continue to use `ustime()` as part of the module interface. Since this time is only used internally, and never actually exposed to the module, we can convert this to use `monotime` directly. Signed-off-by: Jim Brunner --- src/defrag.c | 3 +-- src/module.c | 6 +++--- src/server.h | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index d0c7632f17..9c195e8959 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -905,8 +905,7 @@ static int defragLaterItem(dictEntry *de, unsigned long *cursor, monotime endtim } else if (ob->type == OBJ_STREAM) { return scanLaterStreamListpacks(ob, cursor, endtime); } else if (ob->type == OBJ_MODULE) { - long long endtimeWallClock = ustime() + (endtime - getMonotonicUs()); - return moduleLateDefrag(dictGetKey(de), ob, cursor, endtimeWallClock, dbid); + return moduleLateDefrag(dictGetKey(de), ob, cursor, endtime, dbid); } else { *cursor = 0; /* object type may have changed since we schedule it for later */ } diff --git a/src/module.c b/src/module.c index 794038beb4..4092ae6b06 100644 --- a/src/module.c +++ b/src/module.c @@ -13344,7 +13344,7 @@ const char *VM_GetCurrentCommandName(ValkeyModuleCtx *ctx) { * defrag callback. */ struct ValkeyModuleDefragCtx { - long long int endtime; + monotime endtime; unsigned long *cursor; struct serverObject *key; /* Optional name of key processed, NULL when unknown. */ int dbid; /* The dbid of the key being processed, -1 when unknown. */ @@ -13373,7 +13373,7 @@ int VM_RegisterDefragFunc(ValkeyModuleCtx *ctx, ValkeyModuleDefragFunc cb) { * so it generally makes sense to do small batches of work in between calls. */ int VM_DefragShouldStop(ValkeyModuleDefragCtx *ctx) { - return (ctx->endtime != 0 && ctx->endtime < ustime()); + return (ctx->endtime != 0 && ctx->endtime <= getMonotonicUs()); } /* Store an arbitrary cursor value for future re-use. @@ -13455,7 +13455,7 @@ ValkeyModuleString *VM_DefragValkeyModuleString(ValkeyModuleDefragCtx *ctx, Valk * Returns a zero value (and initializes the cursor) if no more needs to be done, * or a non-zero value otherwise. */ -int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, long long endtime, int dbid) { +int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime endtime, int dbid) { moduleValue *mv = value->ptr; moduleType *mt = mv->type; diff --git a/src/server.h b/src/server.h index 0aac1acbd8..896ff735b3 100644 --- a/src/server.h +++ b/src/server.h @@ -2732,7 +2732,7 @@ size_t moduleGetFreeEffort(robj *key, robj *val, int dbid); size_t moduleGetMemUsage(robj *key, robj *val, size_t sample_size, int dbid); robj *moduleTypeDupOrReply(client *c, robj *fromkey, robj *tokey, int todb, robj *value); int moduleDefragValue(robj *key, robj *obj, int dbid); -int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, long long endtime, int dbid); +int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime endtime, int dbid); void moduleDefragGlobals(void); void *moduleGetHandleByName(char *modulename); int moduleIsModuleCommand(void *module_handle, struct serverCommand *cmd); From 105509cdad1e667cd15ad751bf8b918d9ca1ca06 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Wed, 4 Dec 2024 06:09:56 +0800 Subject: [PATCH 79/92] Run RDMA builtin in CI workflow (#1380) Since 4695d118dd (#1209), RDMA supports builtin. And module connection type may be removed in future. So run a builtin RDMA support for CI workflow. RDMA module is complied only in CI, keep it building check only until module connection type gets obsolete. Signed-off-by: zhenwei pi --- .github/workflows/ci.yml | 10 +++++++--- tests/rdma/run.py | 3 +-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3fec424cee..df3eaa1905 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -77,10 +77,14 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - name: make + - name: prepare-development-libraries + run: sudo apt-get install librdmacm-dev libibverbs-dev + - name: make-rdma-module + run: make -j4 BUILD_RDMA=module + - name: make-rdma-builtin run: | - sudo apt-get install librdmacm-dev libibverbs-dev - make -j4 BUILD_RDMA=module + make distclean + make -j4 BUILD_RDMA=yes - name: clone-rxe-kmod run: | mkdir -p tests/rdma/rxe diff --git a/tests/rdma/run.py b/tests/rdma/run.py index 09168f368a..77e0f285fe 100755 --- a/tests/rdma/run.py +++ b/tests/rdma/run.py @@ -60,10 +60,9 @@ def test_rdma(ipaddr): # step 2, start server svrpath = valkeydir + "/src/valkey-server" - rdmapath = valkeydir + "/src/valkey-rdma.so" svrcmd = [svrpath, "--port", "0", "--loglevel", "verbose", "--protected-mode", "yes", "--appendonly", "no", "--daemonize", "no", "--dir", valkeydir + "/tests/rdma/tmp", - "--loadmodule", rdmapath, "--rdma-port", "6379", "--rdma-bind", ipaddr] + "--rdma-port", "6379", "--rdma-bind", ipaddr] svr = subprocess.Popen(svrcmd, shell=False, stdout=subprocess.PIPE) try: From a401e3789d58c4d41769c3099a5f1cc009130994 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Wed, 4 Dec 2024 10:33:14 -0800 Subject: [PATCH 80/92] Update code of conduct maintainers email address (#1391) Updating code of conduct maintainer's email address Signed-off-by: Madelyn Olson --- CODE_OF_CONDUCT.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 1c530ec7ba..36764bb81b 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -49,7 +49,7 @@ representative at an online or offline event. Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at -this email address: placeholderkv@gmail.com. +this email address: maintainers@lists.valkey.io. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. From 71560a2a4a1a73085dba9a8ea8f835c371358cfa Mon Sep 17 00:00:00 2001 From: Wen Hui Date: Thu, 5 Dec 2024 11:58:24 -0500 Subject: [PATCH 81/92] Add API UpdateRuntimeArgs for updating the module arguments during runtime (#1041) Before Redis OSS 7, if we load a module with some arguments during runtime, and run the command "config rewrite", the module information will not be saved into the config file. Since Redis OSS 7 and Valkey 7.2, if we load a module with some arguments during runtime, the module information (path, arguments number, and arguments value) can be saved into the config file after config rewrite command is called. Thus, the module will be loaded automatically when the server startup next time. Following is one example: bind 172.25.0.58 port 7000 protected-mode no enable-module-command yes Generated by CONFIG REWRITE latency-tracking-info-percentiles 50 99 99.9 dir "/home/ubuntu/valkey" save 3600 1 300 100 60 10000 user default on nopass sanitize-payload ~* &* +https://github.com/ALL loadmodule tests/modules/datatype.so 10 20 However, there is one problem. If developers write a module, and update the running arguments by someway, the updated arguments can not be saved into the config file even "config rewrite" is called. The reason comes from the following function rewriteConfigLoadmoduleOption (src/config.c) void rewriteConfigLoadmoduleOption(struct rewriteConfigState *state) { .......... struct ValkeyModule *module = dictGetVal(de); line = sdsnew("loadmodule "); line = sdscatsds(line, module->loadmod->path); for (int i = 0; i < module->loadmod->argc; i++) { line = sdscatlen(line, " ", 1); line = sdscatsds(line, module->loadmod->argv[i]->ptr); } rewriteConfigRewriteLine(state, "loadmodule", line, 1); ....... } The function only save the initial arguments information (module->loadmod) into the configfile. After core members discuss, ref https://github.com/valkey-io/valkey/issues/1177 We decide add the following API to implement this feature: Original proposal: int VM_UpdateRunTimeArgs(ValkeyModuleCtx *ctx, int index, char *value); Updated proposal: ValkeyModuleString **values VM_GetRuntimeArgs(ValkeyModuleCtx *ctx); **int VM_UpdateRuntimeArgs(ValkeyModuleCtx *ctx, int argc, ValkeyModuleString **values); Why we do not recommend the following way: MODULE UNLOAD Update module args in the conf file MODULE LOAD I think there are the following disadvantages: 1. Some modules can not be unloaded. Such as the example module datatype.so, which is tests/modules/datatype.so 2. it is not atomic operation for MODULE UNLOAD + MODULE LOAD 3. sometimes, if we just run the module unload, the client business could be interrupted --------- Signed-off-by: hwware --- src/module.c | 22 ++++++++++++++++++++ src/valkeymodule.h | 2 ++ tests/modules/Makefile | 1 + tests/modules/moduleparameter.c | 28 ++++++++++++++++++++++++++ tests/unit/moduleapi/moduleconfigs.tcl | 13 +++++++++++- 5 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 tests/modules/moduleparameter.c diff --git a/src/module.c b/src/module.c index 4092ae6b06..5f9dff0402 100644 --- a/src/module.c +++ b/src/module.c @@ -2255,6 +2255,27 @@ int moduleIsModuleCommand(void *module_handle, struct serverCommand *cmd) { return (cp->module == module_handle); } +/* ValkeyModule_UpdateRuntimeArgs can be used to update the module argument values. + * The function parameter 'argc' indicates the number of updated arguments, and 'argv' + * represents the values of the updated arguments. + * Once 'CONFIG REWRITE' command is called, the updated argument values can be saved into conf file. + * + * The function always returns VALKEYMODULE_OK. */ +int VM_UpdateRuntimeArgs(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) { + struct moduleLoadQueueEntry *loadmod = ctx->module->loadmod; + for (int i = 0; i < loadmod->argc; i++) { + decrRefCount(loadmod->argv[i]); + } + zfree(loadmod->argv); + loadmod->argv = argc - 1 ? zmalloc(sizeof(robj *) * (argc - 1)) : NULL; + loadmod->argc = argc - 1; + for (int i = 1; i < argc; i++) { + loadmod->argv[i - 1] = argv[i]; + incrRefCount(loadmod->argv[i - 1]); + } + return VALKEYMODULE_OK; +} + /* -------------------------------------------------------------------------- * ## Module information and time measurement * -------------------------------------------------------------------------- */ @@ -13560,6 +13581,7 @@ void moduleRegisterCoreAPI(void) { REGISTER_API(SetModuleAttribs); REGISTER_API(IsModuleNameBusy); REGISTER_API(WrongArity); + REGISTER_API(UpdateRuntimeArgs); REGISTER_API(ReplyWithLongLong); REGISTER_API(ReplyWithError); REGISTER_API(ReplyWithErrorFormat); diff --git a/src/valkeymodule.h b/src/valkeymodule.h index c2cdb2f0e7..7c3adfd477 100644 --- a/src/valkeymodule.h +++ b/src/valkeymodule.h @@ -967,6 +967,7 @@ VALKEYMODULE_API void (*ValkeyModule_SetModuleAttribs)(ValkeyModuleCtx *ctx, con VALKEYMODULE_ATTR; VALKEYMODULE_API int (*ValkeyModule_IsModuleNameBusy)(const char *name) VALKEYMODULE_ATTR; VALKEYMODULE_API int (*ValkeyModule_WrongArity)(ValkeyModuleCtx *ctx) VALKEYMODULE_ATTR; +VALKEYMODULE_API int (*ValkeyModule_UpdateRuntimeArgs)(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) VALKEYMODULE_ATTR; VALKEYMODULE_API int (*ValkeyModule_ReplyWithLongLong)(ValkeyModuleCtx *ctx, long long ll) VALKEYMODULE_ATTR; VALKEYMODULE_API int (*ValkeyModule_GetSelectedDb)(ValkeyModuleCtx *ctx) VALKEYMODULE_ATTR; VALKEYMODULE_API int (*ValkeyModule_SelectDb)(ValkeyModuleCtx *ctx, int newid) VALKEYMODULE_ATTR; @@ -1673,6 +1674,7 @@ static int ValkeyModule_Init(ValkeyModuleCtx *ctx, const char *name, int ver, in VALKEYMODULE_GET_API(SetModuleAttribs); VALKEYMODULE_GET_API(IsModuleNameBusy); VALKEYMODULE_GET_API(WrongArity); + VALKEYMODULE_GET_API(UpdateRuntimeArgs); VALKEYMODULE_GET_API(ReplyWithLongLong); VALKEYMODULE_GET_API(ReplyWithError); VALKEYMODULE_GET_API(ReplyWithErrorFormat); diff --git a/tests/modules/Makefile b/tests/modules/Makefile index 1690b9b627..82813bb6f7 100644 --- a/tests/modules/Makefile +++ b/tests/modules/Makefile @@ -58,6 +58,7 @@ TEST_MODULES = \ eventloop.so \ moduleconfigs.so \ moduleconfigstwo.so \ + moduleparameter.so \ publish.so \ usercall.so \ postnotifications.so \ diff --git a/tests/modules/moduleparameter.c b/tests/modules/moduleparameter.c new file mode 100644 index 0000000000..6c110f2cfb --- /dev/null +++ b/tests/modules/moduleparameter.c @@ -0,0 +1,28 @@ +#include "valkeymodule.h" +#include +#include +#include +#include + +int test_module_update_parameter(ValkeyModuleCtx *ctx, + ValkeyModuleString **argv, int argc) { + + ValkeyModule_UpdateRuntimeArgs(ctx, argv, argc); + return ValkeyModule_ReplyWithSimpleString(ctx, "OK"); +} + +int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) { + VALKEYMODULE_NOT_USED(argv); + VALKEYMODULE_NOT_USED(argc); + + if (ValkeyModule_Init(ctx, "moduleparameter", 1, VALKEYMODULE_APIVER_1) == + VALKEYMODULE_ERR) + return VALKEYMODULE_ERR; + + if (ValkeyModule_CreateCommand(ctx, "testmoduleparameter.update.parameter", + test_module_update_parameter, "fast", 0, 0, + 0) == VALKEYMODULE_ERR) + return VALKEYMODULE_ERR; + + return VALKEYMODULE_OK; +} diff --git a/tests/unit/moduleapi/moduleconfigs.tcl b/tests/unit/moduleapi/moduleconfigs.tcl index 44f994d2d0..54de5f2611 100644 --- a/tests/unit/moduleapi/moduleconfigs.tcl +++ b/tests/unit/moduleapi/moduleconfigs.tcl @@ -1,5 +1,7 @@ set testmodule [file normalize tests/modules/moduleconfigs.so] set testmoduletwo [file normalize tests/modules/moduleconfigstwo.so] +set testmoduleparameter [file normalize tests/modules/moduleparameter.so] + start_server {tags {"modules"}} { r module load $testmodule @@ -243,5 +245,14 @@ start_server {tags {"modules"}} { assert_equal [r config get moduleconfigs.memory_numeric] "moduleconfigs.memory_numeric 1024" } } -} + test {Module Update Args} { + r module load $testmoduleparameter 10 20 30 + set t [r module list] + set modulename [lmap x [r module list] {dict get $x name}] + assert_not_equal [lsearch $modulename moduleparameter] -1 + assert_equal "{10 20 30}" [lmap x [r module list] {dict get $x args}] + assert_equal OK [r testmoduleparameter.update.parameter 40 50 60 70] + assert_equal "{40 50 60 70}" [lmap x [r module list] {dict get $x args}] + } +} From 6b3e1228cd043ebd35eec9c4354c933d5a8f968c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=A3=8E=E5=8E=BB=E5=B9=BD=E5=A2=A8?= <43802771+fengquyoumo@users.noreply.github.com> Date: Fri, 6 Dec 2024 01:26:56 +0800 Subject: [PATCH 82/92] RDMA: Fix dead loop when transfer large data (20KB) (#1386) Determine the status of the Client when attempting to read data. If state=CLIENT_COMPLETED_IO, no read attempt is made and I/O operations on the Client are rescheduled by the main thread. > And 20474 Byte = PROTO_IOBUF_LEN(16KB) + SDS_HDR_VAR(16, s)(4090 Byte) Fixes #1385 --------- Signed-off-by: fengquyoumo <1455117463@qq.com> --- src/rdma.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/src/rdma.c b/src/rdma.c index de7ea396a1..7fe65ad2d2 100644 --- a/src/rdma.c +++ b/src/rdma.c @@ -77,9 +77,12 @@ typedef enum ValkeyRdmaOpcode { #define VALKEY_RDMA_INVALID_OPCODE 0xffff #define VALKEY_RDMA_KEEPALIVE_MS 3000 +#define RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE (1 << 0) + typedef struct rdma_connection { connection c; struct rdma_cm_id *cm_id; + int flags; int last_errno; listNode *pending_list_node; } rdma_connection; @@ -693,7 +696,7 @@ static void connRdmaEventHandler(struct aeEventLoop *el, int fd, void *clientDat } /* uplayer should read all */ - while (ctx->rx.pos < ctx->rx.offset) { + while (!(rdma_conn->flags & RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE) && ctx->rx.pos < ctx->rx.offset) { if (conn->read_handler && (callHandler(conn, conn->read_handler) == C_ERR)) { return; } @@ -705,7 +708,7 @@ static void connRdmaEventHandler(struct aeEventLoop *el, int fd, void *clientDat } /* RDMA comp channel has no POLLOUT event, try to send remaining buffer */ - if ((ctx->tx.offset < ctx->tx.length) && conn->write_handler) { + if (!(rdma_conn->flags & RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE) && ctx->tx.offset < ctx->tx.length && conn->write_handler) { callHandler(conn, conn->write_handler); } } @@ -884,6 +887,9 @@ static void connRdmaAcceptHandler(aeEventLoop *el, int fd, void *privdata, int m } static int connRdmaSetRwHandler(connection *conn) { + rdma_connection *rdma_conn = (rdma_connection *)conn; + if (rdma_conn->flags & RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE) return C_OK; + /* IB channel only has POLLIN event */ if (conn->read_handler || conn->write_handler) { if (aeCreateFileEvent(server.el, conn->fd, AE_READABLE, conn->type->ae_handler, conn) == AE_ERR) { @@ -1721,12 +1727,12 @@ static int rdmaProcessPendingData(void) { listNode *ln; rdma_connection *rdma_conn; connection *conn; - int processed; + int processed = 0; - processed = listLength(pending_list); listRewind(pending_list, &li); while ((ln = listNext(&li))) { rdma_conn = listNodeValue(ln); + if (rdma_conn->flags & RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE) continue; conn = &rdma_conn->c; /* a connection can be disconnected by remote peer, CM event mark state as CONN_STATE_CLOSED, kick connection @@ -1741,15 +1747,32 @@ static int rdmaProcessPendingData(void) { callHandler(conn, conn->write_handler); } + ++processed; continue; } connRdmaEventHandler(NULL, -1, rdma_conn, 0); + ++processed; } return processed; } +static void postPoneUpdateRdmaState(struct connection *conn, int postpone) { + rdma_connection *rdma_conn = (rdma_connection *)conn; + if (postpone) { + rdma_conn->flags |= RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE; + } else { + rdma_conn->flags &= ~RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE; + } +} + +static void updateRdmaState(struct connection *conn) { + rdma_connection *rdma_conn = (rdma_connection *)conn; + connRdmaSetRwHandler(conn); + connRdmaEventHandler(NULL, -1, rdma_conn, 0); +} + static ConnectionType CT_RDMA = { /* connection type */ .get_type = connRdmaGetType, @@ -1792,6 +1815,8 @@ static ConnectionType CT_RDMA = { /* pending data */ .has_pending_data = rdmaHasPendingData, .process_pending_data = rdmaProcessPendingData, + .postpone_update_state = postPoneUpdateRdmaState, + .update_state = updateRdmaState, }; ConnectionType *connectionTypeRdma(void) { From 6df376d68a97e9c0da4549f57db96742b5482202 Mon Sep 17 00:00:00 2001 From: Caiyi Wu <53631337+Codebells@users.noreply.github.com> Date: Fri, 6 Dec 2024 03:01:38 +0800 Subject: [PATCH 83/92] Fix coredump when use hellodict example module (#1395) In the ValkeyModule_OnLoad method of the file hellodict.c, the parameter keystep of ValkeyModule_CreateCommand should be 1. Otherwise, execute command will coredump. MODULE LOAD /home/tiger/valkey/src/modules/hellodict.so COMMAND GETKEYS HELLODICT.SET key value Signed-off-by: Codebells <1347103071@qq.com> --- src/modules/hellodict.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/modules/hellodict.c b/src/modules/hellodict.c index e0af06ba2f..db2fd17e8a 100644 --- a/src/modules/hellodict.c +++ b/src/modules/hellodict.c @@ -109,13 +109,13 @@ int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int arg if (ValkeyModule_Init(ctx, "hellodict", 1, VALKEYMODULE_APIVER_1) == VALKEYMODULE_ERR) return VALKEYMODULE_ERR; - if (ValkeyModule_CreateCommand(ctx, "hellodict.set", cmd_SET, "write deny-oom", 1, 1, 0) == VALKEYMODULE_ERR) + if (ValkeyModule_CreateCommand(ctx, "hellodict.set", cmd_SET, "write deny-oom", 1, 1, 1) == VALKEYMODULE_ERR) return VALKEYMODULE_ERR; - if (ValkeyModule_CreateCommand(ctx, "hellodict.get", cmd_GET, "readonly", 1, 1, 0) == VALKEYMODULE_ERR) + if (ValkeyModule_CreateCommand(ctx, "hellodict.get", cmd_GET, "readonly", 1, 1, 1) == VALKEYMODULE_ERR) return VALKEYMODULE_ERR; - if (ValkeyModule_CreateCommand(ctx, "hellodict.keyrange", cmd_KEYRANGE, "readonly", 1, 1, 0) == VALKEYMODULE_ERR) + if (ValkeyModule_CreateCommand(ctx, "hellodict.keyrange", cmd_KEYRANGE, "readonly", 1, 1, 1) == VALKEYMODULE_ERR) return VALKEYMODULE_ERR; /* Create our global dictionary. Here we'll set our keys and values. */ From a2fe6af457e353425d39c858b8cf68f1b4d6a9b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Sat, 7 Dec 2024 10:25:40 +0100 Subject: [PATCH 84/92] Fix Module Update Args test when other modules are loaded (#1403) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #1400 Signed-off-by: Viktor Söderqvist --- tests/unit/moduleapi/moduleconfigs.tcl | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tests/unit/moduleapi/moduleconfigs.tcl b/tests/unit/moduleapi/moduleconfigs.tcl index 54de5f2611..2474ad3567 100644 --- a/tests/unit/moduleapi/moduleconfigs.tcl +++ b/tests/unit/moduleapi/moduleconfigs.tcl @@ -2,6 +2,14 @@ set testmodule [file normalize tests/modules/moduleconfigs.so] set testmoduletwo [file normalize tests/modules/moduleconfigstwo.so] set testmoduleparameter [file normalize tests/modules/moduleparameter.so] +proc module_get_args {mod} { + foreach line [r module list] { + if {[dict get $line name] eq $mod} { + return [dict get $line args] + } + } + throw error {module not found} +} start_server {tags {"modules"}} { r module load $testmodule @@ -246,13 +254,13 @@ start_server {tags {"modules"}} { } } test {Module Update Args} { - r module load $testmoduleparameter 10 20 30 + r module load $testmoduleparameter 10 20 30 - set t [r module list] - set modulename [lmap x [r module list] {dict get $x name}] - assert_not_equal [lsearch $modulename moduleparameter] -1 - assert_equal "{10 20 30}" [lmap x [r module list] {dict get $x args}] - assert_equal OK [r testmoduleparameter.update.parameter 40 50 60 70] - assert_equal "{40 50 60 70}" [lmap x [r module list] {dict get $x args}] + set t [r module list] + set modulename [lmap x [r module list] {dict get $x name}] + assert_not_equal [lsearch $modulename moduleparameter] -1 + assert_equal {10 20 30} [module_get_args moduleparameter] + assert_equal OK [r testmoduleparameter.update.parameter 40 50 60 70] + assert_equal {40 50 60 70} [module_get_args moduleparameter] } } From f20d629dbe31d31eb82e360f9da4ef94ba9aabdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Sat, 7 Dec 2024 10:26:31 +0100 Subject: [PATCH 85/92] Fix sanitizer builds with clang (#1402) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By including after the other includes in the unit test, we can avoid redefining a macro which led to a build failure. Fixes #1394 --------- Signed-off-by: Viktor Söderqvist --- src/unit/test_networking.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/unit/test_networking.c b/src/unit/test_networking.c index ac042d907f..566583bcc5 100644 --- a/src/unit/test_networking.c +++ b/src/unit/test_networking.c @@ -1,9 +1,9 @@ -#include - #include "../networking.c" #include "../server.c" #include "test_help.h" +#include + int test_backupAndUpdateClientArgv(int argc, char **argv, int flags) { UNUSED(argc); UNUSED(argv); From 176fafcaf71793efdadefba8e49ef711748b0c20 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sun, 8 Dec 2024 20:28:14 +0800 Subject: [PATCH 86/92] Add a note to conf about the dangers of modifying dir at runtime (#887) We've had security issues in the past with it, which is why we marked it as PROTECTED. But, modifying during runtime is also a dangerous action. For example, when child processes are running, persistent temp files and log files may have unexpected effects. A scenario for modifying dir at runtime is to migrate a disk failure, such as using disk-based replication to migrate a node, writing nodes.conf to save the cluster configuration. We decided to leave it as is and add a note in the conf about the dangers of modifying dir at runtime. Signed-off-by: Binbin --- tests/unit/introspection.tcl | 7 +++++++ valkey.conf | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index a51f07927d..47490a295b 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -980,6 +980,13 @@ start_server {tags {"introspection"}} { } } {} {external:skip} + test {valkey-server command line arguments - dir multiple times} { + start_server {config "default.conf" args {--dir "./" --dir "./"}} { + r config get dir + assert_equal {PONG} [r ping] + } + } {} {external:skip} + # Config file at this point is at a weird state, and includes all # known keywords. Might be a good idea to avoid adding tests here. } diff --git a/valkey.conf b/valkey.conf index b997e8179b..e23aea39de 100644 --- a/valkey.conf +++ b/valkey.conf @@ -582,6 +582,9 @@ rdb-del-sync-files no # The working directory. # +# The server log is written relative this directory, if the 'logfile' +# configuration directive is a relative path. +# # The DB will be written inside this directory, with the filename specified # above using the 'dbfilename' configuration directive. # @@ -591,6 +594,9 @@ rdb-del-sync-files no # 'cluster-config-file' configuration directive is a relative path. # # Note that you must specify a directory here, not a file name. +# Note that modifying 'dir' during runtime may have unexpected behavior, +# for example when a child process is running, related file operations may +# have unexpected effects. dir ./ ################################# REPLICATION ################################# From e8078b7315250dc052b4020a4ea73471a8c0e4a9 Mon Sep 17 00:00:00 2001 From: Guillaume Koenig <106696198+knggk@users.noreply.github.com> Date: Sun, 8 Dec 2024 07:30:07 -0500 Subject: [PATCH 87/92] Allow MEMORY MALLOC-STATS and MEMORY PURGE during loading phase (#1317) - Enable investigation of memory issues during loading - Previously, all memory commands were rejected with LOADING error (except memory help) - `MEMORY MALLOC-STATS` and `MEMORTY PURGE` are now allowed as they don't depend on the dataset - `MEMORY STATS` and `MEMORY USAGE KEY` remain disallowed Fixes #1299 Signed-off-by: Guillaume Koenig Signed-off-by: Binbin Co-authored-by: Binbin --- src/commands.def | 4 +-- src/commands/memory-malloc-stats.json | 3 ++ src/commands/memory-purge.json | 3 ++ tests/unit/introspection.tcl | 43 +++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/src/commands.def b/src/commands.def index ecc77126af..1ac2368ee1 100644 --- a/src/commands.def +++ b/src/commands.def @@ -7320,8 +7320,8 @@ struct COMMAND_ARG MEMORY_USAGE_Args[] = { struct COMMAND_STRUCT MEMORY_Subcommands[] = { {MAKE_CMD("doctor","Outputs a memory problems report.","O(1)","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_DOCTOR_History,0,MEMORY_DOCTOR_Tips,3,memoryCommand,2,0,0,MEMORY_DOCTOR_Keyspecs,0,NULL,0)}, {MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_HELP_History,0,MEMORY_HELP_Tips,0,memoryCommand,2,CMD_LOADING|CMD_STALE,0,MEMORY_HELP_Keyspecs,0,NULL,0)}, -{MAKE_CMD("malloc-stats","Returns the allocator statistics.","Depends on how much memory is allocated, could be slow","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_MALLOC_STATS_History,0,MEMORY_MALLOC_STATS_Tips,3,memoryCommand,2,0,0,MEMORY_MALLOC_STATS_Keyspecs,0,NULL,0)}, -{MAKE_CMD("purge","Asks the allocator to release memory.","Depends on how much memory is allocated, could be slow","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_PURGE_History,0,MEMORY_PURGE_Tips,2,memoryCommand,2,0,0,MEMORY_PURGE_Keyspecs,0,NULL,0)}, +{MAKE_CMD("malloc-stats","Returns the allocator statistics.","Depends on how much memory is allocated, could be slow","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_MALLOC_STATS_History,0,MEMORY_MALLOC_STATS_Tips,3,memoryCommand,2,CMD_LOADING,0,MEMORY_MALLOC_STATS_Keyspecs,0,NULL,0)}, +{MAKE_CMD("purge","Asks the allocator to release memory.","Depends on how much memory is allocated, could be slow","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_PURGE_History,0,MEMORY_PURGE_Tips,2,memoryCommand,2,CMD_LOADING,0,MEMORY_PURGE_Keyspecs,0,NULL,0)}, {MAKE_CMD("stats","Returns details about memory usage.","O(1)","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_STATS_History,0,MEMORY_STATS_Tips,3,memoryCommand,2,0,0,MEMORY_STATS_Keyspecs,0,NULL,0)}, {MAKE_CMD("usage","Estimates the memory usage of a key.","O(N) where N is the number of samples.","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_USAGE_History,0,MEMORY_USAGE_Tips,0,memoryCommand,-3,CMD_READONLY,0,MEMORY_USAGE_Keyspecs,1,NULL,2),.args=MEMORY_USAGE_Args}, {0} diff --git a/src/commands/memory-malloc-stats.json b/src/commands/memory-malloc-stats.json index 5ef6a31c40..af5d439744 100644 --- a/src/commands/memory-malloc-stats.json +++ b/src/commands/memory-malloc-stats.json @@ -12,6 +12,9 @@ "REQUEST_POLICY:ALL_SHARDS", "RESPONSE_POLICY:SPECIAL" ], + "command_flags": [ + "LOADING" + ], "reply_schema": { "type": "string", "description": "The memory allocator's internal statistics report." diff --git a/src/commands/memory-purge.json b/src/commands/memory-purge.json index 77ed61dc5b..aea3e2d24a 100644 --- a/src/commands/memory-purge.json +++ b/src/commands/memory-purge.json @@ -11,6 +11,9 @@ "REQUEST_POLICY:ALL_SHARDS", "RESPONSE_POLICY:ALL_SUCCEEDED" ], + "command_flags": [ + "LOADING" + ], "reply_schema": { "const": "OK" } diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index 47490a295b..bafc46d4b7 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -1042,6 +1042,49 @@ test {config during loading} { } } {} {external:skip} +test {MEMORY commands during loading} { + start_server [list overrides [list key-load-delay 50 loading-process-events-interval-bytes 1024]] { + # Set up some initial data + r debug populate 100000 key 1000 + + # Save and restart + r save + restart_server 0 false false + + # At this point, keys are loaded one at time, busy looping 50usec + # between each. Further, other events are processed every 1024 bytes + # of RDB. We're sending all our commands deferred, so they have a + # chance to be processed all at once between loading two keys. + + set rd [valkey_deferring_client] + + # Allowed during loading + $rd memory help + $rd memory malloc-stats + $rd memory purge + + # Disallowed during loading (because directly dependent on the dataset) + $rd memory doctor + $rd memory stats + $rd memory usage key:1 + + # memory help + assert_match {{MEMORY *}} [$rd read] + # memory malloc-stats + assert_match {*alloc*} [$rd read] + # memory purge + assert_match OK [$rd read] + # memory doctor + assert_error {*LOADING*} {$rd read} + # memory stats + assert_error {*LOADING*} {$rd read} + # memory usage key:1 + assert_error {*LOADING*} {$rd read} + + $rd close + } +} {} {external:skip} + test {CONFIG REWRITE handles rename-command properly} { start_server {tags {"introspection"} overrides {rename-command {flushdb badger}}} { assert_error {ERR unknown command*} {r flushdb} From b09db3ef788896f7192b068b1089c11b761ed3fe Mon Sep 17 00:00:00 2001 From: Roman Gershman Date: Mon, 9 Dec 2024 10:01:43 +0200 Subject: [PATCH 88/92] Fix typo in streams seen-time / active-time test (#1409) This variable name is wrong, it causes the wrong variable to be asserted. Signed-off-by: Roman Gershman --- tests/unit/type/stream-cgroups.tcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/type/stream-cgroups.tcl b/tests/unit/type/stream-cgroups.tcl index d934e48140..d736b9cdb7 100644 --- a/tests/unit/type/stream-cgroups.tcl +++ b/tests/unit/type/stream-cgroups.tcl @@ -944,7 +944,7 @@ start_server { # Simulate loading from RDB - set reply [r XINFO STREAM x FULL] + set reply [r XINFO STREAM mystream FULL] set group [lindex [dict get $reply groups] 0] set consumer [lindex [dict get $group consumers] 0] set prev_seen [dict get $consumer seen-time] @@ -954,7 +954,7 @@ start_server { r DEL mystream r RESTORE mystream 0 $dump - set reply [r XINFO STREAM x FULL] + set reply [r XINFO STREAM mystream FULL] set group [lindex [dict get $reply groups] 0] set consumer [lindex [dict get $group consumers] 0] assert_equal $prev_seen [dict get $consumer seen-time] From 924729eb1695a8a5913fe32531a8d520560fe70b Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 9 Dec 2024 16:19:02 +0800 Subject: [PATCH 89/92] Fix the election was reset wrongly before failover epoch was obtained (#1339) After #1009, we will reset the election when we received a claim with an equal or higher epoch since a node can win an election in the past. But we need to consider the time before the node actually obtains the failover_auth_epoch. The failover_auth_epoch default is 0, so before the node actually get the failover epoch, we might wrongly reset the election. This is probably harmless, but will produce misleading log output and may delay election by a cron cycle or beforesleep. Now we will only reset the election when a node is actually obtains the failover epoch. Signed-off-by: Binbin --- src/cluster_legacy.c | 3 ++- tests/unit/cluster/failover2.tcl | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 6ea8eb2e67..50a8ffca38 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3149,7 +3149,8 @@ int clusterProcessPacket(clusterLink *link) { sender->configEpoch = sender_claimed_config_epoch; clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG); - if (server.cluster->failover_auth_time && sender->configEpoch >= server.cluster->failover_auth_epoch) { + if (server.cluster->failover_auth_time && server.cluster->failover_auth_sent && + sender->configEpoch >= server.cluster->failover_auth_epoch) { /* Another node has claimed an epoch greater than or equal to ours. * If we have an ongoing election, reset it because we cannot win * with an epoch smaller than or equal to the incoming claim. This diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl index 21c4f4a678..9262049e4e 100644 --- a/tests/unit/cluster/failover2.tcl +++ b/tests/unit/cluster/failover2.tcl @@ -86,6 +86,11 @@ start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval fail "No failover detected" } + # Make sure there is no false epoch 0. + verify_no_log_message -7 "*Failover election in progress for epoch 0*" 0 + verify_no_log_message -8 "*Failover election in progress for epoch 0*" 0 + verify_no_log_message -9 "*Failover election in progress for epoch 0*" 0 + # Make sure there is no failover timeout. verify_no_log_message -7 "*Failover attempt expired*" 0 verify_no_log_message -8 "*Failover attempt expired*" 0 From 5be4ce6d27c0fb8c046508ff04016a1395ca9d5e Mon Sep 17 00:00:00 2001 From: ranshid <88133677+ranshid@users.noreply.github.com> Date: Mon, 9 Dec 2024 16:48:46 +0200 Subject: [PATCH 90/92] Optimize ZRANK to avoid path comparisons (#1389) ZRANK is a widly used command for workloads using sorted-sets. For example, in leaderboards It enables query the specific rank of a player. The way ZRANK is currently implemented is: 1. locate the element in the SortedSet hashtable. 2. take the score of the element and use it in order to locate the element in the SkipList (when listpack encoding is not used) 3. During the SkipLis scan for the elemnt we keep the path and use it in order to sum the span in each path node in order to calculate the elemnt rank One problem with this approach is that it involves multiple compare operations in order to locate the element. Specifically string comparison can be expensive since it will require access multiple memory locations for the items the element string is compared against. Perf analysis showed this can take up to 20% of the rank scan time. (TBD - provide the perf results for example) We can improve the rank search by taking advantage of the fact that the element node in the skiplist is pointed by the hashtable value! Our Skiplist implementation is using FatKeys, where each added node is assigned a randomly chosen height. Say we keep a height record for every skiplist element. In order to get an element rank we simply: 1. locate the element in the SortedSet hashtable. 2. we go directly to the node in the skiplist. 3. we jump to the full height of the node and take the span value. 4. we continue going foreward and always jump to the heighst point in each node we get to, making sure to sum all the spans. 5. we take off the summed spans from the SkipList length and we now have the specific node rank. :) In order to test this method I created several benchmarks. All benchmarks used the same seeds and the lists contained 1M elements. Since a very important factor is the number of scores compared to the number of elements (since small ratio means more string compares during searches) each benchmark test used different number of scores (1, 10K, 100K, 1M) some results: **TPS** Scores range | non-optimized | optimized | gain -- | -- | -- | -- 1 | 416042 | 605363 | 45.51% 10K | 359776 | 459200 | 27.63% 100K | 380387 | 459157 | 20.71% 1M | 416059 | 450853 | 8.36% **Latency** Scores range | non-optimized | optimized | gain -- | -- | -- | -- 1 | 1.191000 | 0.831000 | -30.23% 10K | 1.383000 | 1.095000 | -20.82% 100K | 1.311000 | 1.087000 | -17.09% 1M | 1.191000 | 1.119000 | -6.05% ### Memory efficiency adding another field to each skiplist node can cause degredation in memory efficiency for large sortedsets. We use the fact that level 0 recorded span of ALL nodes can either be 1 or zero (for the last node). So we use wrappers in order to get a node span and override the span for level 0 to hold the node height. --------- Signed-off-by: Ran Shidlansik --- src/server.h | 4 ++ src/t_zset.c | 104 ++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 86 insertions(+), 22 deletions(-) diff --git a/src/server.h b/src/server.h index 896ff735b3..44de6eada1 100644 --- a/src/server.h +++ b/src/server.h @@ -1449,6 +1449,10 @@ typedef struct zskiplistNode { struct zskiplistNode *backward; struct zskiplistLevel { struct zskiplistNode *forward; + /* At each level we keep the span, which is the number of elements which are on the "subtree" + * from this node at this level to the next node at the same level. + * One exception is the value at level 0. In level 0 the span can only be 1 or 0 (in case the last elements in the list) + * So we use it in order to hold the height of the node, which is the number of levels. */ unsigned long span; } level[]; } zskiplistNode; diff --git a/src/t_zset.c b/src/t_zset.c index a1e71208cb..36a9bfffb1 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -72,12 +72,51 @@ void zsetConvertAndExpand(robj *zobj, int encoding, unsigned long cap); zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_level, unsigned long rank); zskiplistNode *zslGetElementByRank(zskiplist *zsl, unsigned long rank); +static inline unsigned long zslGetNodeSpanAtLevel(zskiplistNode *x, int level) { + /* We use the level 0 span in order to hold the node height, so in case the span is requested on + * level 0 and this is not the last node we return 1 and 0 otherwise. For the rest of the levels we just return + * the recorded span in that level. */ + if (level > 0) return x->level[level].span; + return x->level[level].forward ? 1 : 0; +} + +static inline void zslSetNodeSpanAtLevel(zskiplistNode *x, int level, unsigned long span) { + /* We use the level 0 span in order to hold the node height, so we avoid overriding it. */ + if (level > 0) + x->level[level].span = span; +} + +static inline void zslIncrNodeSpanAtLevel(zskiplistNode *x, int level, unsigned long incr) { + /* We use the level 0 span in order to hold the node height, so we avoid overriding it. */ + if (level > 0) + x->level[level].span += incr; +} + +static inline void zslDecrNodeSpanAtLevel(zskiplistNode *x, int level, unsigned long decr) { + /* We use the level 0 span in order to hold the node height, so we avoid overriding it. */ + if (level > 0) + x->level[level].span -= decr; +} + +static inline unsigned long zslGetNodeHeight(zskiplistNode *x) { + /* Since the span at level 0 is always 1 (or 0 for the last node), this + * field is instead used for storing the height of the node. */ + return x->level[0].span; +} + +static inline void zslSetNodeHeight(zskiplistNode *x, int height) { + /* Since the span at level 0 is always 1 (or 0 for the last node), this + * field is instead used for storing the height of the node. */ + x->level[0].span = height; +} + /* Create a skiplist node with the specified number of levels. * The SDS string 'ele' is referenced by the node after the call. */ -zskiplistNode *zslCreateNode(int level, double score, sds ele) { - zskiplistNode *zn = zmalloc(sizeof(*zn) + level * sizeof(struct zskiplistLevel)); +zskiplistNode *zslCreateNode(int height, double score, sds ele) { + zskiplistNode *zn = zmalloc(sizeof(*zn) + height * sizeof(struct zskiplistLevel)); zn->score = score; zn->ele = ele; + zslSetNodeHeight(zn, height); return zn; } @@ -147,7 +186,7 @@ zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) { while (x->level[i].forward && (x->level[i].forward->score < score || (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) < 0))) { - rank[i] += x->level[i].span; + rank[i] += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } update[i] = x; @@ -161,9 +200,10 @@ zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) { for (i = zsl->level; i < level; i++) { rank[i] = 0; update[i] = zsl->header; - update[i]->level[i].span = zsl->length; + zslSetNodeSpanAtLevel(update[i], i, zsl->length); } zsl->level = level; + zslSetNodeHeight(zsl->header, level); } x = zslCreateNode(level, score, ele); for (i = 0; i < level; i++) { @@ -171,13 +211,13 @@ zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) { update[i]->level[i].forward = x; /* update span covered by update[i] as x is inserted here */ - x->level[i].span = update[i]->level[i].span - (rank[0] - rank[i]); - update[i]->level[i].span = (rank[0] - rank[i]) + 1; + zslSetNodeSpanAtLevel(x, i, zslGetNodeSpanAtLevel(update[i], i) - (rank[0] - rank[i])); + zslSetNodeSpanAtLevel(update[i], i, (rank[0] - rank[i]) + 1); } /* increment span for untouched levels */ for (i = level; i < zsl->level; i++) { - update[i]->level[i].span++; + zslIncrNodeSpanAtLevel(update[i], i, 1); } x->backward = (update[0] == zsl->header) ? NULL : update[0]; @@ -195,10 +235,10 @@ void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) { int i; for (i = 0; i < zsl->level; i++) { if (update[i]->level[i].forward == x) { - update[i]->level[i].span += x->level[i].span - 1; + zslIncrNodeSpanAtLevel(update[i], i, zslGetNodeSpanAtLevel(x, i) - 1); update[i]->level[i].forward = x->level[i].forward; } else { - update[i]->level[i].span -= 1; + zslDecrNodeSpanAtLevel(update[i], i, 1); } } if (x->level[0].forward) { @@ -336,7 +376,7 @@ zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n) { x = zsl->header; i = zsl->level - 1; while (x->level[i].forward && !zslValueGteMin(x->level[i].forward->score, range)) { - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } /* Remember the last node which has zsl->level-1 levels and its rank. */ @@ -348,7 +388,7 @@ zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n) { /* Go forward while *OUT* of range. */ while (x->level[i].forward && !zslValueGteMin(x->level[i].forward->score, range)) { /* Count the rank of the last element smaller than the range. */ - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } } @@ -372,7 +412,7 @@ zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n) { /* Go forward while *IN* range. */ while (x->level[i].forward && zslValueLteMax(x->level[i].forward->score, range)) { /* Count the rank of the last element in range. */ - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } } @@ -464,8 +504,8 @@ unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned x = zsl->header; for (i = zsl->level - 1; i >= 0; i--) { - while (x->level[i].forward && (traversed + x->level[i].span) < start) { - traversed += x->level[i].span; + while (x->level[i].forward && (traversed + zslGetNodeSpanAtLevel(x, i)) < start) { + traversed += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } update[i] = x; @@ -499,7 +539,7 @@ unsigned long zslGetRank(zskiplist *zsl, double score, sds ele) { while (x->level[i].forward && (x->level[i].forward->score < score || (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) <= 0))) { - rank += x->level[i].span; + rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } @@ -511,6 +551,18 @@ unsigned long zslGetRank(zskiplist *zsl, double score, sds ele) { return 0; } +/* Find the rank for a specific skiplist node. */ +unsigned long zslGetRankByNode(zskiplist *zsl, zskiplistNode *x) { + int i = zslGetNodeHeight(x) - 1; + unsigned long rank = zslGetNodeSpanAtLevel(x, i); + while (x->level[zslGetNodeHeight(x) - 1].forward) { + x = x->level[zslGetNodeHeight(x) - 1].forward; + rank += zslGetNodeSpanAtLevel(x, zslGetNodeHeight(x) - 1); + } + rank = zsl->length - rank; + return rank; +} + /* Finds an element by its rank from start node. The rank argument needs to be 1-based. */ zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_level, unsigned long rank) { zskiplistNode *x; @@ -519,8 +571,8 @@ zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_ x = start_node; for (i = start_level; i >= 0; i--) { - while (x->level[i].forward && (traversed + x->level[i].span) <= rank) { - traversed += x->level[i].span; + while (x->level[i].forward && (traversed + zslGetNodeSpanAtLevel(x, i)) <= rank) { + traversed += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } if (traversed == rank) { @@ -690,7 +742,7 @@ zskiplistNode *zslNthInLexRange(zskiplist *zsl, zlexrangespec *range, long n) { x = zsl->header; i = zsl->level - 1; while (x->level[i].forward && !zslLexValueGteMin(x->level[i].forward->ele, range)) { - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } /* Remember the last node which has zsl->level-1 levels and its rank. */ @@ -702,7 +754,7 @@ zskiplistNode *zslNthInLexRange(zskiplist *zsl, zlexrangespec *range, long n) { /* Go forward while *OUT* of range. */ while (x->level[i].forward && !zslLexValueGteMin(x->level[i].forward->ele, range)) { /* Count the rank of the last element smaller than the range. */ - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } } @@ -726,7 +778,7 @@ zskiplistNode *zslNthInLexRange(zskiplist *zsl, zlexrangespec *range, long n) { /* Go forward while *IN* range. */ while (x->level[i].forward && zslLexValueLteMax(x->level[i].forward->ele, range)) { /* Count the rank of the last element in range. */ - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } } @@ -1173,6 +1225,13 @@ unsigned char *zzlDeleteRangeByRank(unsigned char *zl, unsigned int start, unsig * Common sorted set API *----------------------------------------------------------------------------*/ +/* Utility function used for mapping the hashtable entry to the matching skiplist node. + * For example, this is used in case of ZRANK query. */ +static inline zskiplistNode *zsetGetSLNodeByEntry(dictEntry *de) { + char *score_ref = ((char *)dictGetVal(de)); + return (zskiplistNode *)(score_ref - offsetof(zskiplistNode, score)); +} + unsigned long zsetLength(const robj *zobj) { unsigned long length = 0; if (zobj->encoding == OBJ_ENCODING_LISTPACK) { @@ -1603,8 +1662,9 @@ long zsetRank(robj *zobj, sds ele, int reverse, double *output_score) { de = dictFind(zs->dict, ele); if (de != NULL) { - score = *(double *)dictGetVal(de); - rank = zslGetRank(zsl, score, ele); + zskiplistNode *n = zsetGetSLNodeByEntry(de); + score = n->score; + rank = zslGetRankByNode(zsl, n); /* Existing elements always have a rank. */ serverAssert(rank != 0); if (output_score) *output_score = score; From 1ba85d002a824a12b0107bdd2b493a3a0516cec9 Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 10 Dec 2024 00:37:04 +0800 Subject: [PATCH 91/92] Use binary representation in assert crash log, cleanup in crash log (#1410) Change assert crash log to also use binary representation like 5bdd72bea77d4bb237441c9a671e80edcdc998ad. And do not print the password in assert crash log like 56eef6fb5ab7a755485c19f358761954ca459472. In addition, for 5bdd72bea77d4bb237441c9a671e80edcdc998ad, we will print '"argv"', because originally the code would print a '', and sdscatrepr will add an extra "", so now removing the extra '' here. Extract the getArgvReprString method and clean up the code a bit. Examples: ``` debug assert "\x00abc" before: client->argv[0] = "debug" (refcount: 1) client->argv[1] = "assert" (refcount: 1) client->argv[2] = "" (refcount: 1) after: client->argv[0] = "debug" (refcount: 1) client->argv[1] = "assert" (refcount: 1) client->argv[2] = "\x00abc" (refcount: 1) debug panic "\x00abc" before: argc: '3' argv[0]: '"debug"' argv[1]: '"panic"' argv[2]: '"\x00abc"' after: argc: 3 argv[0]: "debug" argv[1]: "panic" argv[2]: "\x00abc" ``` Signed-off-by: Binbin --- src/debug.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/debug.c b/src/debug.c index 38b66dacb5..7407af3514 100644 --- a/src/debug.c +++ b/src/debug.c @@ -1049,6 +1049,14 @@ __attribute__((noinline, weak)) void _serverAssert(const char *estr, const char bugReportEnd(0, 0); } +/* Returns the argv argument in binary representation, limited to length 128. */ +sds getArgvReprString(robj *argv) { + robj *decoded = getDecodedObject(argv); + sds repr = sdscatrepr(sdsempty(), decoded->ptr, min(sdslen(decoded->ptr), 128)); + decrRefCount(decoded); + return repr; +} + /* Checks if the argument at the given index should be redacted from logs. */ int shouldRedactArg(const client *c, int idx) { serverAssert(idx < c->argc); @@ -1073,16 +1081,12 @@ void _serverAssertPrintClientInfo(const client *c) { serverLog(LL_WARNING, "client->argv[%d]: %zu bytes", j, sdslen((sds)c->argv[j]->ptr)); continue; } - char buf[128]; - char *arg; - - if (c->argv[j]->type == OBJ_STRING && sdsEncodedObject(c->argv[j])) { - arg = (char *)c->argv[j]->ptr; - } else { - snprintf(buf, sizeof(buf), "Object type: %u, encoding: %u", c->argv[j]->type, c->argv[j]->encoding); - arg = buf; + sds repr = getArgvReprString(c->argv[j]); + serverLog(LL_WARNING, "client->argv[%d] = %s (refcount: %d)", j, repr, c->argv[j]->refcount); + sdsfree(repr); + if (!strcasecmp(c->argv[j]->ptr, "auth") || !strcasecmp(c->argv[j]->ptr, "auth2")) { + break; } - serverLog(LL_WARNING, "client->argv[%d] = \"%s\" (refcount: %d)", j, arg, c->argv[j]->refcount); } } @@ -1890,23 +1894,18 @@ void logCurrentClient(client *cc, const char *title) { client = catClientInfoString(sdsempty(), cc, server.hide_user_data_from_log); serverLog(LL_WARNING | LL_RAW, "%s\n", client); sdsfree(client); - serverLog(LL_WARNING | LL_RAW, "argc: '%d'\n", cc->argc); + serverLog(LL_WARNING | LL_RAW, "argc: %d\n", cc->argc); for (j = 0; j < cc->argc; j++) { if (shouldRedactArg(cc, j)) { serverLog(LL_WARNING | LL_RAW, "argv[%d]: %zu bytes\n", j, sdslen((sds)cc->argv[j]->ptr)); continue; } - robj *decoded; - decoded = getDecodedObject(cc->argv[j]); - sds repr = sdscatrepr(sdsempty(), decoded->ptr, min(sdslen(decoded->ptr), 128)); - serverLog(LL_WARNING | LL_RAW, "argv[%d]: '%s'\n", j, (char *)repr); - if (!strcasecmp(decoded->ptr, "auth") || !strcasecmp(decoded->ptr, "auth2")) { - sdsfree(repr); - decrRefCount(decoded); + sds repr = getArgvReprString(cc->argv[j]); + serverLog(LL_WARNING | LL_RAW, "argv[%d]: %s\n", j, repr); + sdsfree(repr); + if (!strcasecmp(cc->argv[j]->ptr, "auth") || !strcasecmp(cc->argv[j]->ptr, "auth2")) { break; } - sdsfree(repr); - decrRefCount(decoded); } /* Check if the first argument, usually a key, is found inside the * selected DB, and if so print info about the associated object. */ From 4f61034934cf165163ef272e5795bccadc288b09 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Mon, 9 Dec 2024 12:28:17 -0800 Subject: [PATCH 92/92] Update governance and maintainers file for Valkey committers (#1390) We added two more committers, but according to our governance document that makes them TSC members. As we discussed, for now we want to keep the balance of corporate interests, so so updating the governance to explicitly list TSC members compared to folks with just write permissions. Also adds the new new folks with commit permissions. --------- Signed-off-by: Madelyn Olson --- GOVERNANCE.md | 4 +++- MAINTAINERS.md | 10 +++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/GOVERNANCE.md b/GOVERNANCE.md index 33c3887430..7fd33272cb 100644 --- a/GOVERNANCE.md +++ b/GOVERNANCE.md @@ -2,7 +2,9 @@ The Valkey project is managed by a Technical Steering Committee (TSC) composed of the maintainers of the Valkey repository. The Valkey project includes all of the current and future repositories under the Valkey-io organization. -Maintainers are defined as individuals with full commit access to a repository, which shall be in sync with the MAINTAINERS.md file in a given projects repository. +Committers are defined as individuals with write access to the code within a repository. +Maintainers are defined as individuals with full access to a repository and own its governance. +Both maintainers and committers should be clearly listed in the MAINTAINERS.md file in a given projects repository. Maintainers of other repositories within the Valkey project are not members of the TSC unless explicitly added. ## Technical Steering Committee diff --git a/MAINTAINERS.md b/MAINTAINERS.md index 635bf25067..947979eb33 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -16,8 +16,16 @@ Maintainers listed in alphabetical order by their github ID. | Zhao Zhao | [soloestoy](https://github.com/soloestoy) | Alibaba | | Viktor Söderqvist | [zuiderkwast](https://github.com/zuiderkwast) | Ericsson | +## Current Committers -### Former Maintainers +Committers listed in alphabetical order by their github ID. + +| Committer | GitHub ID | Affiliation | +| ------------------- | ----------------------------------------------- | ----------- | +| Harkrishn Patro | [hpatro](https://github.com/hpatro) | Amazon | +| Ran Shidlansik | [ranshid](https://github.com/ranshid) | Amazon | + +### Former Maintainers and Committers | Maintainer | GitHub ID | Affiliation | | ------------------- | ----------------------------------------------- | ----------- | \ No newline at end of file