From c387e1fd3cf863134a1973139c9a5b803e1f4e22 Mon Sep 17 00:00:00 2001 From: Uri Yagelnik Date: Mon, 8 Jul 2024 07:05:46 +0000 Subject: [PATCH] Async IO threads Signed-off-by: Uri Yagelnik --- .github/workflows/daily.yml | 4 +- src/Makefile | 2 +- src/ae.c | 4 +- src/ae.h | 5 +- src/blocked.c | 2 +- src/config.c | 6 +- src/config.h | 11 +- src/connection.h | 18 + src/debug.c | 2 + src/eval.c | 2 +- src/io_threads.c | 377 ++++++ src/io_threads.h | 13 + src/networking.c | 1569 ++++++++++++------------ src/rdb.c | 2 +- src/replication.c | 22 +- src/server.c | 112 +- src/server.h | 107 +- src/socket.c | 2 + src/tls.c | 55 +- src/unix.c | 2 + tests/integration/failover.tcl | 6 + tests/integration/replication.tcl | 1 + tests/integration/shutdown.tcl | 12 + tests/integration/valkey-cli.tcl | 1 + tests/unit/client-eviction.tcl | 23 +- tests/unit/cluster/pubsubshard.tcl | 8 +- tests/unit/dump.tcl | 1 + tests/unit/info.tcl | 83 +- tests/unit/maxmemory.tcl | 1 + tests/unit/memefficiency.tcl | 3 + tests/unit/moduleapi/blockedclient.tcl | 2 +- tests/unit/pubsub.tcl | 12 + tests/unit/pubsubshard.tcl | 8 + tests/unit/querybuf.tcl | 2 +- tests/unit/type/list.tcl | 7 + tests/unit/type/stream-cgroups.tcl | 11 +- tests/unit/type/zset.tcl | 1 + valkey.conf | 21 +- 38 files changed, 1551 insertions(+), 969 deletions(-) create mode 100644 src/io_threads.c create mode 100644 src/io_threads.h diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index 7679856d1d..91dbb26fab 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -358,10 +358,10 @@ jobs: run: sudo apt-get install tcl8.6 tclx - name: test if: true && !contains(github.event.inputs.skiptests, 'valkey') - run: ./runtest --config io-threads 4 --config io-threads-do-reads yes --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}} + run: ./runtest --config io-threads 2 --config events-per-io-thread 0 --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}} - name: cluster tests if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster --config io-threads 4 --config io-threads-do-reads yes ${{github.event.inputs.cluster_test_args}} + run: ./runtest-cluster --config io-threads 2 --config events-per-io-thread 0 ${{github.event.inputs.cluster_test_args}} test-ubuntu-reclaim-cache: runs-on: ubuntu-latest diff --git a/src/Makefile b/src/Makefile index 18e5527eff..4e8c34b253 100644 --- a/src/Makefile +++ b/src/Makefile @@ -401,7 +401,7 @@ endif ENGINE_NAME=valkey SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX) ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX) -ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o +ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX) ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX) diff --git a/src/ae.c b/src/ae.c index 62031cbeea..28b50c660f 100644 --- a/src/ae.c +++ b/src/ae.c @@ -392,7 +392,7 @@ int aeProcessEvents(aeEventLoop *eventLoop, int flags) { } /* After sleep callback. */ - if (eventLoop->aftersleep != NULL && flags & AE_CALL_AFTER_SLEEP) eventLoop->aftersleep(eventLoop); + if (eventLoop->aftersleep != NULL && flags & AE_CALL_AFTER_SLEEP) eventLoop->aftersleep(eventLoop, numevents); for (j = 0; j < numevents; j++) { int fd = eventLoop->fired[j].fd; @@ -489,6 +489,6 @@ void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep eventLoop->beforesleep = beforesleep; } -void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep) { +void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeAfterSleepProc *aftersleep) { eventLoop->aftersleep = aftersleep; } diff --git a/src/ae.h b/src/ae.h index a6dcbce50d..3b1c96a01d 100644 --- a/src/ae.h +++ b/src/ae.h @@ -68,6 +68,7 @@ typedef void aeFileProc(struct aeEventLoop *eventLoop, int fd, void *clientData, typedef int aeTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData); typedef void aeEventFinalizerProc(struct aeEventLoop *eventLoop, void *clientData); typedef void aeBeforeSleepProc(struct aeEventLoop *eventLoop); +typedef void aeAfterSleepProc(struct aeEventLoop *eventLoop, int numevents); /* File event structure */ typedef struct aeFileEvent { @@ -107,7 +108,7 @@ typedef struct aeEventLoop { int stop; void *apidata; /* This is used for polling API specific data */ aeBeforeSleepProc *beforesleep; - aeBeforeSleepProc *aftersleep; + aeAfterSleepProc *aftersleep; int flags; } aeEventLoop; @@ -130,7 +131,7 @@ int aeWait(int fd, int mask, long long milliseconds); void aeMain(aeEventLoop *eventLoop); char *aeGetApiName(void); void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep); -void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep); +void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeAfterSleepProc *aftersleep); int aeGetSetSize(aeEventLoop *eventLoop); int aeResizeSetSize(aeEventLoop *eventLoop, int setsize); void aeSetDontWait(aeEventLoop *eventLoop, int noWait); diff --git a/src/blocked.c b/src/blocked.c index 15ef39af3b..a1d5306dad 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -146,7 +146,7 @@ void processUnblockedClients(void) { if (!c->flag.blocked) { /* If we have a queued command, execute it now. */ if (processPendingCommandAndInputBuffer(c) == C_ERR) { - c = NULL; + continue; } } beforeNextClient(c); diff --git a/src/config.c b/src/config.c index f8784413f9..32e6018ff2 100644 --- a/src/config.c +++ b/src/config.c @@ -590,6 +590,9 @@ void loadServerConfigFromString(char *config) { if (server.config_hz < CONFIG_MIN_HZ) server.config_hz = CONFIG_MIN_HZ; if (server.config_hz > CONFIG_MAX_HZ) server.config_hz = CONFIG_MAX_HZ; + /* To ensure backward compatibility when io_threads_num is according to the previous maximum of 128. */ + if (server.io_threads_num > IO_THREADS_MAX_NUM) server.io_threads_num = IO_THREADS_MAX_NUM; + sdsfreesplitres(lines, totlines); reading_config_file = 0; return; @@ -3023,7 +3026,7 @@ standardConfig static_configs[] = { /* Bool configs */ createBoolConfig("rdbchecksum", NULL, IMMUTABLE_CONFIG, server.rdb_checksum, 1, NULL, NULL), createBoolConfig("daemonize", NULL, IMMUTABLE_CONFIG, server.daemonize, 0, NULL, NULL), - createBoolConfig("io-threads-do-reads", NULL, DEBUG_CONFIG | IMMUTABLE_CONFIG, server.io_threads_do_reads, 0, NULL, NULL), /* Read + parse from threads? */ + createBoolConfig("io-threads-do-reads", NULL, DEBUG_CONFIG | IMMUTABLE_CONFIG, server.io_threads_do_reads, 1, NULL, NULL), /* Read + parse from threads */ createBoolConfig("always-show-logo", NULL, IMMUTABLE_CONFIG, server.always_show_logo, 0, NULL, NULL), createBoolConfig("protected-mode", NULL, MODIFIABLE_CONFIG, server.protected_mode, 1, NULL, NULL), createBoolConfig("rdbcompression", NULL, MODIFIABLE_CONFIG, server.rdb_compression, 1, NULL, NULL), @@ -3124,6 +3127,7 @@ standardConfig static_configs[] = { createIntConfig("databases", NULL, IMMUTABLE_CONFIG, 1, INT_MAX, server.dbnum, 16, INTEGER_CONFIG, NULL, NULL), createIntConfig("port", NULL, MODIFIABLE_CONFIG, 0, 65535, server.port, 6379, INTEGER_CONFIG, NULL, updatePort), /* TCP port. */ createIntConfig("io-threads", NULL, DEBUG_CONFIG | IMMUTABLE_CONFIG, 1, 128, server.io_threads_num, 1, INTEGER_CONFIG, NULL, NULL), /* Single threaded by default */ + createIntConfig("events-per-io-thread", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.events_per_io_thread, 2, INTEGER_CONFIG, NULL, NULL), createIntConfig("auto-aof-rewrite-percentage", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.aof_rewrite_perc, 100, INTEGER_CONFIG, NULL, NULL), createIntConfig("cluster-replica-validity-factor", "cluster-slave-validity-factor", MODIFIABLE_CONFIG, 0, INT_MAX, server.cluster_replica_validity_factor, 10, INTEGER_CONFIG, NULL, NULL), /* replica max data age factor. */ createIntConfig("list-max-listpack-size", "list-max-ziplist-size", MODIFIABLE_CONFIG, INT_MIN, INT_MAX, server.list_max_listpack_size, -2, INTEGER_CONFIG, NULL, NULL), diff --git a/src/config.h b/src/config.h index 95c2e84a00..201e421976 100644 --- a/src/config.h +++ b/src/config.h @@ -264,6 +264,15 @@ void setproctitle(const char *fmt, ...); #error "Undefined or invalid BYTE_ORDER" #endif +/* Cache line alignment */ +#ifndef CACHE_LINE_SIZE +#if defined(__aarch64__) && defined(__APPLE__) +#define CACHE_LINE_SIZE 128 +#else +#define CACHE_LINE_SIZE 64 +#endif /* __aarch64__ && __APPLE__ */ +#endif /* CACHE_LINE_SIZE */ + #if (__i386 || __amd64 || __powerpc__) && __GNUC__ #define GNUC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) #if defined(__clang__) @@ -329,7 +338,7 @@ void setcpuaffinity(const char *cpulist); #define HAVE_FADVISE #endif -#define IO_THREADS_MAX_NUM 128 +#define IO_THREADS_MAX_NUM 16 #ifndef CACHE_LINE_SIZE #if defined(__aarch64__) && defined(__APPLE__) diff --git a/src/connection.h b/src/connection.h index 3de581b417..c6466c2d4c 100644 --- a/src/connection.h +++ b/src/connection.h @@ -112,6 +112,12 @@ typedef struct ConnectionType { int (*has_pending_data)(void); int (*process_pending_data)(void); + /* Postpone update state - with IO threads & TLS we don't want the IO threads to update the event loop events - let + * the main-thread do it */ + void (*postpone_update_state)(struct connection *conn, int); + /* Called by the main-thread */ + void (*update_state)(struct connection *conn); + /* TLS specified methods */ sds (*get_peer_cert)(struct connection *conn); } ConnectionType; @@ -456,4 +462,16 @@ static inline int connIsTLS(connection *conn) { return conn && conn->type == connectionTypeTls(); } +static inline void connUpdateState(connection *conn) { + if (conn->type->update_state) { + conn->type->update_state(conn); + } +} + +static inline void connSetPostponeUpdateState(connection *conn, int on) { + if (conn->type->postpone_update_state) { + conn->type->postpone_update_state(conn, on); + } +} + #endif /* __REDIS_CONNECTION_H */ diff --git a/src/debug.c b/src/debug.c index 36c425a4f4..9501b8a658 100644 --- a/src/debug.c +++ b/src/debug.c @@ -37,6 +37,7 @@ #include "fpconv_dtoa.h" #include "cluster.h" #include "threads_mngr.h" +#include "io_threads.h" #include #include @@ -2159,6 +2160,7 @@ void removeSigSegvHandlers(void) { } void printCrashReport(void) { + server.crashed = 1; /* Log INFO and CLIENT LIST */ logServerInfo(); diff --git a/src/eval.c b/src/eval.c index e4e51f7da5..2afbf445f5 100644 --- a/src/eval.c +++ b/src/eval.c @@ -928,7 +928,7 @@ void ldbEndSession(client *c) { /* If it's a fork()ed session, we just exit. */ if (ldb.forked) { - writeToClient(c, 0); + writeToClient(c); serverLog(LL_NOTICE, "Lua debugging session child exiting"); exitFromChild(0); } else { diff --git a/src/io_threads.c b/src/io_threads.c new file mode 100644 index 0000000000..6149febabc --- /dev/null +++ b/src/io_threads.c @@ -0,0 +1,377 @@ +#include "io_threads.h" + +static __thread int thread_id = 0; /* Thread local var */ +static pthread_t io_threads[IO_THREADS_MAX_NUM] = {0}; +static pthread_mutex_t io_threads_mutex[IO_THREADS_MAX_NUM]; + +/* IO jobs queue functions - Used to send jobs from the main-thread to the IO thread. */ +typedef void (*job_handler)(void *); +typedef struct iojob { + job_handler handler; + void *data; +} iojob; + +typedef struct IOJobQueue { + iojob *ring_buffer; + size_t size; + _Atomic size_t head __attribute__((aligned(CACHE_LINE_SIZE))); /* Next write index for producer (main-thread) */ + _Atomic size_t tail __attribute__((aligned(CACHE_LINE_SIZE))); /* Next read index for consumer (IO-thread) */ +} IOJobQueue; +IOJobQueue io_jobs[IO_THREADS_MAX_NUM] = {0}; + +/* Initialize the job queue with a specified number of items. */ +static void IOJobQueue_init(IOJobQueue *jq, size_t item_count) { + debugServerAssertWithInfo(NULL, NULL, inMainThread()); + jq->ring_buffer = zcalloc(item_count * sizeof(iojob)); + jq->size = item_count; /* Total number of items */ + jq->head = 0; + jq->tail = 0; +} + +/* Clean up the job queue and free allocated memory. */ +static void IOJobQueue_cleanup(IOJobQueue *jq) { + debugServerAssertWithInfo(NULL, NULL, inMainThread()); + zfree(jq->ring_buffer); + memset(jq, 0, sizeof(*jq)); +} + +static int IOJobQueue_isFull(const IOJobQueue *jq) { + debugServerAssertWithInfo(NULL, NULL, inMainThread()); + size_t current_head = atomic_load_explicit(&jq->head, memory_order_relaxed); + /* We don't use memory_order_acquire for the tail due to performance reasons, + * In the worst case we will just assume wrongly the buffer is full and the main thread will do the job by itself. */ + size_t current_tail = atomic_load_explicit(&jq->tail, memory_order_relaxed); + size_t next_head = (current_head + 1) % jq->size; + return next_head == current_tail; +} + +/* Attempt to push a new job to the queue from the main thread. + * the caller must ensure the queue is not full before calling this function. */ +static void IOJobQueue_push(IOJobQueue *jq, job_handler handler, void *data) { + debugServerAssertWithInfo(NULL, NULL, inMainThread()); + /* Assert the queue is not full - should not happen as the caller should check for it before. */ + serverAssert(!IOJobQueue_isFull(jq)); + + /* No need to use atomic acquire for the head, as the main thread is the only one that writes to the head index. */ + size_t current_head = atomic_load_explicit(&jq->head, memory_order_relaxed); + size_t next_head = (current_head + 1) % jq->size; + + /* We store directly the job's fields to avoid allocating a new iojob structure. */ + serverAssert(jq->ring_buffer[current_head].data == NULL); + serverAssert(jq->ring_buffer[current_head].handler == NULL); + jq->ring_buffer[current_head].data = data; + jq->ring_buffer[current_head].handler = handler; + + /* memory_order_release to make sure the data is visible to the consumer (the IO thread). */ + atomic_store_explicit(&jq->head, next_head, memory_order_release); +} + +/* Returns the number of jobs currently available for consumption in the given job queue. + * + * This function ensures memory visibility for the jobs by + * using a memory acquire fence when there are jobs available. */ +static size_t IOJobQueue_availableJobs(const IOJobQueue *jq) { + debugServerAssertWithInfo(NULL, NULL, !inMainThread()); + /* We use memory_order_acquire to make sure the head and the job's fields are visible to the consumer (IO thread). */ + size_t current_head = atomic_load_explicit(&jq->head, memory_order_acquire); + size_t current_tail = atomic_load_explicit(&jq->tail, memory_order_relaxed); + + if (current_head >= current_tail) { + return current_head - current_tail; + } else { + return jq->size - (current_tail - current_head); + } +} + +/* Checks if the job Queue is empty. + * returns 1 if the buffer is currently empty, 0 otherwise. + * Called by the main-thread only. + * This function uses relaxed memory order, so the caller need to use an acquire + * memory fence before calling this function to be sure it has the latest index + * from the other thread, especially when called repeatedly. */ +static int IOJobQueue_isEmpty(const IOJobQueue *jq) { + size_t current_head = atomic_load_explicit(&jq->head, memory_order_relaxed); + size_t current_tail = atomic_load_explicit(&jq->tail, memory_order_relaxed); + return current_head == current_tail; +} + +/* Removes the next job from the given job queue by advancing the tail index. + * Called by the IO thread. + * The caller must ensure that the queue is not empty before calling this function. + * This function uses relaxed memory order, so the caller need to use an release memory fence + * after calling this function to make sure the updated tail is visible to the producer (main thread). */ +static void IOJobQueue_removeJob(IOJobQueue *jq) { + debugServerAssertWithInfo(NULL, NULL, !inMainThread()); + size_t current_tail = atomic_load_explicit(&jq->tail, memory_order_relaxed); + jq->ring_buffer[current_tail].data = NULL; + jq->ring_buffer[current_tail].handler = NULL; + atomic_store_explicit(&jq->tail, (current_tail + 1) % jq->size, memory_order_relaxed); +} + +/* Retrieves the next job handler and data from the job queue without removal. + * Called by the consumer (IO thread). Caller must ensure queue is not empty.*/ +static void IOJobQueue_peek(const IOJobQueue *jq, job_handler *handler, void **data) { + debugServerAssertWithInfo(NULL, NULL, !inMainThread()); + size_t current_tail = atomic_load_explicit(&jq->tail, memory_order_relaxed); + iojob *job = &jq->ring_buffer[current_tail]; + *handler = job->handler; + *data = job->data; +} + +/* End of IO job queue functions */ + +int inMainThread(void) { + return thread_id == 0; +} + +/* Wait until the IO-thread is done with the client */ +void waitForClientIO(client *c) { + /* No need to wait if the client was not offloaded to the IO thread. */ + if (c->io_read_state == CLIENT_IDLE && c->io_write_state == CLIENT_IDLE) return; + + /* Wait for read operation to complete if pending. */ + while (c->io_read_state == CLIENT_PENDING_IO) { + atomic_thread_fence(memory_order_acquire); + } + + /* Wait for write operation to complete if pending. */ + while (c->io_write_state == CLIENT_PENDING_IO) { + atomic_thread_fence(memory_order_acquire); + } + + /* Final memory barrier to ensure all changes are visible */ + atomic_thread_fence(memory_order_acquire); +} + +/** Adjusts the number of active I/O threads based on the current event load. + * If increase_only is non-zero, only allows increasing the number of threads.*/ +void adjustIOThreadsByEventLoad(int numevents, int increase_only) { + if (server.io_threads_num == 1) return; /* All I/O is being done by the main thread. */ + debugServerAssertWithInfo(NULL, NULL, server.io_threads_num > 1); + + int target_threads = + server.events_per_io_thread == 0 ? server.io_threads_num : numevents / server.events_per_io_thread; + + target_threads = max(1, min(target_threads, server.io_threads_num)); + + if (target_threads == server.active_io_threads_num) return; + + if (target_threads < server.active_io_threads_num) { + if (increase_only) return; + + int threads_to_deactivate_num = server.active_io_threads_num - target_threads; + for (int i = 0; i < threads_to_deactivate_num; i++) { + int tid = server.active_io_threads_num - 1; + IOJobQueue *jq = &io_jobs[tid]; + /* We can't lock the thread if it may have pending jobs */ + if (!IOJobQueue_isEmpty(jq)) return; + pthread_mutex_lock(&io_threads_mutex[tid]); + server.active_io_threads_num--; + } + } else { + int threads_to_activate_num = target_threads - server.active_io_threads_num; + for (int i = 0; i < threads_to_activate_num; i++) { + pthread_mutex_unlock(&io_threads_mutex[server.active_io_threads_num]); + server.active_io_threads_num++; + } + } +} + +static void *IOThreadMain(void *myid) { + /* The ID is the thread ID number (from 1 to server.io_threads_num-1). ID 0 is the main thread. */ + long id = (long)myid; + char thdname[32]; + + serverAssert(server.io_threads_num > 0); + serverAssert(id > 0 && id < server.io_threads_num); + snprintf(thdname, sizeof(thdname), "io_thd_%ld", id); + valkey_set_thread_title(thdname); + serverSetCpuAffinity(server.server_cpulist); + makeThreadKillable(); + initSharedQueryBuf(); + + thread_id = (int)id; + size_t jobs_to_process = 0; + IOJobQueue *jq = &io_jobs[id]; + while (1) { + /* Wait for jobs */ + for (int j = 0; j < 1000000; j++) { + jobs_to_process = IOJobQueue_availableJobs(jq); + if (jobs_to_process) break; + } + + /* Give the main thread a chance to stop this thread. */ + if (jobs_to_process == 0) { + pthread_mutex_lock(&io_threads_mutex[id]); + pthread_mutex_unlock(&io_threads_mutex[id]); + continue; + } + + for (size_t j = 0; j < jobs_to_process; j++) { + job_handler handler; + void *data; + /* We keep the job in the queue until it's processed. This ensures that if the main thread checks + * and finds the queue empty, it can be certain that the IO thread is not currently handling any job. */ + IOJobQueue_peek(jq, &handler, &data); + handler(data); + /* Remove the job after it was processed */ + IOJobQueue_removeJob(jq); + } + /* Memory barrier to make sure the main thread sees the updated tail index. + * We do it once per loop and not per tail-update for optimization reasons. + * As the main-thread main concern is to check if the queue is empty, it's enough to do it once at the end. */ + atomic_thread_fence(memory_order_release); + } + freeSharedQueryBuf(); + return NULL; +} + +#define IO_JOB_QUEUE_SIZE 2048 +static void createIOThread(int id) { + pthread_t tid; + pthread_mutex_init(&io_threads_mutex[id], NULL); + IOJobQueue_init(&io_jobs[id], IO_JOB_QUEUE_SIZE); + pthread_mutex_lock(&io_threads_mutex[id]); /* Thread will be stopped. */ + if (pthread_create(&tid, NULL, IOThreadMain, (void *)(long)id) != 0) { + serverLog(LL_WARNING, "Fatal: Can't initialize IO thread, pthread_create failed with: %s", strerror(errno)); + exit(1); + } + io_threads[id] = tid; +} + +/* Terminates the IO thread specified by id. + * Called on server shutdown */ +static void shutdownIOThread(int id) { + int err; + pthread_t tid = io_threads[id]; + if (tid == pthread_self()) return; + if (tid == 0) return; + + pthread_cancel(tid); + + if ((err = pthread_join(tid, NULL)) != 0) { + serverLog(LL_WARNING, "IO thread(tid:%lu) can not be joined: %s", (unsigned long)tid, strerror(err)); + } else { + serverLog(LL_NOTICE, "IO thread(tid:%lu) terminated", (unsigned long)tid); + } + + IOJobQueue_cleanup(&io_jobs[id]); +} + +void killIOThreads(void) { + for (int j = 1; j < server.io_threads_num; j++) { /* We don't kill thread 0, which is the main thread. */ + shutdownIOThread(j); + } +} + +/* Initialize the data structures needed for I/O threads. */ +void initIOThreads(void) { + server.active_io_threads_num = 1; /* We start with threads not active. */ + + /* Don't spawn any thread if the user selected a single thread: + * we'll handle I/O directly from the main thread. */ + if (server.io_threads_num == 1) return; + + serverAssert(server.io_threads_num <= IO_THREADS_MAX_NUM); + + /* Spawn and initialize the I/O threads. */ + for (int i = 1; i < server.io_threads_num; i++) { + createIOThread(i); + } +} + +int trySendReadToIOThreads(client *c) { + if (server.active_io_threads_num <= 1) return C_ERR; + if (!server.io_threads_do_reads) return C_ERR; + /* If IO thread is areadty reading, return C_OK to make sure the main thread will not handle it. */ + if (c->io_read_state != CLIENT_IDLE) return C_OK; + /* Currently, replica/master writes are not offloaded and are processed synchronously. */ + if (c->flag.primary || getClientType(c) == CLIENT_TYPE_REPLICA) return C_ERR; + /* With Lua debug client we may call connWrite directly in the main thread */ + if (c->flag.lua_debug) return C_ERR; + /* For simplicity let the main-thread handle the blocked clients */ + if (c->flag.blocked || c->flag.unblocked) return C_ERR; + if (c->flag.close_asap) return C_ERR; + size_t tid = (c->id % (server.active_io_threads_num - 1)) + 1; + + /* Handle case where client has a pending IO write job on a different thread: + * 1. A write job is still pending (io_write_state == CLIENT_PENDING_IO) + * 2. The pending job is on a different thread (c->cur_tid != tid) + * + * This situation can occur if active_io_threads_num increased since the + * original job assignment. In this case, we keep the job on its current + * thread to ensure the same thread handles the client's I/O operations. */ + if (c->io_write_state == CLIENT_PENDING_IO && c->cur_tid != (uint8_t)tid) tid = c->cur_tid; + + IOJobQueue *jq = &io_jobs[tid]; + if (IOJobQueue_isFull(jq)) return C_ERR; + + c->cur_tid = tid; + c->read_flags = canParseCommand(c) ? 0 : READ_FLAGS_DONT_PARSE; + c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0; + + c->io_read_state = CLIENT_PENDING_IO; + connSetPostponeUpdateState(c->conn, 1); + IOJobQueue_push(jq, ioThreadReadQueryFromClient, c); + c->flag.pending_read = 1; + listLinkNodeTail(server.clients_pending_io_read, &c->pending_read_list_node); + return C_OK; +} + +/* This function attempts to offload the client's write to an I/O thread. + * Returns C_OK if the client's writes were successfully offloaded to an I/O thread, + * or C_ERR if the client is not eligible for offloading. */ +int trySendWriteToIOThreads(client *c) { + if (server.active_io_threads_num <= 1) return C_ERR; + /* The I/O thread is already writing for this client. */ + if (c->io_write_state != CLIENT_IDLE) return C_OK; + /* Nothing to write */ + if (!clientHasPendingReplies(c)) return C_ERR; + /* Currently, replica/master writes are not offloaded and are processed synchronously. */ + if (c->flag.primary || getClientType(c) == CLIENT_TYPE_REPLICA) return C_ERR; + /* We can't offload debugged clients as the main-thread may read at the same time */ + if (c->flag.lua_debug) return C_ERR; + + size_t tid = (c->id % (server.active_io_threads_num - 1)) + 1; + /* Handle case where client has a pending IO read job on a different thread: + * 1. A read job is still pending (io_read_state == CLIENT_PENDING_IO) + * 2. The pending job is on a different thread (c->cur_tid != tid) + * + * This situation can occur if active_io_threads_num increased since the + * original job assignment. In this case, we keep the job on its current + * thread to ensure the same thread handles the client's I/O operations. */ + if (c->io_read_state == CLIENT_PENDING_IO && c->cur_tid != (uint8_t)tid) tid = c->cur_tid; + + IOJobQueue *jq = &io_jobs[tid]; + if (IOJobQueue_isFull(jq)) return C_ERR; + + c->cur_tid = tid; + if (c->flag.pending_write) { + /* We move the client to the io pending write queue */ + listUnlinkNode(server.clients_pending_write, &c->clients_pending_write_node); + } else { + c->flag.pending_write = 1; + } + serverAssert(c->clients_pending_write_node.prev == NULL && c->clients_pending_write_node.next == NULL); + listLinkNodeTail(server.clients_pending_io_write, &c->clients_pending_write_node); + + /* Save the last block of the reply list to io_last_reply_block and the used + * position to io_last_bufpos. The I/O thread will write only up to + * io_last_bufpos, regardless of the c->bufpos value. This is to prevent I/O + * threads from reading data that might be invalid in their local CPU cache. */ + c->io_last_reply_block = listLast(c->reply); + if (c->io_last_reply_block) { + c->io_last_bufpos = ((clientReplyBlock *)listNodeValue(c->io_last_reply_block))->used; + } else { + c->io_last_bufpos = (size_t)c->bufpos; + } + serverAssert(c->bufpos > 0 || c->io_last_bufpos > 0); + + /* The main-thread will update the client state after the I/O thread completes the write. */ + connSetPostponeUpdateState(c->conn, 1); + c->write_flags = 0; + c->io_write_state = CLIENT_PENDING_IO; + + IOJobQueue_push(jq, ioThreadWriteToClient, c); + return C_OK; +} diff --git a/src/io_threads.h b/src/io_threads.h new file mode 100644 index 0000000000..30d1cdad79 --- /dev/null +++ b/src/io_threads.h @@ -0,0 +1,13 @@ +#ifndef IO_THREADS_H +#define IO_THREADS_H + +#include "server.h" + +void initIOThreads(void); +void killIOThreads(void); +int inMainThread(void); +int trySendReadToIOThreads(client *c); +int trySendWriteToIOThreads(client *c); +void adjustIOThreadsByEventLoad(int numevents, int increase_only); + +#endif /* IO_THREADS_H */ diff --git a/src/networking.c b/src/networking.c index f017e7c034..b249aa61f3 100644 --- a/src/networking.c +++ b/src/networking.c @@ -33,6 +33,7 @@ #include "fpconv_dtoa.h" #include "fmtargs.h" #include +#include "io_threads.h" #include #include #include @@ -46,6 +47,8 @@ char *getClientSockname(client *c); int ProcessingEventsWhileBlocked = 0; /* See processEventsWhileBlocked(). */ __thread sds thread_shared_qb = NULL; +typedef enum { PARSE_OK = 0, PARSE_ERR = -1, PARSE_NEEDMORE = -2 } parseResult; + /* Return the size consumed from the allocator, for the specified SDS string, * including internal fragmentation. This function is used in order to compute * the client output buffer size. */ @@ -158,6 +161,9 @@ client *createClient(connection *conn) { c->argv_len_sum = 0; c->original_argc = 0; c->original_argv = NULL; + c->nread = 0; + c->read_flags = 0; + c->write_flags = 0; c->cmd = c->lastcmd = c->realcmd = NULL; c->cur_script = NULL; c->multibulklen = 0; @@ -199,7 +205,9 @@ client *createClient(connection *conn) { c->sockname = NULL; c->client_list_node = NULL; c->postponed_list_node = NULL; - c->pending_read_list_node = NULL; + c->io_read_state = CLIENT_IDLE; + c->io_write_state = CLIENT_IDLE; + c->nwritten = 0; c->client_tracking_redirection = 0; c->client_tracking_prefixes = NULL; c->last_memory_usage = 0; @@ -210,6 +218,7 @@ client *createClient(connection *conn) { c->auth_callback_privdata = NULL; c->auth_module = NULL; listInitNode(&c->clients_pending_write_node, c); + listInitNode(&c->pending_read_list_node, c); c->mem_usage_bucket = NULL; c->mem_usage_bucket_node = NULL; if (conn) linkClient(c); @@ -300,13 +309,8 @@ int prepareClientToWrite(client *c) { if (!c->conn) return C_ERR; /* Fake client for AOF loading. */ /* Schedule the client to write the output buffers to the socket, unless - * it should already be setup to do so (it has already pending data). - * - * If CLIENT_PENDING_READ is set, we're in an IO thread and should - * not put the client in pending write queue. Instead, it will be - * done by handleClientsWithPendingReadsUsingThreads() upon return. - */ - if (!clientHasPendingReplies(c) && io_threads_op == IO_THREADS_OP_IDLE) putClientInPendingWriteQueue(c); + * it should already be setup to do so (it has already pending data). */ + if (!clientHasPendingReplies(c)) putClientInPendingWriteQueue(c); /* Authorize the caller to queue in the output buffer of this client. */ return C_OK; @@ -745,7 +749,8 @@ void trimReplyUnusedTailSpace(client *c) { * allocation), otherwise there's a high chance realloc will NOP. * Also, to avoid large memmove which happens as part of realloc, we only do * that if the used part is small. */ - if (tail->size - tail->used > tail->size / 4 && tail->used < PROTO_REPLY_CHUNK_BYTES) { + if (tail->size - tail->used > tail->size / 4 && tail->used < PROTO_REPLY_CHUNK_BYTES && + c->io_write_state != CLIENT_PENDING_IO) { size_t usable_size; size_t old_size = tail->size; tail = zrealloc_usable(tail, tail->used + sizeof(clientReplyBlock), &usable_size); @@ -804,8 +809,10 @@ void setDeferredReply(client *c, void *node, const char *s, size_t length) { * - The prev node is non-NULL and has space in it or * - The next node is non-NULL, * - It has enough room already allocated - * - And not too large (avoid large memmove) */ - if (ln->prev != NULL && (prev = listNodeValue(ln->prev)) && prev->size - prev->used > 0) { + * - And not too large (avoid large memmove) + * - And the client is not in a pending I/O state */ + if (ln->prev != NULL && (prev = listNodeValue(ln->prev)) && prev->size - prev->used > 0 && + c->io_write_state != CLIENT_PENDING_IO) { size_t len_to_copy = prev->size - prev->used; if (len_to_copy > length) len_to_copy = length; memcpy(prev->buf + prev->used, s, len_to_copy); @@ -819,7 +826,7 @@ void setDeferredReply(client *c, void *node, const char *s, size_t length) { } if (ln->next != NULL && (next = listNodeValue(ln->next)) && next->size - next->used >= length && - next->used < PROTO_REPLY_CHUNK_BYTES * 4) { + next->used < PROTO_REPLY_CHUNK_BYTES * 4 && c->io_write_state != CLIENT_PENDING_IO) { memmove(next->buf + length, next->buf, next->used); memcpy(next->buf, s, length); next->used += length; @@ -1498,15 +1505,19 @@ void unlinkClient(client *c) { /* Remove from the list of pending writes if needed. */ if (c->flag.pending_write) { serverAssert(&c->clients_pending_write_node.next != NULL || &c->clients_pending_write_node.prev != NULL); - listUnlinkNode(server.clients_pending_write, &c->clients_pending_write_node); + if (c->io_write_state == CLIENT_IDLE) { + listUnlinkNode(server.clients_pending_write, &c->clients_pending_write_node); + } else { + listUnlinkNode(server.clients_pending_io_write, &c->clients_pending_write_node); + } c->flag.pending_write = 0; } /* Remove from the list of pending reads if needed. */ - serverAssert(!c->conn || io_threads_op == IO_THREADS_OP_IDLE); - if (c->pending_read_list_node != NULL) { - listDelNode(server.clients_pending_read, c->pending_read_list_node); - c->pending_read_list_node = NULL; + serverAssert(c->io_read_state != CLIENT_PENDING_IO && c->io_write_state != CLIENT_PENDING_IO); + if (c->flag.pending_read) { + listUnlinkNode(server.clients_pending_io_read, &c->pending_read_list_node); + c->flag.pending_read = 0; } @@ -1585,6 +1596,9 @@ void freeClient(client *c) { return; } + /* Wait for IO operations to be done before proceeding */ + waitForClientIO(c); + /* For connected clients, call the disconnection event of modules hooks. */ if (c->conn) { moduleFireServerEvent(VALKEYMODULE_EVENT_CLIENT_CHANGE, VALKEYMODULE_SUBEVENT_CLIENT_CHANGE_DISCONNECTED, c); @@ -1735,22 +1749,9 @@ void freeClient(client *c) { * a context where calling freeClient() is not possible, because the client * should be valid for the continuation of the flow of the program. */ void freeClientAsync(client *c) { - /* We need to handle concurrent access to the server.clients_to_close list - * only in the freeClientAsync() function, since it's the only function that - * may access the list while the server uses I/O threads. All the other accesses - * are in the context of the main thread while the other threads are - * idle. */ if (c->flag.close_asap || c->flag.script) return; c->flag.close_asap = 1; - if (server.io_threads_num == 1) { - /* no need to bother with locking if there's just one thread (the main thread) */ - listAddNodeTail(server.clients_to_close, c); - return; - } - static pthread_mutex_t async_free_queue_mutex = PTHREAD_MUTEX_INITIALIZER; - pthread_mutex_lock(&async_free_queue_mutex); listAddNodeTail(server.clients_to_close, c); - pthread_mutex_unlock(&async_free_queue_mutex); } /* Log errors for invalid use and free the client in async way. @@ -1769,31 +1770,90 @@ void logInvalidUseAndFreeClientAsync(client *c, const char *fmt, ...) { freeClientAsync(c); } -/* Perform processing of the client before moving on to processing the next client - * this is useful for performing operations that affect the global state but can't - * wait until we're done with all clients. In other words can't wait until beforeSleep() - * return C_ERR in case client is no longer valid after call. - * The input client argument: c, may be NULL in case the previous client was - * freed before the call. */ -int beforeNextClient(client *c) { +/* Resets the shared query buffer used by the given client. + * If any data remained in the buffer, the client will take ownership of the buffer + * and a new empty buffer will be allocated for the shared buffer. */ +void resetSharedQueryBuf(client *c) { + serverAssert(c->querybuf == thread_shared_qb); + size_t remaining = sdslen(c->querybuf) - c->qb_pos; + + if (remaining > 0) { + /* Let the client take ownership of the shared buffer. */ + initSharedQueryBuf(); + return; + } + + c->querybuf = NULL; + sdsclear(thread_shared_qb); + c->qb_pos = 0; +} + +/* Trims the client query buffer to the current position. */ +void trimClientQueryBuffer(client *c) { + if (c->querybuf == thread_shared_qb) { + resetSharedQueryBuf(c); + } + + if (c->querybuf == NULL) { + return; + } + + serverAssert(c->qb_pos <= sdslen(c->querybuf)); + + if (c->qb_pos > 0) { + sdsrange(c->querybuf, c->qb_pos, -1); + c->qb_pos = 0; + } +} + +/* Perform processing of the client before moving on to processing the next client. + * This is useful for performing operations that affect the global state but can't + * wait until we're done with all clients. In other words, it can't wait until beforeSleep(). + * With IO threads enabled, this function offloads the write to the IO threads if possible. */ +void beforeNextClient(client *c) { /* Notice, this code is also called from 'processUnblockedClients'. * But in case of a module blocked client (see RM_Call 'K' flag) we do not reach this code path. * So whenever we change the code here we need to consider if we need this change on module * blocked client as well */ - /* Skip the client processing if we're in an IO thread, in that case we'll perform - this operation later (this function is called again) in the fan-in stage of the threading mechanism */ - if (io_threads_op != IO_THREADS_OP_IDLE) return C_OK; + /* Trim the query buffer to the current position. */ + if (c->flag.primary) { + /* If the client is a primary, trim the querybuf to repl_applied, + * since primary client is very special, its querybuf not only + * used to parse command, but also proxy to sub-replicas. + * + * Here are some scenarios we cannot trim to qb_pos: + * 1. we don't receive complete command from primary + * 2. primary client blocked cause of client pause + * 3. io threads operate read, primary client flagged with CLIENT_PENDING_COMMAND + * + * In these scenarios, qb_pos points to the part of the current command + * or the beginning of next command, and the current command is not applied yet, + * so the repl_applied is not equal to qb_pos. */ + if (c->repl_applied) { + sdsrange(c->querybuf, c->repl_applied, -1); + c->qb_pos -= c->repl_applied; + c->repl_applied = 0; + } + } else { + trimClientQueryBuffer(c); + } /* Handle async frees */ /* Note: this doesn't make the server.clients_to_close list redundant because of * cases where we want an async free of a client other than myself. For example * in ACL modifications we disconnect clients authenticated to non-existent * users (see ACL LOAD). */ - if (c && (c->flag.close_asap)) { + if (c->flag.close_asap) { freeClient(c); - return C_ERR; + return; + } + + updateClientMemUsageAndBucket(c); + /* If IO threads are enabled try to write immediately the reply instead of waiting to beforeSleep, + * unless aof_fsync is set to always in which case we need to wait for beforeSleep after writing the aof buffer. */ + if (server.aof_fsync != AOF_FSYNC_ALWAYS) { + trySendWriteToIOThreads(c); } - return C_OK; } /* Free the clients marked as CLOSE_ASAP, return the number of clients @@ -1827,57 +1887,204 @@ client *lookupClientByID(uint64_t id) { return c; } +void writeToReplica(client *c) { + /* Can be called from main-thread only as replica write offload is not supported yet */ + serverAssert(inMainThread()); + int nwritten = 0; + serverAssert(c->bufpos == 0 && listLength(c->reply) == 0); + while (clientHasPendingReplies(c)) { + replBufBlock *o = listNodeValue(c->ref_repl_buf_node); + serverAssert(o->used >= c->ref_block_pos); + + /* Send current block if it is not fully sent. */ + if (o->used > c->ref_block_pos) { + nwritten = connWrite(c->conn, o->buf + c->ref_block_pos, o->used - c->ref_block_pos); + if (nwritten <= 0) { + c->write_flags |= WRITE_FLAGS_WRITE_ERROR; + return; + } + c->nwritten += nwritten; + c->ref_block_pos += nwritten; + } + + /* If we fully sent the object on head, go to the next one. */ + listNode *next = listNextNode(c->ref_repl_buf_node); + if (next && c->ref_block_pos == o->used) { + o->refcount--; + ((replBufBlock *)(listNodeValue(next)))->refcount++; + c->ref_repl_buf_node = next; + c->ref_block_pos = 0; + incrementalTrimReplicationBacklog(REPL_BACKLOG_TRIM_BLOCKS_PER_CALL); + } + } +} + /* This function should be called from _writeToClient when the reply list is not empty, * it gathers the scattered buffers from reply list and sends them away with connWritev. - * If we write successfully, it returns C_OK, otherwise, C_ERR is returned, - * and 'nwritten' is an output parameter, it means how many bytes server write - * to client. */ -static int _writevToClient(client *c, ssize_t *nwritten) { + * If we write successfully, it returns C_OK, otherwise, C_ERR is returned. + * Sets the c->nwritten to the number of bytes the server wrote to the client. + * Can be called from the main thread or an I/O thread */ +static int writevToClient(client *c) { int iovcnt = 0; int iovmax = min(IOV_MAX, c->conn->iovcnt); - struct iovec iov[iovmax]; - size_t iov_bytes_len = 0; + struct iovec iov_arr[iovmax]; + struct iovec *iov = iov_arr; + ssize_t bufpos, iov_bytes_len = 0; + listNode *lastblock; + + if (inMainThread()) { + lastblock = listLast(c->reply); + bufpos = c->bufpos; + } else { + lastblock = c->io_last_reply_block; + bufpos = lastblock ? (size_t)c->bufpos : c->io_last_bufpos; + } + /* If the static reply buffer is not empty, * add it to the iov array for writev() as well. */ - if (c->bufpos > 0) { + if (bufpos > 0) { iov[iovcnt].iov_base = c->buf + c->sentlen; - iov[iovcnt].iov_len = c->bufpos - c->sentlen; + iov[iovcnt].iov_len = bufpos - c->sentlen; iov_bytes_len += iov[iovcnt++].iov_len; } /* The first node of reply list might be incomplete from the last call, * thus it needs to be calibrated to get the actual data address and length. */ - size_t offset = c->bufpos > 0 ? 0 : c->sentlen; + size_t sentlen = bufpos > 0 ? 0 : c->sentlen; listIter iter; listNode *next; clientReplyBlock *o; + size_t used; listRewind(c->reply, &iter); while ((next = listNext(&iter)) && iovcnt < iovmax && iov_bytes_len < NET_MAX_WRITES_PER_EVENT) { o = listNodeValue(next); - if (o->used == 0) { /* empty node, just release it and skip. */ - c->reply_bytes -= o->size; - listDelNode(c->reply, next); - offset = 0; + + used = o->used; + /* Use c->io_last_bufpos as the currently used portion of the block. + * We use io_last_bufpos instead of o->used to ensure that we only access data guaranteed to be visible to the + * current thread. Using o->used, which may have been updated by the main thread, could lead to accessing data + * that may not yet be visible to the current thread*/ + if (!inMainThread() && next == lastblock) used = c->io_last_bufpos; + + if (used == 0) { /* empty node, skip over it. */ + if (next == lastblock) break; + sentlen = 0; continue; } - iov[iovcnt].iov_base = o->buf + offset; - iov[iovcnt].iov_len = o->used - offset; + iov[iovcnt].iov_base = o->buf + sentlen; + iov[iovcnt].iov_len = used - sentlen; iov_bytes_len += iov[iovcnt++].iov_len; - offset = 0; + + sentlen = 0; + if (next == lastblock) break; } - if (iovcnt == 0) return C_OK; - *nwritten = connWritev(c->conn, iov, iovcnt); - if (*nwritten <= 0) return C_ERR; + + serverAssert(iovcnt != 0); + + ssize_t totwritten = 0; + while (1) { + int nwritten = connWritev(c->conn, iov, iovcnt); + if (nwritten <= 0) { + c->write_flags |= WRITE_FLAGS_WRITE_ERROR; + totwritten = totwritten > 0 ? totwritten : nwritten; + break; + } + totwritten += nwritten; + + if (totwritten == iov_bytes_len) break; + + if (totwritten > NET_MAX_WRITES_PER_EVENT) { + /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT + * bytes, Since it's a good idea to serve + * other clients as well, even if a very large request comes from + * super fast link that is always able to accept data (in real world + * scenario think about 'KEYS *' against the loopback interface). + * + * However if we are over the maxmemory limit we ignore that and + * just deliver as much data as it is possible to deliver. */ + int ignore_max_write_limit = server.maxmemory > 0 && zmalloc_used_memory() > server.maxmemory; + if (!ignore_max_write_limit) { + break; + } + } + + /* proceed to the unwritten blocks */ + while (nwritten > 0) { + if ((size_t)nwritten < iov[0].iov_len) { + iov[0].iov_base = (char *)iov[0].iov_base + nwritten; + iov[0].iov_len -= nwritten; + break; + } + nwritten -= iov[0].iov_len; + iov++; + iovcnt--; + } + } + + c->nwritten = totwritten; + return totwritten > 0 ? C_OK : C_ERR; +} + +/* This function does actual writing output buffers to non-replica client, it is called by writeToClient. + * If we write successfully, it returns C_OK, otherwise, C_ERR is returned, + * and 'c->nwritten' is set to the number of bytes the server wrote to the client. */ +int _writeToClient(client *c) { + listNode *lastblock; + size_t bufpos; + + if (inMainThread()) { + /* In the main thread, access bufpos and lastblock directly */ + lastblock = listLast(c->reply); + bufpos = (size_t)c->bufpos; + } else { + /* If there is a last block, use bufpos directly; otherwise, use io_last_bufpos */ + bufpos = c->io_last_reply_block ? (size_t)c->bufpos : c->io_last_bufpos; + lastblock = c->io_last_reply_block; + } + + /* If the reply list is not empty, use writev to save system calls and TCP packets */ + if (lastblock) return writevToClient(c); + + ssize_t bytes_to_write = bufpos - c->sentlen; + ssize_t tot_written = 0; + + while (tot_written < bytes_to_write) { + int nwritten = connWrite(c->conn, c->buf + c->sentlen, bytes_to_write - tot_written); + if (nwritten <= 0) { + c->write_flags |= WRITE_FLAGS_WRITE_ERROR; + tot_written = tot_written > 0 ? tot_written : nwritten; + break; + } + tot_written += nwritten; + } + + c->nwritten = tot_written; + return tot_written > 0 ? C_OK : C_ERR; +} + +static void postWriteToReplica(client *c) { + serverAssert(inMainThread()); + if (c->nwritten > 0) c->net_output_bytes += c->nwritten; +} + +static void _postWriteToClient(client *c) { + if (c->nwritten <= 0) return; + + listIter iter; + listNode *next; + clientReplyBlock *o; + + server.stat_net_output_bytes += c->nwritten; /* Locate the new node which has leftover data and * release all nodes in front of it. */ - ssize_t remaining = *nwritten; - if (c->bufpos > 0) { /* deal with static reply buffer first. */ + ssize_t remaining = c->nwritten; + if (c->bufpos > 0) { /* Deal with static reply buffer first. */ int buf_len = c->bufpos - c->sentlen; - c->sentlen += remaining; + c->sentlen += c->nwritten; /* If the buffer was sent, set bufpos to zero to continue with * the remainder of the reply. */ - if (remaining >= buf_len) { + if (c->nwritten >= buf_len) { c->bufpos = 0; c->sentlen = 0; } @@ -1896,116 +2103,31 @@ static int _writevToClient(client *c, ssize_t *nwritten) { listDelNode(c->reply, next); c->sentlen = 0; } - - return C_OK; -} - -/* This function does actual writing output buffers to different types of - * clients, it is called by writeToClient. - * If we write successfully, it returns C_OK, otherwise, C_ERR is returned, - * and 'nwritten' is an output parameter, it means how many bytes server write - * to client. */ -int _writeToClient(client *c, ssize_t *nwritten) { - *nwritten = 0; - if (getClientType(c) == CLIENT_TYPE_REPLICA) { - serverAssert(c->bufpos == 0 && listLength(c->reply) == 0); - - replBufBlock *o = listNodeValue(c->ref_repl_buf_node); - serverAssert(o->used >= c->ref_block_pos); - /* Send current block if it is not fully sent. */ - if (o->used > c->ref_block_pos) { - *nwritten = connWrite(c->conn, o->buf + c->ref_block_pos, o->used - c->ref_block_pos); - if (*nwritten <= 0) return C_ERR; - c->ref_block_pos += *nwritten; - } - - /* If we fully sent the object on head, go to the next one. */ - listNode *next = listNextNode(c->ref_repl_buf_node); - if (next && c->ref_block_pos == o->used) { - o->refcount--; - ((replBufBlock *)(listNodeValue(next)))->refcount++; - c->ref_repl_buf_node = next; - c->ref_block_pos = 0; - incrementalTrimReplicationBacklog(REPL_BACKLOG_TRIM_BLOCKS_PER_CALL); - } - return C_OK; - } - - /* When the reply list is not empty, it's better to use writev to save us some - * system calls and TCP packets. */ - if (listLength(c->reply) > 0) { - int ret = _writevToClient(c, nwritten); - if (ret != C_OK) return ret; - - /* If there are no longer objects in the list, we expect - * the count of reply bytes to be exactly zero. */ - if (listLength(c->reply) == 0) serverAssert(c->reply_bytes == 0); - } else if (c->bufpos > 0) { - *nwritten = connWrite(c->conn, c->buf + c->sentlen, c->bufpos - c->sentlen); - if (*nwritten <= 0) return C_ERR; - c->sentlen += *nwritten; - - /* If the buffer was sent, set bufpos to zero to continue with - * the remainder of the reply. */ - if ((int)c->sentlen == c->bufpos) { - c->bufpos = 0; - c->sentlen = 0; - } - } - - return C_OK; } -/* Write data in output buffers to client. Return C_OK if the client - * is still valid after the call, C_ERR if it was freed because of some - * error. If handler_installed is set, it will attempt to clear the - * write event. - * - * This function is called by threads, but always with handler_installed - * set to 0. So when handler_installed is set to 0 the function must be - * thread safe. */ -int writeToClient(client *c, int handler_installed) { +/* Updates the client's memory usage and bucket and server stats after writing. + * If a write handler is installed , it will attempt to clear the write event. + * If the client is no longer valid, it will return C_ERR, otherwise C_OK. */ +int postWriteToClient(client *c) { + c->io_last_reply_block = NULL; + c->io_last_bufpos = 0; /* Update total number of writes on server */ - atomic_fetch_add_explicit(&server.stat_total_writes_processed, 1, memory_order_relaxed); - - ssize_t nwritten = 0, totwritten = 0; - - while (clientHasPendingReplies(c)) { - int ret = _writeToClient(c, &nwritten); - if (ret == C_ERR) break; - totwritten += nwritten; - /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT - * bytes, in a single threaded server it's a good idea to serve - * other clients as well, even if a very large request comes from - * super fast link that is always able to accept data (in real world - * scenario think about 'KEYS *' against the loopback interface). - * - * However if we are over the maxmemory limit we ignore that and - * just deliver as much data as it is possible to deliver. - * - * Moreover, we also send as much as possible if the client is - * a replica or a monitor (otherwise, on high-speed traffic, the - * replication/output buffer will grow indefinitely) */ - if (totwritten > NET_MAX_WRITES_PER_EVENT && - (server.maxmemory == 0 || zmalloc_used_memory() < server.maxmemory) && !c->flag.replica) - break; - } - + server.stat_total_writes_processed++; if (getClientType(c) == CLIENT_TYPE_REPLICA) { - atomic_fetch_add_explicit(&server.stat_net_repl_output_bytes, totwritten, memory_order_relaxed); + postWriteToReplica(c); } else { - atomic_fetch_add_explicit(&server.stat_net_output_bytes, totwritten, memory_order_relaxed); + _postWriteToClient(c); } - c->net_output_bytes += totwritten; - if (nwritten == -1) { + if (c->write_flags & WRITE_FLAGS_WRITE_ERROR) { if (connGetState(c->conn) != CONN_STATE_CONNECTED) { serverLog(LL_VERBOSE, "Error writing to client: %s", connGetLastError(c->conn)); freeClientAsync(c); return C_ERR; } } - if (totwritten > 0) { + if (c->nwritten > 0) { + c->net_output_bytes += c->nwritten; /* For clients representing primaries we don't count sending data * as an interaction, since we always send REPLCONF ACK commands * that take some time to just fill the socket output buffer. @@ -2014,12 +2136,7 @@ int writeToClient(client *c, int handler_installed) { } if (!clientHasPendingReplies(c)) { c->sentlen = 0; - /* Note that writeToClient() is called in a threaded way, but - * aeDeleteFileEvent() is not thread safe: however writeToClient() - * is always called with handler_installed set to 0 from threads - * so we are fine. */ - if (handler_installed) { - serverAssert(io_threads_op == IO_THREADS_OP_IDLE); + if (connHasWriteHandler(c->conn)) { connSetWriteHandler(c->conn, NULL); } @@ -2029,17 +2146,239 @@ int writeToClient(client *c, int handler_installed) { return C_ERR; } } - /* Update client's memory usage after writing. - * Since this isn't thread safe we do this conditionally. In case of threaded writes this is done in - * handleClientsWithPendingWritesUsingThreads(). */ - if (io_threads_op == IO_THREADS_OP_IDLE) updateClientMemUsageAndBucket(c); + /* Update client's memory usage after writing.*/ + updateClientMemUsageAndBucket(c); return C_OK; } +/* Write data in output buffers to client. Return C_OK if the client + * is still valid after the call, C_ERR if it was freed because of some + * error. + * + * This function is called by main-thread only */ +int writeToClient(client *c) { + if (c->io_write_state != CLIENT_IDLE || c->io_read_state != CLIENT_IDLE) return C_OK; + + c->nwritten = 0; + c->write_flags = 0; + + if (getClientType(c) == CLIENT_TYPE_REPLICA) { + writeToReplica(c); + } else { + _writeToClient(c); + } + + return postWriteToClient(c); +} + /* Write event handler. Just send data to the client. */ void sendReplyToClient(connection *conn) { client *c = connGetPrivateData(conn); - writeToClient(c, 1); + if (trySendWriteToIOThreads(c) == C_OK) return; + writeToClient(c); +} + +void handleQbLimitReached(client *c) { + sds ci = catClientInfoString(sdsempty(), c), bytes = sdsempty(); + bytes = sdscatrepr(bytes, c->querybuf, 64); + serverLog(LL_WARNING, "Closing client that reached max query buffer length: %s (qbuf initial bytes: %s)", ci, + bytes); + sdsfree(ci); + sdsfree(bytes); + freeClientAsync(c); + server.stat_client_qbuf_limit_disconnections++; +} + +/* Handle read errors and update statistics. + * + * Called only from the main thread. + * If the read was done in an I/O thread, this function is invoked after the + * read job has completed, in the main thread context. + * + * Returns: + * - C_OK if the querybuf can be further processed. + * - C_ERR if not. */ +int handleReadResult(client *c) { + serverAssert(inMainThread()); + server.stat_total_reads_processed++; + if (c->nread <= 0) { + if (c->nread == -1) { + if (connGetState(c->conn) != CONN_STATE_CONNECTED) { + serverLog(LL_VERBOSE, "Reading from client: %s", connGetLastError(c->conn)); + freeClientAsync(c); + } + } else if (c->nread == 0) { + if (server.verbosity <= LL_VERBOSE) { + sds info = catClientInfoString(sdsempty(), c); + serverLog(LL_VERBOSE, "Client closed connection %s", info); + sdsfree(info); + } + freeClientAsync(c); + } + return C_ERR; + } + + c->last_interaction = server.unixtime; + c->net_input_bytes += c->nread; + if (c->flag.primary) { + c->read_reploff += c->nread; + server.stat_net_repl_input_bytes += c->nread; + } else { + server.stat_net_input_bytes += c->nread; + } + + /* Handle QB limit */ + if (c->read_flags & READ_FLAGS_QB_LIMIT_REACHED) { + handleQbLimitReached(c); + return C_ERR; + } + return C_OK; +} + + +void handleParseError(client *c) { + int flags = c->read_flags; + if (flags & READ_FLAGS_ERROR_BIG_INLINE_REQUEST) { + addReplyError(c, "Protocol error: too big inline request"); + setProtocolError("too big inline request", c); + } else if (flags & READ_FLAGS_ERROR_BIG_MULTIBULK) { + addReplyError(c, "Protocol error: too big mbulk count string"); + setProtocolError("too big mbulk count string", c); + } else if (flags & READ_FLAGS_ERROR_INVALID_MULTIBULK_LEN) { + addReplyError(c, "Protocol error: invalid multibulk length"); + setProtocolError("invalid mbulk count", c); + } else if (flags & READ_FLAGS_ERROR_UNAUTHENTICATED_MULTIBULK_LEN) { + addReplyError(c, "Protocol error: unauthenticated multibulk length"); + setProtocolError("unauth mbulk count", c); + } else if (flags & READ_FLAGS_ERROR_UNAUTHENTICATED_BULK_LEN) { + addReplyError(c, "Protocol error: unauthenticated bulk length"); + setProtocolError("unauth bulk length", c); + } else if (flags & READ_FLAGS_ERROR_BIG_BULK_COUNT) { + addReplyError(c, "Protocol error: too big bulk count string"); + setProtocolError("too big bulk count string", c); + } else if (flags & READ_FLAGS_ERROR_MBULK_UNEXPECTED_CHARACTER) { + addReplyErrorFormat(c, "Protocol error: expected '$', got '%c'", c->querybuf[c->qb_pos]); + setProtocolError("expected $ but got something else", c); + } else if (flags & READ_FLAGS_ERROR_MBULK_INVALID_BULK_LEN) { + addReplyError(c, "Protocol error: invalid bulk length"); + setProtocolError("invalid bulk length", c); + } else if (flags & READ_FLAGS_ERROR_UNBALANCED_QUOTES) { + addReplyError(c, "Protocol error: unbalanced quotes in request"); + setProtocolError("unbalanced quotes in inline request", c); + } else if (flags & READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_PRIMARY) { + serverLog(LL_WARNING, "WARNING: Receiving inline protocol from primary, primary stream corruption? Closing the " + "primary connection and discarding the cached primary."); + setProtocolError("Master using the inline protocol. Desync?", c); + } else { + serverAssertWithInfo(c, NULL, "Unknown parsing error"); + } +} + +int isParsingError(client *c) { + return c->read_flags & (READ_FLAGS_ERROR_BIG_INLINE_REQUEST | READ_FLAGS_ERROR_BIG_MULTIBULK | + READ_FLAGS_ERROR_INVALID_MULTIBULK_LEN | READ_FLAGS_ERROR_UNAUTHENTICATED_MULTIBULK_LEN | + READ_FLAGS_ERROR_UNAUTHENTICATED_BULK_LEN | READ_FLAGS_ERROR_MBULK_INVALID_BULK_LEN | + READ_FLAGS_ERROR_BIG_BULK_COUNT | READ_FLAGS_ERROR_MBULK_UNEXPECTED_CHARACTER | + READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_PRIMARY | READ_FLAGS_ERROR_UNBALANCED_QUOTES); +} + +/* This function is called after the query-buffer was parsed. + * It is used to handle parsing errors and to update the client state. + * The function returns C_OK if a command can be executed, otherwise C_ERR. */ +parseResult handleParseResults(client *c) { + if (isParsingError(c)) { + handleParseError(c); + return PARSE_ERR; + } + + if (c->read_flags & READ_FLAGS_INLINE_ZERO_QUERY_LEN && getClientType(c) == CLIENT_TYPE_REPLICA) { + c->repl_ack_time = server.unixtime; + } + + if (c->read_flags & READ_FLAGS_INLINE_ZERO_QUERY_LEN) { + /* in case the client's query was an empty line we will ignore it and proceed to process the rest of the buffer + * if any */ + resetClient(c); + return PARSE_OK; + } + + if (c->read_flags & READ_FLAGS_PARSING_NEGATIVE_MBULK_LEN) { + /* Multibulk processing could see a <= 0 length. */ + resetClient(c); + return PARSE_OK; + } + + if (c->read_flags & READ_FLAGS_PARSING_COMPLETED) { + return PARSE_OK; + } else { + return PARSE_NEEDMORE; + } +} + +/* Process the completion of an IO write operation for a client. + * This function handles various post-write tasks, including updating client state, + * returns 1 if processing completed successfully, 0 if processing is skipped. */ +int processClientIOWriteDone(client *c) { + /* memory barrier acquire to get the latest client state */ + atomic_thread_fence(memory_order_acquire); + /* If a client is protected, don't proceed to check the write results as it may trigger conn close. */ + if (c->flag.protected) return 0; + + listUnlinkNode(server.clients_pending_io_write, &c->clients_pending_write_node); + c->flag.pending_write = 0; + c->io_write_state = CLIENT_IDLE; + + /* Don't post-process-writes to clients that are going to be closed anyway. */ + if (c->flag.close_asap) return 0; + + /* Update processed count on server */ + server.stat_io_writes_processed += 1; + + connSetPostponeUpdateState(c->conn, 0); + connUpdateState(c->conn); + if (postWriteToClient(c) == C_ERR) { + return 1; + } + + if (clientHasPendingReplies(c)) { + if (c->write_flags & WRITE_FLAGS_WRITE_ERROR) { + /* Install the write handler if there are pending writes in some of the clients as a result of not being + * able to write everything in one go. */ + installClientWriteHandler(c); + } else { + /* If we can send the client to the I/O thread, let it handle the write. */ + if (trySendWriteToIOThreads(c) == C_OK) return 1; + /* Try again in the next eventloop */ + putClientInPendingWriteQueue(c); + } + } + + return 1; +} + +/* This function handles the post-processing of I/O write operations that have been + * completed for clients. It iterates through the list of clients with pending I/O + * writes and performs necessary actions based on their current state. + * + * Returns The number of clients processed during this function call. */ +int processIOThreadsWriteDone(void) { + if (listLength(server.clients_pending_io_write) == 0) return 0; + int processed = 0; + listNode *ln; + + listNode *next = listFirst(server.clients_pending_io_write); + while (next) { + ln = next; + next = listNextNode(ln); + client *c = listNodeValue(ln); + + /* Client is still waiting for a pending I/O - skip it */ + if (c->io_write_state == CLIENT_PENDING_IO || c->io_read_state == CLIENT_PENDING_IO) continue; + + processed += processClientIOWriteDone(c); + } + + return processed; } /* This function is called just before entering the event loop, in the hope @@ -2047,10 +2386,16 @@ void sendReplyToClient(connection *conn) { * need to use a syscall in order to install the writable event handler, * get it called, and so forth. */ int handleClientsWithPendingWrites(void) { + int processed = 0; + int pending_writes = listLength(server.clients_pending_write); + if (pending_writes == 0) return processed; /* Return ASAP if there are no clients. */ + + /* Adjust the number of I/O threads based on the number of pending writes this is required in case pending_writes > + * poll_events (for example in pubsub) */ + adjustIOThreadsByEventLoad(pending_writes, 1); + listIter li; listNode *ln; - int processed = listLength(server.clients_pending_write); - listRewind(server.clients_pending_write, &li); while ((ln = listNext(&li))) { client *c = listNodeValue(ln); @@ -2064,8 +2409,18 @@ int handleClientsWithPendingWrites(void) { /* Don't write to clients that are going to be closed anyway. */ if (c->flag.close_asap) continue; + if (!clientHasPendingReplies(c)) continue; + + /* If we can send the client to the I/O thread, let it handle the write. */ + if (trySendWriteToIOThreads(c) == C_OK) continue; + + /* We can't write to the client while IO operation is in progress. */ + if (c->io_write_state != CLIENT_IDLE || c->io_read_state != CLIENT_IDLE) continue; + + processed++; + /* Try to write buffers to the client socket. */ - if (writeToClient(c, 0) == C_ERR) continue; + if (writeToClient(c) == C_ERR) continue; /* If after the synchronous writes above we still have data to * output to the client, we need to install the writable handler. */ @@ -2113,52 +2468,21 @@ void resetClient(client *c) { c->flag.reply_skip = 0; if (c->flag.reply_skip_next) { c->flag.reply_skip = 1; - c->flag.reply_skip_next = 0; - } -} - -/* Initializes the shared query buffer to a new sds with the default capacity */ -void initSharedQueryBuf(void) { - thread_shared_qb = sdsnewlen(NULL, PROTO_IOBUF_LEN); - sdsclear(thread_shared_qb); -} - -/* Resets the shared query buffer used by the given client. - * If any data remained in the buffer, the client will take ownership of the buffer - * and a new empty buffer will be allocated for the shared buffer. */ -void resetSharedQueryBuf(client *c) { - serverAssert(c->querybuf == thread_shared_qb); - size_t remaining = sdslen(c->querybuf) - c->qb_pos; - - if (remaining > 0) { - /* Let the client take ownership of the shared buffer. */ - initSharedQueryBuf(); - return; - } - - c->querybuf = NULL; - sdsclear(thread_shared_qb); - c->qb_pos = 0; -} - -/* Trims the client query buffer to the current position. */ -void trimClientQueryBuffer(client *c) { - if (c->querybuf == thread_shared_qb) { - resetSharedQueryBuf(c); - } - - if (c->querybuf == NULL) { - return; - } - - serverAssert(c->qb_pos <= sdslen(c->querybuf)); - - if (c->qb_pos > 0) { - sdsrange(c->querybuf, c->qb_pos, -1); - c->qb_pos = 0; + c->flag.reply_skip_next = 0; } } +/* Initializes the shared query buffer to a new sds with the default capacity */ +void initSharedQueryBuf(void) { + thread_shared_qb = sdsnewlen(NULL, PROTO_IOBUF_LEN); + sdsclear(thread_shared_qb); +} + +void freeSharedQueryBuf(void) { + sdsfree(thread_shared_qb); + thread_shared_qb = NULL; +} + /* This function is used when we want to re-enter the event loop but there * is the risk that the client we are dealing with will be freed in some * way. This happens for instance in: @@ -2193,16 +2517,14 @@ void unprotectClient(client *c) { /* Like processMultibulkBuffer(), but for the inline protocol instead of RESP, * this function consumes the client query buffer and creates a command ready - * to be executed inside the client structure. Returns C_OK if the command - * is ready to be executed, or C_ERR if there is still protocol to read to - * have a well formed command. The function also returns C_ERR when there is - * a protocol error: in such a case the client structure is setup to reply - * with the error and close the connection. */ -int processInlineBuffer(client *c) { + * to be executed inside the client structure. + * Sets the client read_flags to indicate the parsing outcome. */ +void processInlineBuffer(client *c) { char *newline; int argc, j, linefeed_chars = 1; sds *argv, aux; size_t querylen; + int is_primary = c->read_flags & READ_FLAGS_PRIMARY; /* Search for end of line */ newline = strchr(c->querybuf + c->qb_pos, '\n'); @@ -2210,10 +2532,9 @@ int processInlineBuffer(client *c) { /* Nothing to do without a \r\n */ if (newline == NULL) { if (sdslen(c->querybuf) - c->qb_pos > PROTO_INLINE_MAX_SIZE) { - addReplyError(c, "Protocol error: too big inline request"); - setProtocolError("too big inline request", c); + c->read_flags |= READ_FLAGS_ERROR_BIG_INLINE_REQUEST; } - return C_ERR; + return; } /* Handle the \r\n case. */ @@ -2225,15 +2546,13 @@ int processInlineBuffer(client *c) { argv = sdssplitargs(aux, &argc); sdsfree(aux); if (argv == NULL) { - addReplyError(c, "Protocol error: unbalanced quotes in request"); - setProtocolError("unbalanced quotes in inline request", c); - return C_ERR; + c->read_flags |= READ_FLAGS_ERROR_UNBALANCED_QUOTES; + return; } - /* Newline from replicas can be used to refresh the last ACK time. - * This is useful for a replica to ping back while loading a big - * RDB file. */ - if (querylen == 0 && getClientType(c) == CLIENT_TYPE_REPLICA) c->repl_ack_time = server.unixtime; + if (querylen == 0) { + c->read_flags |= READ_FLAGS_INLINE_ZERO_QUERY_LEN; + } /* Primaries should never send us inline protocol to run actual * commands. If this happens, it is likely due to a bug in the server where @@ -2242,12 +2561,10 @@ int processInlineBuffer(client *c) { * * However there is an exception: primaries may send us just a newline * to keep the connection active. */ - if (querylen != 0 && c->flag.primary) { + if (querylen != 0 && is_primary) { sdsfreesplitres(argv, argc); - serverLog(LL_WARNING, "WARNING: Receiving inline protocol from primary, primary stream corruption? Closing the " - "primary connection and discarding the cached primary."); - setProtocolError("Primary using the inline protocol. Desync?", c); - return C_ERR; + c->read_flags |= READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_PRIMARY; + return; } /* Move querybuffer position to the next query in the buffer. */ @@ -2268,7 +2585,7 @@ int processInlineBuffer(client *c) { c->argv_len_sum += sdslen(argv[j]); } zfree(argv); - return C_OK; + c->read_flags |= READ_FLAGS_PARSING_COMPLETED; } /* Helper function. Record protocol error details in server log, @@ -2281,9 +2598,10 @@ static void setProtocolError(const char *errstr, client *c) { /* Sample some protocol to given an idea about what was inside. */ char buf[256]; - if (sdslen(c->querybuf) - c->qb_pos < PROTO_DUMP_LEN) { + buf[0] = '\0'; + if (c->querybuf && sdslen(c->querybuf) - c->qb_pos < PROTO_DUMP_LEN) { snprintf(buf, sizeof(buf), "Query buffer during protocol error: '%s'", c->querybuf + c->qb_pos); - } else { + } else if (c->querybuf) { snprintf(buf, sizeof(buf), "Query buffer during protocol error: '%.*s' (... more %zu bytes ...) '%.*s'", PROTO_DUMP_LEN / 2, c->querybuf + c->qb_pos, sdslen(c->querybuf) - c->qb_pos - PROTO_DUMP_LEN, PROTO_DUMP_LEN / 2, c->querybuf + sdslen(c->querybuf) - PROTO_DUMP_LEN / 2); @@ -2306,20 +2624,18 @@ static void setProtocolError(const char *errstr, client *c) { } /* Process the query buffer for client 'c', setting up the client argument - * vector for command execution. Returns C_OK if after running the function - * the client has a well-formed ready to be processed command, otherwise - * C_ERR if there is still to read more buffer to get the full command. - * The function also returns C_ERR when there is a protocol error: in such a - * case the client structure is setup to reply with the error and close - * the connection. + * vector for command execution. + * Sets the client's read_flags to indicate the parsing outcome. * * This function is called if processInputBuffer() detects that the next * command is in RESP format, so the first byte in the command is found * to be '*'. Otherwise for inline commands processInlineBuffer() is called. */ -int processMultibulkBuffer(client *c) { +void processMultibulkBuffer(client *c) { char *newline = NULL; int ok; long long ll; + int is_primary = c->read_flags & READ_FLAGS_PRIMARY; + int auth_required = c->read_flags & READ_FLAGS_AUTH_REQUIRED; if (c->multibulklen == 0) { /* The client should have been reset */ @@ -2329,32 +2645,32 @@ int processMultibulkBuffer(client *c) { newline = strchr(c->querybuf + c->qb_pos, '\r'); if (newline == NULL) { if (sdslen(c->querybuf) - c->qb_pos > PROTO_INLINE_MAX_SIZE) { - addReplyError(c, "Protocol error: too big mbulk count string"); - setProtocolError("too big mbulk count string", c); + c->read_flags |= READ_FLAGS_ERROR_BIG_MULTIBULK; } - return C_ERR; + return; } /* Buffer should also contain \n */ - if (newline - (c->querybuf + c->qb_pos) > (ssize_t)(sdslen(c->querybuf) - c->qb_pos - 2)) return C_ERR; + if (newline - (c->querybuf + c->qb_pos) > (ssize_t)(sdslen(c->querybuf) - c->qb_pos - 2)) return; /* We know for sure there is a whole line since newline != NULL, * so go ahead and find out the multi bulk length. */ serverAssertWithInfo(c, NULL, c->querybuf[c->qb_pos] == '*'); ok = string2ll(c->querybuf + 1 + c->qb_pos, newline - (c->querybuf + 1 + c->qb_pos), &ll); if (!ok || ll > INT_MAX) { - addReplyError(c, "Protocol error: invalid multibulk length"); - setProtocolError("invalid mbulk count", c); - return C_ERR; - } else if (ll > 10 && authRequired(c)) { - addReplyError(c, "Protocol error: unauthenticated multibulk length"); - setProtocolError("unauth mbulk count", c); - return C_ERR; + c->read_flags |= READ_FLAGS_ERROR_INVALID_MULTIBULK_LEN; + return; + } else if (ll > 10 && auth_required) { + c->read_flags |= READ_FLAGS_ERROR_UNAUTHENTICATED_MULTIBULK_LEN; + return; } c->qb_pos = (newline - c->querybuf) + 2; - if (ll <= 0) return C_OK; + if (ll <= 0) { + c->read_flags |= READ_FLAGS_PARSING_NEGATIVE_MBULK_LEN; + return; + } c->multibulklen = ll; @@ -2372,9 +2688,8 @@ int processMultibulkBuffer(client *c) { newline = strchr(c->querybuf + c->qb_pos, '\r'); if (newline == NULL) { if (sdslen(c->querybuf) - c->qb_pos > PROTO_INLINE_MAX_SIZE) { - addReplyError(c, "Protocol error: too big bulk count string"); - setProtocolError("too big bulk count string", c); - return C_ERR; + c->read_flags |= READ_FLAGS_ERROR_BIG_BULK_COUNT; + return; } break; } @@ -2383,24 +2698,21 @@ int processMultibulkBuffer(client *c) { if (newline - (c->querybuf + c->qb_pos) > (ssize_t)(sdslen(c->querybuf) - c->qb_pos - 2)) break; if (c->querybuf[c->qb_pos] != '$') { - addReplyErrorFormat(c, "Protocol error: expected '$', got '%c'", c->querybuf[c->qb_pos]); - setProtocolError("expected $ but got something else", c); - return C_ERR; + c->read_flags |= READ_FLAGS_ERROR_MBULK_UNEXPECTED_CHARACTER; + return; } ok = string2ll(c->querybuf + c->qb_pos + 1, newline - (c->querybuf + c->qb_pos + 1), &ll); - if (!ok || ll < 0 || (!c->flag.primary && ll > server.proto_max_bulk_len)) { - addReplyError(c, "Protocol error: invalid bulk length"); - setProtocolError("invalid bulk length", c); - return C_ERR; - } else if (ll > 16384 && authRequired(c)) { - addReplyError(c, "Protocol error: unauthenticated bulk length"); - setProtocolError("unauth bulk length", c); - return C_ERR; + if (!ok || ll < 0 || (!(is_primary) && ll > server.proto_max_bulk_len)) { + c->read_flags |= READ_FLAGS_ERROR_MBULK_INVALID_BULK_LEN; + return; + } else if (ll > 16384 && auth_required) { + c->read_flags |= READ_FLAGS_ERROR_UNAUTHENTICATED_BULK_LEN; + return; } c->qb_pos = newline - c->querybuf + 2; - if (!c->flag.primary && ll >= PROTO_MBULK_BIG_ARG) { + if (!(is_primary) && ll >= PROTO_MBULK_BIG_ARG) { /* When the client is not a primary client (because primary * client's querybuf can only be trimmed after data applied * and sent to replicas). @@ -2446,7 +2758,7 @@ int processMultibulkBuffer(client *c) { /* Optimization: if a non-primary client's buffer contains JUST our bulk element * instead of creating a new object by *copying* the sds we * just use the current sds string. */ - if (!c->flag.primary && c->qb_pos == 0 && c->bulklen >= PROTO_MBULK_BIG_ARG && + if (!is_primary && c->qb_pos == 0 && c->bulklen >= PROTO_MBULK_BIG_ARG && sdslen(c->querybuf) == (size_t)(c->bulklen + 2)) { c->argv[c->argc++] = createObject(OBJ_STRING, c->querybuf); c->argv_len_sum += c->bulklen; @@ -2466,10 +2778,7 @@ int processMultibulkBuffer(client *c) { } /* We're done when c->multibulk == 0 */ - if (c->multibulklen == 0) return C_OK; - - /* Still not ready to process the command */ - return C_ERR; + if (c->multibulklen == 0) c->read_flags |= READ_FLAGS_PARSING_COMPLETED; } /* Perform necessary tasks after a command was executed: @@ -2572,122 +2881,103 @@ int processPendingCommandAndInputBuffer(client *c) { return C_OK; } -/* This function is called every time, in the client structure 'c', there is - * more query buffer to process, because we read more data from the socket - * or because a client was blocked and later reactivated, so there could be - * pending query buffer, already representing a full command, to process. - * return C_ERR in case the client was freed during the processing */ +/* Parse a single command from the query buf. + * + * This function may be called from the main thread or from the I/O thread. + * + * Sets the client's read_flags to indicate the parsing outcome */ +void parseCommand(client *c) { + /* Determine request type when unknown. */ + if (!c->reqtype) { + if (c->querybuf[c->qb_pos] == '*') { + c->reqtype = PROTO_REQ_MULTIBULK; + } else { + c->reqtype = PROTO_REQ_INLINE; + } + } + + if (c->reqtype == PROTO_REQ_INLINE) { + processInlineBuffer(c); + } else if (c->reqtype == PROTO_REQ_MULTIBULK) { + processMultibulkBuffer(c); + } else { + serverPanic("Unknown request type"); + } +} + +int canParseCommand(client *c) { + if (c->cmd != NULL) return 0; + + /* Don't parse a command if the client is in the middle of something. */ + if (c->flag.blocked || c->flag.unblocked) return 0; + + /* Don't process more buffers from clients that have already pending + * commands to execute in c->argv. */ + if (c->flag.pending_command) return 0; + + /* Don't process input from the primary while there is a busy script + * condition on the replica. We want just to accumulate the replication + * stream (instead of replying -BUSY like we do with other clients) and + * later resume the processing. */ + if (isInsideYieldingLongCommand() && c->flag.primary) return 0; + + /* CLIENT_CLOSE_AFTER_REPLY closes the connection once the reply is + * written to the client. Make sure to not let the reply grow after + * this flag has been set (i.e. don't process more commands). + * + * The same applies for clients we want to terminate ASAP. */ + if (c->flag.close_after_reply || c->flag.close_asap) return 0; + + return 1; +} + int processInputBuffer(client *c) { - /* Keep processing while there is something in the input buffer */ + /* Parse the query buffer. */ while (c->querybuf && c->qb_pos < sdslen(c->querybuf)) { - /* Immediately abort if the client is in the middle of something. */ - if (c->flag.blocked) break; - - /* Don't process more buffers from clients that have already pending - * commands to execute in c->argv. */ - if (c->flag.pending_command) break; - - /* Don't process input from the primary while there is a busy script - * condition on the replica. We want just to accumulate the replication - * stream (instead of replying -BUSY like we do with other clients) and - * later resume the processing. */ - if (isInsideYieldingLongCommand() && c->flag.primary) break; - - /* CLIENT_CLOSE_AFTER_REPLY closes the connection once the reply is - * written to the client. Make sure to not let the reply grow after - * this flag has been set (i.e. don't process more commands). - * - * The same applies for clients we want to terminate ASAP. */ - if (c->flag.close_after_reply || c->flag.close_asap) break; - - /* Determine request type when unknown. */ - if (!c->reqtype) { - if (c->querybuf[c->qb_pos] == '*') { - c->reqtype = PROTO_REQ_MULTIBULK; - } else { - c->reqtype = PROTO_REQ_INLINE; - } + if (!canParseCommand(c)) { + break; } - if (c->reqtype == PROTO_REQ_INLINE) { - if (processInlineBuffer(c) != C_OK) break; - } else if (c->reqtype == PROTO_REQ_MULTIBULK) { - if (processMultibulkBuffer(c) != C_OK) break; - } else { - serverPanic("Unknown request type"); + c->read_flags = c->flag.primary ? READ_FLAGS_PRIMARY : 0; + c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0; + + parseCommand(c); + + if (handleParseResults(c) != PARSE_OK) { + break; } - /* Multibulk processing could see a <= 0 length. */ if (c->argc == 0) { - resetClient(c); - } else { - /* If we are in the context of an I/O thread, we can't really - * execute the command here. All we can do is to flag the client - * as one that needs to process the command. */ - if (io_threads_op != IO_THREADS_OP_IDLE) { - serverAssert(io_threads_op == IO_THREADS_OP_READ); - c->flag.pending_command = 1; - break; - } - - if (c->querybuf == thread_shared_qb) { - /* Before processing the command, reset the shared query buffer to its default state. - * This avoids unintentionally modifying the shared qb during processCommand as we may use - * the shared qb for other clients during processEventsWhileBlocked */ - resetSharedQueryBuf(c); - } + /* No command to process - continue parsing the query buf. */ + continue; + } - /* We are finally ready to execute the command. */ - if (processCommandAndResetClient(c) == C_ERR) { - /* If the client is no longer valid, we avoid exiting this - * loop and trimming the client buffer later. So we return - * ASAP in that case. */ - return C_ERR; - } + if (c->querybuf == thread_shared_qb) { + /* Before processing the command, reset the shared query buffer to its default state. + * This avoids unintentionally modifying the shared qb during processCommand as we may use + * the shared qb for other clients during processEventsWhileBlocked */ + resetSharedQueryBuf(c); } - } - if (c->flag.primary) { - /* If the client is a primary, trim the querybuf to repl_applied, - * since primary client is very special, its querybuf not only - * used to parse command, but also proxy to sub-replicas. - * - * Here are some scenarios we cannot trim to qb_pos: - * 1. we don't receive complete command from primary - * 2. primary client blocked cause of client pause - * 3. io threads operate read, primary client flagged with CLIENT_PENDING_COMMAND - * - * In these scenarios, qb_pos points to the part of the current command - * or the beginning of next command, and the current command is not applied yet, - * so the repl_applied is not equal to qb_pos. */ - if (c->repl_applied) { - sdsrange(c->querybuf, c->repl_applied, -1); - c->qb_pos -= c->repl_applied; - c->repl_applied = 0; + /* We are finally ready to execute the command. */ + if (processCommandAndResetClient(c) == C_ERR) { + /* If the client is no longer valid, we avoid exiting this + * loop and trimming the client buffer later. So we return + * ASAP in that case. */ + return C_ERR; } - } else { - trimClientQueryBuffer(c); } - /* Update client memory usage after processing the query buffer, this is - * important in case the query buffer is big and wasn't drained during - * the above loop (because of partially sent big commands). */ - if (io_threads_op == IO_THREADS_OP_IDLE) updateClientMemUsageAndBucket(c); - return C_OK; } -void readQueryFromClient(connection *conn) { - client *c = connGetPrivateData(conn); - int nread, big_arg = 0; +/* This function can be called from the main-thread or from the IO-thread. + * The function allocates query-buf for the client if required and reads to it from the network. + * It will set c->nread to the bytes read from the network. */ +void readToQueryBuf(client *c) { + int big_arg = 0; size_t qblen, readlen; - - /* Check if we want to read from the client later when exiting from - * the event loop. This is the case if threaded I/O is enabled. */ - if (postponeClientRead(c)) return; - - /* Update total number of reads on server */ - atomic_fetch_add_explicit(&server.stat_total_reads_processed, 1, memory_order_relaxed); + int is_primary = c->read_flags & READ_FLAGS_PRIMARY; readlen = PROTO_IOBUF_LEN; qblen = c->querybuf ? sdslen(c->querybuf) : 0; @@ -2717,7 +3007,7 @@ void readQueryFromClient(connection *conn) { qblen = sdslen(c->querybuf); } - if (!c->flag.primary && // primary client's querybuf can grow greedy. + if (!is_primary && // primary client's querybuf can grow greedy. (big_arg || sdsalloc(c->querybuf) < PROTO_IOBUF_LEN)) { /* When reading a BIG_ARG we won't be reading more than that one arg * into the query buffer, so we don't need to pre-allocate more than we @@ -2734,65 +3024,38 @@ void readQueryFromClient(connection *conn) { /* Read as much as possible from the socket to save read(2) system calls. */ readlen = sdsavail(c->querybuf); } - nread = connRead(c->conn, c->querybuf + qblen, readlen); - if (nread == -1) { - if (connGetState(conn) == CONN_STATE_CONNECTED) { - goto done; - } else { - serverLog(LL_VERBOSE, "Reading from client: %s", connGetLastError(c->conn)); - freeClientAsync(c); - goto done; - } - } else if (nread == 0) { - if (server.verbosity <= LL_VERBOSE) { - sds info = catClientInfoString(sdsempty(), c); - serverLog(LL_VERBOSE, "Client closed connection %s", info); - sdsfree(info); - } - freeClientAsync(c); - goto done; + c->nread = connRead(c->conn, c->querybuf + qblen, readlen); + if (c->nread <= 0) { + return; } - sdsIncrLen(c->querybuf, nread); + sdsIncrLen(c->querybuf, c->nread); qblen = sdslen(c->querybuf); if (c->querybuf_peak < qblen) c->querybuf_peak = qblen; - - c->last_interaction = server.unixtime; - if (c->flag.primary) { - c->read_reploff += nread; - atomic_fetch_add_explicit(&server.stat_net_repl_input_bytes, nread, memory_order_relaxed); - } else { - atomic_fetch_add_explicit(&server.stat_net_input_bytes, nread, memory_order_relaxed); - } - c->net_input_bytes += nread; - - if (!c->flag.primary && + if (!is_primary) { /* The commands cached in the MULTI/EXEC queue have not been executed yet, * so they are also considered a part of the query buffer in a broader sense. * * For unauthenticated clients, the query buffer cannot exceed 1MB at most. */ - (c->mstate.argv_len_sums + sdslen(c->querybuf) > server.client_max_querybuf_len || - (c->mstate.argv_len_sums + sdslen(c->querybuf) > 1024 * 1024 && authRequired(c)))) { - sds ci = catClientInfoString(sdsempty(), c), bytes = sdsempty(); - - bytes = sdscatrepr(bytes, c->querybuf, 64); - serverLog(LL_WARNING, "Closing client that reached max query buffer length: %s (qbuf initial bytes: %s)", ci, - bytes); - sdsfree(ci); - sdsfree(bytes); - freeClientAsync(c); - atomic_fetch_add_explicit(&server.stat_client_qbuf_limit_disconnections, 1, memory_order_relaxed); - goto done; + size_t qb_memory = sdslen(c->querybuf) + c->mstate.argv_len_sums; + if (qb_memory > server.client_max_querybuf_len || + (qb_memory > 1024 * 1024 && (c->read_flags & READ_FLAGS_AUTH_REQUIRED))) { + c->read_flags |= READ_FLAGS_QB_LIMIT_REACHED; + } } +} - /* There is more data in the client input buffer, continue parsing it - * and check if there is a full command to execute. */ - if (processInputBuffer(c) == C_ERR) c = NULL; +void readQueryFromClient(connection *conn) { + client *c = connGetPrivateData(conn); + /* Check if we can send the client to be handled by the IO-thread */ + if (postponeClientRead(c)) return; -done: - if (c && c->querybuf == thread_shared_qb) { - sdsclear(thread_shared_qb); - c->querybuf = NULL; + if (c->io_write_state != CLIENT_IDLE || c->io_read_state != CLIENT_IDLE) return; + + readToQueryBuf(c); + + if (handleReadResult(c) == C_OK) { + if (processInputBuffer(c) == C_ERR) return; } beforeNextClient(c); } @@ -2849,6 +3112,7 @@ char *getClientSockname(client *c) { /* Concatenate a string representing the state of a client in a human * readable format, into the sds string 's'. */ sds catClientInfoString(sds s, client *client) { + if (!server.crashed) waitForClientIO(client); char flags[17], events[3], conninfo[CONN_INFO_LEN], *p; p = flags; @@ -4056,7 +4320,7 @@ void flushReplicasOutputBuffers(void) { */ if (replica->repl_state == REPLICA_STATE_ONLINE && !(replica->flag.close_asap) && can_receive_writes && !replica->repl_start_cmd_stream_on_ack && clientHasPendingReplies(replica)) { - writeToClient(replica, 0); + writeToClient(replica); } } } @@ -4218,375 +4482,79 @@ void processEventsWhileBlocked(void) { server.cmd_time_snapshot = prev_cmd_time_snapshot; } -/* ========================================================================== - * Threaded I/O - * ========================================================================== */ - -typedef struct __attribute__((aligned(CACHE_LINE_SIZE))) threads_pending { - _Atomic unsigned long value; -} threads_pending; - -pthread_t io_threads[IO_THREADS_MAX_NUM]; -pthread_mutex_t io_threads_mutex[IO_THREADS_MAX_NUM]; -threads_pending io_threads_pending[IO_THREADS_MAX_NUM]; -int io_threads_op; -/* IO_THREADS_OP_IDLE, IO_THREADS_OP_READ or IO_THREADS_OP_WRITE. */ // TODO: should access to this be atomic??! - -/* This is the list of clients each thread will serve when threaded I/O is - * used. We spawn io_threads_num-1 threads, since one is the main thread - * itself. */ -list *io_threads_list[IO_THREADS_MAX_NUM]; - -static inline unsigned long getIOPendingCount(int i) { - unsigned long count = atomic_load(&io_threads_pending[i].value); - return count; -} - -static inline void setIOPendingCount(int i, unsigned long count) { - atomic_store(&io_threads_pending[i].value, count); -} - -void *IOThreadMain(void *myid) { - /* The ID is the thread number (from 0 to server.io_threads_num-1), and is - * used by the thread to just manipulate a single sub-array of clients. */ - long id = (unsigned long)myid; - char thdname[16]; - - snprintf(thdname, sizeof(thdname), "io_thd_%ld", id); - valkey_set_thread_title(thdname); - serverSetCpuAffinity(server.server_cpulist); - makeThreadKillable(); - initSharedQueryBuf(); - - while (1) { - /* Wait for start */ - for (int j = 0; j < 1000000; j++) { - if (getIOPendingCount(id) != 0) break; - } - - /* Give the main thread a chance to stop this thread. */ - if (getIOPendingCount(id) == 0) { - pthread_mutex_lock(&io_threads_mutex[id]); - pthread_mutex_unlock(&io_threads_mutex[id]); - continue; - } +/* Return 1 if the client read is handled using threaded I/O. + * 0 otherwise. */ +int postponeClientRead(client *c) { + if (ProcessingEventsWhileBlocked) return 0; - serverAssert(getIOPendingCount(id) != 0); - - /* Process: note that the main thread will never touch our list - * before we drop the pending count to 0. */ - listIter li; - listNode *ln; - listRewind(io_threads_list[id], &li); - while ((ln = listNext(&li))) { - client *c = listNodeValue(ln); - if (io_threads_op == IO_THREADS_OP_WRITE) { - writeToClient(c, 0); - } else if (io_threads_op == IO_THREADS_OP_READ) { - readQueryFromClient(c->conn); - } else { - serverPanic("io_threads_op value is unknown"); - } - } - listEmpty(io_threads_list[id]); - setIOPendingCount(id, 0); - } + return (trySendReadToIOThreads(c) == C_OK); } -/* Initialize the data structures needed for threaded I/O. */ -void initThreadedIO(void) { - server.io_threads_active = 0; /* We start with threads not active. */ - - /* Indicate that io-threads are currently idle */ - io_threads_op = IO_THREADS_OP_IDLE; - - /* Don't spawn any thread if the user selected a single thread: - * we'll handle I/O directly from the main thread. */ - if (server.io_threads_num == 1) return; +int processIOThreadsReadDone(void) { + if (listLength(server.clients_pending_io_read) == 0) return 0; + int processed = 0; + listNode *ln; - if (server.io_threads_num > IO_THREADS_MAX_NUM) { - serverLog(LL_WARNING, - "Fatal: too many I/O threads configured. " - "The maximum number is %d.", - IO_THREADS_MAX_NUM); - exit(1); - } - - /* Spawn and initialize the I/O threads. */ - for (int i = 0; i < server.io_threads_num; i++) { - /* Things we do for all the threads including the main thread. */ - io_threads_list[i] = listCreate(); - if (i == 0) continue; /* Thread 0 is the main thread. */ - - /* Things we do only for the additional threads. */ - pthread_t tid; - pthread_mutex_init(&io_threads_mutex[i], NULL); - setIOPendingCount(i, 0); - pthread_mutex_lock(&io_threads_mutex[i]); /* Thread will be stopped. */ - if (pthread_create(&tid, NULL, IOThreadMain, (void *)(long)i) != 0) { - serverLog(LL_WARNING, "Fatal: Can't initialize IO thread."); - exit(1); - } - io_threads[i] = tid; - } -} + listNode *next = listFirst(server.clients_pending_io_read); + while (next) { + ln = next; + next = listNextNode(ln); + client *c = listNodeValue(ln); -void killIOThreads(void) { - int err, j; - for (j = 0; j < server.io_threads_num; j++) { - if (io_threads[j] == pthread_self()) continue; - if (io_threads[j] && pthread_cancel(io_threads[j]) == 0) { - if ((err = pthread_join(io_threads[j], NULL)) != 0) { - serverLog(LL_WARNING, "IO thread(tid:%lu) can not be joined: %s", (unsigned long)io_threads[j], - strerror(err)); - } else { - serverLog(LL_WARNING, "IO thread(tid:%lu) terminated", (unsigned long)io_threads[j]); - } + /* Client is still waiting for a pending I/O - skip it */ + if (c->io_write_state == CLIENT_PENDING_IO || c->io_read_state == CLIENT_PENDING_IO) continue; + /* If the write job is done, process it ASAP to free the buffer and handle connection errors */ + if (c->io_write_state == CLIENT_COMPLETED_IO) { + processClientIOWriteDone(c); } - } -} - -void startThreadedIO(void) { - serverAssert(server.io_threads_active == 0); - for (int j = 1; j < server.io_threads_num; j++) pthread_mutex_unlock(&io_threads_mutex[j]); - server.io_threads_active = 1; -} - -void stopThreadedIO(void) { - /* We may have still clients with pending reads when this function - * is called: handle them before stopping the threads. */ - handleClientsWithPendingReadsUsingThreads(); - serverAssert(server.io_threads_active == 1); - for (int j = 1; j < server.io_threads_num; j++) pthread_mutex_lock(&io_threads_mutex[j]); - server.io_threads_active = 0; -} - -/* This function checks if there are not enough pending clients to justify - * taking the I/O threads active: in that case I/O threads are stopped if - * currently active. We track the pending writes as a measure of clients - * we need to handle in parallel, however the I/O threading is disabled - * globally for reads as well if we have too little pending clients. - * - * The function returns 0 if the I/O threading should be used because there - * are enough active threads, otherwise 1 is returned and the I/O threads - * could be possibly stopped (if already active) as a side effect. */ -int stopThreadedIOIfNeeded(void) { - int pending = listLength(server.clients_pending_write); - - /* Return ASAP if IO threads are disabled (single threaded mode). */ - if (server.io_threads_num == 1) return 1; - - if (pending < (server.io_threads_num * 2)) { - if (server.io_threads_active) stopThreadedIO(); - return 1; - } else { - return 0; - } -} - -/* This function achieves thread safety using a fan-out -> fan-in paradigm: - * Fan out: The main thread fans out work to the io-threads which block until - * setIOPendingCount() is called with a value larger than 0 by the main thread. - * Fan in: The main thread waits until getIOPendingCount() returns 0. Then - * it can safely perform post-processing and return to normal synchronous - * work. */ -int handleClientsWithPendingWritesUsingThreads(void) { - int processed = listLength(server.clients_pending_write); - if (processed == 0) return 0; /* Return ASAP if there are no clients. */ + /* memory barrier acquire to get the updated client state */ + atomic_thread_fence(memory_order_acquire); + /* Don't post-process-writes to clients that are going to be closed anyway. */ + if (c->flag.close_asap) continue; + /* If a client is protected, don't do anything, + * that may trigger read/write error or recreate handler. */ + if (c->flag.protected) continue; - /* If I/O threads are disabled or we have few clients to serve, don't - * use I/O threads, but the boring synchronous code. */ - if (server.io_threads_num == 1 || stopThreadedIOIfNeeded()) { - return handleClientsWithPendingWrites(); - } + listUnlinkNode(server.clients_pending_io_read, ln); + c->flag.pending_read = 0; + c->io_read_state = CLIENT_IDLE; - /* Start threads if needed. */ - if (!server.io_threads_active) startThreadedIO(); + processed++; + server.stat_io_reads_processed++; - /* Distribute the clients across N different lists. */ - listIter li; - listNode *ln; - listRewind(server.clients_pending_write, &li); - int item_id = 0; - while ((ln = listNext(&li))) { - client *c = listNodeValue(ln); - c->flag.pending_write = 0; + connSetPostponeUpdateState(c->conn, 0); + connUpdateState(c->conn); - /* Remove clients from the list of pending writes since - * they are going to be closed ASAP. */ - if (c->flag.close_asap) { - listUnlinkNode(server.clients_pending_write, ln); + /* On read error - stop here. */ + if (handleReadResult(c) == C_ERR) { continue; } - /* Since all replicas and replication backlog use global replication - * buffer, to guarantee data accessing thread safe, we must put all - * replicas client into io_threads_list[0] i.e. main thread handles - * sending the output buffer of all replicas. */ - if (getClientType(c) == CLIENT_TYPE_REPLICA) { - listAddNodeTail(io_threads_list[0], c); - continue; + if (!(c->read_flags & READ_FLAGS_DONT_PARSE)) { + parseResult res = handleParseResults(c); + /* On parse error - stop here. */ + if (res == PARSE_ERR) { + continue; + } else if (res == PARSE_NEEDMORE) { + beforeNextClient(c); + continue; + } } - int target_id = item_id % server.io_threads_num; - listAddNodeTail(io_threads_list[target_id], c); - item_id++; - } - - /* Give the start condition to the waiting threads, by setting the - * start condition atomic var. */ - io_threads_op = IO_THREADS_OP_WRITE; - for (int j = 1; j < server.io_threads_num; j++) { - int count = listLength(io_threads_list[j]); - setIOPendingCount(j, count); - } - - /* Also use the main thread to process a slice of clients. */ - listRewind(io_threads_list[0], &li); - while ((ln = listNext(&li))) { - client *c = listNodeValue(ln); - writeToClient(c, 0); - } - listEmpty(io_threads_list[0]); - - /* Wait for all the other threads to end their work. */ - while (1) { - unsigned long pending = 0; - for (int j = 1; j < server.io_threads_num; j++) pending += getIOPendingCount(j); - if (pending == 0) break; - } - - io_threads_op = IO_THREADS_OP_IDLE; - - /* Run the list of clients again to install the write handler where - * needed. */ - listRewind(server.clients_pending_write, &li); - while ((ln = listNext(&li))) { - client *c = listNodeValue(ln); - - /* Update the client in the mem usage after we're done processing it in the io-threads */ - updateClientMemUsageAndBucket(c); - - /* Install the write handler if there are pending writes in some - * of the clients. */ - if (clientHasPendingReplies(c)) { - installClientWriteHandler(c); + if (c->argc > 0) { + c->flag.pending_command = 1; } - } - while (listLength(server.clients_pending_write) > 0) { - listUnlinkNode(server.clients_pending_write, server.clients_pending_write->head); - } - - /* Update processed count on server */ - server.stat_io_writes_processed += processed; - - return processed; -} - -/* Return 1 if we want to handle the client read later using threaded I/O. - * This is called by the readable handler of the event loop. - * As a side effect of calling this function the client is put in the - * pending read clients and flagged as such. */ -int postponeClientRead(client *c) { - if (server.io_threads_active && server.io_threads_do_reads && !ProcessingEventsWhileBlocked && - !(c->flag.primary || c->flag.replica || c->flag.blocked) && io_threads_op == IO_THREADS_OP_IDLE) { - listAddNodeHead(server.clients_pending_read, c); - c->pending_read_list_node = listFirst(server.clients_pending_read); - return 1; - } else { - return 0; - } -} - -/* When threaded I/O is also enabled for the reading + parsing side, the - * readable handler will just put normal clients into a queue of clients to - * process (instead of serving them synchronously). This function runs - * the queue using the I/O threads, and process them in order to accumulate - * the reads in the buffers, and also parse the first command available - * rendering it in the client structures. - * This function achieves thread safety using a fan-out -> fan-in paradigm: - * Fan out: The main thread fans out work to the io-threads which block until - * setIOPendingCount() is called with a value larger than 0 by the main thread. - * Fan in: The main thread waits until getIOPendingCount() returns 0. Then - * it can safely perform post-processing and return to normal synchronous - * work. */ -int handleClientsWithPendingReadsUsingThreads(void) { - if (!server.io_threads_active || !server.io_threads_do_reads) return 0; - int processed = listLength(server.clients_pending_read); - if (processed == 0) return 0; - - /* Distribute the clients across N different lists. */ - listIter li; - listNode *ln; - listRewind(server.clients_pending_read, &li); - int item_id = 0; - while ((ln = listNext(&li))) { - client *c = listNodeValue(ln); - int target_id = item_id % server.io_threads_num; - listAddNodeTail(io_threads_list[target_id], c); - item_id++; - } - - /* Give the start condition to the waiting threads, by setting the - * start condition atomic var. */ - io_threads_op = IO_THREADS_OP_READ; - for (int j = 1; j < server.io_threads_num; j++) { - int count = listLength(io_threads_list[j]); - setIOPendingCount(j, count); - } - - /* Also use the main thread to process a slice of clients. */ - listRewind(io_threads_list[0], &li); - while ((ln = listNext(&li))) { - client *c = listNodeValue(ln); - readQueryFromClient(c->conn); - } - listEmpty(io_threads_list[0]); - - /* Wait for all the other threads to end their work. */ - while (1) { - unsigned long pending = 0; - for (int j = 1; j < server.io_threads_num; j++) pending += getIOPendingCount(j); - if (pending == 0) break; - } - - io_threads_op = IO_THREADS_OP_IDLE; - - /* Run the list of clients again to process the new buffers. */ - while (listLength(server.clients_pending_read)) { - ln = listFirst(server.clients_pending_read); - client *c = listNodeValue(ln); - listDelNode(server.clients_pending_read, ln); - c->pending_read_list_node = NULL; - - serverAssert(!c->flag.blocked); - if (beforeNextClient(c) == C_ERR) { - /* If the client is no longer valid, we avoid - * processing the client later. So we just go - * to the next. */ - continue; + size_t list_length_before_command_execute = listLength(server.clients_pending_io_read); + if (processPendingCommandAndInputBuffer(c) == C_OK) { + beforeNextClient(c); } - - /* Once io-threads are idle we can update the client in the mem usage */ - updateClientMemUsageAndBucket(c); - - if (processPendingCommandAndInputBuffer(c) == C_ERR) { - /* If the client is no longer valid, we avoid - * processing the client later. So we just go - * to the next. */ - continue; + if (list_length_before_command_execute != listLength(server.clients_pending_io_read)) { + /* A client was unlink from the list possibly making the next node invalid */ + next = listFirst(server.clients_pending_io_read); } - - /* We may have pending replies if a thread readQueryFromClient() produced - * replies and did not put the client in pending write queue (it can't). - */ - if (!c->flag.pending_write && clientHasPendingReplies(c)) putClientInPendingWriteQueue(c); } - /* Update processed count on server */ - server.stat_io_reads_processed += processed; - return processed; } @@ -4640,3 +4608,44 @@ void evictClients(void) { } } } + +/* IO threads functions */ + +void ioThreadReadQueryFromClient(void *data) { + client *c = data; + serverAssert(c->io_read_state == CLIENT_PENDING_IO); + + /* Read */ + readToQueryBuf(c); + + /* Check for read errors. */ + if (c->nread <= 0) { + goto done; + } + + /* Skip command parsing if the READ_FLAGS_DONT_PARSE flag is set. */ + if (c->read_flags & READ_FLAGS_DONT_PARSE) { + goto done; + } + + /* Handle QB limit */ + if (c->read_flags & READ_FLAGS_QB_LIMIT_REACHED) { + goto done; + } + + parseCommand(c); + +done: + trimClientQueryBuffer(c); + atomic_thread_fence(memory_order_release); + c->io_read_state = CLIENT_COMPLETED_IO; +} + +void ioThreadWriteToClient(void *data) { + client *c = data; + serverAssert(c->io_write_state == CLIENT_PENDING_IO); + c->nwritten = 0; + _writeToClient(c); + atomic_thread_fence(memory_order_release); + c->io_write_state = CLIENT_COMPLETED_IO; +} diff --git a/src/rdb.c b/src/rdb.c index 8b1037ab93..f9ccd676fd 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -2931,7 +2931,7 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { processModuleLoadingProgressEvent(0); } if (server.repl_state == REPL_STATE_TRANSFER && rioCheckType(r) == RIO_TYPE_CONN) { - atomic_fetch_add_explicit(&server.stat_net_repl_input_bytes, len, memory_order_relaxed); + server.stat_net_repl_input_bytes += len; } } diff --git a/src/replication.c b/src/replication.c index 6779b4f1b4..21ccb0e92d 100644 --- a/src/replication.c +++ b/src/replication.c @@ -765,9 +765,11 @@ int primaryTryPartialResynchronization(client *c, long long psync_offset) { } /* If we reached this point, we are able to perform a partial resync: - * 1) Set client state to make it a replica. - * 2) Inform the client we can continue with +CONTINUE - * 3) Send the backlog data (from the offset to the end) to the replica. */ + * 1) Make sure no IO operations are being performed before changing the client state. + * 2) Set client state to make it a replica. + * 3) Inform the client we can continue with +CONTINUE + * 4) Send the backlog data (from the offset to the end) to the replica. */ + waitForClientIO(c); c->flag.replica = 1; c->repl_state = REPLICA_STATE_ONLINE; c->repl_ack_time = server.unixtime; @@ -1009,6 +1011,8 @@ void syncCommand(client *c) { c->repl_state = REPLICA_STATE_WAIT_BGSAVE_START; if (server.repl_disable_tcp_nodelay) connDisableTcpNoDelay(c->conn); /* Non critical if it fails. */ c->repldbfd = -1; + /* Wait for any IO pending operation to finish before changing the client state */ + waitForClientIO(c); c->flag.replica = 1; listAddNodeTail(server.replicas, c); @@ -1377,7 +1381,7 @@ void sendBulkToReplica(connection *conn) { freeClient(replica); return; } - atomic_fetch_add_explicit(&server.stat_net_repl_output_bytes, nwritten, memory_order_relaxed); + server.stat_net_repl_output_bytes += nwritten; sdsrange(replica->replpreamble, nwritten, -1); if (sdslen(replica->replpreamble) == 0) { sdsfree(replica->replpreamble); @@ -1405,7 +1409,7 @@ void sendBulkToReplica(connection *conn) { return; } replica->repldboff += nwritten; - atomic_fetch_add_explicit(&server.stat_net_repl_output_bytes, nwritten, memory_order_relaxed); + server.stat_net_repl_output_bytes += nwritten; if (replica->repldboff == replica->repldbsize) { closeRepldbfd(replica); connSetWriteHandler(replica->conn, NULL); @@ -1447,7 +1451,7 @@ void rdbPipeWriteHandler(struct connection *conn) { return; } else { replica->repldboff += nwritten; - atomic_fetch_add_explicit(&server.stat_net_repl_output_bytes, nwritten, memory_order_relaxed); + server.stat_net_repl_output_bytes += nwritten; if (replica->repldboff < server.rdb_pipe_bufflen) { replica->repl_last_partial_write = server.unixtime; return; /* more data to write.. */ @@ -1520,7 +1524,7 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, /* Note: when use diskless replication, 'repldboff' is the offset * of 'rdb_pipe_buff' sent rather than the offset of entire RDB. */ replica->repldboff = nwritten; - atomic_fetch_add_explicit(&server.stat_net_repl_output_bytes, nwritten, memory_order_relaxed); + server.stat_net_repl_output_bytes += nwritten; } /* If we were unable to write all the data to one of the replicas, * setup write handler (and disable pipe read handler, below) */ @@ -1831,7 +1835,7 @@ void readSyncBulkPayload(connection *conn) { } else { /* nread here is returned by connSyncReadLine(), which calls syncReadLine() and * convert "\r\n" to '\0' so 1 byte is lost. */ - atomic_fetch_add_explicit(&server.stat_net_repl_input_bytes, nread + 1, memory_order_relaxed); + server.stat_net_repl_input_bytes += nread + 1; } if (buf[0] == '-') { @@ -1900,7 +1904,7 @@ void readSyncBulkPayload(connection *conn) { cancelReplicationHandshake(1); return; } - atomic_fetch_add_explicit(&server.stat_net_repl_input_bytes, nread, memory_order_relaxed); + server.stat_net_repl_input_bytes += nread; /* When a mark is used, we want to detect EOF asap in order to avoid * writing the EOF mark into the file... */ diff --git a/src/server.c b/src/server.c index 57456c6597..465aa29391 100644 --- a/src/server.c +++ b/src/server.c @@ -39,6 +39,7 @@ #include "syscheck.h" #include "threads_mngr.h" #include "fmtargs.h" +#include "io_threads.h" #include #include @@ -754,6 +755,8 @@ int clientsCronResizeQueryBuffer(client *c) { * The buffer peak will be reset back to the buffer position every server.reply_buffer_peak_reset_time milliseconds * The function always returns 0 as it never terminates the client. */ int clientsCronResizeOutputBuffer(client *c, mstime_t now_ms) { + if (c->io_write_state != CLIENT_IDLE) return 0; + size_t new_buffer_size = 0; char *oldbuf = NULL; const size_t buffer_target_shrink_size = c->buf_usable_size / 2; @@ -904,7 +907,6 @@ void removeClientFromMemUsageBucket(client *c, int allow_eviction) { * returns 1 if client eviction for this client is allowed, 0 otherwise. */ int updateClientMemUsageAndBucket(client *c) { - serverAssert(io_threads_op == IO_THREADS_OP_IDLE && c->conn); int allow_eviction = clientEvictionAllowed(c); removeClientFromMemUsageBucket(c, allow_eviction); @@ -997,6 +999,7 @@ void clientsCron(void) { head = listFirst(server.clients); c = listNodeValue(head); listRotateHeadToTail(server.clients); + if (c->io_read_state != CLIENT_IDLE || c->io_write_state != CLIENT_IDLE) continue; /* The following functions do different service checks on the client. * The protocol is that they return non-zero if the client was * terminated. */ @@ -1075,8 +1078,7 @@ void databasesCron(void) { static inline void updateCachedTimeWithUs(int update_daylight_info, const long long ustime) { server.ustime = ustime; server.mstime = server.ustime / 1000; - time_t unixtime = server.mstime / 1000; - atomic_store_explicit(&server.unixtime, unixtime, memory_order_relaxed); + server.unixtime = server.mstime / 1000; /* To get information about daylight saving time, we need to call * localtime_r and cache the result. However calling localtime_r in this @@ -1257,23 +1259,18 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { monotime cron_start = getMonotonicUs(); run_with_period(100) { - long long stat_net_input_bytes, stat_net_output_bytes; - long long stat_net_repl_input_bytes, stat_net_repl_output_bytes; - - stat_net_input_bytes = atomic_load_explicit(&server.stat_net_input_bytes, memory_order_relaxed); - stat_net_output_bytes = atomic_load_explicit(&server.stat_net_output_bytes, memory_order_relaxed); - stat_net_repl_input_bytes = atomic_load_explicit(&server.stat_net_repl_input_bytes, memory_order_relaxed); - stat_net_repl_output_bytes = atomic_load_explicit(&server.stat_net_repl_output_bytes, memory_order_relaxed); - monotime current_time = getMonotonicUs(); long long factor = 1000000; // us trackInstantaneousMetric(STATS_METRIC_COMMAND, server.stat_numcommands, current_time, factor); - trackInstantaneousMetric(STATS_METRIC_NET_INPUT, stat_net_input_bytes + stat_net_repl_input_bytes, current_time, - factor); - trackInstantaneousMetric(STATS_METRIC_NET_OUTPUT, stat_net_output_bytes + stat_net_repl_output_bytes, + trackInstantaneousMetric(STATS_METRIC_NET_INPUT, server.stat_net_input_bytes + server.stat_net_repl_input_bytes, current_time, factor); - trackInstantaneousMetric(STATS_METRIC_NET_INPUT_REPLICATION, stat_net_repl_input_bytes, current_time, factor); - trackInstantaneousMetric(STATS_METRIC_NET_OUTPUT_REPLICATION, stat_net_repl_output_bytes, current_time, factor); + trackInstantaneousMetric(STATS_METRIC_NET_OUTPUT, + server.stat_net_output_bytes + server.stat_net_repl_output_bytes, current_time, + factor); + trackInstantaneousMetric(STATS_METRIC_NET_INPUT_REPLICATION, server.stat_net_repl_input_bytes, current_time, + factor); + trackInstantaneousMetric(STATS_METRIC_NET_OUTPUT_REPLICATION, server.stat_net_repl_output_bytes, current_time, + factor); trackInstantaneousMetric(STATS_METRIC_EL_CYCLE, server.duration_stats[EL_DURATION_TYPE_EL].cnt, current_time, factor); trackInstantaneousMetric(STATS_METRIC_EL_DURATION, server.duration_stats[EL_DURATION_TYPE_EL].sum, @@ -1433,9 +1430,6 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { migrateCloseTimedoutSockets(); } - /* Stop the I/O threads if we don't have enough pending work. */ - stopThreadedIOIfNeeded(); - /* Resize tracking keys table if needed. This is also done at every * command execution, but we want to be sure that if the last command * executed changes the value via CONFIG SET, the server will perform @@ -1580,23 +1574,31 @@ void beforeSleep(struct aeEventLoop *eventLoop) { * events to handle. */ if (ProcessingEventsWhileBlocked) { uint64_t processed = 0; - processed += handleClientsWithPendingReadsUsingThreads(); + processed += processIOThreadsReadDone(); processed += connTypeProcessPendingData(); if (server.aof_state == AOF_ON || server.aof_state == AOF_WAIT_REWRITE) flushAppendOnlyFile(0); processed += handleClientsWithPendingWrites(); + int last_procssed = 0; + do { + /* Try to process all the pending IO events. */ + last_procssed = processIOThreadsReadDone() + processIOThreadsWriteDone(); + processed += last_procssed; + } while (last_procssed != 0); processed += freeClientsInAsyncFreeQueue(); server.events_processed_while_blocked += processed; return; } /* We should handle pending reads clients ASAP after event loop. */ - handleClientsWithPendingReadsUsingThreads(); + processIOThreadsReadDone(); /* Handle pending data(typical TLS). (must be done before flushAppendOnlyFile) */ connTypeProcessPendingData(); - /* If any connection type(typical TLS) still has pending unread data don't sleep at all. */ - int dont_sleep = connTypeHasPendingData(); + /* If any connection type(typical TLS) still has pending unread data or if there are clients + * with pending IO reads/writes, don't sleep at all. */ + int dont_sleep = connTypeHasPendingData() || listLength(server.clients_pending_io_read) > 0 || + listLength(server.clients_pending_io_write) > 0; /* Call the Cluster before sleep function. Note that this function * may change the state of Cluster (from ok to fail or vice versa), @@ -1659,7 +1661,7 @@ void beforeSleep(struct aeEventLoop *eventLoop) { long long prev_fsynced_reploff = server.fsynced_reploff; /* Write the AOF buffer on disk, - * must be done before handleClientsWithPendingWritesUsingThreads, + * must be done before handleClientsWithPendingWrites, * in case of appendfsync=always. */ if (server.aof_state == AOF_ON || server.aof_state == AOF_WAIT_REWRITE) flushAppendOnlyFile(0); @@ -1679,7 +1681,14 @@ void beforeSleep(struct aeEventLoop *eventLoop) { } /* Handle writes with pending output buffers. */ - handleClientsWithPendingWritesUsingThreads(); + handleClientsWithPendingWrites(); + + /* Try to process more IO reads that are ready to be processed. */ + if (server.aof_fsync != AOF_FSYNC_ALWAYS) { + processIOThreadsReadDone(); + } + + processIOThreadsWriteDone(); /* Record cron time in beforeSleep. This does not include the time consumed by AOF writing and IO writing above. */ monotime cron_start_time_after_write = getMonotonicUs(); @@ -1729,7 +1738,7 @@ void beforeSleep(struct aeEventLoop *eventLoop) { /* This function is called immediately after the event loop multiplexing * API returned, and the control is going to soon return to the server by invoking * the different events callbacks. */ -void afterSleep(struct aeEventLoop *eventLoop) { +void afterSleep(struct aeEventLoop *eventLoop, int numevents) { UNUSED(eventLoop); /********************* WARNING ******************** * Do NOT add anything above moduleAcquireGIL !!! * @@ -1761,6 +1770,8 @@ void afterSleep(struct aeEventLoop *eventLoop) { if (!ProcessingEventsWhileBlocked) { server.cmd_time_snapshot = server.mstime; } + + adjustIOThreadsByEventLoad(numevents, 0); } /* =========================== Server initialization ======================== */ @@ -2478,10 +2489,10 @@ void resetServerStats(void) { server.stat_sync_partial_ok = 0; server.stat_sync_partial_err = 0; server.stat_io_reads_processed = 0; - atomic_store_explicit(&server.stat_total_reads_processed, 0, memory_order_relaxed); + server.stat_total_reads_processed = 0; server.stat_io_writes_processed = 0; - atomic_store_explicit(&server.stat_total_writes_processed, 0, memory_order_relaxed); - atomic_store_explicit(&server.stat_client_qbuf_limit_disconnections, 0, memory_order_relaxed); + server.stat_total_writes_processed = 0; + server.stat_client_qbuf_limit_disconnections = 0; server.stat_client_outbuf_limit_disconnections = 0; for (j = 0; j < STATS_METRIC_COUNT; j++) { server.inst_metric[j].idx = 0; @@ -2492,10 +2503,10 @@ void resetServerStats(void) { server.stat_aof_rewrites = 0; server.stat_rdb_saves = 0; server.stat_aofrw_consecutive_failures = 0; - atomic_store_explicit(&server.stat_net_input_bytes, 0, memory_order_relaxed); - atomic_store_explicit(&server.stat_net_output_bytes, 0, memory_order_relaxed); - atomic_store_explicit(&server.stat_net_repl_input_bytes, 0, memory_order_relaxed); - atomic_store_explicit(&server.stat_net_repl_output_bytes, 0, memory_order_relaxed); + server.stat_net_input_bytes = 0; + server.stat_net_output_bytes = 0; + server.stat_net_repl_input_bytes = 0; + server.stat_net_repl_output_bytes = 0; server.stat_unexpected_error_replies = 0; server.stat_total_error_replies = 0; server.stat_dump_payload_sanitizations = 0; @@ -2545,7 +2556,8 @@ void initServer(void) { server.replicas = listCreate(); server.monitors = listCreate(); server.clients_pending_write = listCreate(); - server.clients_pending_read = listCreate(); + server.clients_pending_io_write = listCreate(); + server.clients_pending_io_read = listCreate(); server.clients_timeout_table = raxNew(); server.replication_allowed = 1; server.replicas_eldb = -1; /* Force to emit the first SELECT command. */ @@ -2641,6 +2653,7 @@ void initServer(void) { server.rdb_last_load_keys_expired = 0; server.rdb_last_load_keys_loaded = 0; server.dirty = 0; + server.crashed = 0; resetServerStats(); /* A few stats we don't want to reset: server startup time, and peak mem. */ server.stat_starttime = time(NULL); @@ -2796,7 +2809,7 @@ void initListeners(void) { * see: https://sourceware.org/bugzilla/show_bug.cgi?id=19329 */ void InitServerLast(void) { bioInit(); - initThreadedIO(); + initIOThreads(); set_jemalloc_bg_thread(server.jemalloc_bg_thread); server.initial_memory_usage = zmalloc_used_memory(); } @@ -5395,7 +5408,7 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "lru_clock:%u\r\n", server.lruclock, "executable:%s\r\n", server.executable ? server.executable : "", "config_file:%s\r\n", server.configfile ? server.configfile : "", - "io_threads_active:%i\r\n", server.io_threads_active, + "io_threads_active:%i\r\n", server.active_io_threads_num > 1, "availability_zone:%s\r\n", server.availability_zone)); /* clang-format on */ @@ -5630,23 +5643,10 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { /* Stats */ if (all_sections || (dictFind(section_dict, "stats") != NULL)) { - long long stat_total_reads_processed, stat_total_writes_processed; - long long stat_net_input_bytes, stat_net_output_bytes; - long long stat_net_repl_input_bytes, stat_net_repl_output_bytes; long long current_eviction_exceeded_time = server.stat_last_eviction_exceeded_time ? (long long)elapsedUs(server.stat_last_eviction_exceeded_time) : 0; long long current_active_defrag_time = server.stat_last_active_defrag_time ? (long long)elapsedUs(server.stat_last_active_defrag_time) : 0; - long long stat_client_qbuf_limit_disconnections; - - stat_total_reads_processed = atomic_load_explicit(&server.stat_total_reads_processed, memory_order_relaxed); - stat_total_writes_processed = atomic_load_explicit(&server.stat_total_writes_processed, memory_order_relaxed); - stat_net_input_bytes = atomic_load_explicit(&server.stat_net_input_bytes, memory_order_relaxed); - stat_net_output_bytes = atomic_load_explicit(&server.stat_net_output_bytes, memory_order_relaxed); - stat_net_repl_input_bytes = atomic_load_explicit(&server.stat_net_repl_input_bytes, memory_order_relaxed); - stat_net_repl_output_bytes = atomic_load_explicit(&server.stat_net_repl_output_bytes, memory_order_relaxed); - stat_client_qbuf_limit_disconnections = - atomic_load_explicit(&server.stat_client_qbuf_limit_disconnections, memory_order_relaxed); if (sections++) info = sdscat(info, "\r\n"); /* clang-format off */ @@ -5654,10 +5654,10 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "total_connections_received:%lld\r\n", server.stat_numconnections, "total_commands_processed:%lld\r\n", server.stat_numcommands, "instantaneous_ops_per_sec:%lld\r\n", getInstantaneousMetric(STATS_METRIC_COMMAND), - "total_net_input_bytes:%lld\r\n", stat_net_input_bytes + stat_net_repl_input_bytes, - "total_net_output_bytes:%lld\r\n", stat_net_output_bytes + stat_net_repl_output_bytes, - "total_net_repl_input_bytes:%lld\r\n", stat_net_repl_input_bytes, - "total_net_repl_output_bytes:%lld\r\n", stat_net_repl_output_bytes, + "total_net_input_bytes:%lld\r\n", server.stat_net_input_bytes + server.stat_net_repl_input_bytes, + "total_net_output_bytes:%lld\r\n", server.stat_net_output_bytes + server.stat_net_repl_output_bytes, + "total_net_repl_input_bytes:%lld\r\n", server.stat_net_repl_input_bytes, + "total_net_repl_output_bytes:%lld\r\n", server.stat_net_repl_output_bytes, "instantaneous_input_kbps:%.2f\r\n", (float)getInstantaneousMetric(STATS_METRIC_NET_INPUT)/1024, "instantaneous_output_kbps:%.2f\r\n", (float)getInstantaneousMetric(STATS_METRIC_NET_OUTPUT)/1024, "instantaneous_input_repl_kbps:%.2f\r\n", (float)getInstantaneousMetric(STATS_METRIC_NET_INPUT_REPLICATION)/1024, @@ -5696,11 +5696,11 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "unexpected_error_replies:%lld\r\n", server.stat_unexpected_error_replies, "total_error_replies:%lld\r\n", server.stat_total_error_replies, "dump_payload_sanitizations:%lld\r\n", server.stat_dump_payload_sanitizations, - "total_reads_processed:%lld\r\n", stat_total_reads_processed, - "total_writes_processed:%lld\r\n", stat_total_writes_processed, + "total_reads_processed:%lld\r\n", server.stat_total_reads_processed, + "total_writes_processed:%lld\r\n", server.stat_total_writes_processed, "io_threaded_reads_processed:%lld\r\n", server.stat_io_reads_processed, "io_threaded_writes_processed:%lld\r\n", server.stat_io_writes_processed, - "client_query_buffer_limit_disconnections:%lld\r\n", stat_client_qbuf_limit_disconnections, + "client_query_buffer_limit_disconnections:%lld\r\n", server.stat_client_qbuf_limit_disconnections, "client_output_buffer_limit_disconnections:%lld\r\n", server.stat_client_outbuf_limit_disconnections, "reply_buffer_shrinks:%lld\r\n", server.stat_reply_buffer_shrinks, "reply_buffer_expands:%lld\r\n", server.stat_reply_buffer_expands, diff --git a/src/server.h b/src/server.h index 73f68a73d4..36a4b641e7 100644 --- a/src/server.h +++ b/src/server.h @@ -1117,6 +1117,12 @@ typedef struct { } clientReqResInfo; #endif +typedef enum { + CLIENT_IDLE = 0, /* Initial state: client is idle. */ + CLIENT_PENDING_IO = 1, /* Main-thread sets this state when client is sent to IO-thread for read/write. */ + CLIENT_COMPLETED_IO = 2 /* IO-thread sets this state after completing IO operation. */ +} clientIOState; + typedef struct ClientFlags { uint64_t primary : 1; /* This client is a primary */ uint64_t replica : 1; /* This client is a replica */ @@ -1141,6 +1147,7 @@ typedef struct ClientFlags { uint64_t prevent_repl_prop : 1; /* Don't propagate to replicas. */ uint64_t prevent_prop : 1; /* Don't propagate to AOF or replicas. */ uint64_t pending_write : 1; /* Client has output to send but a write handler is yet not installed. */ + uint64_t pending_read : 1; /* Client has output to send but a write handler is yet not installed. */ uint64_t reply_off : 1; /* Don't send replies to client. */ uint64_t reply_skip_next : 1; /* Set CLIENT_REPLY_SKIP for next cmd */ uint64_t reply_skip : 1; /* Don't send just this reply. */ @@ -1173,7 +1180,7 @@ typedef struct ClientFlags { uint64_t reprocessing_command : 1; /* The client is re-processing the command. */ uint64_t replication_done : 1; /* Indicate that replication has been done on the client */ uint64_t authenticated : 1; /* Indicate a client has successfully authenticated */ - uint64_t reserved : 10; /* Reserved for future use */ + uint64_t reserved : 9; /* Reserved for future use */ } ClientFlags; typedef struct client { @@ -1198,6 +1205,13 @@ typedef struct client { int original_argc; /* Num of arguments of original command if arguments were rewritten. */ robj **original_argv; /* Arguments of original command if arguments were rewritten. */ size_t argv_len_sum; /* Sum of lengths of objects in argv list. */ + volatile uint8_t io_read_state; /* Indicate the IO read state of the client */ + volatile uint8_t io_write_state; /* Indicate the IO write state of the client */ + uint8_t cur_tid; /* ID of IO thread currently performing IO for this client */ + int nread; /* Number of bytes of the last read. */ + int nwritten; /* Number of bytes of the last write. */ + int read_flags; /* Client Read flags - used to communicate the client read state. */ + uint16_t write_flags; /* Client Write flags - used to communicate the client write state. */ struct serverCommand *cmd, *lastcmd; /* Last command executed. */ struct serverCommand *realcmd; /* The original command that was executed by the client, Used to update error stats in case the c->cmd was modified @@ -1209,6 +1223,7 @@ typedef struct client { int multibulklen; /* Number of multi bulk arguments left to read. */ long bulklen; /* Length of bulk argument in multi bulk request. */ list *reply; /* List of reply objects to send to the client. */ + listNode *io_last_reply_block; /* Last client reply block when sent to IO thread */ unsigned long long reply_bytes; /* Tot bytes of objects in reply list. */ list *deferred_reply_errors; /* Used for module thread safe contexts. */ size_t sentlen; /* Amount of bytes already sent in the current @@ -1253,7 +1268,6 @@ typedef struct client { sds sockname; /* Cached connection target address. */ listNode *client_list_node; /* list node in client list */ listNode *postponed_list_node; /* list node within the postponed list */ - listNode *pending_read_list_node; /* list node in clients pending read list */ void *module_blocked_client; /* Pointer to the ValkeyModuleBlockedClient associated with this * client. This is set in case of module authentication before the * unblocked client is reprocessed to handle reply callbacks. */ @@ -1293,12 +1307,14 @@ typedef struct client { size_t ref_block_pos; /* Access position of referenced buffer block, * i.e. the next offset to send. */ - /* list node in clients_pending_write list */ + /* list node in clients_pending_write or in clients_pending_io_write list */ listNode clients_pending_write_node; + listNode pending_read_list_node; /* list node in clients_pending_io_read list */ /* Response buffer */ size_t buf_peak; /* Peak used size of buffer in last 5 sec interval. */ mstime_t buf_peak_last_reset_time; /* keeps the last time the buffer peak value was reset */ int bufpos; + size_t io_last_bufpos; /* The client's bufpos at the time it was sent to the IO thread */ size_t buf_usable_size; /* Usable size of buffer. */ char *buf; #ifdef LOG_REQ_RES @@ -1629,7 +1645,8 @@ struct valkeyServer { list *clients; /* List of active clients */ list *clients_to_close; /* Clients to close asynchronously */ list *clients_pending_write; /* There is to write or install handler. */ - list *clients_pending_read; /* Client has pending read socket buffers. */ + list *clients_pending_io_read; /* List of clients with pending read to be process by I/O threads. */ + list *clients_pending_io_write; /* List of clients with pending write to be process by I/O threads. */ list *replicas, *monitors; /* List of replicas and MONITORs */ client *current_client; /* The client that triggered the command execution (External or AOF). */ client *executing_client; /* The client executing the current command (possibly script or module). */ @@ -1657,7 +1674,8 @@ struct valkeyServer { int protected_mode; /* Don't accept external connections. */ int io_threads_num; /* Number of IO threads to use. */ int io_threads_do_reads; /* Read and parse from IO threads? */ - int io_threads_active; /* Is IO threads currently active? */ + int active_io_threads_num; /* Current number of active IO threads, includes main thread. */ + int events_per_io_thread; /* Number of events on the event loop to trigger IO threads activation. */ long long events_processed_while_blocked; /* processEventsWhileBlocked() */ int enable_protected_configs; /* Enable the modification of protected configs, see PROTECTED_ACTION_ALLOWED_* */ int enable_debug_cmd; /* Enable DEBUG commands, see PROTECTED_ACTION_ALLOWED_* */ @@ -1710,15 +1728,14 @@ struct valkeyServer { long long slowlog_log_slower_than; /* SLOWLOG time limit (to get logged) */ unsigned long slowlog_max_len; /* SLOWLOG max number of items logged */ struct malloc_stats cron_malloc_stats; /* sampled in serverCron(). */ - _Atomic long long stat_net_input_bytes; /* Bytes read from network. */ - _Atomic long long stat_net_output_bytes; /* Bytes written to network. */ - _Atomic long long - stat_net_repl_input_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */ - _Atomic long long - stat_net_repl_output_bytes; /* Bytes written during replication, added to stat_net_output_bytes in 'info'. */ - size_t stat_current_cow_peak; /* Peak size of copy on write bytes. */ - size_t stat_current_cow_bytes; /* Copy on write bytes while child is active. */ - monotime stat_current_cow_updated; /* Last update time of stat_current_cow_bytes */ + long long stat_net_input_bytes; /* Bytes read from network. */ + long long stat_net_output_bytes; /* Bytes written to network. */ + long long stat_net_repl_input_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */ + /* Bytes written during replication, added to stat_net_output_bytes in 'info'. */ + long long stat_net_repl_output_bytes; + size_t stat_current_cow_peak; /* Peak size of copy on write bytes. */ + size_t stat_current_cow_bytes; /* Copy on write bytes while child is active. */ + monotime stat_current_cow_updated; /* Last update time of stat_current_cow_bytes */ size_t stat_current_save_keys_processed; /* Processed keys while child is active. */ size_t stat_current_save_keys_total; /* Number of keys when child started. */ size_t stat_rdb_cow_bytes; /* Copy on write bytes during RDB saving. */ @@ -1730,12 +1747,12 @@ struct valkeyServer { long long stat_unexpected_error_replies; /* Number of unexpected (aof-loading, replica to primary, etc.) error replies */ long long stat_total_error_replies; /* Total number of issued error replies ( command + rejected errors ) */ - long long stat_dump_payload_sanitizations; /* Number deep dump payloads integrity validations. */ - long long stat_io_reads_processed; /* Number of read events processed by IO / Main threads */ - long long stat_io_writes_processed; /* Number of write events processed by IO / Main threads */ - _Atomic long long stat_total_reads_processed; /* Total number of read events processed */ - _Atomic long long stat_total_writes_processed; /* Total number of write events processed */ - _Atomic long long stat_client_qbuf_limit_disconnections; /* Total number of clients reached query buf length limit */ + long long stat_dump_payload_sanitizations; /* Number deep dump payloads integrity validations. */ + long long stat_io_reads_processed; /* Number of read events processed by IO threads */ + long long stat_io_writes_processed; /* Number of write events processed by IO threads */ + long long stat_total_reads_processed; /* Total number of read events processed */ + long long stat_total_writes_processed; /* Total number of write events processed */ + long long stat_client_qbuf_limit_disconnections; /* Total number of clients reached query buf length limit */ long long stat_client_outbuf_limit_disconnections; /* Total number of clients reached output buf length limit */ /* The following two are used to track instantaneous metrics, like * number of operations per second, network traffic. */ @@ -1881,6 +1898,8 @@ struct valkeyServer { int syslog_facility; /* Syslog facility */ int crashlog_enabled; /* Enable signal handler for crashlog. * disable for clean core dumps. */ + int crashed; /* True if the server has crashed, used in catClientInfoString + * to indicate that no wait for IO threads is needed. */ int memcheck_enabled; /* Enable memory check on crash. */ int use_exit_on_panic; /* Use exit() on panic and assert rather than * abort(). useful for Valgrind. */ @@ -2002,7 +2021,7 @@ struct valkeyServer { int list_max_listpack_size; int list_compress_depth; /* time cache */ - _Atomic time_t unixtime; /* Unix time sampled every cron cycle. */ + time_t unixtime; /* Unix time sampled every cron cycle. */ time_t timezone; /* Cached timezone. As set by tzset(). */ int daylight_active; /* Currently in daylight saving time. */ mstime_t mstime; /* 'unixtime' in milliseconds. */ @@ -2491,11 +2510,6 @@ typedef struct { #define OBJ_HASH_KEY 1 #define OBJ_HASH_VALUE 2 -#define IO_THREADS_OP_IDLE 0 -#define IO_THREADS_OP_READ 1 -#define IO_THREADS_OP_WRITE 2 -extern int io_threads_op; - /*----------------------------------------------------------------------------- * Extern declarations *----------------------------------------------------------------------------*/ @@ -2601,11 +2615,35 @@ void dictVanillaFree(dict *d, void *val); (1ULL << 0) /* Indicating that we should not update \ error stats after sending error reply */ /* networking.c -- Networking and Client related operations */ + +/* Read flags for various read errors and states */ +#define READ_FLAGS_QB_LIMIT_REACHED (1 << 0) +#define READ_FLAGS_ERROR_BIG_INLINE_REQUEST (1 << 1) +#define READ_FLAGS_ERROR_BIG_MULTIBULK (1 << 2) +#define READ_FLAGS_ERROR_INVALID_MULTIBULK_LEN (1 << 3) +#define READ_FLAGS_ERROR_UNAUTHENTICATED_MULTIBULK_LEN (1 << 4) +#define READ_FLAGS_ERROR_UNAUTHENTICATED_BULK_LEN (1 << 5) +#define READ_FLAGS_ERROR_BIG_BULK_COUNT (1 << 6) +#define READ_FLAGS_ERROR_MBULK_UNEXPECTED_CHARACTER (1 << 7) +#define READ_FLAGS_ERROR_MBULK_INVALID_BULK_LEN (1 << 8) +#define READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_PRIMARY (1 << 9) +#define READ_FLAGS_ERROR_UNBALANCED_QUOTES (1 << 10) +#define READ_FLAGS_INLINE_ZERO_QUERY_LEN (1 << 11) +#define READ_FLAGS_PARSING_NEGATIVE_MBULK_LEN (1 << 12) +#define READ_FLAGS_PARSING_COMPLETED (1 << 13) +#define READ_FLAGS_PRIMARY (1 << 14) +#define READ_FLAGS_DONT_PARSE (1 << 15) +#define READ_FLAGS_AUTH_REQUIRED (1 << 16) + +/* Write flags for various write errors and states */ +#define WRITE_FLAGS_WRITE_ERROR (1 << 0) + + client *createClient(connection *conn); void freeClient(client *c); void freeClientAsync(client *c); void logInvalidUseAndFreeClientAsync(client *c, const char *fmt, ...); -int beforeNextClient(client *c); +void beforeNextClient(client *c); void clearClientConnectionState(client *c); void resetClient(client *c); void freeClientOriginalArgv(client *c); @@ -2698,24 +2736,28 @@ void whileBlockedCron(void); void blockingOperationStarts(void); void blockingOperationEnds(void); int handleClientsWithPendingWrites(void); -int handleClientsWithPendingWritesUsingThreads(void); -int handleClientsWithPendingReadsUsingThreads(void); -int stopThreadedIOIfNeeded(void); +void adjustThreadedIOIfNeeded(void); int clientHasPendingReplies(client *c); int updateClientMemUsageAndBucket(client *c); void removeClientFromMemUsageBucket(client *c, int allow_eviction); void unlinkClient(client *c); -int writeToClient(client *c, int handler_installed); +int writeToClient(client *c); void linkClient(client *c); void protectClient(client *c); void unprotectClient(client *c); -void initThreadedIO(void); void initSharedQueryBuf(void); +void freeSharedQueryBuf(void); client *lookupClientByID(uint64_t id); int authRequired(client *c); void putClientInPendingWriteQueue(client *c); client *createCachedResponseClient(int resp); void deleteCachedResponseClient(client *recording_client); +void waitForClientIO(client *c); +void ioThreadReadQueryFromClient(void *data); +void ioThreadWriteToClient(void *data); +int canParseCommand(client *c); +int processIOThreadsReadDone(void); +int processIOThreadsWriteDone(void); /* logreqres.c - logging of requests and responses */ void reqresReset(client *c, int free_buf); @@ -3834,7 +3876,6 @@ void xorDigest(unsigned char *digest, const void *ptr, size_t len); sds catSubCommandFullname(const char *parent_name, const char *sub_name); void commandAddSubcommand(struct serverCommand *parent, struct serverCommand *subcommand, const char *declared_name); void debugDelay(int usec); -void killIOThreads(void); void killThreads(void); void makeThreadKillable(void); void swapMainDbWithTempDb(serverDb *tempDb); diff --git a/src/socket.c b/src/socket.c index 5aa3606990..b2f8f1aaec 100644 --- a/src/socket.c +++ b/src/socket.c @@ -423,6 +423,8 @@ static ConnectionType CT_Socket = { /* pending data */ .has_pending_data = NULL, .process_pending_data = NULL, + .postpone_update_state = NULL, + .update_state = NULL, }; int connBlock(connection *conn) { diff --git a/src/tls.c b/src/tls.c index 2d4d6cd0ae..1913d876fa 100644 --- a/src/tls.c +++ b/src/tls.c @@ -442,6 +442,7 @@ typedef enum { WANT_READ = 1, WANT_WRITE } WantIOType; #define TLS_CONN_FLAG_READ_WANT_WRITE (1 << 0) #define TLS_CONN_FLAG_WRITE_WANT_READ (1 << 1) #define TLS_CONN_FLAG_FD_SET (1 << 2) +#define TLS_CONN_FLAG_POSTPONE_UPDATE_STATE (1 << 3) typedef struct tls_connection { connection c; @@ -596,7 +597,34 @@ static void registerSSLEvent(tls_connection *conn, WantIOType want) { } } +static void postPoneUpdateSSLState(connection *conn_, int postpone) { + tls_connection *conn = (tls_connection *)conn_; + if (postpone) { + conn->flags |= TLS_CONN_FLAG_POSTPONE_UPDATE_STATE; + } else { + conn->flags &= ~TLS_CONN_FLAG_POSTPONE_UPDATE_STATE; + } +} + +static void updatePendingData(tls_connection *conn) { + if (conn->flags & TLS_CONN_FLAG_POSTPONE_UPDATE_STATE) return; + + /* If SSL has pending data, already read from the socket, we're at risk of not calling the read handler again, make + * sure to add it to a list of pending connection that should be handled anyway. */ + if (SSL_pending(conn->ssl) > 0) { + if (!conn->pending_list_node) { + listAddNodeTail(pending_list, conn); + conn->pending_list_node = listLast(pending_list); + } + } else if (conn->pending_list_node) { + listDelNode(pending_list, conn->pending_list_node); + conn->pending_list_node = NULL; + } +} + static void updateSSLEvent(tls_connection *conn) { + if (conn->flags & TLS_CONN_FLAG_POSTPONE_UPDATE_STATE) return; + int mask = aeGetFileEvents(server.el, conn->c.fd); int need_read = conn->c.read_handler || (conn->flags & TLS_CONN_FLAG_WRITE_WANT_READ); int need_write = conn->c.write_handler || (conn->flags & TLS_CONN_FLAG_READ_WANT_WRITE); @@ -610,6 +638,12 @@ static void updateSSLEvent(tls_connection *conn) { if (!need_write && (mask & AE_WRITABLE)) aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE); } +static void updateSSLState(connection *conn_) { + tls_connection *conn = (tls_connection *)conn_; + updateSSLEvent(conn); + updatePendingData(conn); +} + static void tlsHandleEvent(tls_connection *conn, int mask) { int ret, conn_error; @@ -711,19 +745,8 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { if (!callHandler((connection *)conn, conn->c.read_handler)) return; } - /* If SSL has pending that, already read from the socket, we're at - * risk of not calling the read handler again, make sure to add it - * to a list of pending connection that should be handled anyway. */ - if ((mask & AE_READABLE)) { - if (SSL_pending(conn->ssl) > 0) { - if (!conn->pending_list_node) { - listAddNodeTail(pending_list, conn); - conn->pending_list_node = listLast(pending_list); - } - } else if (conn->pending_list_node) { - listDelNode(pending_list, conn->pending_list_node); - conn->pending_list_node = NULL; - } + if (mask & AE_READABLE) { + updatePendingData(conn); } break; @@ -1051,11 +1074,13 @@ static int tlsProcessPendingData(void) { listIter li; listNode *ln; - int processed = listLength(pending_list); + int processed = 0; listRewind(pending_list, &li); while ((ln = listNext(&li))) { tls_connection *conn = listNodeValue(ln); + if (conn->flags & TLS_CONN_FLAG_POSTPONE_UPDATE_STATE) continue; tlsHandleEvent(conn, AE_READABLE); + processed++; } return processed; } @@ -1125,6 +1150,8 @@ static ConnectionType CT_TLS = { /* pending data */ .has_pending_data = tlsHasPendingData, .process_pending_data = tlsProcessPendingData, + .postpone_update_state = postPoneUpdateSSLState, + .update_state = updateSSLState, /* TLS specified methods */ .get_peer_cert = connTLSGetPeerCert, diff --git a/src/unix.c b/src/unix.c index ca38e83ed0..795b2db9f1 100644 --- a/src/unix.c +++ b/src/unix.c @@ -198,6 +198,8 @@ static ConnectionType CT_Unix = { /* pending data */ .has_pending_data = NULL, .process_pending_data = NULL, + .postpone_update_state = NULL, + .update_state = NULL, }; int RedisRegisterConnectionTypeUnix(void) { diff --git a/tests/integration/failover.tcl b/tests/integration/failover.tcl index 70bb66284d..3049cd0ca0 100644 --- a/tests/integration/failover.tcl +++ b/tests/integration/failover.tcl @@ -257,6 +257,12 @@ start_server {overrides {save {}}} { # during the pause. This write will not be interrupted. pause_process [srv -1 pid] set rd [valkey_deferring_client] + # wait for the client creation + wait_for_condition 50 100 { + [s connected_clients] == 2 + } else { + fail "Client creation failed" + } $rd SET FOO BAR $node_0 failover to $node_1_host $node_1_port resume_process [srv -1 pid] diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl index f56fe0a1dc..9634f78252 100644 --- a/tests/integration/replication.tcl +++ b/tests/integration/replication.tcl @@ -167,6 +167,7 @@ start_server {tags {"repl external:skip"}} { test {BLPOP followed by role change, issue #2473} { set rd [valkey_deferring_client] $rd blpop foo 0 ; # Block while B is a master + wait_for_blocked_clients_count 1 # Turn B into master of A $A slaveof no one diff --git a/tests/integration/shutdown.tcl b/tests/integration/shutdown.tcl index b2fdb845a3..9949afe27c 100644 --- a/tests/integration/shutdown.tcl +++ b/tests/integration/shutdown.tcl @@ -156,6 +156,12 @@ test "Shutting down master waits for replica then fails" { set rd2 [valkey_deferring_client -1] $rd1 shutdown $rd2 shutdown + wait_for_condition 50 100 { + [llength [lsearch -all [split [string trim [$master client list]] "\r\n"] *cmd=shutdown*]] == 2 + } else { + fail "SHUTDOWN not called on all clients" + } + set info_clients [$master info clients] assert_match "*connected_clients:3*" $info_clients assert_match "*blocked_clients:2*" $info_clients @@ -209,6 +215,12 @@ test "Shutting down master waits for replica then aborted" { set rd2 [valkey_deferring_client -1] $rd1 shutdown $rd2 shutdown + wait_for_condition 50 100 { + [llength [lsearch -all [split [string trim [$master client list]] "\r\n"] *cmd=shutdown*]] == 2 + } else { + fail "SHUTDOWN not called on all clients" + } + set info_clients [$master info clients] assert_match "*connected_clients:3*" $info_clients assert_match "*blocked_clients:2*" $info_clients diff --git a/tests/integration/valkey-cli.tcl b/tests/integration/valkey-cli.tcl index 153c527055..6344215a25 100644 --- a/tests/integration/valkey-cli.tcl +++ b/tests/integration/valkey-cli.tcl @@ -65,6 +65,7 @@ start_server {tags {"cli"}} { proc run_command {fd cmd} { write_cli $fd $cmd + after 50 set _ [format_output [read_cli $fd]] } diff --git a/tests/unit/client-eviction.tcl b/tests/unit/client-eviction.tcl index afcdcd1323..ceeb20f7b6 100644 --- a/tests/unit/client-eviction.tcl +++ b/tests/unit/client-eviction.tcl @@ -91,17 +91,31 @@ start_server {} { lassign [gen_client] rr cname # Attempt to fill the query buff with only half the percentage threshold verify we're not disconnected set n [expr $maxmemory_clients_actual / 2] - $rr write [join [list "*1\r\n\$$n\r\n" [string repeat v $n]] ""] + # send incomplete command (n - 1) to make sure we don't use the shared qb + $rr write [join [list "*1\r\n\$$n\r\n" [string repeat v [expr {$n - 1}]]] ""] $rr flush + # Wait for the client to start using a private query buffer. + wait_for_condition 10 10 { + [client_field $cname qbuf] > 0 + } else { + fail "client should start using a private query buffer" + } set tot_mem [client_field $cname tot-mem] assert {$tot_mem >= $n && $tot_mem < $maxmemory_clients_actual} # Attempt to fill the query buff with the percentage threshold of maxmemory and verify we're evicted $rr close lassign [gen_client] rr cname + # send incomplete command (maxmemory_clients_actual - 1) to make sure we don't use the shared qb catch { - $rr write [join [list "*1\r\n\$$maxmemory_clients_actual\r\n" [string repeat v $maxmemory_clients_actual]] ""] + $rr write [join [list "*1\r\n\$$maxmemory_clients_actual\r\n" [string repeat v [expr {$maxmemory_clients_actual - 1}]]] ""] $rr flush + # Wait for the client to start using a private query buffer. + wait_for_condition 10 10 { + [client_field $cname qbuf] > 0 + } else { + fail "client should start using a private query buffer" + } } e assert {![client_exists $cname]} $rr close @@ -399,6 +413,11 @@ start_server {} { # Decrease maxmemory_clients and expect client eviction r config set maxmemory-clients [expr $maxmemory_clients / 2] + wait_for_condition 50 10 { + [llength [lsearch -all [split [string trim [r client list]] "\r\n"] *name=client*]] < $client_count + } else { + fail "Failed to evict clients" + } set connected_clients [llength [lsearch -all [split [string trim [r client list]] "\r\n"] *name=client*]] assert {$connected_clients > 0 && $connected_clients < $client_count} diff --git a/tests/unit/cluster/pubsubshard.tcl b/tests/unit/cluster/pubsubshard.tcl index e32b6a3a0e..d38c22dedb 100644 --- a/tests/unit/cluster/pubsubshard.tcl +++ b/tests/unit/cluster/pubsubshard.tcl @@ -62,7 +62,13 @@ test "sunsubscribe without specifying any channel would unsubscribe all shard ch set sub_res [ssubscribe $subscribeclient [list "\{channel.0\}1" "\{channel.0\}2" "\{channel.0\}3"]] assert_equal [list 1 2 3] $sub_res sunsubscribe $subscribeclient - + + # wait for the unsubscribe to take effect + wait_for_condition 50 10 { + [$publishclient spublish "\{channel.0\}1" hello] eq 0 + } else { + fail "unsubscribe did not take effect as expected" + } assert_equal 0 [$publishclient spublish "\{channel.0\}1" hello] assert_equal 0 [$publishclient spublish "\{channel.0\}2" hello] assert_equal 0 [$publishclient spublish "\{channel.0\}3" hello] diff --git a/tests/unit/dump.tcl b/tests/unit/dump.tcl index 9018270d67..e4c0f9d312 100644 --- a/tests/unit/dump.tcl +++ b/tests/unit/dump.tcl @@ -287,6 +287,7 @@ start_server {tags {"dump"}} { set rd [valkey_deferring_client] $rd debug sleep 1.0 ; # Make second server unable to reply. + after 100; # wait to make sure DEBUG command was executed. set e {} catch {r -1 migrate $second_host $second_port key 9 500} e assert_match {IOERR*} $e diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl index 17dc6a1861..befecae220 100644 --- a/tests/unit/info.tcl +++ b/tests/unit/info.tcl @@ -295,47 +295,50 @@ start_server {tags {"info" "external:skip"}} { } } - test {stats: eventloop metrics} { - set info1 [r info stats] - set cycle1 [getInfoProperty $info1 eventloop_cycles] - set el_sum1 [getInfoProperty $info1 eventloop_duration_sum] - set cmd_sum1 [getInfoProperty $info1 eventloop_duration_cmd_sum] - assert_morethan $cycle1 0 - assert_morethan $el_sum1 0 - assert_morethan $cmd_sum1 0 - after 110 ;# default hz is 10, wait for a cron tick. - set info2 [r info stats] - set cycle2 [getInfoProperty $info2 eventloop_cycles] - set el_sum2 [getInfoProperty $info2 eventloop_duration_sum] - set cmd_sum2 [getInfoProperty $info2 eventloop_duration_cmd_sum] - if {$::verbose} { puts "eventloop metrics cycle1: $cycle1, cycle2: $cycle2" } - assert_morethan $cycle2 $cycle1 - assert_lessthan $cycle2 [expr $cycle1+10] ;# we expect 2 or 3 cycles here, but allow some tolerance - if {$::verbose} { puts "eventloop metrics el_sum1: $el_sum1, el_sum2: $el_sum2" } - assert_morethan $el_sum2 $el_sum1 - assert_lessthan $el_sum2 [expr $el_sum1+30000] ;# we expect roughly 100ms here, but allow some tolerance - if {$::verbose} { puts "eventloop metrics cmd_sum1: $cmd_sum1, cmd_sum2: $cmd_sum2" } - assert_morethan $cmd_sum2 $cmd_sum1 - assert_lessthan $cmd_sum2 [expr $cmd_sum1+15000] ;# we expect about tens of ms here, but allow some tolerance - } - - test {stats: instantaneous metrics} { - r config resetstat - set retries 0 - for {set retries 1} {$retries < 4} {incr retries} { - after 1600 ;# hz is 10, wait for 16 cron tick so that sample array is fulfilled - set value [s instantaneous_eventloop_cycles_per_sec] - if {$value > 0} break + # skip the following 2 tests if we are running with io-threads as the eventloop metrics are different in that case. + if {[r config get io-threads] eq 0} { + test {stats: eventloop metrics} { + set info1 [r info stats] + set cycle1 [getInfoProperty $info1 eventloop_cycles] + set el_sum1 [getInfoProperty $info1 eventloop_duration_sum] + set cmd_sum1 [getInfoProperty $info1 eventloop_duration_cmd_sum] + assert_morethan $cycle1 0 + assert_morethan $el_sum1 0 + assert_morethan $cmd_sum1 0 + after 110 ;# default hz is 10, wait for a cron tick. + set info2 [r info stats] + set cycle2 [getInfoProperty $info2 eventloop_cycles] + set el_sum2 [getInfoProperty $info2 eventloop_duration_sum] + set cmd_sum2 [getInfoProperty $info2 eventloop_duration_cmd_sum] + if {$::verbose} { puts "eventloop metrics cycle1: $cycle1, cycle2: $cycle2" } + assert_morethan $cycle2 $cycle1 + assert_lessthan $cycle2 [expr $cycle1+10] ;# we expect 2 or 3 cycles here, but allow some tolerance + if {$::verbose} { puts "eventloop metrics el_sum1: $el_sum1, el_sum2: $el_sum2" } + assert_morethan $el_sum2 $el_sum1 + assert_lessthan $el_sum2 [expr $el_sum1+30000] ;# we expect roughly 100ms here, but allow some tolerance + if {$::verbose} { puts "eventloop metrics cmd_sum1: $cmd_sum1, cmd_sum2: $cmd_sum2" } + assert_morethan $cmd_sum2 $cmd_sum1 + assert_lessthan $cmd_sum2 [expr $cmd_sum1+15000] ;# we expect about tens of ms here, but allow some tolerance + } + + test {stats: instantaneous metrics} { + r config resetstat + set retries 0 + for {set retries 1} {$retries < 4} {incr retries} { + after 1600 ;# hz is 10, wait for 16 cron tick so that sample array is fulfilled + set value [s instantaneous_eventloop_cycles_per_sec] + if {$value > 0} break + } + + assert_lessthan $retries 4 + if {$::verbose} { puts "instantaneous metrics instantaneous_eventloop_cycles_per_sec: $value" } + assert_morethan $value 0 + assert_lessthan $value [expr $retries*15] ;# default hz is 10 + set value [s instantaneous_eventloop_duration_usec] + if {$::verbose} { puts "instantaneous metrics instantaneous_eventloop_duration_usec: $value" } + assert_morethan $value 0 + assert_lessthan $value [expr $retries*22000] ;# default hz is 10, so duration < 1000 / 10, allow some tolerance } - - assert_lessthan $retries 4 - if {$::verbose} { puts "instantaneous metrics instantaneous_eventloop_cycles_per_sec: $value" } - assert_morethan $value 0 - assert_lessthan $value [expr $retries*15] ;# default hz is 10 - set value [s instantaneous_eventloop_duration_usec] - if {$::verbose} { puts "instantaneous metrics instantaneous_eventloop_duration_usec: $value" } - assert_morethan $value 0 - assert_lessthan $value [expr $retries*22000] ;# default hz is 10, so duration < 1000 / 10, allow some tolerance } test {stats: debug metrics} { diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl index ee1232796d..66dae2546a 100644 --- a/tests/unit/maxmemory.tcl +++ b/tests/unit/maxmemory.tcl @@ -98,6 +98,7 @@ start_server {tags {"maxmemory" "external:skip"}} { $rr write "\r\n" $rr flush } + after 100; # give the server some time to process the input buffer - this was added to make sure the test pass with io-threads active. }]} { lremove clients $rr } diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index 525db407bf..feb98d9cdd 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -404,6 +404,8 @@ run_solo {defrag} { r save ;# saving an rdb iterates over all the data / pointers } {OK} + # Skip the following two tests if we are running with IO threads, as the IO threads allocate the command arguments in a different arena. As a result, fragmentation is not as expected. + if {[r config get io-threads] eq 0} { test "Active defrag pubsub: $type" { r flushdb r config resetstat @@ -502,6 +504,7 @@ run_solo {defrag} { } $rd_pubsub close } + } ;# io-threads if {$type eq "standalone"} { ;# skip in cluster mode test "Active defrag big list: $type" { diff --git a/tests/unit/moduleapi/blockedclient.tcl b/tests/unit/moduleapi/blockedclient.tcl index d94ef5c5ba..bb0a15db50 100644 --- a/tests/unit/moduleapi/blockedclient.tcl +++ b/tests/unit/moduleapi/blockedclient.tcl @@ -128,7 +128,7 @@ foreach call_type {nested normal} { # send another command after the blocked one, to make sure we don't attempt to process it $rd ping $rd flush - + after 100 # make sure we get BUSY error, and that we didn't get it too early assert_error {*BUSY Slow module operation*} {r ping} assert_morethan_equal [expr [clock clicks -milliseconds]-$start] $busy_time_limit diff --git a/tests/unit/pubsub.tcl b/tests/unit/pubsub.tcl index 2f336dfcb9..72d0498ce1 100644 --- a/tests/unit/pubsub.tcl +++ b/tests/unit/pubsub.tcl @@ -85,6 +85,12 @@ start_server {tags {"pubsub network"}} { set rd1 [valkey_deferring_client] assert_equal {1 2 3} [subscribe $rd1 {chan1 chan2 chan3}] unsubscribe $rd1 + # wait for the unsubscribe to take effect + wait_for_condition 50 100 { + [r publish chan1 hello] eq 0 + } else { + fail "unsubscribe did not take effect" + } assert_equal 0 [r publish chan1 hello] assert_equal 0 [r publish chan2 hello] assert_equal 0 [r publish chan3 hello] @@ -158,6 +164,12 @@ start_server {tags {"pubsub network"}} { set rd1 [valkey_deferring_client] assert_equal {1 2 3} [psubscribe $rd1 {chan1.* chan2.* chan3.*}] punsubscribe $rd1 + # wait for the unsubscribe to take effect + wait_for_condition 50 100 { + [r publish chan1.hi hello] eq 0 + } else { + fail "unsubscribe did not take effect" + } assert_equal 0 [r publish chan1.hi hello] assert_equal 0 [r publish chan2.hi hello] assert_equal 0 [r publish chan3.hi hello] diff --git a/tests/unit/pubsubshard.tcl b/tests/unit/pubsubshard.tcl index d56f36ffaa..e19db211f7 100644 --- a/tests/unit/pubsubshard.tcl +++ b/tests/unit/pubsubshard.tcl @@ -46,6 +46,14 @@ start_server {tags {"pubsubshard external:skip"}} { assert_equal {2} [ssubscribe $rd1 {chan2}] assert_equal {3} [ssubscribe $rd1 {chan3}] sunsubscribe $rd1 + + # wait for the unsubscribe to take effect + wait_for_condition 50 100 { + [r spublish chan1 hello] eq 0 + } else { + fail "unsubscribe did not take effect" + } + assert_equal 0 [r SPUBLISH chan1 hello] assert_equal 0 [r SPUBLISH chan2 hello] assert_equal 0 [r SPUBLISH chan3 hello] diff --git a/tests/unit/querybuf.tcl b/tests/unit/querybuf.tcl index 519743d248..f0f432b38f 100644 --- a/tests/unit/querybuf.tcl +++ b/tests/unit/querybuf.tcl @@ -92,7 +92,7 @@ start_server {tags {"querybuf slow"}} { # Write something smaller, so query buf peak can shrink $rd set x [string repeat A 100] set new_test_client_qbuf [client_query_buffer test_client] - if {$new_test_client_qbuf < $orig_test_client_qbuf} { break } + if {$new_test_client_qbuf < $orig_test_client_qbuf && $new_test_client_qbuf > 0} { break } if {[expr [clock milliseconds] - $t] > 1000} { break } after 10 } diff --git a/tests/unit/type/list.tcl b/tests/unit/type/list.tcl index e6c8bb331f..4773a58820 100644 --- a/tests/unit/type/list.tcl +++ b/tests/unit/type/list.tcl @@ -1100,6 +1100,13 @@ foreach {pop} {BLPOP BLMPOP_LEFT} { $watching_client get somekey{t} $watching_client read $watching_client exec + # wait for exec to be called. + wait_for_condition 50 10 { + [llength [lsearch -all [split [string trim [r client list]] "\r\n"] *cmd=exec*]] == 1 + } else { + fail "$cmd was not called" + } + # Blocked BLPOPLPUSH may create problems, unblock it. r lpush srclist{t} element set res [$watching_client read] diff --git a/tests/unit/type/stream-cgroups.tcl b/tests/unit/type/stream-cgroups.tcl index 2cd812e521..d934e48140 100644 --- a/tests/unit/type/stream-cgroups.tcl +++ b/tests/unit/type/stream-cgroups.tcl @@ -520,7 +520,7 @@ start_server { # Before the fix in #13004, this time would have been 1200+ (i.e. more than 1200ms), # now it should be 1000, but in order to avoid timing issues, we increase the range a bit. - assert_range [expr $end-$start] 1000 1150 + assert_range [expr $end-$start] 1000 1199 $rd1 close $rd2 close @@ -931,14 +931,14 @@ start_server { set reply [r xinfo consumers mystream mygroup] set consumer_info [lindex $reply 0] assert_equal [lindex $consumer_info 1] "Alice" ;# consumer name - assert {[dict get $consumer_info idle] < 80} ;# consumer idle (seen-time) - assert {[dict get $consumer_info inactive] < 80} ;# consumer inactive (active-time) + assert {[dict get $consumer_info idle] < 300} ;# consumer idle (seen-time) + assert {[dict get $consumer_info inactive] < 300} ;# consumer inactive (active-time) after 100 r XREADGROUP GROUP mygroup Alice COUNT 1 STREAMS mystream > set reply [r xinfo consumers mystream mygroup] set consumer_info [lindex $reply 0] - assert {[dict get $consumer_info idle] < 80} ;# consumer idle (seen-time) + assert {[dict get $consumer_info idle] < 300} ;# consumer idle (seen-time) assert {[dict get $consumer_info inactive] >= 100} ;# consumer inactive (active-time) @@ -1324,6 +1324,9 @@ start_server { assert_equal [dict get $group entries-read] 3 assert_equal [dict get $group lag] 0 + # wait for replica offset + wait_for_ofs_sync $master $replica + set reply [$replica XINFO STREAM mystream FULL] set group [lindex [dict get $reply groups] 0] assert_equal [dict get $group entries-read] 3 diff --git a/tests/unit/type/zset.tcl b/tests/unit/type/zset.tcl index f6c643a5ef..b341bbf69d 100644 --- a/tests/unit/type/zset.tcl +++ b/tests/unit/type/zset.tcl @@ -2012,6 +2012,7 @@ start_server {tags {"zset"}} { # Before the fix in #13004, this time would have been 1200+ (i.e. more than 1200ms), # now it should be 1000, but in order to avoid timing issues, we increase the range a bit. assert_range [expr $end-$start] 1000 1150 + puts "Time: [expr $end-$start]" r debug set-active-expire 1 $rd close diff --git a/valkey.conf b/valkey.conf index e4ffd0f8ad..8badf1487a 100644 --- a/valkey.conf +++ b/valkey.conf @@ -1288,9 +1288,8 @@ lazyfree-lazy-user-flush no # to pipelining nor sharding of the instance. # # By default threading is disabled, we suggest enabling it only in machines -# that have at least 4 or more cores, leaving at least one spare core. -# Using more than 8 threads is unlikely to help much. We also recommend using -# threaded I/O only if you actually have performance problems, with +# that have at least 3 or more cores, leaving at least one spare core. +# We also recommend using threaded I/O only if you actually have performance problems, with # instances being able to use a quite big percentage of CPU time, otherwise # there is no point in using this feature. # @@ -1301,19 +1300,9 @@ lazyfree-lazy-user-flush no # io-threads 4 # # Setting io-threads to 1 will just use the main thread as usual. -# When I/O threads are enabled, we only use threads for writes, that is -# to thread the write(2) syscall and transfer the client buffers to the -# socket. However it is also possible to enable threading of reads and -# protocol parsing using the following configuration directive, by setting -# it to yes: -# -# io-threads-do-reads no -# -# Usually threading reads doesn't help much. -# -# NOTE 1: This configuration directive cannot be changed at runtime via -# CONFIG SET. Also, this feature currently does not work when SSL is -# enabled. +# When I/O threads are enabled, we use threads for reads and writes, that is +# to thread the write and read syscall and transfer the client buffers to the +# socket and to enable threading of reads and protocol parsing. # # NOTE 2: If you want to test the server speedup using valkey-benchmark, make # sure you also run the benchmark itself in threaded mode, using the