From b6e0f1998930a4a56175ef56f45d08095c62c14e Mon Sep 17 00:00:00 2001
From: Karthik Subbarao <karthikrs2021@gmail.com>
Date: Wed, 15 May 2024 08:43:36 +0000
Subject: [PATCH 1/6] Add support for limiting custom LUA errors when over 128
 error messages while allowing non LUA errors to function as usual

Signed-off-by: Karthik Subbarao <karthikrs2021@gmail.com>
---
 src/networking.c    | 28 +++++++++++++++++-----------
 src/script_lua.c    |  2 +-
 src/server.c        | 42 ------------------------------------------
 src/server.h        |  7 ++++++-
 tests/unit/info.tcl | 18 ++++++------------
 5 files changed, 30 insertions(+), 67 deletions(-)

diff --git a/src/networking.c b/src/networking.c
index 5aa02e8315..afb4ed9ca6 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -513,19 +513,25 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) {
     if (!(flags & ERR_REPLY_FLAG_NO_STATS_UPDATE)) {
         /* Increment the global error counter */
         server.stat_total_error_replies++;
-        /* Increment the error stats
-         * If the string already starts with "-..." then the error prefix
-         * is provided by the caller ( we limit the search to 32 chars). Otherwise we use "-ERR". */
-        if (s[0] != '-') {
-            incrementErrorCount("ERR", 3);
+        /* After the errors RAX reaches this limit, instead of tracking
+         * custom LUA errors, we track the error under `error_LUA` */
+        if (flags & ERR_REPLY_FLAG_LUA && raxSize(server.errors) >= ERROR_STATS_LUA_LIMIT) {
+            incrementErrorCount("LUA_ERRORSTATS_DISABLED", 23);
         } else {
-            char *spaceloc = memchr(s, ' ', len < 32 ? len : 32);
-            if (spaceloc) {
-                const size_t errEndPos = (size_t)(spaceloc - s);
-                incrementErrorCount(s+1, errEndPos-1);
-            } else {
-                /* Fallback to ERR if we can't retrieve the error prefix */
+            /* Increment the error stats
+            * If the string already starts with "-..." then the error prefix
+            * is provided by the caller ( we limit the search to 32 chars). Otherwise we use "-ERR". */
+            if (s[0] != '-') {
                 incrementErrorCount("ERR", 3);
+            } else {
+                char *spaceloc = memchr(s, ' ', len < 32 ? len : 32);
+                if (spaceloc) {
+                    const size_t errEndPos = (size_t)(spaceloc - s);
+                    incrementErrorCount(s+1, errEndPos-1);
+                } else {
+                    /* Fallback to ERR if we can't retrieve the error prefix */
+                    incrementErrorCount("ERR", 3);
+                }
             }
         }
     } else {
diff --git a/src/script_lua.c b/src/script_lua.c
index 21bf6f6af9..501116ae4f 100644
--- a/src/script_lua.c
+++ b/src/script_lua.c
@@ -639,7 +639,7 @@ static void luaReplyToRedisReply(client *c, client* script_client, lua_State *lu
             errorInfo err_info = {0};
             luaExtractErrorInformation(lua, &err_info);
             addReplyErrorFormatEx(c,
-                                  err_info.ignore_err_stats_update? ERR_REPLY_FLAG_NO_STATS_UPDATE: 0,
+                                  err_info.ignore_err_stats_update? ERR_REPLY_FLAG_NO_STATS_UPDATE | ERR_REPLY_FLAG_LUA: ERR_REPLY_FLAG_LUA,
                                   "-%s",
                                   err_info.msg);
             luaErrorInformationDiscard(&err_info);
diff --git a/src/server.c b/src/server.c
index fe54d9eb82..284e675e4a 100644
--- a/src/server.c
+++ b/src/server.c
@@ -2629,7 +2629,6 @@ void initServer(void) {
     server.main_thread_id = pthread_self();
     server.current_client = NULL;
     server.errors = raxNew();
-    server.errors_enabled = 1;
     server.execution_nesting = 0;
     server.clients = listCreate();
     server.clients_index = raxNew();
@@ -3136,7 +3135,6 @@ void resetCommandTableStats(dict* commands) {
 void resetErrorTableStats(void) {
     freeErrorsRadixTreeAsync(server.errors);
     server.errors = raxNew();
-    server.errors_enabled = 1;
 }
 
 /* ========================== OP Array API ============================ */
@@ -4233,49 +4231,9 @@ int processCommand(client *c) {
 
 /* ====================== Error lookup and execution ===================== */
 
-/* Users who abuse lua error_reply will generate a new error object on each
- * error call, which can make server.errors get bigger and bigger. This will
- * cause the server to block when calling INFO (we also return errorstats by
- * default). To prevent the damage it can cause, when a misuse is detected,
- * we will print the warning log and disable the errorstats to avoid adding
- * more new errors. It can be re-enabled via CONFIG RESETSTAT. */
-#define ERROR_STATS_NUMBER 128
 void incrementErrorCount(const char *fullerr, size_t namelen) {
-    /* errorstats is disabled, return ASAP. */
-    if (!server.errors_enabled) return;
-
     void *result;
     if (!raxFind(server.errors,(unsigned char*)fullerr,namelen,&result)) {
-        if (server.errors->numele >= ERROR_STATS_NUMBER) {
-            sds errors = sdsempty();
-            raxIterator ri;
-            raxStart(&ri, server.errors);
-            raxSeek(&ri, "^", NULL, 0);
-            while (raxNext(&ri)) {
-                char *tmpsafe;
-                errors = sdscatlen(errors, getSafeInfoString((char *)ri.key, ri.key_len, &tmpsafe), ri.key_len);
-                errors = sdscatlen(errors, ", ", 2);
-                if (tmpsafe != NULL) zfree(tmpsafe);
-            }
-            sdsrange(errors, 0, -3); /* Remove final ", ". */
-            raxStop(&ri);
-
-            /* Print the warning log and the contents of server.errors to the log. */
-            serverLog(LL_WARNING,
-                      "Errorstats stopped adding new errors because the number of "
-                      "errors reached the limit, may be misuse of lua error_reply, "
-                      "please check INFO ERRORSTATS, this can be re-enabled via "
-                      "CONFIG RESETSTAT.");
-            serverLog(LL_WARNING, "Current errors code list: %s", errors);
-            sdsfree(errors);
-
-            /* Reset the errors and add a single element to indicate that it is disabled. */
-            resetErrorTableStats();
-            incrementErrorCount("ERRORSTATS_DISABLED", 19);
-            server.errors_enabled = 0;
-            return;
-        }
-
         struct serverError *error = zmalloc(sizeof(*error));
         error->count = 1;
         raxInsert(server.errors,(unsigned char*)fullerr,namelen,error,NULL);
diff --git a/src/server.h b/src/server.h
index 23bcda81d7..3222e33989 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1568,7 +1568,6 @@ struct valkeyServer {
     dict *orig_commands;        /* Command table before command renaming. */
     aeEventLoop *el;
     rax *errors;                /* Errors table */
-    int errors_enabled;         /* If true, errorstats is enabled, and we will add new errors. */
     unsigned int lruclock; /* Clock for LRU eviction */
     volatile sig_atomic_t shutdown_asap; /* Shutdown ordered by signal handler. */
     mstime_t shutdown_mstime;   /* Timestamp to limit graceful shutdown. */
@@ -2563,9 +2562,15 @@ int validateProcTitleTemplate(const char *template);
 int serverCommunicateSystemd(const char *sd_notify_msg);
 void serverSetCpuAffinity(const char *cpulist);
 
+/* ERROR STATS constants */
+#define ERROR_STATS_LUA_LIMIT 128 /* After the errors RAX reaches this limit, instead of tracking 
+                                    custom LUA errors, we track the error under `error_LUA`. */
+
 /* afterErrorReply flags */
 #define ERR_REPLY_FLAG_NO_STATS_UPDATE (1ULL<<0) /* Indicating that we should not update
                                                     error stats after sending error reply */
+#define ERR_REPLY_FLAG_LUA (1ULL<<1) /* Indicating that we are coming in from ScriptCall */
+
 /* networking.c -- Networking and Client related operations */
 client *createClient(connection *conn);
 void freeClient(client *c);
diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
index 17dc6a1861..dbe4d3606d 100644
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@@ -281,18 +281,12 @@ start_server {tags {"info" "external:skip"}} {
                     r eval {return redis.error_reply(string.format('%s my error message', ARGV[1]))} 0 $j
                 }
             }
-
-            assert_equal [count_log_message 0 "Errorstats stopped adding new errors"] 1
-            assert_equal [count_log_message 0 "Current errors code list"] 1
-            assert_equal "count=1" [errorstat ERRORSTATS_DISABLED]
-
-            # Since we currently have no metrics exposed for server.errors, we use lazyfree
-            # to verify that we only have 128 errors.
-            wait_for_condition 50 100 {
-                [s lazyfreed_objects] eq 128
-            } else {
-                fail "errorstats resetstat lazyfree error"
-            }
+            # Validate that custom LUA errors are tracked in `LUA_ERRORSTATS_DISABLED` when errors
+            # has 128 entries.
+            assert_equal "count=972" [errorstat LUA_ERRORSTATS_DISABLED]
+            # Validate that non LUA errors continue to be tracked even when we have >=128 entries.
+            assert_error {ERR syntax error} {r set a b c d e f g}
+            assert_equal "count=1" [errorstat ERR]
         }
 
         test {stats: eventloop metrics} {

From c86cb624202009d4a9f4a6eaeb407c6fd355f72d Mon Sep 17 00:00:00 2001
From: Karthik Subbarao <karthikrs2021@gmail.com>
Date: Wed, 15 May 2024 09:19:34 +0000
Subject: [PATCH 2/6] Minor clean up + update documentation

Signed-off-by: Karthik Subbarao <karthikrs2021@gmail.com>
---
 src/networking.c | 24 +++++++++++-------------
 src/server.h     |  3 ++-
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/networking.c b/src/networking.c
index afb4ed9ca6..1c2747dc27 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -513,25 +513,23 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) {
     if (!(flags & ERR_REPLY_FLAG_NO_STATS_UPDATE)) {
         /* Increment the global error counter */
         server.stat_total_error_replies++;
+        /* Increment the error stats */
         /* After the errors RAX reaches this limit, instead of tracking
          * custom LUA errors, we track the error under `error_LUA` */
         if (flags & ERR_REPLY_FLAG_LUA && raxSize(server.errors) >= ERROR_STATS_LUA_LIMIT) {
             incrementErrorCount("LUA_ERRORSTATS_DISABLED", 23);
+        /* If the string already starts with "-..." then the error prefix
+        * is provided by the caller ( we limit the search to 32 chars). Otherwise we use "-ERR". */ 
+        } else if (s[0] != '-') {
+            incrementErrorCount("ERR", 3);
         } else {
-            /* Increment the error stats
-            * If the string already starts with "-..." then the error prefix
-            * is provided by the caller ( we limit the search to 32 chars). Otherwise we use "-ERR". */
-            if (s[0] != '-') {
-                incrementErrorCount("ERR", 3);
+            char *spaceloc = memchr(s, ' ', len < 32 ? len : 32);
+            if (spaceloc) {
+                const size_t errEndPos = (size_t)(spaceloc - s);
+                incrementErrorCount(s+1, errEndPos-1);
             } else {
-                char *spaceloc = memchr(s, ' ', len < 32 ? len : 32);
-                if (spaceloc) {
-                    const size_t errEndPos = (size_t)(spaceloc - s);
-                    incrementErrorCount(s+1, errEndPos-1);
-                } else {
-                    /* Fallback to ERR if we can't retrieve the error prefix */
-                    incrementErrorCount("ERR", 3);
-                }
+                /* Fallback to ERR if we can't retrieve the error prefix */
+                incrementErrorCount("ERR", 3);
             }
         }
     } else {
diff --git a/src/server.h b/src/server.h
index 3222e33989..4ff209c0e9 100644
--- a/src/server.h
+++ b/src/server.h
@@ -2569,7 +2569,8 @@ void serverSetCpuAffinity(const char *cpulist);
 /* afterErrorReply flags */
 #define ERR_REPLY_FLAG_NO_STATS_UPDATE (1ULL<<0) /* Indicating that we should not update
                                                     error stats after sending error reply */
-#define ERR_REPLY_FLAG_LUA (1ULL<<1) /* Indicating that we are coming in from ScriptCall */
+#define ERR_REPLY_FLAG_LUA (1ULL<<1) /* Indicating that the error message is from LUA replying
+                                        to a client */
 
 /* networking.c -- Networking and Client related operations */
 client *createClient(connection *conn);

From bc0eaef1522765d9117d27e4c60b1a8bbc4dd9b9 Mon Sep 17 00:00:00 2001
From: Karthik Subbarao <karthikrs2021@gmail.com>
Date: Thu, 16 May 2024 16:58:13 +0000
Subject: [PATCH 3/6] Rename LUA Error Stat overflow error message

Signed-off-by: Karthik Subbarao <karthikrs2021@gmail.com>
---
 src/networking.c    | 2 +-
 src/server.h        | 3 ++-
 tests/unit/info.tcl | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/networking.c b/src/networking.c
index 1c2747dc27..784083a4f9 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -517,7 +517,7 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) {
         /* After the errors RAX reaches this limit, instead of tracking
          * custom LUA errors, we track the error under `error_LUA` */
         if (flags & ERR_REPLY_FLAG_LUA && raxSize(server.errors) >= ERROR_STATS_LUA_LIMIT) {
-            incrementErrorCount("LUA_ERRORSTATS_DISABLED", 23);
+            incrementErrorCount(LUA_ERRORSTATS_OVERFLOW_ERR, strlen(LUA_ERRORSTATS_OVERFLOW_ERR));
         /* If the string already starts with "-..." then the error prefix
         * is provided by the caller ( we limit the search to 32 chars). Otherwise we use "-ERR". */ 
         } else if (s[0] != '-') {
diff --git a/src/server.h b/src/server.h
index 4ff209c0e9..b77ef69dfd 100644
--- a/src/server.h
+++ b/src/server.h
@@ -2564,7 +2564,8 @@ void serverSetCpuAffinity(const char *cpulist);
 
 /* ERROR STATS constants */
 #define ERROR_STATS_LUA_LIMIT 128 /* After the errors RAX reaches this limit, instead of tracking 
-                                    custom LUA errors, we track the error under `error_LUA`. */
+                                    custom LUA errors, we track the error under `error_LUA_ERRORSTATS_OVERFLOW`. */
+#define LUA_ERRORSTATS_OVERFLOW_ERR "LUA_ERRORSTATS_OVERFLOW"
 
 /* afterErrorReply flags */
 #define ERR_REPLY_FLAG_NO_STATS_UPDATE (1ULL<<0) /* Indicating that we should not update
diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
index dbe4d3606d..6a7fbe9438 100644
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@@ -281,9 +281,9 @@ start_server {tags {"info" "external:skip"}} {
                     r eval {return redis.error_reply(string.format('%s my error message', ARGV[1]))} 0 $j
                 }
             }
-            # Validate that custom LUA errors are tracked in `LUA_ERRORSTATS_DISABLED` when errors
+            # Validate that custom LUA errors are tracked in `LUA_ERRORSTATS_OVERFLOW` when errors
             # has 128 entries.
-            assert_equal "count=972" [errorstat LUA_ERRORSTATS_DISABLED]
+            assert_equal "count=972" [errorstat LUA_ERRORSTATS_OVERFLOW]
             # Validate that non LUA errors continue to be tracked even when we have >=128 entries.
             assert_error {ERR syntax error} {r set a b c d e f g}
             assert_equal "count=1" [errorstat ERR]

From 1c77f69b16f675cf15a5ce5d038d07a831862dfb Mon Sep 17 00:00:00 2001
From: Karthik Subbarao <karthikrs2021@gmail.com>
Date: Mon, 20 May 2024 15:36:57 +0000
Subject: [PATCH 4/6] Update documentation, minor refactor, additional test
 case

Signed-off-by: Karthik Subbarao <karthikrs2021@gmail.com>
---
 src/script_lua.c    | 2 +-
 src/server.h        | 2 +-
 tests/unit/info.tcl | 5 ++++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/script_lua.c b/src/script_lua.c
index 501116ae4f..eff914779d 100644
--- a/src/script_lua.c
+++ b/src/script_lua.c
@@ -639,7 +639,7 @@ static void luaReplyToRedisReply(client *c, client* script_client, lua_State *lu
             errorInfo err_info = {0};
             luaExtractErrorInformation(lua, &err_info);
             addReplyErrorFormatEx(c,
-                                  err_info.ignore_err_stats_update? ERR_REPLY_FLAG_NO_STATS_UPDATE | ERR_REPLY_FLAG_LUA: ERR_REPLY_FLAG_LUA,
+                                  ERR_REPLY_FLAG_LUA | (err_info.ignore_err_stats_update? ERR_REPLY_FLAG_NO_STATS_UPDATE: 0),
                                   "-%s",
                                   err_info.msg);
             luaErrorInformationDiscard(&err_info);
diff --git a/src/server.h b/src/server.h
index b77ef69dfd..b9c9a00b4d 100644
--- a/src/server.h
+++ b/src/server.h
@@ -2564,7 +2564,7 @@ void serverSetCpuAffinity(const char *cpulist);
 
 /* ERROR STATS constants */
 #define ERROR_STATS_LUA_LIMIT 128 /* After the errors RAX reaches this limit, instead of tracking 
-                                    custom LUA errors, we track the error under `error_LUA_ERRORSTATS_OVERFLOW`. */
+                                    custom LUA errors, we track the error under the symbol below. */
 #define LUA_ERRORSTATS_OVERFLOW_ERR "LUA_ERRORSTATS_OVERFLOW"
 
 /* afterErrorReply flags */
diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
index 6a7fbe9438..f72166ce32 100644
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@@ -278,7 +278,7 @@ start_server {tags {"info" "external:skip"}} {
             r config resetstat
             for {set j 1} {$j <= 1100} {incr j} {
                 assert_error "$j my error message" {
-                    r eval {return redis.error_reply(string.format('%s my error message', ARGV[1]))} 0 $j
+                    r eval {return server.error_reply(string.format('%s my error message', ARGV[1]))} 0 $j
                 }
             }
             # Validate that custom LUA errors are tracked in `LUA_ERRORSTATS_OVERFLOW` when errors
@@ -287,6 +287,9 @@ start_server {tags {"info" "external:skip"}} {
             # Validate that non LUA errors continue to be tracked even when we have >=128 entries.
             assert_error {ERR syntax error} {r set a b c d e f g}
             assert_equal "count=1" [errorstat ERR]
+            r eval {return server.error_reply(string.format('My error message'))} 0
+            r eval {return {err = 'My error message'}} 0
+            assert_equal "count=974" [errorstat LUA_ERRORSTATS_OVERFLOW]
         }
 
         test {stats: eventloop metrics} {

From 8206c4ae59d8763f4ed0af45cf7dc0dafc0857ec Mon Sep 17 00:00:00 2001
From: KarthikSubbarao <karthikrs2021@gmail.com>
Date: Tue, 21 May 2024 04:06:37 +0000
Subject: [PATCH 5/6] update tests

Signed-off-by: KarthikSubbarao <karthikrs2021@gmail.com>
---
 tests/unit/info.tcl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
index f72166ce32..ab5dc06df3 100644
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@@ -287,8 +287,8 @@ start_server {tags {"info" "external:skip"}} {
             # Validate that non LUA errors continue to be tracked even when we have >=128 entries.
             assert_error {ERR syntax error} {r set a b c d e f g}
             assert_equal "count=1" [errorstat ERR]
-            r eval {return server.error_reply(string.format('My error message'))} 0
-            r eval {return {err = 'My error message'}} 0
+            assert_error "My error message" {r eval {return server.error_reply(string.format('My error message'))} 0}
+            assert_error "My error message" {r eval {return {err = 'My error message'}} 0}
             assert_equal "count=974" [errorstat LUA_ERRORSTATS_OVERFLOW]
         }
 

From a03333b61351696a58b77671174fb6194e21a738 Mon Sep 17 00:00:00 2001
From: KarthikSubbarao <karthikrs2021@gmail.com>
Date: Tue, 21 May 2024 04:41:04 +0000
Subject: [PATCH 6/6] Add tests for Valkey Functions

Signed-off-by: KarthikSubbarao <karthikrs2021@gmail.com>
---
 tests/unit/info.tcl | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
index ab5dc06df3..181eb1412f 100644
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@@ -287,9 +287,19 @@ start_server {tags {"info" "external:skip"}} {
             # Validate that non LUA errors continue to be tracked even when we have >=128 entries.
             assert_error {ERR syntax error} {r set a b c d e f g}
             assert_equal "count=1" [errorstat ERR]
-            assert_error "My error message" {r eval {return server.error_reply(string.format('My error message'))} 0}
+            # Test LUA error variants.
+            assert_error "My error message" {r eval {return server.error_reply('My error message')} 0}
             assert_error "My error message" {r eval {return {err = 'My error message'}} 0}
             assert_equal "count=974" [errorstat LUA_ERRORSTATS_OVERFLOW]
+            # Function calls that contain custom error messages should call be included in overflow counter
+            r FUNCTION LOAD replace [format "#!lua name=mylib\nserver.register_function('customerrorfn', function() return server.error_reply('My error message') end)"]
+            assert_error "My error message" {r fcall customerrorfn 0}
+            assert_equal "count=975" [errorstat LUA_ERRORSTATS_OVERFLOW]
+            # Function calls that contain non lua errors should continue to be tracked normally (in a separate counter).
+            r FUNCTION LOAD replace [format "#!lua name=mylib\nserver.register_function('invalidgetcmd', function() return server.call('get', 'x', 'x', 'x') end)"]
+            assert_error "ERR Wrong number of args*" {r fcall invalidgetcmd 0}
+            assert_equal "count=975" [errorstat LUA_ERRORSTATS_OVERFLOW]
+            assert_equal "count=2" [errorstat ERR]
         }
 
         test {stats: eventloop metrics} {