From 66002530466a45bce85e4930364f1b153c44840b Mon Sep 17 00:00:00 2001 From: yoav-steinberg Date: Sun, 26 Sep 2021 17:45:02 +0300 Subject: Client eviction ci issues (#9549) Fixing CI test issues introduced in #8687 - valgrind warnings in readQueryFromClient when client was freed by processInputBuffer - adding DEBUG pause-cron for tests not to be time dependent. - skipping a test that depends on socket buffers / events not compatible with TLS - making sure client got subscribed by not using deferring client --- src/blocked.c | 4 +++- src/debug.c | 6 ++++++ src/networking.c | 27 +++++++++++++++++++-------- src/server.c | 4 ++++ src/server.h | 3 ++- tests/README.md | 1 + tests/support/server.tcl | 5 +++++ tests/unit/client-eviction.tcl | 22 ++++++++-------------- tests/unit/maxmemory.tcl | 9 ++++++--- 9 files changed, 54 insertions(+), 27 deletions(-) diff --git a/src/blocked.c b/src/blocked.c index 86aed2440..67fd3fdca 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -141,8 +141,10 @@ void processUnblockedClients(void) { if (processPendingCommandsAndResetClient(c) == C_OK) { /* Now process client if it has more data in it's buffer. */ if (c->querybuf && sdslen(c->querybuf) > 0) { - processInputBuffer(c); + if (processInputBuffer(c) == C_ERR) c = NULL; } + } else { + c = NULL; } } beforeNextClient(c); diff --git a/src/debug.c b/src/debug.c index 4e95e2dfd..ccaf93758 100644 --- a/src/debug.c +++ b/src/debug.c @@ -471,6 +471,8 @@ void debugCommand(client *c) { " Show low level info about the ziplist encoding of .", "CLIENT-EVICTION", " Show low level client eviction pools info (maxmemory-clients).", +"PAUSE-CRON <0|1>", +" Stop periodic cron job processing.", NULL }; addReplyHelp(c, help); @@ -910,6 +912,10 @@ NULL mallctl_string(c, c->argv+2, c->argc-2); return; #endif + } else if (!strcasecmp(c->argv[1]->ptr,"pause-cron") && c->argc == 3) + { + server.pause_cron = atoi(c->argv[2]->ptr); + addReply(c,shared.ok); } else { addReplySubcommandSyntaxError(c); return; diff --git a/src/networking.c b/src/networking.c index 21278d783..1d8e264cb 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1481,7 +1481,9 @@ void freeClientAsync(client *c) { /* Perform processing of the client before moving on to processing the next client * this is useful for performing operations that affect the global state but can't * wait until we're done with all clients. In other words can't wait until beforeSleep() - * return C_ERR in case client is no longer valid after call. */ + * return C_ERR in case client is no longer valid after call. + * The input client argument: c, may be NULL in case the previous client was + * freed before the call. */ int beforeNextClient(client *c) { /* Skip the client processing if we're in an IO thread, in that case we'll perform this operation later (this function is called again) in the fan-in stage of the threading mechanism */ @@ -1492,7 +1494,7 @@ int beforeNextClient(client *c) { * cases where we want an async free of a client other than myself. For example * in ACL modifications we disconnect clients authenticated to non-existent * users (see ACL LOAD). */ - if (c->flags & CLIENT_CLOSE_ASAP) { + if (c && (c->flags & CLIENT_CLOSE_ASAP)) { freeClient(c); return C_ERR; } @@ -2106,8 +2108,9 @@ int processPendingCommandsAndResetClient(client *c) { /* This function is called every time, in the client structure 'c', there is * more query buffer to process, because we read more data from the socket * or because a client was blocked and later reactivated, so there could be - * pending query buffer, already representing a full command, to process. */ -void processInputBuffer(client *c) { + * pending query buffer, already representing a full command, to process. + * return C_ERR in case the client was freed during the processing */ +int processInputBuffer(client *c) { /* Keep processing while there is something in the input buffer */ while(c->qb_pos < sdslen(c->querybuf)) { /* Immediately abort if the client is in the middle of something. */ @@ -2165,7 +2168,7 @@ void processInputBuffer(client *c) { /* If the client is no longer valid, we avoid exiting this * loop and trimming the client buffer later. So we return * ASAP in that case. */ - return; + return C_ERR; } } } @@ -2180,6 +2183,8 @@ void processInputBuffer(client *c) { * important in case the query buffer is big and wasn't drained during * the above loop (because of partially sent big commands). */ updateClientMemUsage(c); + + return C_OK; } void readQueryFromClient(connection *conn) { @@ -2270,8 +2275,9 @@ void readQueryFromClient(connection *conn) { } /* There is more data in the client input buffer, continue parsing it - * in case to check if there is a full command to execute. */ - processInputBuffer(c); + * and check if there is a full command to execute. */ + if (processInputBuffer(c) == C_ERR) + c = NULL; done: beforeNextClient(c); @@ -3855,7 +3861,12 @@ int handleClientsWithPendingReadsUsingThreads(void) { continue; } - processInputBuffer(c); + if (processInputBuffer(c) == C_ERR) { + /* If the client is no longer valid, we avoid + * processing the client later. So we just go + * to the next. */ + continue; + } /* We may have pending replies if a thread readQueryFromClient() produced * replies and did not install a write handler (it can't). diff --git a/src/server.c b/src/server.c index 9b8fd48ef..709aa8703 100644 --- a/src/server.c +++ b/src/server.c @@ -2564,6 +2564,9 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { } } + /* for debug purposes: skip actual cron work if pause_cron is on */ + if (server.pause_cron) return 1000/server.hz; + run_with_period(100) { long long stat_net_input_bytes, stat_net_output_bytes; atomicGet(server.stat_net_input_bytes, stat_net_input_bytes); @@ -3187,6 +3190,7 @@ void initServerConfig(void) { server.next_client_id = 1; /* Client IDs, start from 1 .*/ server.loading_process_events_interval_bytes = (1024*1024*2); server.page_size = sysconf(_SC_PAGESIZE); + server.pause_cron = 0; unsigned int lruclock = getLRUClock(); atomicSet(server.lruclock,lruclock); diff --git a/src/server.h b/src/server.h index 3a94effe9..065bec445 100644 --- a/src/server.h +++ b/src/server.h @@ -1417,6 +1417,7 @@ struct redisServer { int set_proc_title; /* True if change proc title */ char *proc_title_template; /* Process title template format */ clientBufferLimitsConfig client_obuf_limits[CLIENT_TYPE_OBUF_COUNT]; + int pause_cron; /* Don't run cron tasks (debug) */ /* AOF persistence */ int aof_enabled; /* AOF configuration */ int aof_state; /* AOF_(ON|OFF|WAIT_REWRITE) */ @@ -2005,7 +2006,7 @@ void setDeferredMapLen(client *c, void *node, long length); void setDeferredSetLen(client *c, void *node, long length); void setDeferredAttributeLen(client *c, void *node, long length); void setDeferredPushLen(client *c, void *node, long length); -void processInputBuffer(client *c); +int processInputBuffer(client *c); void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask); void acceptTLSHandler(aeEventLoop *el, int fd, void *privdata, int mask); void acceptUnixHandler(aeEventLoop *el, int fd, void *privdata, int mask); diff --git a/tests/README.md b/tests/README.md index 980f8c3ad..178f9dd55 100644 --- a/tests/README.md +++ b/tests/README.md @@ -36,6 +36,7 @@ The following compatibility and capability tags are currently used: | --------------------- | --------- | | `external:skip` | Not compatible with external servers. | | `cluster:skip` | Not compatible with `--cluster-mode`. | +| `tls:skip` | Not campatible with `--tls`. | | `needs:repl` | Uses replication and needs to be able to `SYNC` from server. | | `needs:debug` | Uses the `DEBUG` command or other debugging focused commands (like `OBJECT`). | | `needs:pfdebug` | Uses the `PFDEBUG` command. | diff --git a/tests/support/server.tcl b/tests/support/server.tcl index 2e2b0d75a..0563995fd 100644 --- a/tests/support/server.tcl +++ b/tests/support/server.tcl @@ -202,6 +202,11 @@ proc tags_acceptable {tags err_return} { return 0 } + if {$::tls && [lsearch $tags "tls:skip"] >= 0} { + set err "Not supported in tls mode" + return 0 + } + return 1 } diff --git a/tests/unit/client-eviction.tcl b/tests/unit/client-eviction.tcl index 9421d8b61..0b1b8b281 100644 --- a/tests/unit/client-eviction.tcl +++ b/tests/unit/client-eviction.tcl @@ -184,11 +184,6 @@ start_server {} { test "client evicted due to tracking redirection" { r flushdb - # Use slow hz to avoid clientsCron from updating memory usage frequently since - # we're testing the update logic when writing tracking redirection output - set backup_hz [lindex [r config get hz] 1] - r config set hz 1 - set rr [redis_client] set redirected_c [redis_client] $redirected_c client setname redirected_client @@ -196,12 +191,16 @@ start_server {} { $redirected_c SUBSCRIBE __redis__:invalidate $rr client tracking on redirect $redir_id bcast # Use a big key name to fill the redirected tracking client's buffer quickly - set key_length [expr 1024*10] + set key_length [expr 1024*200] set long_key [string repeat k $key_length] # Use a script so we won't need to pass the long key name when dirtying it in the loop set script_sha [$rr script load "redis.call('incr', '$long_key')"] + + # Pause serverCron so it won't update memory usage since we're testing the update logic when + # writing tracking redirection output + r debug pause-cron 1 + # Read and write to same (long) key until redirected_client's buffers cause it to be evicted - set t [clock milliseconds] catch { while true { set mem [client_field redirected_client tot-mem] @@ -210,13 +209,8 @@ start_server {} { } } e assert_match {no client named redirected_client found*} $e - - # Make sure eviction happened in less than 1sec, this means, statistically eviction - # wasn't caused by clientCron accounting - set t [expr [clock milliseconds] - $t] - assert {$t < 1000} - - r config set hz $backup_hz + + r debug pause-cron 0 $rr close $redirected_c close } diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl index 0bc0e4b6d..5caa23810 100644 --- a/tests/unit/maxmemory.tcl +++ b/tests/unit/maxmemory.tcl @@ -74,7 +74,11 @@ start_server {tags {"maxmemory" "external:skip"}} { } verify_test $client_eviction - } + + # This test relies on SIGSTOP/CONT to handle all sent commands in a single event loop. + # In TLS multiple event loops are needed to receive all sent commands, so the test breaks. + # Mark it to be skipped when running in TLS mode. + } {} {tls:skip} foreach rr $clients { $rr close } @@ -113,13 +117,12 @@ start_server {tags {"maxmemory" "external:skip"}} { init_test $client_eviction for {set j 0} {$j < 10} {incr j} { - set rr [redis_deferring_client] + set rr [redis_client] lappend clients $rr } foreach rr $clients { $rr subscribe bla - $rr flush } for {set j 0} {$j < 40} {incr j} { -- cgit v1.2.1