Set repl-diskless-sync to yes by default, add repl-diskless-sync-max-replicas (#10092)

1. enable diskless replication by default 2. add a new config named repl-diskless-sync-max-replicas that enables replication to start before the full repl-diskless-sync-delay was reached. 3. put replica online sooner on the master (see below) 4. test suite uses repl-diskless-sync-delay of 0 to be faster 5. a few tests that use multiple replica on a pre-populated master, are now using the new repl-diskless-sync-max-replicas 6. fix possible timing issues in a few cluster tests (see below) put replica online sooner on the master ---------------------------------------------------- there were two tests that failed because they needed for the master to realize that the replica is online, but the test code was actually only waiting for the replica to realize it's online, and in diskless it could have been before the master realized it. changes include two things: 1. the tests wait on the right thing 2. issues in the master, putting the replica online in two steps. the master used to put the replica as online in 2 steps. the first step was to mark it as online, and the second step was to enable the write event (only after getting ACK), but in fact the first step didn't contains some of the tasks to put it online (like updating good slave count, and sending the module event). this meant that if a test was waiting to see that the replica is online form the point of view of the master, and then confirm that the module got an event, or that the master has enough good replicas, it could fail due to timing issues. so now the full effect of putting the replica online, happens at once, and only the part about enabling the writes is delayed till the ACK. fix cluster tests -------------------- I added some code to wait for the replica to sync and avoid race conditions. later realized the sentinel and cluster tests where using the original 5 seconds delay, so changed it to 0. this means the other changes are probably not needed, but i suppose they're still better (avoid race conditions)
author: Oran Agra <oran@redislabs.com> 2022-01-17 14:11:11 +0200
committer: GitHub <noreply@github.com> 2022-01-17 14:11:11 +0200
commit: ae89958972ee720be2bff68231d6353553c2272a (patch)
tree: d0cc61d5bc718b7c5077e5746abb802184423851 /src
parent: 90916f16a5536bc833529d2e563d1094433de495 (diff)
download: redis-ae89958972ee720be2bff68231d6353553c2272a.tar.gz
4 files changed, 53 insertions, 39 deletions
diff --git a/src/config.c b/src/config.c
index f206bcc8c..d11445775 100644
--- a/src/config.c
+++ b/src/config.c
@@ -2752,7 +2752,7 @@ standardConfig configs[] = {
     createBoolConfig("lazyfree-lazy-user-del", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.lazyfree_lazy_user_del , 0, NULL, NULL),
     createBoolConfig("lazyfree-lazy-user-flush", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.lazyfree_lazy_user_flush , 0, NULL, NULL),
     createBoolConfig("repl-disable-tcp-nodelay", NULL, MODIFIABLE_CONFIG, server.repl_disable_tcp_nodelay, 0, NULL, NULL),
-    createBoolConfig("repl-diskless-sync", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.repl_diskless_sync, 0, NULL, NULL),
+    createBoolConfig("repl-diskless-sync", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.repl_diskless_sync, 1, NULL, NULL),
     createBoolConfig("aof-rewrite-incremental-fsync", NULL, MODIFIABLE_CONFIG, server.aof_rewrite_incremental_fsync, 1, NULL, NULL),
     createBoolConfig("no-appendfsync-on-rewrite", NULL, MODIFIABLE_CONFIG, server.aof_no_fsync_on_rewrite, 0, NULL, NULL),
     createBoolConfig("cluster-require-full-coverage", NULL, MODIFIABLE_CONFIG, server.cluster_require_full_coverage, 1, NULL, NULL),
@@ -2859,6 +2859,7 @@ standardConfig configs[] = {
     createIntConfig("min-replicas-max-lag", "min-slaves-max-lag", MODIFIABLE_CONFIG, 0, INT_MAX, server.repl_min_slaves_max_lag, 10, INTEGER_CONFIG, NULL, updateGoodSlaves),
     createIntConfig("watchdog-period", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, INT_MAX, server.watchdog_period, 0, INTEGER_CONFIG, NULL, updateWatchdogPeriod),
     createIntConfig("shutdown-timeout", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.shutdown_timeout, 10, INTEGER_CONFIG, NULL, NULL),
+    createIntConfig("repl-diskless-sync-max-replicas", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.repl_diskless_sync_max_replicas, 0, INTEGER_CONFIG, NULL, NULL),
 
     /* Unsigned int configs */
     createUIntConfig("maxclients", NULL, MODIFIABLE_CONFIG, 1, UINT_MAX, server.maxclients, 10000, INTEGER_CONFIG, NULL, updateMaxclients),
diff --git a/src/networking.c b/src/networking.c
index 22d9c6812..57576032e 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -162,7 +162,7 @@ client *createClient(connection *conn) {
     c->ctime = c->lastinteraction = server.unixtime;
     clientSetDefaultAuth(c);
     c->replstate = REPL_STATE_NONE;
-    c->repl_put_online_on_ack = 0;
+    c->repl_start_cmd_stream_on_ack = 0;
     c->reploff = 0;
     c->read_reploff = 0;
     c->repl_ack_off = 0;
@@ -225,7 +225,7 @@ void clientInstallWriteHandler(client *c) {
      * writes at this stage. */
     if (!(c->flags & CLIENT_PENDING_WRITE) &&
         (c->replstate == REPL_STATE_NONE ||
-         (c->replstate == SLAVE_STATE_ONLINE && !c->repl_put_online_on_ack)))
+         (c->replstate == SLAVE_STATE_ONLINE && !c->repl_start_cmd_stream_on_ack)))
     {
         /* Here instead of installing the write handler, we just flag the
          * client and put it into a list of clients that have something
@@ -3588,7 +3588,7 @@ void flushSlavesOutputBuffers(void) {
          */
         if (slave->replstate == SLAVE_STATE_ONLINE &&
             can_receive_writes &&
-            !slave->repl_put_online_on_ack &&
+            !slave->repl_start_cmd_stream_on_ack &&
             clientHasPendingReplies(slave))
         {
             writeToClient(slave,0);
diff --git a/src/replication.c b/src/replication.c
index 75472bf9e..e387a8fd4 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -44,7 +44,8 @@
 void replicationDiscardCachedMaster(void);
 void replicationResurrectCachedMaster(connection *conn);
 void replicationSendAck(void);
-void putSlaveOnline(client *slave);
+void replicaPutOnline(client *slave);
+void replicaStartCommandStream(client *slave);
 int cancelReplicationHandshake(int reconnect);
 
 /* We take a global flag to remember if this instance generated an RDB
@@ -768,7 +769,7 @@ int masterTryPartialResynchronization(client *c) {
     c->flags |= CLIENT_SLAVE;
     c->replstate = SLAVE_STATE_ONLINE;
     c->repl_ack_time = server.unixtime;
-    c->repl_put_online_on_ack = 0;
+    c->repl_start_cmd_stream_on_ack = 0;
     listAddNodeTail(server.slaves,c);
     /* We can't use the connection buffers since they are used to accumulate
      * new commands at this stage. But we are sure the socket send buffer is
@@ -1183,8 +1184,8 @@ void replconfCommand(client *c) {
              * quick check first (instead of waiting for the next ACK. */
             if (server.child_type == CHILD_TYPE_RDB && c->replstate == SLAVE_STATE_WAIT_BGSAVE_END)
                 checkChildrenDone();
-            if (c->repl_put_online_on_ack && c->replstate == SLAVE_STATE_ONLINE)
-                putSlaveOnline(c);
+            if (c->repl_start_cmd_stream_on_ack && c->replstate == SLAVE_STATE_ONLINE)
+                replicaStartCommandStream(c);
             /* Note: this command does not reply anything! */
             return;
         } else if (!strcasecmp(c->argv[j]->ptr,"getack")) {
@@ -1238,37 +1239,20 @@ void replconfCommand(client *c) {
 }
 
 /* This function puts a replica in the online state, and should be called just
- * after a replica received the RDB file for the initial synchronization, and
- * we are finally ready to send the incremental stream of commands.
+ * after a replica received the RDB file for the initial synchronization.
  *
  * It does a few things:
- * 1) Close the replica's connection async if it doesn't need replication
- *    commands buffer stream, since it actually isn't a valid replica.
- * 2) Put the slave in ONLINE state. Note that the function may also be called
- *    for a replicas that are already in ONLINE state, but having the flag
- *    repl_put_online_on_ack set to true: we still have to install the write
- *    handler in that case. This function will take care of that.
- * 3) Make sure the writable event is re-installed, since calling the SYNC
- *    command disables it, so that we can accumulate output buffer without
- *    sending it to the replica.
- * 4) Update the count of "good replicas". */
-void putSlaveOnline(client *slave) {
-    slave->replstate = SLAVE_STATE_ONLINE;
-    slave->repl_put_online_on_ack = 0;
-    slave->repl_ack_time = server.unixtime; /* Prevent false timeout. */
-
+ * 1) Put the slave in ONLINE state.
+ * 2) Update the count of "good replicas".
+ * 3) Trigger the module event. */
+void replicaPutOnline(client *slave) {
     if (slave->flags & CLIENT_REPL_RDBONLY) {
-        serverLog(LL_NOTICE,
-            "Close the connection with replica %s as RDB transfer is complete",
-            replicationGetSlaveName(slave));
-        freeClientAsync(slave);
-        return;
-    }
-    if (connSetWriteHandler(slave->conn, sendReplyToClient) == C_ERR) {
-        serverLog(LL_WARNING,"Unable to register writable event for replica bulk transfer: %s", strerror(errno));
-        freeClient(slave);
         return;
     }
+
+    slave->replstate = SLAVE_STATE_ONLINE;
+    slave->repl_ack_time = server.unixtime; /* Prevent false timeout. */
+
     refreshGoodSlavesCount();
     /* Fire the replica change modules event. */
     moduleFireServerEvent(REDISMODULE_EVENT_REPLICA_CHANGE,
@@ -1278,6 +1262,30 @@ void putSlaveOnline(client *slave) {
         replicationGetSlaveName(slave));
 }
 
+/* This function should be called just after a replica received the RDB file
+ * for the initial synchronization, and we are finally ready to send the
+ * incremental stream of commands.
+ *
+ * It does a few things:
+ * 1) Close the replica's connection async if it doesn't need replication
+ *    commands buffer stream, since it actually isn't a valid replica.
+ * 2) Make sure the writable event is re-installed, since when calling the SYNC
+ *    command we had no replies and it was disabled, and then we could
+ *    accumulate output buffer data without sending it to the replica so it
+ *    won't get mixed with the RDB stream. */
+void replicaStartCommandStream(client *slave) {
+    slave->repl_start_cmd_stream_on_ack = 0;
+    if (slave->flags & CLIENT_REPL_RDBONLY) {
+        serverLog(LL_NOTICE,
+            "Close the connection with replica %s as RDB transfer is complete",
+            replicationGetSlaveName(slave));
+        freeClientAsync(slave);
+        return;
+    }
+
+    clientInstallWriteHandler(slave);
+}
+
 /* We call this function periodically to remove an RDB file that was
  * generated because of replication, in an instance that is otherwise
  * without any persistence. We don't want instances without persistence
@@ -1376,7 +1384,8 @@ void sendBulkToSlave(connection *conn) {
         close(slave->repldbfd);
         slave->repldbfd = -1;
         connSetWriteHandler(slave->conn,NULL);
-        putSlaveOnline(slave);
+        replicaPutOnline(slave);
+        replicaStartCommandStream(slave);
     }
 }
 
@@ -1583,9 +1592,8 @@ void updateSlavesWaitingBgsave(int bgsaveerr, int type) {
                  * after such final EOF. So we don't want to glue the end of
                  * the RDB transfer with the start of the other replication
                  * data. */
-                slave->replstate = SLAVE_STATE_ONLINE;
-                slave->repl_put_online_on_ack = 1;
-                slave->repl_ack_time = server.unixtime; /* Timeout otherwise. */
+                replicaPutOnline(slave);
+                slave->repl_start_cmd_stream_on_ack = 1;
             } else {
                 if ((slave->repldbfd = open(server.rdb_filename,O_RDONLY)) == -1 ||
                     redis_fstat(slave->repldbfd,&buf) == -1) {
@@ -3721,6 +3729,8 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out) {
 
         if (slaves_waiting &&
             (!server.repl_diskless_sync ||
+             (server.repl_diskless_sync_max_replicas > 0 &&
+              slaves_waiting >= server.repl_diskless_sync_max_replicas) ||
              max_idle >= server.repl_diskless_sync_delay))
         {
             if (mincapa_out)
diff --git a/src/server.h b/src/server.h
index b98883b33..2570302c3 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1084,7 +1084,7 @@ typedef struct client {
     uint64_t flags;         /* Client flags: CLIENT_* macros. */
     int authenticated;      /* Needed when the default user requires auth. */
     int replstate;          /* Replication state if this is a slave. */
-    int repl_put_online_on_ack; /* Install slave write handler on first ACK. */
+    int repl_start_cmd_stream_on_ack; /* Install slave write handler on first ACK. */
     int repldbfd;           /* Replication DB file descriptor. */
     off_t repldboff;        /* Replication DB file offset. */
     off_t repldbsize;       /* Replication DB file size. */
@@ -1706,6 +1706,8 @@ struct redisServer {
     int repl_diskless_load;         /* Slave parse RDB directly from the socket.
                                      * see REPL_DISKLESS_LOAD_* enum */
     int repl_diskless_sync_delay;   /* Delay to start a diskless repl BGSAVE. */
+    int repl_diskless_sync_max_replicas;/* Max replicas for diskless repl BGSAVE
+                                         * delay (start sooner if they all connect). */
     size_t repl_buffer_mem;         /* The memory of replication buffer. */
     list *repl_buffer_blocks;       /* Replication buffers blocks list
                                      * (serving replica clients and repl backlog) */
@@ -2458,6 +2460,7 @@ void unprotectClient(client *c);
 void initThreadedIO(void);
 client *lookupClientByID(uint64_t id);
 int authRequired(client *c);
+void clientInstallWriteHandler(client *c);
 
 #ifdef __GNUC__
 void addReplyErrorFormat(client *c, const char *fmt, ...)
author	Oran Agra <oran@redislabs.com>	2022-01-17 14:11:11 +0200
committer	GitHub <noreply@github.com>	2022-01-17 14:11:11 +0200
commit	ae89958972ee720be2bff68231d6353553c2272a (patch)
tree	d0cc61d5bc718b7c5077e5746abb802184423851 /src
parent	90916f16a5536bc833529d2e563d1094433de495 (diff)
download	redis-ae89958972ee720be2bff68231d6353553c2272a.tar.gz