PSYNC2: meaningful offset implemented.

A very commonly signaled operational problem with Redis master-replicas sets is that, once the master becomes unavailable for some reason, especially because of network problems, many times it wont be able to perform a partial resynchronization with the new master, once it rejoins the partition, for the following reason: 1. The master becomes isolated, however it keeps sending PINGs to the replicas. Such PINGs will never be received since the link connection is actually already severed. 2. On the other side, one of the replicas will turn into the new master, setting its secondary replication ID offset to the one of the last command received from the old master: this offset will not include the PINGs sent by the master once the link was already disconnected. 3. When the master rejoins the partion and is turned into a replica, its offset will be too advanced because of the PINGs, so a PSYNC will fail, and a full synchronization will be required. Related to issue #7002 and other discussion we had in the past around this problem.
author: antirez <antirez@gmail.com> 2020-03-24 11:02:40 +0100
committer: antirez <antirez@gmail.com> 2020-03-25 15:26:37 +0100
commit: 57fa355e56540a6fab61a5b8ecdda96225815632 (patch)
tree: 223ee35175d615f9dff7a7fef588d47ad96f5502
parent: f15042dbf0f0981c827126cf70bf335e755253f0 (diff)
download: redis-57fa355e56540a6fab61a5b8ecdda96225815632.tar.gz
3 files changed, 40 insertions, 1 deletions
diff --git a/src/replication.c b/src/replication.c
index 49c38f73f..e62afb2fb 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -162,6 +162,7 @@ void feedReplicationBacklog(void *ptr, size_t len) {
     unsigned char *p = ptr;
 
     server.master_repl_offset += len;
+    server.master_repl_meaningful_offset = server.master_repl_offset;
 
     /* This is a circular buffer, so write as much data we can at every
      * iteration and rewind the "idx" index if we reach the limit. */
@@ -1768,6 +1769,7 @@ void readSyncBulkPayload(connection *conn) {
      * we are starting a new history. */
     memcpy(server.replid,server.master->replid,sizeof(server.replid));
     server.master_repl_offset = server.master->reploff;
+    server.master_repl_meaningful_offset = server.master->reploff;
     clearReplicationId2();
 
     /* Let's create the replication backlog if needed. Slaves need to
@@ -2725,12 +2727,37 @@ void replicationCacheMaster(client *c) {
  * current offset if no data was lost during the failover. So we use our
  * current replication ID and offset in order to synthesize a cached master. */
 void replicationCacheMasterUsingMyself(void) {
+    serverLog(LL_NOTICE,
+        "Before turning into a replica, using my own master parameters "
+        "to synthesize a cached master: I may be able to synchronize with "
+        "the new master with just a partial transfer.");
+
     /* This will be used to populate the field server.master->reploff
      * by replicationCreateMasterClient(). We'll later set the created
      * master as server.cached_master, so the replica will use such
      * offset for PSYNC. */
     server.master_initial_offset = server.master_repl_offset;
 
+    /* However if the "meaningful" offset, that is the offset without
+     * the final PINGs in the stream, is different, use this instead:
+     * often when the master is no longer reachable, replicas will never
+     * receive the PINGs, however the master will end with an incremented
+     * offset because of the PINGs and will not be able to incrementally
+     * PSYNC with the new master. */
+    if (server.master_repl_offset > server.master_repl_meaningful_offset) {
+        long long delta = server.master_repl_offset -
+                          server.master_repl_meaningful_offset;
+        serverLog(LL_NOTICE,
+            "Using the meaningful offset %lld instead of %lld to exclude "
+            "the final PINGs (%lld bytes difference)",
+                server.master_repl_meaningful_offset,
+                server.master_repl_offset,
+                delta);
+        server.master_initial_offset = server.master_repl_meaningful_offset;
+        server.repl_backlog_histlen -= delta;
+        if (server.repl_backlog_histlen < 0) server.repl_backlog_histlen = 0;
+    }
+
     /* The master client we create can be set to any DBID, because
      * the new master will start its replication stream with SELECT. */
     replicationCreateMasterClient(NULL,-1);
@@ -2742,7 +2769,6 @@ void replicationCacheMasterUsingMyself(void) {
     unlinkClient(server.master);
     server.cached_master = server.master;
     server.master = NULL;
-    serverLog(LL_NOTICE,"Before turning into a replica, using my master parameters to synthesize a cached master: I may be able to synchronize with the new master with just a partial transfer.");
 }
 
 /* Free a cached master, called when there are no longer the conditions for
@@ -3122,10 +3148,18 @@ void replicationCron(void) {
             clientsArePaused();
 
         if (!manual_failover_in_progress) {
+            long long before_ping = server.master_repl_meaningful_offset;
             ping_argv[0] = createStringObject("PING",4);
             replicationFeedSlaves(server.slaves, server.slaveseldb,
                 ping_argv, 1);
             decrRefCount(ping_argv[0]);
+            /* The server.master_repl_meaningful_offset variable represents
+             * the offset of the replication stream without the pending PINGs.
+             * This is useful to set the right replication offset for PSYNC
+             * when the master is turned into a replica. Otherwise pending
+             * PINGs may not allow it to perform an incremental sync with the
+             * new master. */
+            server.master_repl_meaningful_offset = before_ping;
         }
     }
 
diff --git a/src/server.c b/src/server.c
index 65a01db57..84439461e 100644
--- a/src/server.c
+++ b/src/server.c
@@ -2355,6 +2355,7 @@ void initServerConfig(void) {
     server.repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT;
     server.repl_down_since = 0; /* Never connected, repl is down since EVER. */
     server.master_repl_offset = 0;
+    server.master_repl_meaningful_offset = 0;
 
     /* Replication partial resync backlog */
     server.repl_backlog = NULL;
@@ -4398,6 +4399,7 @@ sds genRedisInfoString(const char *section) {
             "master_replid:%s\r\n"
             "master_replid2:%s\r\n"
             "master_repl_offset:%lld\r\n"
+            "master_repl_meaningful_offset:%lld\r\n"
             "second_repl_offset:%lld\r\n"
             "repl_backlog_active:%d\r\n"
             "repl_backlog_size:%lld\r\n"
@@ -4406,6 +4408,7 @@ sds genRedisInfoString(const char *section) {
             server.replid,
             server.replid2,
             server.master_repl_offset,
+            server.master_repl_meaningful_offset,
             server.second_replid_offset,
             server.repl_backlog != NULL,
             server.repl_backlog_size,
@@ -4783,6 +4786,7 @@ void loadDataFromDisk(void) {
             {
                 memcpy(server.replid,rsi.repl_id,sizeof(server.replid));
                 server.master_repl_offset = rsi.repl_offset;
+                server.master_repl_meaningful_offset = rsi.repl_offset;
                 /* If we are a slave, create a cached master from this
                  * information, in order to allow partial resynchronizations
                  * with masters. */
diff --git a/src/server.h b/src/server.h
index ed4707d66..818fbc3b3 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1241,6 +1241,7 @@ struct redisServer {
     char replid[CONFIG_RUN_ID_SIZE+1];  /* My current replication ID. */
     char replid2[CONFIG_RUN_ID_SIZE+1]; /* replid inherited from master*/
     long long master_repl_offset;   /* My current replication offset */
+    long long master_repl_meaningful_offset; /* Offset minus latest PINGs. */
     long long second_replid_offset; /* Accept offsets up to this for replid2. */
     int slaveseldb;                 /* Last SELECTed DB in replication output */
     int repl_ping_slave_period;     /* Master pings the slave every N seconds */
author	antirez <antirez@gmail.com>	2020-03-24 11:02:40 +0100
committer	antirez <antirez@gmail.com>	2020-03-25 15:26:37 +0100
commit	57fa355e56540a6fab61a5b8ecdda96225815632 (patch)
tree	223ee35175d615f9dff7a7fef588d47ad96f5502
parent	f15042dbf0f0981c827126cf70bf335e755253f0 (diff)
download	redis-57fa355e56540a6fab61a5b8ecdda96225815632.tar.gz