diskless replication on slave side (don't store rdb to file), plus some other related fixes

The implementation of the diskless replication was currently diskless only on the master side. The slave side still stores the received rdb file to the disk before loading it back in and parsing it. other changes: -------------- don't save rdb / aof file when we're a slave that is not synced (sync failed and dataset is empty), so that we don't override an existing one and end up loosing data on failover. loadAppendOnlyFile (loadDataFromDisk) would have exit() if the file doesn't exist, but that would never happen since the file was always already created in initServer before that check. instad: don't create an empty aof file on startup before reading it and only create it when we start writing to it. this allows us to distinguish between success to load an empty file and a failure to load a non-existing file, currently we don't use the above since the startup should succeed even if the file doesn't exist (first server startup). maybe we need to add another config called "preload-file" or "load-on-startup" or "abort-when-load-fails" distinguish between aof configuration and state so that we can re-enable aof only when sync eventually succeeds (and not when exiting from readSyncBulkPayload after a failed attempt) also a CONFIG GET and INFO during rdb loading would have lied SLAVEOF NO ONE, will have an argument to succeed only if the slave is in sync (a specific offset can be provided) tests: add test for not saving on exit for unsynced slave replication tests for diskless slave and diskless master other replication tests improvements (not related to diskless slave)
author: Oran Agra <oran@redislabs.com> 2015-03-02 11:20:19 +0200
committer: antirez <antirez@gmail.com> 2017-12-11 11:09:33 +0100
commit: 9e5224bfd5ec13c14f89fe5eb15441a1d74d8dc5 (patch)
tree: 3a600448a3a433b5fe1e1d360efa2dcd3ca97a72 /tests
parent: 0f26125841b13c29d89a2b78957262f164b31770 (diff)
download: redis-9e5224bfd5ec13c14f89fe5eb15441a1d74d8dc5.tar.gz
5 files changed, 234 insertions, 102 deletions
diff --git a/tests/integration/psync2.tcl b/tests/integration/psync2.tcl
index 3d9e5527a..9ad46f389 100644
--- a/tests/integration/psync2.tcl
+++ b/tests/integration/psync2.tcl
@@ -1,3 +1,4 @@
+proc test_psync2 {mdl sdls sdll} {
 start_server {tags {"psync2"}} {
 start_server {} {
 start_server {} {
@@ -31,6 +32,9 @@ start_server {} {
         if {$debug_msg} {puts "Log file: [srv [expr 0-$j] stdout]"}
     }
 
+    test "PSYNC2: ### SETTING diskless master: $mdl; diskless slave (sync, load): $sdls, $sdll ###" {
+    }
+    
     set cycle 1
     while {([clock seconds]-$start_time) < $duration} {
         test "PSYNC2: --- CYCLE $cycle ---" {
@@ -45,6 +49,8 @@ start_server {} {
         set used [list $master_id]
         test "PSYNC2: \[NEW LAYOUT\] Set #$master_id as master" {
             $R($master_id) slaveof no one
+            $R($master_id) config set repl-diskless-sync $mdl
+            $R($master_id) config set repl-diskless-sync-delay 1
             if {$counter_value == 0} {
                 $R($master_id) set x $counter_value
             }
@@ -62,6 +68,9 @@ start_server {} {
             set master_port $R_port($mid)
 
             test "PSYNC2: Set #$slave_id to replicate from #$mid" {
+                $R($slave_id) config set repl-diskless-load $sdll
+                $R($slave_id) config set repl-diskless-sync $sdls
+                $R($slave_id) config set repl-diskless-sync-delay 1
                 $R($slave_id) slaveof $master_host $master_port
             }
             lappend used $slave_id
@@ -243,3 +252,12 @@ start_server {} {
     }
 
 }}}}}
+}
+
+foreach mdl {yes no} {
+    foreach sdls {yes no} {
+        foreach sdll {yes no} {                
+            test_psync2 $mdl $sdls $sdll
+        }
+    }
+}
diff --git a/tests/integration/replication-4.tcl b/tests/integration/replication-4.tcl
index 1c559b706..348b1cae5 100644
--- a/tests/integration/replication-4.tcl
+++ b/tests/integration/replication-4.tcl
@@ -1,12 +1,3 @@
-proc start_bg_complex_data {host port db ops} {
-    set tclsh [info nameofexecutable]
-    exec $tclsh tests/helpers/bg_complex_data.tcl $host $port $db $ops &
-}
-
-proc stop_bg_complex_data {handle} {
-    catch {exec /bin/kill -9 $handle}
-}
-
 start_server {tags {"repl"}} {
     start_server {} {
 
@@ -153,3 +144,87 @@ start_server {tags {"repl"}} {
         }
     }
 }
+
+# test that restart of a slave that is not in sync, doens't override an existing rdb
+start_server {tags {"repl"}} {
+    start_server {} {
+        set master [srv -1 client]
+        set master_host [srv -1 host]
+        set master_port [srv -1 port]
+        set slave [srv 0 client]
+
+        $master select 0
+        $slave select 0
+
+        # Populate master
+        for {set j 0} {$j < 100} {incr j} {
+            $master set key$j $j
+        }
+
+        # Connect slave to master
+        test {First server should have role slave after SLAVEOF} {
+            $slave slaveof $master_host $master_port
+            wait_for_condition 50 100 {
+                [s 0 master_link_status] eq {up}
+            } else {
+                fail "Replication not started."
+            }
+        }
+
+        test {Slave should sync with master} {
+            wait_for_condition 50 100 {
+                [$slave dbsize] == 100
+            } else {
+                fail "Replication not completed."
+            }
+        }
+
+        # Disconnect slave
+        $slave slaveof no one
+        $slave save
+
+        # Make sure no RDB saving is in progress
+        test {Make sure no RDB saving is in progress} {
+            wait_for_condition 50 100 {
+                [s -1 rdb_bgsave_in_progress] eq {0}
+            } else {
+                fail "RDB saving never finished."
+            }
+        }
+
+        # Setup delay to simulate a long RDB transfer time
+        # 50000 microseconds * 100 keys = 5 seconds
+        $master config set rdb-key-save-delay 50000
+
+        # Connect slave to master
+        $slave slaveof $master_host $master_port
+
+        # Make sure master started sending the file
+        test {Make sure master started sending RDB} {
+            wait_for_condition 50 100 {
+                [s -1 rdb_bgsave_in_progress] eq {1}
+            } else {
+                fail "RDB saving never started."
+            }
+        }
+
+        test {Kill master and restart slave} {
+            # Kill the master mid-RDB sending
+            catch {$master shutdown}
+            
+            # Restart slave
+            catch {$slave debug restart}
+        }
+
+        after 100
+
+        # Make sure it has all 100 keys
+        test {Slave should load old RDB} {
+            wait_for_condition 50 100 {
+                [$slave dbsize] == 100
+            } else {
+                fail "RDB not loaded."
+            }
+        }
+    }
+}
diff --git a/tests/integration/replication-psync.tcl b/tests/integration/replication-psync.tcl
index 2b9e13f50..ac9cca71a 100644
--- a/tests/integration/replication-psync.tcl
+++ b/tests/integration/replication-psync.tcl
@@ -1,12 +1,3 @@
-proc start_bg_complex_data {host port db ops} {
-    set tclsh [info nameofexecutable]
-    exec $tclsh tests/helpers/bg_complex_data.tcl $host $port $db $ops &
-}
-
-proc stop_bg_complex_data {handle} {
-    catch {exec /bin/kill -9 $handle}
-}
-
 # Creates a master-slave pair and breaks the link continuously to force
 # partial resyncs attempts, all this while flooding the master with
 # write queries.
@@ -17,7 +8,7 @@ proc stop_bg_complex_data {handle} {
 # If reconnect is > 0, the test actually try to break the connection and
 # reconnect with the master, otherwise just the initial synchronization is
 # checked for consistency.
-proc test_psync {descr duration backlog_size backlog_ttl delay cond diskless reconnect} {
+proc test_psync {descr duration backlog_size backlog_ttl delay cond mdl sdl reconnect} {
     start_server {tags {"repl"}} {
         start_server {} {
 
@@ -28,8 +19,9 @@ proc test_psync {descr duration backlog_size backlog_ttl delay cond diskless rec
 
             $master config set repl-backlog-size $backlog_size
             $master config set repl-backlog-ttl $backlog_ttl
-            $master config set repl-diskless-sync $diskless
+            $master config set repl-diskless-sync $mdl
             $master config set repl-diskless-sync-delay 1
+            $slave config set repl-diskless-load $sdl
 
             set load_handle0 [start_bg_complex_data $master_host $master_port 9 100000]
             set load_handle1 [start_bg_complex_data $master_host $master_port 11 100000]
@@ -54,7 +46,7 @@ proc test_psync {descr duration backlog_size backlog_ttl delay cond diskless rec
                 }
             }
 
-            test "Test replication partial resync: $descr (diskless: $diskless, reconnect: $reconnect)" {
+            test "Test replication partial resync: $descr (diskless: $mdl, $sdl)" {
                 # Now while the clients are writing data, break the maste-slave
                 # link multiple times.
                 if ($reconnect) {
@@ -79,6 +71,32 @@ proc test_psync {descr duration backlog_size backlog_ttl delay cond diskless rec
                 stop_bg_complex_data $load_handle0
                 stop_bg_complex_data $load_handle1
                 stop_bg_complex_data $load_handle2
+
+                # Wait for the slave to reach the "online"
+                # state from the POV of the master.
+                set retry 5000
+                while {$retry} {
+                    set info [$master info]
+                    if {[string match {*slave0:*state=online*} $info]} {
+                        break
+                    } else {
+                        incr retry -1
+                        after 100
+                    }
+                }
+                if {$retry == 0} {
+                    error "assertion:Slave not correctly synchronized"
+                }
+
+                # Wait that slave acknowledge it is online so
+                # we are sure that DBSIZE and DEBUG DIGEST will not
+                # fail because of timing issues. (-LOADING error)
+                wait_for_condition 5000 100 {
+                    [lindex [$slave role] 3] eq {connected}
+                } else {
+                    fail "Slave still not connected after some time"
+                }  
+
                 set retry 10
                 while {$retry && ([$master debug digest] ne [$slave debug digest])}\
                 {
@@ -106,23 +124,25 @@ proc test_psync {descr duration backlog_size backlog_ttl delay cond diskless rec
     }
 }
 
-foreach diskless {no yes} {
-    test_psync {no reconnection, just sync} 6 1000000 3600 0 {
-    } $diskless 0
+foreach mdl {no yes} {
+    foreach sdl {no yes} {
+        test_psync {no reconnection, just sync} 6 1000000 3600 0 {
+        } $mdl $sdl 0
 
-    test_psync {ok psync} 6 100000000 3600 0 {
+        test_psync {ok psync} 6 100000000 3600 0 {
         assert {[s -1 sync_partial_ok] > 0}
-    } $diskless 1
+        } $mdl $sdl 1
 
-    test_psync {no backlog} 6 100 3600 0.5 {
+        test_psync {no backlog} 6 100 3600 0.5 {
         assert {[s -1 sync_partial_err] > 0}
-    } $diskless 1
+        } $mdl $sdl 1
 
-    test_psync {ok after delay} 3 100000000 3600 3 {
+        test_psync {ok after delay} 3 100000000 3600 3 {
         assert {[s -1 sync_partial_ok] > 0}
-    } $diskless 1
+        } $mdl $sdl 1
 
-    test_psync {backlog expired} 3 100000000 1 3 {
+        test_psync {backlog expired} 3 100000000 1 3 {
         assert {[s -1 sync_partial_err] > 0}
-    } $diskless 1
+        } $mdl $sdl 1
+    }
 }
diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl
index e811cf0ee..fb05328e9 100644
--- a/tests/integration/replication.tcl
+++ b/tests/integration/replication.tcl
@@ -183,85 +183,92 @@ start_server {tags {"repl"}} {
     }
 }
 
-foreach dl {no yes} {
-    start_server {tags {"repl"}} {
-        set master [srv 0 client]
-        $master config set repl-diskless-sync $dl
-        set master_host [srv 0 host]
-        set master_port [srv 0 port]
-        set slaves {}
-        set load_handle0 [start_write_load $master_host $master_port 3]
-        set load_handle1 [start_write_load $master_host $master_port 5]
-        set load_handle2 [start_write_load $master_host $master_port 20]
-        set load_handle3 [start_write_load $master_host $master_port 8]
-        set load_handle4 [start_write_load $master_host $master_port 4]
-        start_server {} {
-            lappend slaves [srv 0 client]
+foreach mdl {no yes} {
+    foreach sdl {no yes} {
+        start_server {tags {"repl"}} {
+            set master [srv 0 client]
+            $master config set repl-diskless-sync $mdl
+            $master config set repl-diskless-sync-delay 1
+            set master_host [srv 0 host]
+            set master_port [srv 0 port]
+            set slaves {}
+            set load_handle0 [start_bg_complex_data $master_host $master_port 9 100000000]
+            set load_handle1 [start_bg_complex_data $master_host $master_port 11 100000000]
+            set load_handle2 [start_bg_complex_data $master_host $master_port 12 100000000]
+            set load_handle3 [start_write_load $master_host $master_port 8]
+            set load_handle4 [start_write_load $master_host $master_port 4]
+            after 5000 ;# wait for some data to accumulate so that we have RDB part for the fork
             start_server {} {
                 lappend slaves [srv 0 client]
                 start_server {} {
                     lappend slaves [srv 0 client]
-                    test "Connect multiple slaves at the same time (issue #141), diskless=$dl" {
-                        # Send SLAVEOF commands to slaves
-                        [lindex $slaves 0] slaveof $master_host $master_port
-                        [lindex $slaves 1] slaveof $master_host $master_port
-                        [lindex $slaves 2] slaveof $master_host $master_port
-
-                        # Wait for all the three slaves to reach the "online"
-                        # state from the POV of the master.
-                        set retry 500
-                        while {$retry} {
-                            set info [r -3 info]
-                            if {[string match {*slave0:*state=online*slave1:*state=online*slave2:*state=online*} $info]} {
-                                break
+                    start_server {} {
+                        lappend slaves [srv 0 client]
+                        test "Connect multiple slaves at the same time (issue #141), master diskless=$mdl, slave diskless=$sdl" {
+                            # Send SLAVEOF commands to slaves
+                            [lindex $slaves 0] config set repl-diskless-load $sdl
+                            [lindex $slaves 1] config set repl-diskless-load $sdl
+                            [lindex $slaves 2] config set repl-diskless-load $sdl
+                            [lindex $slaves 0] slaveof $master_host $master_port
+                            [lindex $slaves 1] slaveof $master_host $master_port
+                            [lindex $slaves 2] slaveof $master_host $master_port
+
+                            # Wait for all the three slaves to reach the "online"
+                            # state from the POV of the master.
+                            set retry 500
+                            while {$retry} {
+                                set info [r -3 info]
+                                if {[string match {*slave0:*state=online*slave1:*state=online*slave2:*state=online*} $info]} {
+                                    break
+                                } else {
+                                    incr retry -1
+                                    after 100
+                                }
+                            }
+                            if {$retry == 0} {
+                                error "assertion:Slaves not correctly synchronized"
+                            }
+
+                            # Wait that slaves acknowledge they are online so
+                            # we are sure that DBSIZE and DEBUG DIGEST will not
+                            # fail because of timing issues.
+                            wait_for_condition 500 100 {
+                                [lindex [[lindex $slaves 0] role] 3] eq {connected} &&
+                                [lindex [[lindex $slaves 1] role] 3] eq {connected} &&
+                                [lindex [[lindex $slaves 2] role] 3] eq {connected}
                             } else {
-                                incr retry -1
-                                after 100
+                                fail "Slaves still not connected after some time"
                             }
-                        }
-                        if {$retry == 0} {
-                            error "assertion:Slaves not correctly synchronized"
-                        }
 
-                        # Wait that slaves acknowledge they are online so
-                        # we are sure that DBSIZE and DEBUG DIGEST will not
-                        # fail because of timing issues.
-                        wait_for_condition 500 100 {
-                            [lindex [[lindex $slaves 0] role] 3] eq {connected} &&
-                            [lindex [[lindex $slaves 1] role] 3] eq {connected} &&
-                            [lindex [[lindex $slaves 2] role] 3] eq {connected}
-                        } else {
-                            fail "Slaves still not connected after some time"
-                        }
+                            # Stop the write load
+                            stop_bg_complex_data $load_handle0
+                            stop_bg_complex_data $load_handle1
+                            stop_bg_complex_data $load_handle2
+                            stop_write_load $load_handle3
+                            stop_write_load $load_handle4
+
+                            # Make sure that slaves and master have same
+                            # number of keys
+                            wait_for_condition 500 100 {
+                                [$master dbsize] == [[lindex $slaves 0] dbsize] &&
+                                [$master dbsize] == [[lindex $slaves 1] dbsize] &&
+                                [$master dbsize] == [[lindex $slaves 2] dbsize]
+                            } else {
+                                fail "Different number of keys between master and slave after too long time."
+                            }
 
-                        # Stop the write load
-                        stop_write_load $load_handle0
-                        stop_write_load $load_handle1
-                        stop_write_load $load_handle2
-                        stop_write_load $load_handle3
-                        stop_write_load $load_handle4
-
-                        # Make sure that slaves and master have same
-                        # number of keys
-                        wait_for_condition 500 100 {
-                            [$master dbsize] == [[lindex $slaves 0] dbsize] &&
-                            [$master dbsize] == [[lindex $slaves 1] dbsize] &&
-                            [$master dbsize] == [[lindex $slaves 2] dbsize]
-                        } else {
-                            fail "Different number of keys between masted and slave after too long time."
+                            # Check digests
+                            set digest [$master debug digest]
+                            set digest0 [[lindex $slaves 0] debug digest]
+                            set digest1 [[lindex $slaves 1] debug digest]
+                            set digest2 [[lindex $slaves 2] debug digest]
+                            assert {$digest ne 0000000000000000000000000000000000000000}
+                            assert {$digest eq $digest0}
+                            assert {$digest eq $digest1}
+                            assert {$digest eq $digest2}
                         }
-
-                        # Check digests
-                        set digest [$master debug digest]
-                        set digest0 [[lindex $slaves 0] debug digest]
-                        set digest1 [[lindex $slaves 1] debug digest]
-                        set digest2 [[lindex $slaves 2] debug digest]
-                        assert {$digest ne 0000000000000000000000000000000000000000}
-                        assert {$digest eq $digest0}
-                        assert {$digest eq $digest1}
-                        assert {$digest eq $digest2}
-                    }
-               }
+                   }
+                }
             }
         }
     }
diff --git a/tests/support/util.tcl b/tests/support/util.tcl
index 64c36b326..c3679ef30 100644
--- a/tests/support/util.tcl
+++ b/tests/support/util.tcl
@@ -375,3 +375,15 @@ proc start_write_load {host port seconds} {
 proc stop_write_load {handle} {
     catch {exec /bin/kill -9 $handle}
 }
+
+# Execute a background process writing complex data for the specified number
+# of ops to the specified Redis instance.
+proc start_bg_complex_data {host port db ops} {
+    set tclsh [info nameofexecutable]
+    exec $tclsh tests/helpers/bg_complex_data.tcl $host $port $db $ops &
+}
+
+# Stop a process generating write load executed with start_bg_complex_data.
+proc stop_bg_complex_data {handle} {
+    catch {exec /bin/kill -9 $handle}
+}
author	Oran Agra <oran@redislabs.com>	2015-03-02 11:20:19 +0200
committer	antirez <antirez@gmail.com>	2017-12-11 11:09:33 +0100
commit	9e5224bfd5ec13c14f89fe5eb15441a1d74d8dc5 (patch)
tree	3a600448a3a433b5fe1e1d360efa2dcd3ca97a72 /tests
parent	0f26125841b13c29d89a2b78957262f164b31770 (diff)
download	redis-9e5224bfd5ec13c14f89fe5eb15441a1d74d8dc5.tar.gz