make replication tests more stable on slow machines

solving few replication related tests race conditions which fail on slow machines bugfix in slave buffers test: since the test is executed twice, each time with a different commands count, the threshold for the delta can't be a constant.
author: Oran Agra <oran@redislabs.com> 2019-05-05 08:19:52 +0300
committer: Oran Agra <oran@redislabs.com> 2019-05-05 08:25:01 +0300
commit: ba809f26d4bd81d23fa929d0c018f235ab298564 (patch)
tree: 567a60afcbd58f6d06be7465aa7a25906a73663f
parent: 0a6090bfd8fbec26f682ff0a1dc7a43699e0c9b7 (diff)
download: redis-ba809f26d4bd81d23fa929d0c018f235ab298564.tar.gz
3 files changed, 34 insertions, 4 deletions
diff --git a/tests/integration/psync2.tcl b/tests/integration/psync2.tcl
index 8663d6fcc..d1212b640 100644
--- a/tests/integration/psync2.tcl
+++ b/tests/integration/psync2.tcl
@@ -166,12 +166,15 @@ start_server {} {
         # Pick a random slave
         set slave_id [expr {($master_id+1)%5}]
         set sync_count [status $R($master_id) sync_full]
+        set sync_partial [status $R($master_id) sync_partial_ok]
         catch {
             $R($slave_id) config rewrite
             $R($slave_id) debug restart
         }
+        # note: just waiting for connected_slaves==4 has a race condition since
+        # we might do the check before the master realized that the slave disconnected
         wait_for_condition 50 1000 {
-            [status $R($master_id) connected_slaves] == 4
+            [status $R($master_id) sync_partial_ok] == $sync_partial + 1
         } else {
             fail "Replica not reconnecting"
         }
diff --git a/tests/integration/replication-psync.tcl b/tests/integration/replication-psync.tcl
index a3bce2a4c..bf8682446 100644
--- a/tests/integration/replication-psync.tcl
+++ b/tests/integration/replication-psync.tcl
@@ -79,6 +79,32 @@ proc test_psync {descr duration backlog_size backlog_ttl delay cond diskless rec
                 stop_bg_complex_data $load_handle0
                 stop_bg_complex_data $load_handle1
                 stop_bg_complex_data $load_handle2
+
+                # Wait for the slave to reach the "online"
+                # state from the POV of the master.
+                set retry 5000
+                while {$retry} {
+                    set info [$master info]
+                    if {[string match {*slave0:*state=online*} $info]} {
+                        break
+                    } else {
+                        incr retry -1
+                        after 100
+                    }
+                }
+                if {$retry == 0} {
+                    error "assertion:Slave not correctly synchronized"
+                }
+
+                # Wait that slave acknowledge it is online so
+                # we are sure that DBSIZE and DEBUG DIGEST will not
+                # fail because of timing issues. (-LOADING error)
+                wait_for_condition 5000 100 {
+                    [lindex [$slave role] 3] eq {connected}
+                } else {
+                    fail "Slave still not connected after some time"
+                }  
+
                 set retry 10
                 while {$retry && ([$master debug digest] ne [$slave debug digest])}\
                 {
diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl
index 1def57af5..0f64ddc18 100644
--- a/tests/unit/maxmemory.tcl
+++ b/tests/unit/maxmemory.tcl
@@ -161,7 +161,7 @@ proc test_slave_buffers {test_name cmd_count payload_len limit_memory pipeline}
             }
 
             # make sure master doesn't disconnect slave because of timeout
-            $master config set repl-timeout 300 ;# 5 minutes
+            $master config set repl-timeout 1200 ;# 20 minutes (for valgrind and slow machines)
             $master config set maxmemory-policy allkeys-random
             $master config set client-output-buffer-limit "replica 100000000 100000000 300"
             $master config set repl-backlog-size [expr {10*1024}]
@@ -212,7 +212,8 @@ proc test_slave_buffers {test_name cmd_count payload_len limit_memory pipeline}
 
             assert {[$master dbsize] == 100}
             assert {$slave_buf > 2*1024*1024} ;# some of the data may have been pushed to the OS buffers
-            assert {$delta < 50*1024 && $delta > -50*1024} ;# 1 byte unaccounted for, with 1M commands will consume some 1MB
+            set delta_max [expr {$cmd_count / 2}] ;# 1 byte unaccounted for, with 1M commands will consume some 1MB
+            assert {$delta < $delta_max && $delta > -$delta_max}
 
             $master client kill type slave
             set killed_used [s -1 used_memory]
@@ -221,7 +222,7 @@ proc test_slave_buffers {test_name cmd_count payload_len limit_memory pipeline}
             set killed_used_no_repl [expr {$killed_used - $killed_mem_not_counted_for_evict}]
             set delta_no_repl [expr {$killed_used_no_repl - $used_no_repl}]
             assert {$killed_slave_buf == 0}
-            assert {$delta_no_repl > -50*1024 && $delta_no_repl < 50*1024} ;# 1 byte unaccounted for, with 1M commands will consume some 1MB
+            assert {$delta_no_repl > -$delta_max && $delta_no_repl < $delta_max}
 
         }
         # unfreeze slave process (after the 'test' succeeded or failed, but before we attempt to terminate the server
author	Oran Agra <oran@redislabs.com>	2019-05-05 08:19:52 +0300
committer	Oran Agra <oran@redislabs.com>	2019-05-05 08:25:01 +0300
commit	ba809f26d4bd81d23fa929d0c018f235ab298564 (patch)
tree	567a60afcbd58f6d06be7465aa7a25906a73663f
parent	0a6090bfd8fbec26f682ff0a1dc7a43699e0c9b7 (diff)
download	redis-ba809f26d4bd81d23fa929d0c018f235ab298564.tar.gz