Attempt to solve MacOS CI issues in GH Actions (#12013)

The MacOS CI in github actions often hangs without any logs. GH argues that it's due to resource utilization, either running out of disk space, memory, or CPU starvation, and thus the runner is terminated. This PR contains multiple attempts to resolve this: 1. introducing pause_process instead of SIGSTOP, which waits for the process to stop before resuming the test, possibly resolving race conditions in some tests, this was a suspect since there was one test that could result in an infinite loop in that case, in practice this didn't help, but still a good idea to keep. 2. disable the `save` config in many tests that don't need it, specifically ones that use heavy writes and could create large files. 3. change the `populate` proc to use short pipeline rather than an infinite one. 4. use `--clients 1` in the macos CI so that we don't risk running multiple resource demanding tests in parallel. 5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout when a test or a server starts.
author: Oran Agra <oran@redislabs.com> 2023-04-12 09:19:21 +0300
committer: GitHub <noreply@github.com> 2023-04-12 09:19:21 +0300
commit: 997fa41e99271cc5c3a79e9bf8a1332b3d9ab0c2 (patch)
tree: e9518b5a84c42b73ba39b735bc4d11010adc3d70 /tests
parent: 45b8eea19f3e2491dec669f0745e513a4c9d7329 (diff)
download: redis-997fa41e99271cc5c3a79e9bf8a1332b3d9ab0c2.tar.gz
21 files changed, 141 insertions, 107 deletions
diff --git a/tests/integration/aof-multi-part.tcl b/tests/integration/aof-multi-part.tcl
index 74f6b4949..1d41a8a83 100644
--- a/tests/integration/aof-multi-part.tcl
+++ b/tests/integration/aof-multi-part.tcl
@@ -755,7 +755,7 @@ tags {"external:skip"} {
     # writing pressure, etc.
 
 
-    start_server {tags {"Multi Part AOF"} overrides {aof-use-rdb-preamble {yes} appendonly {no}}} {
+    start_server {tags {"Multi Part AOF"} overrides {aof-use-rdb-preamble {yes} appendonly {no} save {}}} {
         set dir [get_redis_dir]
         set aof_basename "appendonly.aof"
         set aof_dirname "appendonlydir"
@@ -1173,7 +1173,7 @@ tags {"external:skip"} {
             assert {$d1 eq $d2}
         }
 
-        start_server {overrides {aof-use-rdb-preamble {yes} appendonly {no}}} {
+        start_server {overrides {aof-use-rdb-preamble {yes} appendonly {no} save {}}} {
             set dir [get_redis_dir]
             set aof_basename "appendonly.aof"
             set aof_dirname "appendonlydir"
diff --git a/tests/integration/block-repl.tcl b/tests/integration/block-repl.tcl
index 3f3a86ed8..52b4a53ea 100644
--- a/tests/integration/block-repl.tcl
+++ b/tests/integration/block-repl.tcl
@@ -12,7 +12,7 @@ proc stop_bg_block_op {handle} {
 }
 
 start_server {tags {"repl" "external:skip"}} {
-    start_server {} {
+    start_server {overrides {save {}}} {
         set master [srv -1 client]
         set master_host [srv -1 host]
         set master_port [srv -1 port]
diff --git a/tests/integration/failover.tcl b/tests/integration/failover.tcl
index 2cd944851..21fa3d281 100644
--- a/tests/integration/failover.tcl
+++ b/tests/integration/failover.tcl
@@ -1,6 +1,6 @@
-start_server {tags {"failover external:skip"}} {
-start_server {} {
-start_server {} {
+start_server {tags {"failover external:skip"} overrides {save {}}} {
+start_server {overrides {save {}}} {
+start_server {overrides {save {}}} {
     set node_0 [srv 0 client]
     set node_0_host [srv 0 host]
     set node_0_port [srv 0 port]
@@ -66,13 +66,13 @@ start_server {} {
 
         # Generate a delta between primary and replica
         set load_handler [start_write_load $node_0_host $node_0_port 5]
-        exec kill -SIGSTOP [srv -1 pid]
+        pause_process [srv -1 pid]
         wait_for_condition 50 100 {
             [s 0 total_commands_processed] > 100
         } else {
             fail "Node 0 did not accept writes"
         }
-        exec kill -SIGCONT [srv -1 pid]
+        resume_process [srv -1 pid]
 
         # Execute the failover
         $node_0 failover to $node_1_host $node_1_port
@@ -108,7 +108,7 @@ start_server {} {
 
         wait_for_ofs_sync $node_1 $node_2
         # We stop node 0 to and make sure node 2 is selected
-        exec kill -SIGSTOP $node_0_pid
+        pause_process $node_0_pid
         $node_1 set CASE 1
         $node_1 FAILOVER
 
@@ -118,7 +118,7 @@ start_server {} {
         } else {
             fail "Failover from node 1 to node 2 did not finish"
         }
-        exec kill -SIGCONT $node_0_pid
+        resume_process $node_0_pid
         $node_0 replicaof $node_2_host $node_2_port
 
         wait_for_sync $node_0
@@ -138,7 +138,7 @@ start_server {} {
         set initial_psyncs [s 0 sync_partial_ok]
         set initial_syncs [s 0 sync_full]
 
-        exec kill -SIGSTOP $node_0_pid
+        pause_process $node_0_pid
         # node 0 will never acknowledge this write
         $node_2 set case 2
         $node_2 failover to $node_0_host $node_0_port TIMEOUT 100 FORCE
@@ -155,7 +155,7 @@ start_server {} {
         assert_match *slave* [$node_1 role]
         assert_match *slave* [$node_2 role]
 
-        exec kill -SIGCONT $node_0_pid
+        resume_process $node_0_pid
 
         # Wait for failover to end
         wait_for_condition 50 100 {
@@ -186,7 +186,7 @@ start_server {} {
         set initial_syncs [s 0 sync_full]
 
         # Stop replica so it never catches up
-        exec kill -SIGSTOP [srv -1 pid]
+        pause_process [srv -1 pid]
         $node_0 SET CASE 1
         
         $node_0 failover to [srv -1 host] [srv -1 port] TIMEOUT 500
@@ -197,7 +197,7 @@ start_server {} {
             fail "Failover from node_0 to replica did not finish"
         }
 
-        exec kill -SIGCONT [srv -1 pid]
+        resume_process [srv -1 pid]
 
         # We need to make sure the nodes actually sync back up
         wait_for_ofs_sync $node_0 $node_1
@@ -218,7 +218,7 @@ start_server {} {
         set initial_syncs [s 0 sync_full]
     
         # Stop replica so it never catches up
-        exec kill -SIGSTOP [srv -1 pid]
+        pause_process [srv -1 pid]
         $node_0 SET CASE 2
         
         $node_0 failover to [srv -1 host] [srv -1 port] TIMEOUT 60000
@@ -230,7 +230,7 @@ start_server {} {
         $node_0 failover abort
         assert_match [s 0 master_failover_state] "no-failover"
 
-        exec kill -SIGCONT [srv -1 pid]
+        resume_process [srv -1 pid]
 
         # Just make sure everything is still synced
         wait_for_ofs_sync $node_0 $node_1
@@ -255,11 +255,11 @@ start_server {} {
 
         # We pause the target long enough to send a write command
         # during the pause. This write will not be interrupted.
-        exec kill -SIGSTOP [srv -1 pid]
+        pause_process [srv -1 pid]
         set rd [redis_deferring_client]
         $rd SET FOO BAR
         $node_0 failover to $node_1_host $node_1_port
-        exec kill -SIGCONT [srv -1 pid]
+        resume_process [srv -1 pid]
 
         # Wait for failover to end
         wait_for_condition 50 100 {
diff --git a/tests/integration/replication-2.tcl b/tests/integration/replication-2.tcl
index f9f259211..c18ff24fc 100644
--- a/tests/integration/replication-2.tcl
+++ b/tests/integration/replication-2.tcl
@@ -42,7 +42,7 @@ start_server {tags {"repl external:skip"}} {
         test {No write if min-slaves-max-lag is > of the slave lag} {
             r config set min-slaves-to-write 1
             r config set min-slaves-max-lag 2
-            exec kill -SIGSTOP [srv -1 pid]
+            pause_process [srv -1 pid]
             assert {[r set foo 12345] eq {OK}}
             wait_for_condition 100 100 {
                 [catch {r set foo 12345}] != 0
@@ -52,7 +52,7 @@ start_server {tags {"repl external:skip"}} {
             catch {r set foo 12345} err
             assert_match {NOREPLICAS*} $err
         }
-        exec kill -SIGCONT [srv -1 pid]
+        resume_process [srv -1 pid]
 
         test {min-slaves-to-write is ignored by slaves} {
             r config set min-slaves-to-write 1
diff --git a/tests/integration/replication-4.tcl b/tests/integration/replication-4.tcl
index f772eccb2..867ef364e 100644
--- a/tests/integration/replication-4.tcl
+++ b/tests/integration/replication-4.tcl
@@ -1,5 +1,5 @@
-start_server {tags {"repl network external:skip singledb:skip"}} {
-    start_server {} {
+start_server {tags {"repl network external:skip singledb:skip"} overrides {save {}}} {
+    start_server { overrides {save {}}} {
 
         set master [srv -1 client]
         set master_host [srv -1 host]
@@ -104,7 +104,7 @@ start_server {tags {"repl external:skip"}} {
             assert_equal OK [$master set foo 123]
             assert_equal OK [$master eval "return redis.call('set','foo',12345)" 0]
             # Killing a slave to make it become a lagged slave.
-            exec kill -SIGSTOP [srv 0 pid]
+            pause_process [srv 0 pid]
             # Waiting for slave kill.
             wait_for_condition 100 100 {
                 [catch {$master set foo 123}] != 0
@@ -113,7 +113,7 @@ start_server {tags {"repl external:skip"}} {
             }
             assert_error "*NOREPLICAS*" {$master set foo 123}
             assert_error "*NOREPLICAS*" {$master eval "return redis.call('set','foo',12345)" 0}
-            exec kill -SIGCONT [srv 0 pid]
+            resume_process [srv 0 pid]
         }
     }
 }
@@ -146,12 +146,12 @@ start_server {tags {"repl external:skip"}} {
             $master debug set-active-expire 0
             $master set k 1 px $px_ms
             wait_for_ofs_sync $master $slave
-            exec kill -SIGSTOP [srv 0 pid]
+            pause_process [srv 0 pid]
             $master incr k
             after [expr $px_ms + 1]
             # Stopping the replica for one second to makes sure the INCR arrives
             # to the replica after the key is logically expired.
-            exec kill -SIGCONT [srv 0 pid]
+            resume_process [srv 0 pid]
             wait_for_ofs_sync $master $slave
             # Check that k is logically expired but is present in the replica.
             set res [$slave exists k]
diff --git a/tests/integration/replication-buffer.tcl b/tests/integration/replication-buffer.tcl
index 2e402480d..143dc74aa 100644
--- a/tests/integration/replication-buffer.tcl
+++ b/tests/integration/replication-buffer.tcl
@@ -159,7 +159,7 @@ start_server {} {
         assert {[s repl_backlog_histlen] > [expr 2*10000*10000]}
         assert_equal [s connected_slaves] {2}
 
-        exec kill -SIGSTOP $replica2_pid
+        pause_process $replica2_pid
         r config set client-output-buffer-limit "replica 128k 0 0"
         # trigger output buffer limit check
         r set key [string repeat A [expr 64*1024]]
@@ -178,7 +178,7 @@ start_server {} {
         } else {
             fail "Replication backlog memory is not smaller"
         }
-        exec kill -SIGCONT $replica2_pid
+        resume_process $replica2_pid
     }
     # speed up termination
     $master config set shutdown-timeout 0
diff --git a/tests/integration/replication-psync.tcl b/tests/integration/replication-psync.tcl
index 16f3b8889..dc1df0fa6 100644
--- a/tests/integration/replication-psync.tcl
+++ b/tests/integration/replication-psync.tcl
@@ -9,8 +9,8 @@
 # reconnect with the master, otherwise just the initial synchronization is
 # checked for consistency.
 proc test_psync {descr duration backlog_size backlog_ttl delay cond mdl sdl reconnect} {
-    start_server {tags {"repl"}} {
-        start_server {} {
+    start_server {tags {"repl"} overrides {save {}}} {
+        start_server {overrides {save {}}} {
 
             set master [srv -1 client]
             set master_host [srv -1 host]
diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl
index b4e9ee673..de4d527f4 100644
--- a/tests/integration/replication.tcl
+++ b/tests/integration/replication.tcl
@@ -302,7 +302,7 @@ start_server {tags {"repl external:skip"}} {
 
 foreach mdl {no yes} {
     foreach sdl {disabled swapdb} {
-        start_server {tags {"repl external:skip"}} {
+        start_server {tags {"repl external:skip"} overrides {save {}}} {
             set master [srv 0 client]
             $master config set repl-diskless-sync $mdl
             $master config set repl-diskless-sync-delay 5
@@ -310,11 +310,11 @@ foreach mdl {no yes} {
             set master_host [srv 0 host]
             set master_port [srv 0 port]
             set slaves {}
-            start_server {} {
+            start_server {overrides {save {}}} {
                 lappend slaves [srv 0 client]
-                start_server {} {
+                start_server {overrides {save {}}} {
                     lappend slaves [srv 0 client]
-                    start_server {} {
+                    start_server {overrides {save {}}} {
                         lappend slaves [srv 0 client]
                         test "Connect multiple replicas at the same time (issue #141), master diskless=$mdl, replica diskless=$sdl" {
                             # start load handles only inside the test, so that the test can be skipped
@@ -391,11 +391,11 @@ foreach mdl {no yes} {
     }
 }
 
-start_server {tags {"repl external:skip"}} {
+start_server {tags {"repl external:skip"} overrides {save {}}} {
     set master [srv 0 client]
     set master_host [srv 0 host]
     set master_port [srv 0 port]
-    start_server {} {
+    start_server {overrides {save {}}} {
         test "Master stream is correctly processed while the replica has a script in -BUSY state" {
             set load_handle0 [start_write_load $master_host $master_port 3]
             set slave [srv 0 client]
@@ -705,11 +705,11 @@ foreach testType {Successful Aborted} {
 }
 
 test {diskless loading short read} {
-    start_server {tags {"repl"}} {
+    start_server {tags {"repl"} overrides {save ""}} {
         set replica [srv 0 client]
         set replica_host [srv 0 host]
         set replica_port [srv 0 port]
-        start_server {} {
+        start_server {overrides {save ""}} {
             set master [srv 0 client]
             set master_host [srv 0 host]
             set master_port [srv 0 port]
@@ -847,7 +847,7 @@ proc compute_cpu_usage {start end} {
 
 
 # test diskless rdb pipe with multiple replicas, which may drop half way
-start_server {tags {"repl external:skip"}} {
+start_server {tags {"repl external:skip"} overrides {save ""}} {
     set master [srv 0 client]
     $master config set repl-diskless-sync yes
     $master config set repl-diskless-sync-delay 5
@@ -868,10 +868,10 @@ start_server {tags {"repl external:skip"}} {
             set replicas {}
             set replicas_alive {}
             # start one replica that will read the rdb fast, and one that will be slow
-            start_server {} {
+            start_server {overrides {save ""}} {
                 lappend replicas [srv 0 client]
                 lappend replicas_alive [srv 0 client]
-                start_server {} {
+                start_server {overrides {save ""}} {
                     lappend replicas [srv 0 client]
                     lappend replicas_alive [srv 0 client]
 
@@ -913,7 +913,7 @@ start_server {tags {"repl external:skip"}} {
                     if {$all_drop == "timeout"} {
                         $master config set repl-timeout 2
                         # we want the slow replica to hang on a key for very long so it'll reach repl-timeout
-                        exec kill -SIGSTOP [srv -1 pid]
+                        pause_process [srv -1 pid]
                         after 2000
                     }
 
@@ -940,7 +940,7 @@ start_server {tags {"repl external:skip"}} {
                         # master disconnected the slow replica, remove from array
                         set replicas_alive [lreplace $replicas_alive 0 0]
                         # release it
-                        exec kill -SIGCONT [srv -1 pid]
+                        resume_process [srv -1 pid]
                     }
 
                     # make sure we don't have a busy loop going thought epoll_wait
@@ -1000,7 +1000,7 @@ test "diskless replication child being killed is collected" {
     # when diskless master is waiting for the replica to become writable
     # it removes the read event from the rdb pipe so if the child gets killed
     # the replica will hung. and the master may not collect the pid with waitpid
-    start_server {tags {"repl"}} {
+    start_server {tags {"repl"} overrides {save ""}} {
         set master [srv 0 client]
         set master_host [srv 0 host]
         set master_port [srv 0 port]
@@ -1010,7 +1010,7 @@ test "diskless replication child being killed is collected" {
         # put enough data in the db that the rdb file will be bigger than the socket buffers
         $master debug populate 20000 test 10000
         $master config set rdbcompression no
-        start_server {} {
+        start_server {overrides {save ""}} {
             set replica [srv 0 client]
             set loglines [count_log_lines 0]
             $replica config set repl-diskless-load swapdb
@@ -1044,7 +1044,7 @@ test "diskless replication child being killed is collected" {
 foreach mdl {yes no} {
     test "replication child dies when parent is killed - diskless: $mdl" {
         # when master is killed, make sure the fork child can detect that and exit
-        start_server {tags {"repl"}} {
+        start_server {tags {"repl"} overrides {save ""}} {
             set master [srv 0 client]
             set master_host [srv 0 host]
             set master_port [srv 0 port]
@@ -1054,7 +1054,7 @@ foreach mdl {yes no} {
             # create keys that will take 10 seconds to save
             $master config set rdb-key-save-delay 1000
             $master debug populate 10000
-            start_server {} {
+            start_server {overrides {save ""}} {
                 set replica [srv 0 client]
                 $replica replicaof $master_host $master_port
 
@@ -1085,7 +1085,7 @@ test "diskless replication read pipe cleanup" {
     # When we close this pipe (fd), the read handler also needs to be removed from the event loop (if it still registered).
     # Otherwise, next time we will use the same fd, the registration will be fail (panic), because
     # we will use EPOLL_CTL_MOD (the fd still register in the event loop), on fd that already removed from epoll_ctl
-    start_server {tags {"repl"}} {
+    start_server {tags {"repl"} overrides {save ""}} {
         set master [srv 0 client]
         set master_host [srv 0 host]
         set master_port [srv 0 port]
@@ -1097,7 +1097,7 @@ test "diskless replication read pipe cleanup" {
         $master config set rdb-key-save-delay 100000
         $master debug populate 20000 test 10000
         $master config set rdbcompression no
-        start_server {} {
+        start_server {overrides {save ""}} {
             set replica [srv 0 client]
             set loglines [count_log_lines 0]
             $replica config set repl-diskless-load swapdb
@@ -1122,17 +1122,17 @@ test "diskless replication read pipe cleanup" {
 test {replicaof right after disconnection} {
     # this is a rare race condition that was reproduced sporadically by the psync2 unit.
     # see details in #7205
-    start_server {tags {"repl"}} {
+    start_server {tags {"repl"} overrides {save ""}} {
         set replica1 [srv 0 client]
         set replica1_host [srv 0 host]
         set replica1_port [srv 0 port]
         set replica1_log [srv 0 stdout]
-        start_server {} {
+        start_server {overrides {save ""}} {
             set replica2 [srv 0 client]
             set replica2_host [srv 0 host]
             set replica2_port [srv 0 port]
             set replica2_log [srv 0 stdout]
-            start_server {} {
+            start_server {overrides {save ""}} {
                 set master [srv 0 client]
                 set master_host [srv 0 host]
                 set master_port [srv 0 port]
diff --git a/tests/integration/shutdown.tcl b/tests/integration/shutdown.tcl
index 60afc5c7f..b2ec32cbd 100644
--- a/tests/integration/shutdown.tcl
+++ b/tests/integration/shutdown.tcl
@@ -19,8 +19,8 @@ proc fill_up_os_socket_send_buffer_for_repl {idx} {
 
 foreach how {sigterm shutdown} {
     test "Shutting down master waits for replica to catch up ($how)" {
-        start_server {} {
-            start_server {} {
+        start_server {overrides {save ""}} {
+            start_server {overrides {save ""}} {
                 set master [srv -1 client]
                 set master_host [srv -1 host]
                 set master_port [srv -1 port]
@@ -42,8 +42,7 @@ foreach how {sigterm shutdown} {
                 wait_for_ofs_sync $master $replica
 
                 # Pause the replica.
-                exec kill -SIGSTOP $replica_pid
-                after 10
+                pause_process $replica_pid
 
                 # Fill up the OS socket send buffer for the replica connection
                 # to prevent the following INCR from reaching the replica via
@@ -69,7 +68,7 @@ foreach how {sigterm shutdown} {
 
                 # Wake up replica and check if master has waited for it.
                 after 20; # 2 cron intervals
-                exec kill -SIGCONT $replica_pid
+                resume_process $replica_pid
                 wait_for_condition 300 1000 {
                     [$replica get k] eq 2
                 } else {
@@ -86,8 +85,8 @@ foreach how {sigterm shutdown} {
 }
 
 test {Shutting down master waits for replica timeout} {
-    start_server {} {
-        start_server {} {
+    start_server {overrides {save ""}} {
+        start_server {overrides {save ""}} {
             set master [srv -1 client]
             set master_host [srv -1 host]
             set master_port [srv -1 port]
@@ -107,8 +106,7 @@ test {Shutting down master waits for replica timeout} {
             wait_for_ofs_sync $master $replica
 
             # Pause the replica.
-            exec kill -SIGSTOP $replica_pid
-            after 10
+            pause_process $replica_pid
 
             # Fill up the OS socket send buffer for the replica connection to
             # prevent the following INCR k from reaching the replica via the OS.
@@ -129,15 +127,15 @@ test {Shutting down master waits for replica timeout} {
             verify_log_message -1 "*0 of 1 replicas are in sync*" 0
 
             # Wake up replica.
-            exec kill -SIGCONT $replica_pid
+            resume_process $replica_pid
             assert_equal 1 [$replica get k]
         }
     }
 } {} {repl external:skip}
 
 test "Shutting down master waits for replica then fails" {
-    start_server {} {
-        start_server {} {
+    start_server {overrides {save ""}} {
+        start_server {overrides {save ""}} {
             set master [srv -1 client]
             set master_host [srv -1 host]
             set master_port [srv -1 port]
@@ -150,8 +148,7 @@ test "Shutting down master waits for replica then fails" {
             wait_for_sync $replica
 
             # Pause the replica and write a key on master.
-            exec kill -SIGSTOP $replica_pid
-            after 10
+            pause_process $replica_pid
             $master incr k
 
             # Two clients call blocking SHUTDOWN in parallel.
@@ -168,7 +165,7 @@ test "Shutting down master waits for replica then fails" {
             $master config set appendonly yes
 
             # Wake up replica, causing master to continue shutting down.
-            exec kill -SIGCONT $replica_pid
+            resume_process $replica_pid
 
             # SHUTDOWN returns an error to both clients blocking on SHUTDOWN.
             catch { $rd1 read } e1
@@ -190,8 +187,8 @@ test "Shutting down master waits for replica then fails" {
 } {} {repl external:skip}
 
 test "Shutting down master waits for replica then aborted" {
-    start_server {} {
-        start_server {} {
+    start_server {overrides {save ""}} {
+        start_server {overrides {save ""}} {
             set master [srv -1 client]
             set master_host [srv -1 host]
             set master_port [srv -1 port]
@@ -204,8 +201,7 @@ test "Shutting down master waits for replica then aborted" {
             wait_for_sync $replica
 
             # Pause the replica and write a key on master.
-            exec kill -SIGSTOP $replica_pid
-            after 10
+            pause_process $replica_pid
             $master incr k
 
             # Two clients call blocking SHUTDOWN in parallel.
@@ -221,7 +217,7 @@ test "Shutting down master waits for replica then aborted" {
             $master shutdown abort
 
             # Wake up replica, causing master to continue shutting down.
-            exec kill -SIGCONT $replica_pid
+            resume_process $replica_pid
 
             # SHUTDOWN returns an error to both clients blocking on SHUTDOWN.
             catch { $rd1 read } e1
diff --git a/tests/sentinel/tests/07-down-conditions.tcl b/tests/sentinel/tests/07-down-conditions.tcl
index 403f81e73..dabbc14c5 100644
--- a/tests/sentinel/tests/07-down-conditions.tcl
+++ b/tests/sentinel/tests/07-down-conditions.tcl
@@ -49,9 +49,9 @@ test "SDOWN is triggered by non-responding but not crashed instance" {
     set master_id [get_instance_id_by_port redis [lindex $master_addr 1]]
 
     set pid [get_instance_attrib redis $master_id pid]
-    exec kill -SIGSTOP $pid
+    pause_process $pid
     ensure_master_down
-    exec kill -SIGCONT $pid
+    resume_process $pid
     ensure_master_up
 }
 
diff --git a/tests/support/server.tcl b/tests/support/server.tcl
index 4c596290d..9a3733b61 100644
--- a/tests/support/server.tcl
+++ b/tests/support/server.tcl
@@ -537,6 +537,9 @@ proc start_server {options {code undefined}} {
         set fd [open $stdout "a+"]
         puts $fd "### Starting server for test $::cur_test"
         close $fd
+        if {$::verbose > 1} {
+            puts "### Starting server $stdout for test - $::cur_test"
+        }
     }
 
     # We may have a stdout left over from the previous tests, so we need
diff --git a/tests/support/test.tcl b/tests/support/test.tcl
index 68180bea4..b7cd38b38 100644
--- a/tests/support/test.tcl
+++ b/tests/support/test.tcl
@@ -168,7 +168,9 @@ proc test {name code {okpattern undefined} {tags {}}} {
         send_data_packet $::test_server_fd skip $name
         return
     }
-
+    if {$::verbose > 1} {
+        puts "starting test $name"
+    }
     # abort if only_tests was set but test name is not included
     if {[llength $::only_tests] > 0 && ![search_pattern_list $name $::only_tests]} {
         incr ::num_skipped
@@ -200,11 +202,16 @@ proc test {name code {okpattern undefined} {tags {}}} {
             $r close
         }
     } else {
+        set servers {}
         foreach srv $::servers {
             set stdout [dict get $srv stdout]
             set fd [open $stdout "a+"]
             puts $fd "### Starting test $::cur_test"
             close $fd
+            lappend servers $stdout
+        }
+        if {$::verbose > 1} {
+            puts "### Starting test $::cur_test - with servers: $servers"
         }
     }
 
diff --git a/tests/support/util.tcl b/tests/support/util.tcl
index 236fad314..c98e7801b 100644
--- a/tests/support/util.tcl
+++ b/tests/support/util.tcl
@@ -602,15 +602,24 @@ proc stop_bg_complex_data {handle} {
 # Write num keys with the given key prefix and value size (in bytes). If idx is
 # given, it's the index (AKA level) used with the srv procedure and it specifies
 # to which Redis instance to write the keys.
-proc populate {num {prefix key:} {size 3} {idx 0}} {
-    set rd [redis_deferring_client $idx]
-    for {set j 0} {$j < $num} {incr j} {
-        $rd set $prefix$j [string repeat A $size]
+proc populate {num {prefix key:} {size 3} {idx 0} {prints false}} {
+    r $idx deferred 1
+    if {$num > 16} {set pipeline 16} else {set pipeline $num}
+    set val [string repeat A $size]
+    for {set j 0} {$j < $pipeline} {incr j} {
+        r $idx set $prefix$j $val
+        if {$prints} {puts $j}
     }
-    for {set j 0} {$j < $num} {incr j} {
-        $rd read
+    for {} {$j < $num} {incr j} {
+        r $idx set $prefix$j $val
+        r $idx read
+        if {$prints} {puts $j}
     }
-    $rd close
+    for {set j 0} {$j < $pipeline} {incr j} {
+        r $idx read
+        if {$prints} {puts $j}
+    }
+    r $idx deferred 0
 }
 
 proc get_child_pid {idx} {
@@ -636,6 +645,20 @@ proc process_is_alive pid {
     }
 }
 
+proc pause_process pid {
+    exec kill -SIGSTOP $pid
+    wait_for_condition 50 100 {
+        [string match {*T*} [lindex [exec ps j $pid] 16]]
+    } else {
+        puts [exec ps j $pid]
+        fail "process didn't stop"
+    }
+}
+
+proc resume_process pid {
+    exec kill -SIGCONT $pid
+}
+
 proc cmdrstat {cmd r} {
     if {[regexp "\r\ncmdstat_$cmd:(.*?)\r\n" [$r info commandstats] _ value]} {
         set _ $value
diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl
index 922cb438d..6ec2ae1fc 100644
--- a/tests/test_helper.tcl
+++ b/tests/test_helper.tcl
@@ -526,6 +526,7 @@ proc signal_idle_client fd {
         incr ::next_test
         if {$::loop && $::next_test == [llength $::all_tests]} {
             set ::next_test 0
+            incr ::loop -1
         }
     } elseif {[llength $::run_solo_tests] != 0 && [llength $::active_clients] == 0} {
         if {!$::quiet} {
@@ -620,6 +621,7 @@ proc print_help_screen {} {
         "--no-latency       Skip latency measurements and validation by some tests."
         "--stop             Blocks once the first test fails."
         "--loop             Execute the specified set of tests forever."
+        "--loops <count>    Execute the specified set of tests several times."
         "--wait-server      Wait after server is started (so that you can attach a debugger)."
         "--dump-logs        Dump server log on test failure."
         "--tls              Run tests in TLS mode."
@@ -721,7 +723,7 @@ for {set j 0} {$j < [llength $argv]} {incr j} {
         }
         exit 0
     } elseif {$opt eq {--verbose}} {
-        set ::verbose 1
+        incr ::verbose
     } elseif {$opt eq {--client}} {
         set ::client 1
         set ::test_server_port $arg
@@ -744,7 +746,10 @@ for {set j 0} {$j < [llength $argv]} {incr j} {
     } elseif {$opt eq {--stop}} {
         set ::stop_on_failure 1
     } elseif {$opt eq {--loop}} {
-        set ::loop 1
+        set ::loop 2147483647
+    } elseif {$opt eq {--loops}} {
+        set ::loop $arg
+        incr j
     } elseif {$opt eq {--timeout}} {
         set ::timeout $arg
         incr j
diff --git a/tests/unit/aofrw.tcl b/tests/unit/aofrw.tcl
index fe07351a3..cc7545265 100644
--- a/tests/unit/aofrw.tcl
+++ b/tests/unit/aofrw.tcl
@@ -1,6 +1,6 @@
 # This unit has the potential to create huge .reqres files, causing log-req-res-validator.py to run for a very long time...
 # Since this unit doesn't do anything worth validating, reply_schema-wise, we decided to skip it
-start_server {tags {"aofrw external:skip logreqres:skip"}} {
+start_server {tags {"aofrw external:skip logreqres:skip"} overrides {save {}}} {
     # Enable the AOF
     r config set appendonly yes
     r config set auto-aof-rewrite-percentage 0 ; # Disable auto-rewrite.
diff --git a/tests/unit/client-eviction.tcl b/tests/unit/client-eviction.tcl
index 76f7bf0f2..1fc7c02ca 100644
--- a/tests/unit/client-eviction.tcl
+++ b/tests/unit/client-eviction.tcl
@@ -347,12 +347,12 @@ start_server {} {
         # We use two obuf-clients to make sure that even if client eviction is attempted
         # between two command processing (with no sleep) we don't perform any client eviction
         # because the obuf limit is enforced with precedence.
-        exec kill -SIGSTOP $server_pid
+        pause_process $server_pid
         $rr2 get k
         $rr2 flush
         $rr3 get k
         $rr3 flush
-        exec kill -SIGCONT $server_pid
+        resume_process $server_pid
         r ping ;# make sure a full event loop cycle is processed before issuing CLIENT LIST
 
         # Validate obuf-clients were disconnected (because of obuf limit)
diff --git a/tests/unit/cluster/cli.tcl b/tests/unit/cluster/cli.tcl
index 7131ee20f..3e7f1bedb 100644
--- a/tests/unit/cluster/cli.tcl
+++ b/tests/unit/cluster/cli.tcl
@@ -116,7 +116,7 @@ start_multiple_servers 3 [list overrides $base_conf] {
 
      test "Kill a cluster node and wait for fail state" {
         # kill node3 in cluster
-        exec kill -SIGSTOP $node3_pid
+        pause_process $node3_pid
 
         wait_for_condition 1000 50 {
             [CI 0 cluster_state] eq {fail} &&
@@ -134,7 +134,7 @@ start_multiple_servers 3 [list overrides $base_conf] {
         assert_equal [s -1 blocked_clients]  {0}
     }
 
-    exec kill -SIGCONT $node3_pid
+    resume_process $node3_pid
     $node1_rd close
 
 } ;# stop servers
diff --git a/tests/unit/cluster/links.tcl b/tests/unit/cluster/links.tcl
index 63c2b143c..a202c378b 100644
--- a/tests/unit/cluster/links.tcl
+++ b/tests/unit/cluster/links.tcl
@@ -200,7 +200,7 @@ start_cluster 3 0 {tags {external:skip cluster}} {
         # To manufacture an ever-growing send buffer from primary1 to primary2,
         # make primary2 unresponsive.
         set primary2_pid [srv [expr -1*$primary2_id] pid]
-        exec kill -SIGSTOP $primary2_pid
+        pause_process $primary2_pid
 
         # On primary1, send 128KB Pubsub messages in a loop until the send buffer of the link from
         # primary1 to primary2 exceeds buffer limit therefore be dropped.
@@ -226,7 +226,7 @@ start_cluster 3 0 {tags {external:skip cluster}} {
         assert {[dict get $same_link_p1_from_p2 create-time] eq [dict get $orig_link_p1_from_p2 create-time]}
 
         # Revive primary2
-        exec kill -SIGCONT $primary2_pid
+        resume_process $primary2_pid
 
         # Reset configs on primary1 so config changes don't leak out to other tests
         $primary1 CONFIG set cluster-node-timeout $oldtimeout
diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl
index 564250f2e..54aba6715 100644
--- a/tests/unit/maxmemory.tcl
+++ b/tests/unit/maxmemory.tcl
@@ -350,7 +350,7 @@ proc test_slave_buffers {test_name cmd_count payload_len limit_memory pipeline}
 
             # put the slave to sleep
             set rd_slave [redis_deferring_client]
-            exec kill -SIGSTOP $slave_pid
+            pause_process $slave_pid
 
             # send some 10mb worth of commands that don't increase the memory usage
             if {$pipeline == 1} {
@@ -399,7 +399,7 @@ proc test_slave_buffers {test_name cmd_count payload_len limit_memory pipeline}
 
         }
         # unfreeze slave process (after the 'test' succeeded or failed, but before we attempt to terminate the server
-        exec kill -SIGCONT $slave_pid
+        resume_process $slave_pid
         }
     }
 }
diff --git a/tests/unit/moduleapi/cluster.tcl b/tests/unit/moduleapi/cluster.tcl
index 43356f77d..807508387 100644
--- a/tests/unit/moduleapi/cluster.tcl
+++ b/tests/unit/moduleapi/cluster.tcl
@@ -132,7 +132,7 @@ start_cluster 3 0 [list config_lines $modules] {
 
     test "Kill a cluster node and wait for fail state" {
         # kill node3 in cluster
-        exec kill -SIGSTOP $node3_pid
+        pause_process $node3_pid
 
         wait_for_condition 1000 50 {
             [CI 0 cluster_state] eq {fail} &&
@@ -158,7 +158,7 @@ start_cluster 3 0 [list config_lines $modules] {
         assert_error "ERR Can not execute a command 'set' while the cluster is down" {$node1 do_rm_call set x 1}
     }
 
-    exec kill -SIGCONT $node3_pid
+    resume_process $node3_pid
     $node1_rd close
     $node2_rd close
 }
diff --git a/tests/unit/wait.tcl b/tests/unit/wait.tcl
index af13a3374..8c6010afb 100644
--- a/tests/unit/wait.tcl
+++ b/tests/unit/wait.tcl
@@ -47,25 +47,25 @@ start_server {} {
     }
 
     test {WAIT should not acknowledge 1 additional copy if slave is blocked} {
-        exec kill -SIGSTOP $slave_pid
+        pause_process $slave_pid
         $master set foo 0
         $master incr foo
         $master incr foo
         $master incr foo
         assert {[$master wait 1 1000] == 0}
-        exec kill -SIGCONT $slave_pid
+        resume_process $slave_pid
         assert {[$master wait 1 1000] == 1}
     }
 
     test {WAIT implicitly blocks on client pause since ACKs aren't sent} {
-        exec kill -SIGSTOP $slave_pid
+        pause_process $slave_pid
         $master multi
         $master incr foo
         $master client pause 10000 write
         $master exec
         assert {[$master wait 1 1000] == 0}
         $master client unpause
-        exec kill -SIGCONT $slave_pid
+        resume_process $slave_pid
         assert {[$master wait 1 1000] == 1}
     }
 
@@ -73,7 +73,7 @@ start_server {} {
         set rd [redis_deferring_client -1]
         set rd2 [redis_deferring_client -1]
 
-        exec kill -SIGSTOP $slave_pid
+        pause_process $slave_pid
 
         $rd incr foo
         $rd read
@@ -85,7 +85,7 @@ start_server {} {
         $rd2 wait 1 0
         wait_for_blocked_clients_count 2 100 10 -1
 
-        exec kill -SIGCONT $slave_pid
+        resume_process $slave_pid
 
         assert_equal [$rd read] {1}
         assert_equal [$rd2 read] {1}
@@ -229,10 +229,10 @@ tags {"wait aof network external:skip"} {
             }
 
             test {WAITAOF replica copy if replica is blocked} {
-                exec kill -SIGSTOP $replica_pid
+                pause_process $replica_pid
                 $master incr foo
                 assert_equal [$master waitaof 0 1 50] {1 0} ;# exits on timeout
-                exec kill -SIGCONT $replica_pid
+                resume_process $replica_pid
                 assert_equal [$master waitaof 0 1 0] {1 1}
             }
 
@@ -240,7 +240,7 @@ tags {"wait aof network external:skip"} {
                 set rd [redis_deferring_client -1]
                 set rd2 [redis_deferring_client -1]
 
-                exec kill -SIGSTOP $replica_pid
+                pause_process $replica_pid
 
                 $rd incr foo
                 $rd read
@@ -252,7 +252,7 @@ tags {"wait aof network external:skip"} {
                 $rd2 waitaof 0 1 0
                 wait_for_blocked_clients_count 2 100 10 -1
 
-                exec kill -SIGCONT $replica_pid
+                resume_process $replica_pid
 
                 assert_equal [$rd read] {1 1}
                 assert_equal [$rd2 read] {1 1}
@@ -438,7 +438,7 @@ start_server {} {
         waitForBgrewriteaof $replica1
         waitForBgrewriteaof $replica2
 
-        exec kill -SIGSTOP $replica1_pid
+        pause_process $replica1_pid
 
         $rd incr foo
         $rd read
@@ -451,7 +451,7 @@ start_server {} {
 
         wait_for_blocked_clients_count 2
 
-        exec kill -SIGCONT $replica1_pid
+        resume_process $replica1_pid
 
         # WAIT will unblock the client first.
         assert_equal [$rd2 read] {2}
author	Oran Agra <oran@redislabs.com>	2023-04-12 09:19:21 +0300
committer	GitHub <noreply@github.com>	2023-04-12 09:19:21 +0300
commit	997fa41e99271cc5c3a79e9bf8a1332b3d9ab0c2 (patch)
tree	e9518b5a84c42b73ba39b735bc4d11010adc3d70 /tests
parent	45b8eea19f3e2491dec669f0745e513a4c9d7329 (diff)
download	redis-997fa41e99271cc5c3a79e9bf8a1332b3d9ab0c2.tar.gz