Attempt to solve MacOS CI issues in GH Actions (#12013)

The MacOS CI in github actions often hangs without any logs. GH argues that it's due to resource utilization, either running out of disk space, memory, or CPU starvation, and thus the runner is terminated. This PR contains multiple attempts to resolve this: 1. introducing pause_process instead of SIGSTOP, which waits for the process to stop before resuming the test, possibly resolving race conditions in some tests, this was a suspect since there was one test that could result in an infinite loop in that case, in practice this didn't help, but still a good idea to keep. 2. disable the `save` config in many tests that don't need it, specifically ones that use heavy writes and could create large files. 3. change the `populate` proc to use short pipeline rather than an infinite one. 4. use `--clients 1` in the macos CI so that we don't risk running multiple resource demanding tests in parallel. 5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout when a test or a server starts.
author: Oran Agra <oran@redislabs.com> 2023-04-12 09:19:21 +0300
committer: GitHub <noreply@github.com> 2023-04-12 09:19:21 +0300
commit: 997fa41e99271cc5c3a79e9bf8a1332b3d9ab0c2 (patch)
tree: e9518b5a84c42b73ba39b735bc4d11010adc3d70 /tests/integration/failover.tcl
parent: 45b8eea19f3e2491dec669f0745e513a4c9d7329 (diff)
download: redis-997fa41e99271cc5c3a79e9bf8a1332b3d9ab0c2.tar.gz
1 files changed, 15 insertions, 15 deletions
diff --git a/tests/integration/failover.tcl b/tests/integration/failover.tcl
index 2cd944851..21fa3d281 100644
--- a/tests/integration/failover.tcl
+++ b/tests/integration/failover.tcl
@@ -1,6 +1,6 @@
-start_server {tags {"failover external:skip"}} {
-start_server {} {
-start_server {} {
+start_server {tags {"failover external:skip"} overrides {save {}}} {
+start_server {overrides {save {}}} {
+start_server {overrides {save {}}} {
     set node_0 [srv 0 client]
     set node_0_host [srv 0 host]
     set node_0_port [srv 0 port]
@@ -66,13 +66,13 @@ start_server {} {
 
         # Generate a delta between primary and replica
         set load_handler [start_write_load $node_0_host $node_0_port 5]
-        exec kill -SIGSTOP [srv -1 pid]
+        pause_process [srv -1 pid]
         wait_for_condition 50 100 {
             [s 0 total_commands_processed] > 100
         } else {
             fail "Node 0 did not accept writes"
         }
-        exec kill -SIGCONT [srv -1 pid]
+        resume_process [srv -1 pid]
 
         # Execute the failover
         $node_0 failover to $node_1_host $node_1_port
@@ -108,7 +108,7 @@ start_server {} {
 
         wait_for_ofs_sync $node_1 $node_2
         # We stop node 0 to and make sure node 2 is selected
-        exec kill -SIGSTOP $node_0_pid
+        pause_process $node_0_pid
         $node_1 set CASE 1
         $node_1 FAILOVER
 
@@ -118,7 +118,7 @@ start_server {} {
         } else {
             fail "Failover from node 1 to node 2 did not finish"
         }
-        exec kill -SIGCONT $node_0_pid
+        resume_process $node_0_pid
         $node_0 replicaof $node_2_host $node_2_port
 
         wait_for_sync $node_0
@@ -138,7 +138,7 @@ start_server {} {
         set initial_psyncs [s 0 sync_partial_ok]
         set initial_syncs [s 0 sync_full]
 
-        exec kill -SIGSTOP $node_0_pid
+        pause_process $node_0_pid
         # node 0 will never acknowledge this write
         $node_2 set case 2
         $node_2 failover to $node_0_host $node_0_port TIMEOUT 100 FORCE
@@ -155,7 +155,7 @@ start_server {} {
         assert_match *slave* [$node_1 role]
         assert_match *slave* [$node_2 role]
 
-        exec kill -SIGCONT $node_0_pid
+        resume_process $node_0_pid
 
         # Wait for failover to end
         wait_for_condition 50 100 {
@@ -186,7 +186,7 @@ start_server {} {
         set initial_syncs [s 0 sync_full]
 
         # Stop replica so it never catches up
-        exec kill -SIGSTOP [srv -1 pid]
+        pause_process [srv -1 pid]
         $node_0 SET CASE 1
         
         $node_0 failover to [srv -1 host] [srv -1 port] TIMEOUT 500
@@ -197,7 +197,7 @@ start_server {} {
             fail "Failover from node_0 to replica did not finish"
         }
 
-        exec kill -SIGCONT [srv -1 pid]
+        resume_process [srv -1 pid]
 
         # We need to make sure the nodes actually sync back up
         wait_for_ofs_sync $node_0 $node_1
@@ -218,7 +218,7 @@ start_server {} {
         set initial_syncs [s 0 sync_full]
     
         # Stop replica so it never catches up
-        exec kill -SIGSTOP [srv -1 pid]
+        pause_process [srv -1 pid]
         $node_0 SET CASE 2
         
         $node_0 failover to [srv -1 host] [srv -1 port] TIMEOUT 60000
@@ -230,7 +230,7 @@ start_server {} {
         $node_0 failover abort
         assert_match [s 0 master_failover_state] "no-failover"
 
-        exec kill -SIGCONT [srv -1 pid]
+        resume_process [srv -1 pid]
 
         # Just make sure everything is still synced
         wait_for_ofs_sync $node_0 $node_1
@@ -255,11 +255,11 @@ start_server {} {
 
         # We pause the target long enough to send a write command
         # during the pause. This write will not be interrupted.
-        exec kill -SIGSTOP [srv -1 pid]
+        pause_process [srv -1 pid]
         set rd [redis_deferring_client]
         $rd SET FOO BAR
         $node_0 failover to $node_1_host $node_1_port
-        exec kill -SIGCONT [srv -1 pid]
+        resume_process [srv -1 pid]
 
         # Wait for failover to end
         wait_for_condition 50 100 {
author	Oran Agra <oran@redislabs.com>	2023-04-12 09:19:21 +0300
committer	GitHub <noreply@github.com>	2023-04-12 09:19:21 +0300
commit	997fa41e99271cc5c3a79e9bf8a1332b3d9ab0c2 (patch)
tree	e9518b5a84c42b73ba39b735bc4d11010adc3d70 /tests/integration/failover.tcl
parent	45b8eea19f3e2491dec669f0745e513a4c9d7329 (diff)
download	redis-997fa41e99271cc5c3a79e9bf8a1332b3d9ab0c2.tar.gz