Fix flaky cluster tests in 24-links.tcl (#10157)

* Fix flaky cluster test "Disconnect link when send buffer limit reached" * Fix flaky cluster test "Each node has two links with each peer" Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
author: ny0312 <49037844+ny0312@users.noreply.github.com> 2022-01-23 17:28:32 -0800
committer: GitHub <noreply@github.com> 2022-01-23 17:28:32 -0800
commit: b40a9ba5fda9b28310964843831890144d8a77c2 (patch)
tree: 28c88b0608a03bc08fa2ec04c965604a965c18db /tests/cluster
parent: 7e5ded2ad0521600ceb57d71b0dbb19bbbc087b7 (diff)
download: redis-b40a9ba5fda9b28310964843831890144d8a77c2.tar.gz
1 files changed, 38 insertions, 23 deletions
diff --git a/tests/cluster/tests/24-links.tcl b/tests/cluster/tests/24-links.tcl
index 6657a8ce4..92ce0f0be 100644
--- a/tests/cluster/tests/24-links.tcl
+++ b/tests/cluster/tests/24-links.tcl
@@ -8,18 +8,27 @@ test "Cluster should start ok" {
     assert_cluster_state ok
 }
 
+proc number_of_peers {id} {
+    expr [llength [get_cluster_nodes $id]] - 1
+}
+
+proc number_of_links {id} {
+    llength [get_cluster_links $id]
+}
+
 test "Each node has two links with each peer" {
     foreach_redis_id id {
-        # Get number of peers, excluding myself
-        set nodes [get_cluster_nodes $id]
-        set num_peers [expr [llength $nodes] - 1]
+        # Assert that from point of view of each node, there are two links for
+        # each peer. It might take a while for cluster to stabilize so wait up
+        # to 5 seconds.
+        wait_for_condition 50 100 {
+            [number_of_peers $id]*2 == [number_of_links $id]
+        } else {
+            assert_equal [expr [number_of_peers $id]*2] [number_of_links $id]
+        }
 
-        # Get number of links to peers
+        set nodes [get_cluster_nodes $id]
         set links [get_cluster_links $id]
-        set num_links [llength $links]
-
-        # Two links per peer
-        assert {$num_peers*2 eq $num_links}
 
         # For each peer there should be exactly one
         # link "to" it and one link "from" it.
@@ -59,9 +68,12 @@ test "Disconnect link when send buffer limit reached" {
     set orig_link_p1_to_p2 [get_link_to_peer $primary1_id $primary2_name]
     set orig_link_p1_from_p2 [get_link_from_peer $primary1_id $primary2_name]
 
-    # On primary1, set cluster link send buffer limit to 32MB
+    # On primary1, set cluster link send buffer limit to 256KB, which is large enough to not be
+    # overflowed by regular gossip messages but also small enough that it doesn't take too much
+    # memory to overflow it. If it is set too high, Redis may get OOM killed by kernel before this
+    # limit is overflowed in some RAM-limited test environments.
     set oldlimit [lindex [$primary1 CONFIG get cluster-link-sendbuf-limit] 1]
-    $primary1 CONFIG set cluster-link-sendbuf-limit [expr 32*1024*1024]
+    $primary1 CONFIG set cluster-link-sendbuf-limit [expr 256*1024]
     assert {[get_info_field [$primary1 cluster info] total_cluster_links_buffer_limit_exceeded] eq 0}
 
     # To manufacture an ever-growing send buffer from primary1 to primary2,
@@ -69,22 +81,25 @@ test "Disconnect link when send buffer limit reached" {
     set primary2_pid [get_instance_attrib redis $primary2_id pid]
     exec kill -SIGSTOP $primary2_pid
 
-    # On primary1, send a 10MB Pubsub message. It will stay in send buffer of
-    # the link from primary1 to primary2
-    $primary1 publish channel [prepare_value [expr 10*1024*1024]]
-
-    # Check the same link has not been disconnected, but its send buffer has grown
-    set same_link_p1_to_p2 [get_link_to_peer $primary1_id $primary2_name]
-    assert {[dict get $same_link_p1_to_p2 create_time] eq [dict get $orig_link_p1_to_p2 create_time]}
-    assert {[dict get $same_link_p1_to_p2 send_buffer_allocated] > [dict get $orig_link_p1_to_p2 send_buffer_allocated]}
-
-    # On primary1, send another 30MB Pubsub message.
-    $primary1 publish channel [prepare_value [expr 30*1024*1024]]
+    # On primary1, send 128KB Pubsub messages in a loop until the send buffer of the link from
+    # primary1 to primary2 exceeds buffer limit therefore be dropped.
+    # For the send buffer to grow, we need to first exhaust TCP send buffer of primary1 and TCP
+    # receive buffer of primary2 first. The sizes of these two buffers vary by OS, but 100 128KB
+    # messages should be sufficient.
+    set i 0
+    wait_for_condition 100 0 {
+        [catch {incr i} e] == 0 &&
+        [catch {$primary1 publish channel [prepare_value [expr 128*1024]]} e] == 0 &&
+        [catch {after 500} e] == 0 &&
+        [get_info_field [$primary1 cluster info] total_cluster_links_buffer_limit_exceeded] eq 1
+    } else {
+        fail "Cluster link not freed as expected"
+    }
+    puts -nonewline "$i 128KB messages needed to overflow 256KB buffer limit. "
 
-    # Link has exceeded buffer limit and been dropped and recreated
+    # A new link to primary2 should have been recreated
     set new_link_p1_to_p2 [get_link_to_peer $primary1_id $primary2_name]
     assert {[dict get $new_link_p1_to_p2 create_time] > [dict get $orig_link_p1_to_p2 create_time]}
-    assert {[get_info_field [$primary1 cluster info] total_cluster_links_buffer_limit_exceeded] eq 1}
 
     # Link from primary2 should not be affected
     set same_link_p1_from_p2 [get_link_from_peer $primary1_id $primary2_name]
author	ny0312 <49037844+ny0312@users.noreply.github.com>	2022-01-23 17:28:32 -0800
committer	GitHub <noreply@github.com>	2022-01-23 17:28:32 -0800
commit	b40a9ba5fda9b28310964843831890144d8a77c2 (patch)
tree	28c88b0608a03bc08fa2ec04c965604a965c18db /tests/cluster
parent	7e5ded2ad0521600ceb57d71b0dbb19bbbc087b7 (diff)
download	redis-b40a9ba5fda9b28310964843831890144d8a77c2.tar.gz