Introduce memory management on cluster link buffers (#9774)

Introduce memory management on cluster link buffers: * Introduce a new `cluster-link-sendbuf-limit` config that caps memory usage of cluster bus link send buffers. * Introduce a new `CLUSTER LINKS` command that displays current TCP links to/from peers. * Introduce a new `mem_cluster_links` field under `INFO` command output, which displays the overall memory usage by all current cluster links. * Introduce a new `total_cluster_links_buffer_limit_exceeded` field under `CLUSTER INFO` command output, which displays the accumulated count of cluster links freed due to `cluster-link-sendbuf-limit`.
author: ny0312 <49037844+ny0312@users.noreply.github.com> 2021-12-16 21:56:59 -0800
committer: GitHub <noreply@github.com> 2021-12-16 21:56:59 -0800
commit: 792afb443211f190b3f8bea15e945661453fbddf (patch)
tree: ae3a50c8adc9ed0a3adb3d1093e7f0129b469a2b /tests/cluster
parent: 687210f1550cf9048bed5f5539c9411fb22cd3b0 (diff)
download: redis-792afb443211f190b3f8bea15e945661453fbddf.tar.gz
2 files changed, 168 insertions, 0 deletions
diff --git a/tests/cluster/cluster.tcl b/tests/cluster/cluster.tcl
index e95789282..7b7ce5343 100644
--- a/tests/cluster/cluster.tcl
+++ b/tests/cluster/cluster.tcl
@@ -175,3 +175,72 @@ proc wait_for_cluster_propagation {} {
         fail "cluster config did not reach a consistent state"
     }
 }
+
+# Returns a parsed CLUSTER LINKS output of the instance identified
+# by the given `id` as a list of dictionaries, with each dictionary
+# corresponds to a link.
+proc get_cluster_links id {
+    set lines [R $id cluster links]
+    set links {}
+    foreach l $lines {
+        if {$l eq {}} continue
+        assert_equal [llength $l] 12
+        assert_equal [lindex $l 0] "direction"
+        set dir [lindex $l 1]
+        assert_equal [lindex $l 2] "node"
+        set node [lindex $l 3]
+        assert_equal [lindex $l 4] "create-time"
+        set create_time [lindex $l 5]
+        assert_equal [lindex $l 6] "events"
+        set events [lindex $l 7]
+        assert_equal [lindex $l 8] "send-buffer-allocated"
+        set send_buffer_allocated [lindex $l 9]
+        assert_equal [lindex $l 10] "send-buffer-used"
+        set send_buffer_used [lindex $l 11]
+        set link [dict create \
+            dir $dir \
+            node $node \
+            create_time $create_time \
+            events $events \
+            send_buffer_allocated $send_buffer_allocated \
+            send_buffer_used $send_buffer_used \
+        ]
+        lappend links $link
+    }
+    return $links
+}
+
+proc get_links_with_peer {this_instance_id peer_nodename} {
+    set links [get_cluster_links $this_instance_id]
+    set links_with_peer {}
+    foreach l $links {
+        if {[dict get $l node] eq $peer_nodename} {
+            lappend links_with_peer $l
+        }
+    }
+    return $links_with_peer
+}
+
+# Return the entry in CLUSTER LINKS output by instance identified by `this_instance_id` that
+# corresponds to the link established toward a peer identified by `peer_nodename`
+proc get_link_to_peer {this_instance_id peer_nodename} {
+    set links_with_peer [get_links_with_peer $this_instance_id $peer_nodename]
+    foreach l $links_with_peer {
+        if {[dict get $l dir] eq "to"} {
+            return $l
+        }
+    }
+    return {}
+}
+
+# Return the entry in CLUSTER LINKS output by instance identified by `this_instance_id` that
+# corresponds to the link accepted from a peer identified by `peer_nodename`
+proc get_link_from_peer {this_instance_id peer_nodename} {
+    set links_with_peer [get_links_with_peer $this_instance_id $peer_nodename]
+    foreach l $links_with_peer {
+        if {[dict get $l dir] eq "from"} {
+            return $l
+        }
+    }
+    return {}
+}
diff --git a/tests/cluster/tests/24-links.tcl b/tests/cluster/tests/24-links.tcl
new file mode 100644
index 000000000..6657a8ce4
--- /dev/null
+++ b/tests/cluster/tests/24-links.tcl
@@ -0,0 +1,99 @@
+source "../tests/includes/init-tests.tcl"
+
+test "Create a cluster with two single-node shards" {
+    create_cluster 2 0
+}
+
+test "Cluster should start ok" {
+    assert_cluster_state ok
+}
+
+test "Each node has two links with each peer" {
+    foreach_redis_id id {
+        # Get number of peers, excluding myself
+        set nodes [get_cluster_nodes $id]
+        set num_peers [expr [llength $nodes] - 1]
+
+        # Get number of links to peers
+        set links [get_cluster_links $id]
+        set num_links [llength $links]
+
+        # Two links per peer
+        assert {$num_peers*2 eq $num_links}
+
+        # For each peer there should be exactly one
+        # link "to" it and one link "from" it.
+        foreach n $nodes {
+            if {[has_flag $n myself]} continue
+            set peer [dict get $n id]
+            set to 0
+            set from 0
+            foreach l $links {
+                if {[dict get $l node] eq $peer} {
+                    if {[dict get $l dir] eq "to"} {
+                        incr to
+                    } elseif {[dict get $l dir] eq "from"} {
+                        incr from
+                    }
+                }
+            }
+            assert {$to eq 1}
+            assert {$from eq 1}
+        }
+    }
+}
+
+set primary1_id 0
+set primary2_id 1
+
+set primary1 [Rn $primary1_id]
+set primary2 [Rn $primary2_id]
+
+test "Disconnect link when send buffer limit reached" {
+    # On primary1, set timeout to 1 hour so links won't get disconnected due to timeouts
+    set oldtimeout [lindex [$primary1 CONFIG get cluster-node-timeout] 1]
+    $primary1 CONFIG set cluster-node-timeout [expr 60*60*1000]
+
+    # Get primary1's links with primary2
+    set primary2_name [dict get [get_myself $primary2_id] id]
+    set orig_link_p1_to_p2 [get_link_to_peer $primary1_id $primary2_name]
+    set orig_link_p1_from_p2 [get_link_from_peer $primary1_id $primary2_name]
+
+    # On primary1, set cluster link send buffer limit to 32MB
+    set oldlimit [lindex [$primary1 CONFIG get cluster-link-sendbuf-limit] 1]
+    $primary1 CONFIG set cluster-link-sendbuf-limit [expr 32*1024*1024]
+    assert {[get_info_field [$primary1 cluster info] total_cluster_links_buffer_limit_exceeded] eq 0}
+
+    # To manufacture an ever-growing send buffer from primary1 to primary2,
+    # make primary2 unresponsive.
+    set primary2_pid [get_instance_attrib redis $primary2_id pid]
+    exec kill -SIGSTOP $primary2_pid
+
+    # On primary1, send a 10MB Pubsub message. It will stay in send buffer of
+    # the link from primary1 to primary2
+    $primary1 publish channel [prepare_value [expr 10*1024*1024]]
+
+    # Check the same link has not been disconnected, but its send buffer has grown
+    set same_link_p1_to_p2 [get_link_to_peer $primary1_id $primary2_name]
+    assert {[dict get $same_link_p1_to_p2 create_time] eq [dict get $orig_link_p1_to_p2 create_time]}
+    assert {[dict get $same_link_p1_to_p2 send_buffer_allocated] > [dict get $orig_link_p1_to_p2 send_buffer_allocated]}
+
+    # On primary1, send another 30MB Pubsub message.
+    $primary1 publish channel [prepare_value [expr 30*1024*1024]]
+
+    # Link has exceeded buffer limit and been dropped and recreated
+    set new_link_p1_to_p2 [get_link_to_peer $primary1_id $primary2_name]
+    assert {[dict get $new_link_p1_to_p2 create_time] > [dict get $orig_link_p1_to_p2 create_time]}
+    assert {[get_info_field [$primary1 cluster info] total_cluster_links_buffer_limit_exceeded] eq 1}
+
+    # Link from primary2 should not be affected
+    set same_link_p1_from_p2 [get_link_from_peer $primary1_id $primary2_name]
+    assert {[dict get $same_link_p1_from_p2 create_time] eq [dict get $orig_link_p1_from_p2 create_time]}
+
+    # Revive primary2
+    exec kill -SIGCONT $primary2_pid
+
+    # Reset configs on primary1 so config changes don't leak out to other tests
+    $primary1 CONFIG set cluster-node-timeout $oldtimeout
+    $primary1 CONFIG set cluster-link-sendbuf-limit $oldlimit
+}
author	ny0312 <49037844+ny0312@users.noreply.github.com>	2021-12-16 21:56:59 -0800
committer	GitHub <noreply@github.com>	2021-12-16 21:56:59 -0800
commit	792afb443211f190b3f8bea15e945661453fbddf (patch)
tree	ae3a50c8adc9ed0a3adb3d1093e7f0129b469a2b /tests/cluster
parent	687210f1550cf9048bed5f5539c9411fb22cd3b0 (diff)
download	redis-792afb443211f190b3f8bea15e945661453fbddf.tar.gz