Stabilize cluster hostnames tests (#11307)

This PR introduces a couple of changes to improve cluster test stability: 1. Increase the cluster node timeout to 3 seconds, which is similar to the normal cluster tests, but introduce a new mechanism to increase the ping period so that the tests are still fast. This new config is a debug config. 2. Set `cluster-replica-no-failover yes` on a wider array of tests which are sensitive to failovers. This was occurring on the ARM CI.
author: Madelyn Olson <34459052+madolson@users.noreply.github.com> 2022-10-02 23:25:16 -0700
committer: GitHub <noreply@github.com> 2022-10-03 09:25:16 +0300
commit: 663fbd34592aef3d24ac347ad00dbd02f3c7b713 (patch)
tree: a31b9cf7ec239be605fa94d0104e7f39f668d676
parent: a549b78c48121c698c84536d25404a23f01c167d (diff)
download: redis-663fbd34592aef3d24ac347ad00dbd02f3c7b713.tar.gz
6 files changed, 13 insertions, 10 deletions
diff --git a/src/cluster.c b/src/cluster.c
index c788194f1..e08ed6acb 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -4198,9 +4198,11 @@ void clusterCron(void) {
          * received PONG is older than half the cluster timeout, send
          * a new ping now, to ensure all the nodes are pinged without
          * a too big delay. */
+        mstime_t ping_interval = server.cluster_ping_interval ? 
+            server.cluster_ping_interval : server.cluster_node_timeout/2;
         if (node->link &&
             node->ping_sent == 0 &&
-            (now - node->pong_received) > server.cluster_node_timeout/2)
+            (now - node->pong_received) > ping_interval)
         {
             clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
             continue;
diff --git a/src/config.c b/src/config.c
index 2a34adfd3..4149e06c3 100644
--- a/src/config.c
+++ b/src/config.c
@@ -3116,6 +3116,7 @@ standardConfig static_configs[] = {
     /* Long Long configs */
     createLongLongConfig("busy-reply-threshold", "lua-time-limit", MODIFIABLE_CONFIG, 0, LONG_MAX, server.busy_reply_threshold, 5000, INTEGER_CONFIG, NULL, NULL),/* milliseconds */
     createLongLongConfig("cluster-node-timeout", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.cluster_node_timeout, 15000, INTEGER_CONFIG, NULL, NULL),
+    createLongLongConfig("cluster-ping-interval", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, LLONG_MAX, server.cluster_ping_interval, 0, INTEGER_CONFIG, NULL, NULL),
     createLongLongConfig("slowlog-log-slower-than", NULL, MODIFIABLE_CONFIG, -1, LLONG_MAX, server.slowlog_log_slower_than, 10000, INTEGER_CONFIG, NULL, NULL),
     createLongLongConfig("latency-monitor-threshold", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.latency_monitor_threshold, 0, INTEGER_CONFIG, NULL, NULL),
     createLongLongConfig("proto-max-bulk-len", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, 1024*1024, LONG_MAX, server.proto_max_bulk_len, 512ll*1024*1024, MEMORY_CONFIG, NULL, NULL), /* Bulk request max size */
diff --git a/src/server.h b/src/server.h
index 92acaa9a0..c88087164 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1852,6 +1852,7 @@ struct redisServer {
     int cluster_enabled;      /* Is cluster enabled? */
     int cluster_port;         /* Set the cluster port for a node. */
     mstime_t cluster_node_timeout; /* Cluster node timeout. */
+    mstime_t cluster_ping_interval;    /* A debug configuration for setting how often cluster nodes send ping messages. */
     char *cluster_configfile; /* Cluster auto-generated config file name. */
     struct clusterState *cluster;  /* State of the cluster */
     int cluster_migration_barrier; /* Cluster replicas migration barrier. */
diff --git a/tests/support/cluster_helper.tcl b/tests/support/cluster_helper.tcl
index 6d70e44c1..644eefdae 100644
--- a/tests/support/cluster_helper.tcl
+++ b/tests/support/cluster_helper.tcl
@@ -103,7 +103,7 @@ proc start_cluster {masters replicas options code {slot_allocator continuous_slo
 
     # Configure the starting of multiple servers. Set cluster node timeout
     # aggressively since many tests depend on ping/pong messages. 
-    set cluster_options [list overrides [list cluster-enabled yes cluster-node-timeout 500]]
+    set cluster_options [list overrides [list cluster-enabled yes cluster-ping-interval 100 cluster-node-timeout 3000]]
     set options [concat $cluster_options $options]
 
     # Cluster mode only supports a single database, so before executing the tests
diff --git a/tests/support/server.tcl b/tests/support/server.tcl
index 6cc846b97..a23224bd7 100644
--- a/tests/support/server.tcl
+++ b/tests/support/server.tcl
@@ -418,7 +418,7 @@ proc start_server {options {code undefined}} {
                 set baseconfig $value
             }
             "overrides" {
-                set overrides $value
+                set overrides [concat $overrides $value]
             }
             "config_lines" {
                 set config_lines $value
diff --git a/tests/unit/cluster/hostnames.tcl b/tests/unit/cluster/hostnames.tcl
index 02fb83615..031310172 100644
--- a/tests/unit/cluster/hostnames.tcl
+++ b/tests/unit/cluster/hostnames.tcl
@@ -42,8 +42,9 @@ proc get_slot_field {slot_output shard_id node_id attrib_id} {
     return [lindex [lindex [lindex $slot_output $shard_id] $node_id] $attrib_id]
 }
 
-# Start a cluster with 3 masters and 4 replicas. 
-start_cluster 3 4 {tags {external:skip cluster}} {
+# Start a cluster with 3 masters and 4 replicas.
+# These tests rely on specific node ordering, so make sure no node fails over.
+start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-replica-no-failover yes}} {
 test "Set cluster hostnames and verify they are propagated" {
     for {set j 0} {$j < [llength $::servers]} {incr j} {
         R $j config set cluster-announce-hostname "host-$j.com"
@@ -202,7 +203,9 @@ test "Verify the nodes configured with prefer hostname only show hostname for ne
     R 0 DEBUG DROP-CLUSTER-PACKET-FILTER -1
     R 6 DEBUG DROP-CLUSTER-PACKET-FILTER -1
 
-    wait_for_condition 50 100 {
+    # This operation sometimes spikes to around 5 seconds to resolve the state,
+    # so it has a higher timeout. 
+    wait_for_condition 50 500 {
         [llength [R 6 CLUSTER SLOTS]] eq 3
     } else {
         fail "Node did not learn about the 2 shards it can talk to"
@@ -220,10 +223,6 @@ test "Test restart will keep hostname information" {
     # Store the hostname in the config
     R 0 config rewrite
 
-    # If the primary is slow to reboot it might get demoted, so prevent the replica
-    # from nominating itself.
-    R 3 config set cluster-replica-no-failover yes
-
     restart_server 0 true false
     set slot_result [R 0 CLUSTER SLOTS]
     assert_equal [lindex [get_slot_field $slot_result 0 2 3] 1] "restart-1.com"
author	Madelyn Olson <34459052+madolson@users.noreply.github.com>	2022-10-02 23:25:16 -0700
committer	GitHub <noreply@github.com>	2022-10-03 09:25:16 +0300
commit	663fbd34592aef3d24ac347ad00dbd02f3c7b713 (patch)
tree	a31b9cf7ec239be605fa94d0104e7f39f668d676
parent	a549b78c48121c698c84536d25404a23f01c167d (diff)
download	redis-663fbd34592aef3d24ac347ad00dbd02f3c7b713.tar.gz