summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMadelyn Olson <34459052+madolson@users.noreply.github.com>2022-10-02 23:25:16 -0700
committerGitHub <noreply@github.com>2022-10-03 09:25:16 +0300
commit663fbd34592aef3d24ac347ad00dbd02f3c7b713 (patch)
treea31b9cf7ec239be605fa94d0104e7f39f668d676
parenta549b78c48121c698c84536d25404a23f01c167d (diff)
downloadredis-663fbd34592aef3d24ac347ad00dbd02f3c7b713.tar.gz
Stabilize cluster hostnames tests (#11307)
This PR introduces a couple of changes to improve cluster test stability: 1. Increase the cluster node timeout to 3 seconds, which is similar to the normal cluster tests, but introduce a new mechanism to increase the ping period so that the tests are still fast. This new config is a debug config. 2. Set `cluster-replica-no-failover yes` on a wider array of tests which are sensitive to failovers. This was occurring on the ARM CI.
-rw-r--r--src/cluster.c4
-rw-r--r--src/config.c1
-rw-r--r--src/server.h1
-rw-r--r--tests/support/cluster_helper.tcl2
-rw-r--r--tests/support/server.tcl2
-rw-r--r--tests/unit/cluster/hostnames.tcl13
6 files changed, 13 insertions, 10 deletions
diff --git a/src/cluster.c b/src/cluster.c
index c788194f1..e08ed6acb 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -4198,9 +4198,11 @@ void clusterCron(void) {
* received PONG is older than half the cluster timeout, send
* a new ping now, to ensure all the nodes are pinged without
* a too big delay. */
+ mstime_t ping_interval = server.cluster_ping_interval ?
+ server.cluster_ping_interval : server.cluster_node_timeout/2;
if (node->link &&
node->ping_sent == 0 &&
- (now - node->pong_received) > server.cluster_node_timeout/2)
+ (now - node->pong_received) > ping_interval)
{
clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
continue;
diff --git a/src/config.c b/src/config.c
index 2a34adfd3..4149e06c3 100644
--- a/src/config.c
+++ b/src/config.c
@@ -3116,6 +3116,7 @@ standardConfig static_configs[] = {
/* Long Long configs */
createLongLongConfig("busy-reply-threshold", "lua-time-limit", MODIFIABLE_CONFIG, 0, LONG_MAX, server.busy_reply_threshold, 5000, INTEGER_CONFIG, NULL, NULL),/* milliseconds */
createLongLongConfig("cluster-node-timeout", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.cluster_node_timeout, 15000, INTEGER_CONFIG, NULL, NULL),
+ createLongLongConfig("cluster-ping-interval", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, LLONG_MAX, server.cluster_ping_interval, 0, INTEGER_CONFIG, NULL, NULL),
createLongLongConfig("slowlog-log-slower-than", NULL, MODIFIABLE_CONFIG, -1, LLONG_MAX, server.slowlog_log_slower_than, 10000, INTEGER_CONFIG, NULL, NULL),
createLongLongConfig("latency-monitor-threshold", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.latency_monitor_threshold, 0, INTEGER_CONFIG, NULL, NULL),
createLongLongConfig("proto-max-bulk-len", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, 1024*1024, LONG_MAX, server.proto_max_bulk_len, 512ll*1024*1024, MEMORY_CONFIG, NULL, NULL), /* Bulk request max size */
diff --git a/src/server.h b/src/server.h
index 92acaa9a0..c88087164 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1852,6 +1852,7 @@ struct redisServer {
int cluster_enabled; /* Is cluster enabled? */
int cluster_port; /* Set the cluster port for a node. */
mstime_t cluster_node_timeout; /* Cluster node timeout. */
+ mstime_t cluster_ping_interval; /* A debug configuration for setting how often cluster nodes send ping messages. */
char *cluster_configfile; /* Cluster auto-generated config file name. */
struct clusterState *cluster; /* State of the cluster */
int cluster_migration_barrier; /* Cluster replicas migration barrier. */
diff --git a/tests/support/cluster_helper.tcl b/tests/support/cluster_helper.tcl
index 6d70e44c1..644eefdae 100644
--- a/tests/support/cluster_helper.tcl
+++ b/tests/support/cluster_helper.tcl
@@ -103,7 +103,7 @@ proc start_cluster {masters replicas options code {slot_allocator continuous_slo
# Configure the starting of multiple servers. Set cluster node timeout
# aggressively since many tests depend on ping/pong messages.
- set cluster_options [list overrides [list cluster-enabled yes cluster-node-timeout 500]]
+ set cluster_options [list overrides [list cluster-enabled yes cluster-ping-interval 100 cluster-node-timeout 3000]]
set options [concat $cluster_options $options]
# Cluster mode only supports a single database, so before executing the tests
diff --git a/tests/support/server.tcl b/tests/support/server.tcl
index 6cc846b97..a23224bd7 100644
--- a/tests/support/server.tcl
+++ b/tests/support/server.tcl
@@ -418,7 +418,7 @@ proc start_server {options {code undefined}} {
set baseconfig $value
}
"overrides" {
- set overrides $value
+ set overrides [concat $overrides $value]
}
"config_lines" {
set config_lines $value
diff --git a/tests/unit/cluster/hostnames.tcl b/tests/unit/cluster/hostnames.tcl
index 02fb83615..031310172 100644
--- a/tests/unit/cluster/hostnames.tcl
+++ b/tests/unit/cluster/hostnames.tcl
@@ -42,8 +42,9 @@ proc get_slot_field {slot_output shard_id node_id attrib_id} {
return [lindex [lindex [lindex $slot_output $shard_id] $node_id] $attrib_id]
}
-# Start a cluster with 3 masters and 4 replicas.
-start_cluster 3 4 {tags {external:skip cluster}} {
+# Start a cluster with 3 masters and 4 replicas.
+# These tests rely on specific node ordering, so make sure no node fails over.
+start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-replica-no-failover yes}} {
test "Set cluster hostnames and verify they are propagated" {
for {set j 0} {$j < [llength $::servers]} {incr j} {
R $j config set cluster-announce-hostname "host-$j.com"
@@ -202,7 +203,9 @@ test "Verify the nodes configured with prefer hostname only show hostname for ne
R 0 DEBUG DROP-CLUSTER-PACKET-FILTER -1
R 6 DEBUG DROP-CLUSTER-PACKET-FILTER -1
- wait_for_condition 50 100 {
+ # This operation sometimes spikes to around 5 seconds to resolve the state,
+ # so it has a higher timeout.
+ wait_for_condition 50 500 {
[llength [R 6 CLUSTER SLOTS]] eq 3
} else {
fail "Node did not learn about the 2 shards it can talk to"
@@ -220,10 +223,6 @@ test "Test restart will keep hostname information" {
# Store the hostname in the config
R 0 config rewrite
- # If the primary is slow to reboot it might get demoted, so prevent the replica
- # from nominating itself.
- R 3 config set cluster-replica-no-failover yes
-
restart_server 0 true false
set slot_result [R 0 CLUSTER SLOTS]
assert_equal [lindex [get_slot_field $slot_result 0 2 3] 1] "restart-1.com"