From 0232eaacdfc568ea1970fe650994e35da9622f77 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 22 Jan 2015 18:57:45 +0100 Subject: Avoid duplicated instance execution code in Cluster test. --- tests/instances.tcl | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/tests/instances.tcl b/tests/instances.tcl index 7d87cdf59..353d9b2d2 100644 --- a/tests/instances.tcl +++ b/tests/instances.tcl @@ -33,6 +33,25 @@ if {[catch {cd tmp}]} { exit 1 } +# Execute the specified instance of the server specified by 'type', using +# the provided configuration file. Returns the PID of the process. +proc exec_instance {type cfgfile} { + if {$type eq "redis"} { + set prgname redis-server + } elseif {$type eq "sentinel"} { + set prgname redis-sentinel + } else { + error "Unknown instance type." + } + + if {$::valgrind} { + set pid [exec valgrind --track-origins=yes --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &] + } else { + set pid [exec ../../../src/${prgname} $cfgfile &] + } + return $pid +} + # Spawn a redis or sentinel instance, depending on 'type'. proc spawn_instance {type base_port count {conf {}}} { for {set j 0} {$j < $count} {incr j} { @@ -59,20 +78,7 @@ proc spawn_instance {type base_port count {conf {}}} { close $cfg # Finally exec it and remember the pid for later cleanup. - if {$type eq "redis"} { - set prgname redis-server - } elseif {$type eq "sentinel"} { - set prgname redis-sentinel - } else { - error "Unknown instance type." - } - - if {$::valgrind} { - set pid [exec valgrind --track-origins=yes --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &] - } else { - set pid [exec ../../../src/${prgname} $cfgfile &] - } - + set pid [exec_instance $type $cfgfile] lappend ::pids $pid # Check availability @@ -411,18 +417,7 @@ proc restart_instance {type id} { # Execute the instance with its old setup and append the new pid # file for cleanup. - if {$type eq "redis"} { - set prgname redis-server - } else { - set prgname redis-sentinel - } - - if {$::valgrind} { - set pid [exec valgrind --track-origins=yes --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &] - } else { - set pid [exec ../../../src/${prgname} $cfgfile &] - } - + set pid [exec_instance $type $cfgfile] set_instance_attrib $type $id pid $pid lappend ::pids $pid -- cgit v1.2.1 From 75c5229a8b0416c0f6a852f172727de91b0d0eee Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 28 Jan 2015 23:26:42 +0100 Subject: create-cluster script added. Simple shell script to create / destroy Redis clusters for manual testing. --- utils/create-cluster/README | 27 +++++++++++++++ utils/create-cluster/create-cluster | 66 +++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 utils/create-cluster/README create mode 100755 utils/create-cluster/create-cluster diff --git a/utils/create-cluster/README b/utils/create-cluster/README new file mode 100644 index 000000000..f3a3f0883 --- /dev/null +++ b/utils/create-cluster/README @@ -0,0 +1,27 @@ +Create-custer is a small script used to easily start a big number of Redis +instances configured to run in cluster mode. Its main goal is to allow manual +testing in a condition which is not easy to replicate with the Redis cluster +unit tests, for example when a lot of instances are needed in order to trigger +a give bug. + +The tool can also be used just to easily create a number of instances in a +Redis Cluster in order to experiment a bit with the system. + +USAGE +--- + +To create a cluster, follow this steps: + +1. Edit create-cluster and change the start / end port, depending on the +number of instances you want to create. +2. Use "./create-cluster start" in order to run the instances. +3. Use "./create-cluster create" in order to execute redis-trib create, so that +an actual Redis cluster will be created. +4. Now you are ready to play with the cluster. AOF files and logs for each instances are created in the current directory. + +In order to stop a cluster: + +1. Use "./craete-cluster stop" to stop all the instances. After you stopped the instances you can use "./create-cluster start" to restart them if you change ideas. +2. Use "./create-cluster clean" to remove all the AOF / log files to restat with a clean environment. + +It is currently hardcoded that you start a cluster where each master has one slave, since the script is pretty basic. diff --git a/utils/create-cluster/create-cluster b/utils/create-cluster/create-cluster new file mode 100755 index 000000000..80161587e --- /dev/null +++ b/utils/create-cluster/create-cluster @@ -0,0 +1,66 @@ +#!/bin/bash + +PORT=30000 +ENDPORT=30006 + +if [ "$1" == "start" ] +then + while [ $((PORT < ENDPORT)) != "0" ]; do + PORT=$((PORT+1)) + echo "Starting $PORT" + ../../src/redis-server --port $PORT --cluster-enabled yes --cluster-config-file nodes-${PORT}.conf --cluster-node-timeout 5 --appendonly yes --appendfilename appendonly-${PORT}.aof --dbfilename dump-${PORT}.rdb --logfile ${PORT}.log --daemonize yes + done + exit 0 +fi + +if [ "$1" == "create" ] +then + HOSTS="" + while [ $((PORT < ENDPORT)) != "0" ]; do + PORT=$((PORT+1)) + HOSTS="$HOSTS 127.0.0.1:$PORT" + done + ../../src/redis-trib.rb create --replicas 1 $HOSTS + exit 0 +fi + +if [ "$1" == "stop" ] +then + while [ $((PORT < ENDPORT)) != "0" ]; do + PORT=$((PORT+1)) + echo "Stopping $PORT" + redis-cli -p $PORT shutdown nosave + done + exit 0 +fi + +if [ "$1" == "join" ] +then + while [ $((PORT < ENDPORT)) != "0" ]; do + PORT=$((PORT+1)) + echo "Joining $PORT" + redis-cli -p $PORT CLUSTER MEET 127.0.0.1 10002 + done + + echo "Waiting 5 seconds" + sleep 5 + + PORT=30000 + while [ $((PORT < ENDPORT)) != "0" ]; do + PORT=$((PORT+1)) + echo "Replicate $PORT" + redis-cli -p $PORT CLUSTER REPLICATE $2 + done + exit 0 +fi + +if [ "$1" == "clean" ] +then + rm -rf *.log + rm -rf appendonly*.aof + rm -rf dump*.rdb + rm -rf nodes*.conf + exit 0 +fi + +echo "Usage: $0 [start|create|stop|join|clean]" -- cgit v1.2.1 From 5031c2394ed55ec9fbe6152f029487526a12fb43 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 29 Jan 2015 13:21:42 +0100 Subject: create-cluster script: sane default timeout. --- utils/create-cluster/create-cluster | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/create-cluster/create-cluster b/utils/create-cluster/create-cluster index 80161587e..76f61091d 100755 --- a/utils/create-cluster/create-cluster +++ b/utils/create-cluster/create-cluster @@ -2,13 +2,14 @@ PORT=30000 ENDPORT=30006 +TIMEOUT=15000 if [ "$1" == "start" ] then while [ $((PORT < ENDPORT)) != "0" ]; do PORT=$((PORT+1)) echo "Starting $PORT" - ../../src/redis-server --port $PORT --cluster-enabled yes --cluster-config-file nodes-${PORT}.conf --cluster-node-timeout 5 --appendonly yes --appendfilename appendonly-${PORT}.aof --dbfilename dump-${PORT}.rdb --logfile ${PORT}.log --daemonize yes + ../../src/redis-server --port $PORT --cluster-enabled yes --cluster-config-file nodes-${PORT}.conf --cluster-node-timeout $TIMEOUT --appendonly yes --appendfilename appendonly-${PORT}.aof --dbfilename dump-${PORT}.rdb --logfile ${PORT}.log --daemonize yes done exit 0 fi -- cgit v1.2.1 From 8dd3263216ef2553fc22886dfd38f8157a0516ff Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 29 Jan 2015 14:17:45 +0100 Subject: Cluster: use a number of gossip sections proportional to cluster size. Otherwise it is impossible to receive the majority of failure reports in the node_timeout*2 window in larger clusters. Still with a 200 nodes cluster, 20 gossip sections are a very reasonable amount of bytes to send. A side effect of this change is also fater cluster nodes joins for large clusters, because the cluster layout makes less time to propagate. --- src/cluster.c | 59 ++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index ce544970b..469016c33 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2037,7 +2037,8 @@ void clusterBroadcastMessage(void *buf, size_t len) { dictReleaseIterator(di); } -/* Build the message header */ +/* Build the message header. hdr must point to a buffer at least + * sizeof(clusterMsg) in bytes. */ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { int totlen = 0; uint64_t offset; @@ -2098,40 +2099,60 @@ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { /* Send a PING or PONG packet to the specified node, making sure to add enough * gossip informations. */ void clusterSendPing(clusterLink *link, int type) { - unsigned char buf[sizeof(clusterMsg)+sizeof(clusterMsgDataGossip)*3]; - clusterMsg *hdr = (clusterMsg*) buf; - int gossipcount = 0, totlen; - /* freshnodes is the number of nodes we can still use to populate the - * gossip section of the ping packet. Basically we start with the nodes - * we have in memory minus two (ourself and the node we are sending the - * message to). Every time we add a node we decrement the counter, so when - * it will drop to <= zero we know there is no more gossip info we can - * send. */ + unsigned char *buf; + clusterMsg *hdr; + int gossipcount = 0; /* Number of gossip sections added so far. */ + int wanted; /* Number of gossip sections we want to append if possible. */ + int totlen; /* Total packet length. */ + /* freshnodes is the max number of nodes we can hope to append at all: + * nodes available minus two (ourself and the node we are sending the + * message to). However practically there may be less valid nodes since + * nodes in handshake state, disconnected, are not considered. */ int freshnodes = dictSize(server.cluster->nodes)-2; + /* How many gossip sections we want to add? 1/10 of the available nodes + * and anyway at least 3. */ + wanted = freshnodes/10; + if (wanted < 3) wanted = 3; + + /* Compute the maxium totlen to allocate our buffer. We'll fix the totlen + * later according to the number of gossip sections we really were able + * to put inside the packet. */ + totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + totlen += (sizeof(clusterMsgDataGossip)*wanted); + /* Note: clusterBuildMessageHdr() expects the buffer to be always at least + * sizeof(clusterMsg) or more. */ + if (totlen < (int)sizeof(clusterMsg)) totlen = sizeof(clusterMsg); + buf = zcalloc(totlen); + hdr = (clusterMsg*) buf; + + /* Populate the header. */ if (link->node && type == CLUSTERMSG_TYPE_PING) link->node->ping_sent = mstime(); clusterBuildMessageHdr(hdr,type); /* Populate the gossip fields */ - while(freshnodes > 0 && gossipcount < 3) { + int maxiterations = wanted+10; + while(freshnodes > 0 && gossipcount < wanted && maxiterations--) { dictEntry *de = dictGetRandomKey(server.cluster->nodes); clusterNode *this = dictGetVal(de); clusterMsgDataGossip *gossip; int j; + /* Don't include this node: the whole packet header is about us + * already, so we just gossip about other nodes. */ + if (this == myself) continue; + /* In the gossip section don't include: - * 1) Myself. - * 2) Nodes in HANDSHAKE state. + * 1) Nodes in HANDSHAKE state. * 3) Nodes with the NOADDR flag set. * 4) Disconnected nodes if they don't have configured slots. */ - if (this == myself || - this->flags & (REDIS_NODE_HANDSHAKE|REDIS_NODE_NOADDR) || + if (this->flags & (REDIS_NODE_HANDSHAKE|REDIS_NODE_NOADDR) || (this->link == NULL && this->numslots == 0)) { - freshnodes--; /* otherwise we may loop forever. */ - continue; + freshnodes--; /* Tecnically not correct, but saves CPU. */ + continue; } /* Check if we already added this node */ @@ -2152,11 +2173,15 @@ void clusterSendPing(clusterLink *link, int type) { gossip->flags = htons(this->flags); gossipcount++; } + + /* Ready to send... fix the totlen fiend and queue the message in the + * output buffer. */ totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); totlen += (sizeof(clusterMsgDataGossip)*gossipcount); hdr->count = htons(gossipcount); hdr->totlen = htonl(totlen); clusterSendMessage(link,buf,totlen); + zfree(buf); } /* Send a PONG packet to every connected node that's not in handshake state -- cgit v1.2.1 From 92f29b8904faaa0e554eabdbd7d7928fee95c11f Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 29 Jan 2015 15:01:26 +0100 Subject: CLUSTER count-failure-reports command added. --- src/cluster.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/cluster.c b/src/cluster.c index 469016c33..17a5525b7 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -4064,6 +4064,18 @@ void clusterCommand(redisClient *c) { addReplyBulkCString(c,ni); sdsfree(ni); } + } else if (!strcasecmp(c->argv[1]->ptr,"count-failure-reports") && + c->argc == 3) + { + /* CLUSTER COUNT-FAILURE-REPORTS */ + clusterNode *n = clusterLookupNode(c->argv[2]->ptr); + + if (!n) { + addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); + return; + } else { + addReplyLongLong(c,clusterNodeFailureReportsCount(n)); + } } else if (!strcasecmp(c->argv[1]->ptr,"failover") && (c->argc == 2 || c->argc == 3)) { -- cgit v1.2.1 From 2616d6f6dc6f5ccaae8069bf491fc00c6c823a4c Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 29 Jan 2015 15:40:08 +0100 Subject: Cluster: magical 10% of nodes explained in comments. --- src/cluster.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 17a5525b7..995f16633 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2110,8 +2110,32 @@ void clusterSendPing(clusterLink *link, int type) { * nodes in handshake state, disconnected, are not considered. */ int freshnodes = dictSize(server.cluster->nodes)-2; - /* How many gossip sections we want to add? 1/10 of the available nodes - * and anyway at least 3. */ + /* How many gossip sections we want to add? 1/10 of the number of nodes + * and anyway at least 3. Why 1/10? + * + * If we have N masters, with N/10 entries, and we consider that in + * node_timeout we exchange with each other node at least 4 packets + * (we ping in the worst case in node_timeout/2 time, and we also + * receive two pings from the host), we have a total of 8 packets + * in the node_timeout*2 falure reports validity time. So we have + * that, for a single PFAIL node, we can expect to receive the following + * number of failure reports (in the specified window of time): + * + * PROB * GOSSIP_ENTRIES_PER_PACKET * TOTAL_PACKETS: + * + * PROB = probability of being featured in a single gossip entry, + * which is 1 / NUM_OF_NODES. + * ENTRIES = 10. + * TOTAL_PACKETS = 2 * 4 * NUM_OF_MASTERS. + * + * If we assume we have just masters (so num of nodes and num of masters + * is the same), with 1/10 we always get over the majority, and specifically + * 80% of the number of nodes, to account for many masters failing at the + * same time. + * + * Since we have non-voting slaves that lower the probability of an entry + * to feature our node, we set the number of entires per packet as + * 10% of the total nodes we have. */ wanted = freshnodes/10; if (wanted < 3) wanted = 3; -- cgit v1.2.1