From 0232eaacdfc568ea1970fe650994e35da9622f77 Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Thu, 22 Jan 2015 18:57:45 +0100
Subject: Avoid duplicated instance execution code in Cluster test.

---
 tests/instances.tcl | 47 +++++++++++++++++++++--------------------------
 1 file changed, 21 insertions(+), 26 deletions(-)

diff --git a/tests/instances.tcl b/tests/instances.tcl
index 7d87cdf59..353d9b2d2 100644
--- a/tests/instances.tcl
+++ b/tests/instances.tcl
@@ -33,6 +33,25 @@ if {[catch {cd tmp}]} {
     exit 1
 }
 
+# Execute the specified instance of the server specified by 'type', using
+# the provided configuration file. Returns the PID of the process.
+proc exec_instance {type cfgfile} {
+    if {$type eq "redis"} {
+        set prgname redis-server
+    } elseif {$type eq "sentinel"} {
+        set prgname redis-sentinel
+    } else {
+        error "Unknown instance type."
+    }
+
+    if {$::valgrind} {
+        set pid [exec valgrind --track-origins=yes --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &]
+    } else {
+        set pid [exec ../../../src/${prgname} $cfgfile &]
+    }
+    return $pid
+}
+
 # Spawn a redis or sentinel instance, depending on 'type'.
 proc spawn_instance {type base_port count {conf {}}} {
     for {set j 0} {$j < $count} {incr j} {
@@ -59,20 +78,7 @@ proc spawn_instance {type base_port count {conf {}}} {
         close $cfg
 
         # Finally exec it and remember the pid for later cleanup.
-        if {$type eq "redis"} {
-            set prgname redis-server
-        } elseif {$type eq "sentinel"} {
-            set prgname redis-sentinel
-        } else {
-            error "Unknown instance type."
-        }
-
-        if {$::valgrind} {
-            set pid [exec valgrind --track-origins=yes --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &]
-        } else {
-            set pid [exec ../../../src/${prgname} $cfgfile &]
-        }
-
+        set pid [exec_instance $type $cfgfile]
         lappend ::pids $pid
 
         # Check availability
@@ -411,18 +417,7 @@ proc restart_instance {type id} {
 
     # Execute the instance with its old setup and append the new pid
     # file for cleanup.
-    if {$type eq "redis"} {
-        set prgname redis-server
-    } else {
-        set prgname redis-sentinel
-    }
-
-    if {$::valgrind} {
-        set pid [exec valgrind --track-origins=yes --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &]
-    } else {
-        set pid [exec ../../../src/${prgname} $cfgfile &]
-    }
-
+    set pid [exec_instance $type $cfgfile]
     set_instance_attrib $type $id pid $pid
     lappend ::pids $pid
 
-- 
cgit v1.2.1


From 75c5229a8b0416c0f6a852f172727de91b0d0eee Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Wed, 28 Jan 2015 23:26:42 +0100
Subject: create-cluster script added.

Simple shell script to create / destroy Redis clusters for manual
testing.
---
 utils/create-cluster/README         | 27 +++++++++++++++
 utils/create-cluster/create-cluster | 66 +++++++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100644 utils/create-cluster/README
 create mode 100755 utils/create-cluster/create-cluster

diff --git a/utils/create-cluster/README b/utils/create-cluster/README
new file mode 100644
index 000000000..f3a3f0883
--- /dev/null
+++ b/utils/create-cluster/README
@@ -0,0 +1,27 @@
+Create-custer is a small script used to easily start a big number of Redis
+instances configured to run in cluster mode. Its main goal is to allow manual
+testing in a condition which is not easy to replicate with the Redis cluster
+unit tests, for example when a lot of instances are needed in order to trigger
+a give bug.
+
+The tool can also be used just to easily create a number of instances in a
+Redis Cluster in order to experiment a bit with the system.
+
+USAGE
+---
+
+To create a cluster, follow this steps:
+
+1. Edit create-cluster and change the start / end port, depending on the
+number of instances you want to create.
+2. Use "./create-cluster start" in order to run the instances.
+3. Use "./create-cluster create" in order to execute redis-trib create, so that
+an actual Redis cluster will be created.
+4. Now you are ready to play with the cluster. AOF files and logs for each instances are created in the current directory.
+
+In order to stop a cluster:
+
+1. Use "./craete-cluster stop" to stop all the instances. After you stopped the instances you can use "./create-cluster start" to restart them if you change ideas.
+2. Use "./create-cluster clean" to remove all the AOF / log files to restat with a clean environment.
+
+It is currently hardcoded that you start a cluster where each master has one slave, since the script is pretty basic.
diff --git a/utils/create-cluster/create-cluster b/utils/create-cluster/create-cluster
new file mode 100755
index 000000000..80161587e
--- /dev/null
+++ b/utils/create-cluster/create-cluster
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+PORT=30000
+ENDPORT=30006
+
+if [ "$1" == "start" ]
+then
+    while [ $((PORT < ENDPORT)) != "0" ]; do
+        PORT=$((PORT+1))
+        echo "Starting $PORT"
+        ../../src/redis-server --port $PORT --cluster-enabled yes --cluster-config-file nodes-${PORT}.conf --cluster-node-timeout 5 --appendonly yes --appendfilename appendonly-${PORT}.aof --dbfilename dump-${PORT}.rdb --logfile ${PORT}.log --daemonize yes
+    done
+    exit 0
+fi
+
+if [ "$1" == "create" ]
+then
+    HOSTS=""
+    while [ $((PORT < ENDPORT)) != "0" ]; do
+        PORT=$((PORT+1))
+        HOSTS="$HOSTS 127.0.0.1:$PORT"
+    done
+    ../../src/redis-trib.rb create --replicas 1 $HOSTS
+    exit 0
+fi
+
+if [ "$1" == "stop" ]
+then
+    while [ $((PORT < ENDPORT)) != "0" ]; do
+        PORT=$((PORT+1))
+        echo "Stopping $PORT"
+        redis-cli -p $PORT shutdown nosave
+    done
+    exit 0
+fi
+
+if [ "$1" == "join" ]
+then
+    while [ $((PORT < ENDPORT)) != "0" ]; do
+        PORT=$((PORT+1))
+        echo "Joining $PORT"
+        redis-cli -p $PORT CLUSTER MEET 127.0.0.1 10002
+    done
+
+    echo "Waiting 5 seconds"
+    sleep 5
+
+    PORT=30000
+    while [ $((PORT < ENDPORT)) != "0" ]; do
+        PORT=$((PORT+1))
+        echo "Replicate $PORT"
+        redis-cli -p $PORT CLUSTER REPLICATE $2 
+    done
+    exit 0
+fi
+
+if [ "$1" == "clean" ]
+then
+    rm -rf *.log
+    rm -rf appendonly*.aof
+    rm -rf dump*.rdb
+    rm -rf nodes*.conf
+    exit 0
+fi
+
+echo "Usage: $0 [start|create|stop|join|clean]"
-- 
cgit v1.2.1


From 5031c2394ed55ec9fbe6152f029487526a12fb43 Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Thu, 29 Jan 2015 13:21:42 +0100
Subject: create-cluster script: sane default timeout.

---
 utils/create-cluster/create-cluster | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils/create-cluster/create-cluster b/utils/create-cluster/create-cluster
index 80161587e..76f61091d 100755
--- a/utils/create-cluster/create-cluster
+++ b/utils/create-cluster/create-cluster
@@ -2,13 +2,14 @@
 
 PORT=30000
 ENDPORT=30006
+TIMEOUT=15000
 
 if [ "$1" == "start" ]
 then
     while [ $((PORT < ENDPORT)) != "0" ]; do
         PORT=$((PORT+1))
         echo "Starting $PORT"
-        ../../src/redis-server --port $PORT --cluster-enabled yes --cluster-config-file nodes-${PORT}.conf --cluster-node-timeout 5 --appendonly yes --appendfilename appendonly-${PORT}.aof --dbfilename dump-${PORT}.rdb --logfile ${PORT}.log --daemonize yes
+        ../../src/redis-server --port $PORT --cluster-enabled yes --cluster-config-file nodes-${PORT}.conf --cluster-node-timeout $TIMEOUT --appendonly yes --appendfilename appendonly-${PORT}.aof --dbfilename dump-${PORT}.rdb --logfile ${PORT}.log --daemonize yes
     done
     exit 0
 fi
-- 
cgit v1.2.1


From 8dd3263216ef2553fc22886dfd38f8157a0516ff Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Thu, 29 Jan 2015 14:17:45 +0100
Subject: Cluster: use a number of gossip sections proportional to cluster
 size.

Otherwise it is impossible to receive the majority of failure reports in
the node_timeout*2 window in larger clusters.

Still with a 200 nodes cluster, 20 gossip sections are a very reasonable
amount of bytes to send.

A side effect of this change is also fater cluster nodes joins for large
clusters, because the cluster layout makes less time to propagate.
---
 src/cluster.c | 59 ++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 17 deletions(-)

diff --git a/src/cluster.c b/src/cluster.c
index ce544970b..469016c33 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -2037,7 +2037,8 @@ void clusterBroadcastMessage(void *buf, size_t len) {
     dictReleaseIterator(di);
 }
 
-/* Build the message header */
+/* Build the message header. hdr must point to a buffer at least
+ * sizeof(clusterMsg) in bytes. */
 void clusterBuildMessageHdr(clusterMsg *hdr, int type) {
     int totlen = 0;
     uint64_t offset;
@@ -2098,40 +2099,60 @@ void clusterBuildMessageHdr(clusterMsg *hdr, int type) {
 /* Send a PING or PONG packet to the specified node, making sure to add enough
  * gossip informations. */
 void clusterSendPing(clusterLink *link, int type) {
-    unsigned char buf[sizeof(clusterMsg)+sizeof(clusterMsgDataGossip)*3];
-    clusterMsg *hdr = (clusterMsg*) buf;
-    int gossipcount = 0, totlen;
-    /* freshnodes is the number of nodes we can still use to populate the
-     * gossip section of the ping packet. Basically we start with the nodes
-     * we have in memory minus two (ourself and the node we are sending the
-     * message to). Every time we add a node we decrement the counter, so when
-     * it will drop to <= zero we know there is no more gossip info we can
-     * send. */
+    unsigned char *buf;
+    clusterMsg *hdr;
+    int gossipcount = 0; /* Number of gossip sections added so far. */
+    int wanted; /* Number of gossip sections we want to append if possible. */
+    int totlen; /* Total packet length. */
+    /* freshnodes is the max number of nodes we can hope to append at all:
+     * nodes available minus two (ourself and the node we are sending the
+     * message to). However practically there may be less valid nodes since
+     * nodes in handshake state, disconnected, are not considered. */
     int freshnodes = dictSize(server.cluster->nodes)-2;
 
+    /* How many gossip sections we want to add? 1/10 of the available nodes
+     * and anyway at least 3. */
+    wanted = freshnodes/10;
+    if (wanted < 3) wanted = 3;
+
+    /* Compute the maxium totlen to allocate our buffer. We'll fix the totlen
+     * later according to the number of gossip sections we really were able
+     * to put inside the packet. */
+    totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
+    totlen += (sizeof(clusterMsgDataGossip)*wanted);
+    /* Note: clusterBuildMessageHdr() expects the buffer to be always at least
+     * sizeof(clusterMsg) or more. */
+    if (totlen < (int)sizeof(clusterMsg)) totlen = sizeof(clusterMsg);
+    buf = zcalloc(totlen);
+    hdr = (clusterMsg*) buf;
+
+    /* Populate the header. */
     if (link->node && type == CLUSTERMSG_TYPE_PING)
         link->node->ping_sent = mstime();
     clusterBuildMessageHdr(hdr,type);
 
     /* Populate the gossip fields */
-    while(freshnodes > 0 && gossipcount < 3) {
+    int maxiterations = wanted+10;
+    while(freshnodes > 0 && gossipcount < wanted && maxiterations--) {
         dictEntry *de = dictGetRandomKey(server.cluster->nodes);
         clusterNode *this = dictGetVal(de);
         clusterMsgDataGossip *gossip;
         int j;
 
+        /* Don't include this node: the whole packet header is about us
+         * already, so we just gossip about other nodes. */
+        if (this == myself) continue;
+
         /* In the gossip section don't include:
-         * 1) Myself.
-         * 2) Nodes in HANDSHAKE state.
+         * 1) Nodes in HANDSHAKE state.
          * 3) Nodes with the NOADDR flag set.
          * 4) Disconnected nodes if they don't have configured slots.
          */
-        if (this == myself ||
-            this->flags & (REDIS_NODE_HANDSHAKE|REDIS_NODE_NOADDR) ||
+        if (this->flags & (REDIS_NODE_HANDSHAKE|REDIS_NODE_NOADDR) ||
             (this->link == NULL && this->numslots == 0))
         {
-                freshnodes--; /* otherwise we may loop forever. */
-                continue;
+            freshnodes--; /* Tecnically not correct, but saves CPU. */
+            continue;
         }
 
         /* Check if we already added this node */
@@ -2152,11 +2173,15 @@ void clusterSendPing(clusterLink *link, int type) {
         gossip->flags = htons(this->flags);
         gossipcount++;
     }
+
+    /* Ready to send... fix the totlen fiend and queue the message in the
+     * output buffer. */
     totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
     totlen += (sizeof(clusterMsgDataGossip)*gossipcount);
     hdr->count = htons(gossipcount);
     hdr->totlen = htonl(totlen);
     clusterSendMessage(link,buf,totlen);
+    zfree(buf);
 }
 
 /* Send a PONG packet to every connected node that's not in handshake state
-- 
cgit v1.2.1


From 92f29b8904faaa0e554eabdbd7d7928fee95c11f Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Thu, 29 Jan 2015 15:01:26 +0100
Subject: CLUSTER count-failure-reports command added.

---
 src/cluster.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/cluster.c b/src/cluster.c
index 469016c33..17a5525b7 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -4064,6 +4064,18 @@ void clusterCommand(redisClient *c) {
             addReplyBulkCString(c,ni);
             sdsfree(ni);
         }
+    } else if (!strcasecmp(c->argv[1]->ptr,"count-failure-reports") &&
+               c->argc == 3)
+    {
+        /* CLUSTER COUNT-FAILURE-REPORTS <NODE ID> */
+        clusterNode *n = clusterLookupNode(c->argv[2]->ptr);
+
+        if (!n) {
+            addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
+            return;
+        } else {
+            addReplyLongLong(c,clusterNodeFailureReportsCount(n));
+        }
     } else if (!strcasecmp(c->argv[1]->ptr,"failover") &&
                (c->argc == 2 || c->argc == 3))
     {
-- 
cgit v1.2.1


From 2616d6f6dc6f5ccaae8069bf491fc00c6c823a4c Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Thu, 29 Jan 2015 15:40:08 +0100
Subject: Cluster: magical 10% of nodes explained in comments.

---
 src/cluster.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/cluster.c b/src/cluster.c
index 17a5525b7..995f16633 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -2110,8 +2110,32 @@ void clusterSendPing(clusterLink *link, int type) {
      * nodes in handshake state, disconnected, are not considered. */
     int freshnodes = dictSize(server.cluster->nodes)-2;
 
-    /* How many gossip sections we want to add? 1/10 of the available nodes
-     * and anyway at least 3. */
+    /* How many gossip sections we want to add? 1/10 of the number of nodes
+     * and anyway at least 3. Why 1/10?
+     *
+     * If we have N masters, with N/10 entries, and we consider that in
+     * node_timeout we exchange with each other node at least 4 packets
+     * (we ping in the worst case in node_timeout/2 time, and we also
+     * receive two pings from the host), we have a total of 8 packets
+     * in the node_timeout*2 falure reports validity time. So we have
+     * that, for a single PFAIL node, we can expect to receive the following
+     * number of failure reports (in the specified window of time):
+     *
+     * PROB * GOSSIP_ENTRIES_PER_PACKET * TOTAL_PACKETS:
+     *
+     * PROB = probability of being featured in a single gossip entry,
+     *        which is 1 / NUM_OF_NODES.
+     * ENTRIES = 10.
+     * TOTAL_PACKETS = 2 * 4 * NUM_OF_MASTERS.
+     *
+     * If we assume we have just masters (so num of nodes and num of masters
+     * is the same), with 1/10 we always get over the majority, and specifically
+     * 80% of the number of nodes, to account for many masters failing at the
+     * same time.
+     *
+     * Since we have non-voting slaves that lower the probability of an entry
+     * to feature our node, we set the number of entires per packet as
+     * 10% of the total nodes we have. */
     wanted = freshnodes/10;
     if (wanted < 3) wanted = 3;
 
-- 
cgit v1.2.1