summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorantirez <antirez@gmail.com>2015-01-29 15:45:21 +0100
committerantirez <antirez@gmail.com>2015-01-29 15:45:21 +0100
commit147df6f035236fbbe86b6f6e9906ca5571fffe3c (patch)
tree53349a27400db65e5fa9e4f1c77f67b4e0e5ba6b
parent267f2ec40bd48168910364c5fd6aa02fae1037dd (diff)
parent2616d6f6dc6f5ccaae8069bf491fc00c6c823a4c (diff)
downloadredis-147df6f035236fbbe86b6f6e9906ca5571fffe3c.tar.gz
Merge branch 'testing' into 3.0
-rw-r--r--src/cluster.c95
-rw-r--r--utils/create-cluster/README27
-rwxr-xr-xutils/create-cluster/create-cluster67
3 files changed, 172 insertions, 17 deletions
diff --git a/src/cluster.c b/src/cluster.c
index ce544970b..995f16633 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -2037,7 +2037,8 @@ void clusterBroadcastMessage(void *buf, size_t len) {
dictReleaseIterator(di);
}
-/* Build the message header */
+/* Build the message header. hdr must point to a buffer at least
+ * sizeof(clusterMsg) in bytes. */
void clusterBuildMessageHdr(clusterMsg *hdr, int type) {
int totlen = 0;
uint64_t offset;
@@ -2098,40 +2099,84 @@ void clusterBuildMessageHdr(clusterMsg *hdr, int type) {
/* Send a PING or PONG packet to the specified node, making sure to add enough
* gossip informations. */
void clusterSendPing(clusterLink *link, int type) {
- unsigned char buf[sizeof(clusterMsg)+sizeof(clusterMsgDataGossip)*3];
- clusterMsg *hdr = (clusterMsg*) buf;
- int gossipcount = 0, totlen;
- /* freshnodes is the number of nodes we can still use to populate the
- * gossip section of the ping packet. Basically we start with the nodes
- * we have in memory minus two (ourself and the node we are sending the
- * message to). Every time we add a node we decrement the counter, so when
- * it will drop to <= zero we know there is no more gossip info we can
- * send. */
+ unsigned char *buf;
+ clusterMsg *hdr;
+ int gossipcount = 0; /* Number of gossip sections added so far. */
+ int wanted; /* Number of gossip sections we want to append if possible. */
+ int totlen; /* Total packet length. */
+ /* freshnodes is the max number of nodes we can hope to append at all:
+ * nodes available minus two (ourself and the node we are sending the
+ * message to). However practically there may be less valid nodes since
+ * nodes in handshake state, disconnected, are not considered. */
int freshnodes = dictSize(server.cluster->nodes)-2;
+ /* How many gossip sections we want to add? 1/10 of the number of nodes
+ * and anyway at least 3. Why 1/10?
+ *
+ * If we have N masters, with N/10 entries, and we consider that in
+ * node_timeout we exchange with each other node at least 4 packets
+ * (we ping in the worst case in node_timeout/2 time, and we also
+ * receive two pings from the host), we have a total of 8 packets
+ * in the node_timeout*2 falure reports validity time. So we have
+ * that, for a single PFAIL node, we can expect to receive the following
+ * number of failure reports (in the specified window of time):
+ *
+ * PROB * GOSSIP_ENTRIES_PER_PACKET * TOTAL_PACKETS:
+ *
+ * PROB = probability of being featured in a single gossip entry,
+ * which is 1 / NUM_OF_NODES.
+ * ENTRIES = 10.
+ * TOTAL_PACKETS = 2 * 4 * NUM_OF_MASTERS.
+ *
+ * If we assume we have just masters (so num of nodes and num of masters
+ * is the same), with 1/10 we always get over the majority, and specifically
+ * 80% of the number of nodes, to account for many masters failing at the
+ * same time.
+ *
+ * Since we have non-voting slaves that lower the probability of an entry
+ * to feature our node, we set the number of entires per packet as
+ * 10% of the total nodes we have. */
+ wanted = freshnodes/10;
+ if (wanted < 3) wanted = 3;
+
+ /* Compute the maxium totlen to allocate our buffer. We'll fix the totlen
+ * later according to the number of gossip sections we really were able
+ * to put inside the packet. */
+ totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
+ totlen += (sizeof(clusterMsgDataGossip)*wanted);
+ /* Note: clusterBuildMessageHdr() expects the buffer to be always at least
+ * sizeof(clusterMsg) or more. */
+ if (totlen < (int)sizeof(clusterMsg)) totlen = sizeof(clusterMsg);
+ buf = zcalloc(totlen);
+ hdr = (clusterMsg*) buf;
+
+ /* Populate the header. */
if (link->node && type == CLUSTERMSG_TYPE_PING)
link->node->ping_sent = mstime();
clusterBuildMessageHdr(hdr,type);
/* Populate the gossip fields */
- while(freshnodes > 0 && gossipcount < 3) {
+ int maxiterations = wanted+10;
+ while(freshnodes > 0 && gossipcount < wanted && maxiterations--) {
dictEntry *de = dictGetRandomKey(server.cluster->nodes);
clusterNode *this = dictGetVal(de);
clusterMsgDataGossip *gossip;
int j;
+ /* Don't include this node: the whole packet header is about us
+ * already, so we just gossip about other nodes. */
+ if (this == myself) continue;
+
/* In the gossip section don't include:
- * 1) Myself.
- * 2) Nodes in HANDSHAKE state.
+ * 1) Nodes in HANDSHAKE state.
* 3) Nodes with the NOADDR flag set.
* 4) Disconnected nodes if they don't have configured slots.
*/
- if (this == myself ||
- this->flags & (REDIS_NODE_HANDSHAKE|REDIS_NODE_NOADDR) ||
+ if (this->flags & (REDIS_NODE_HANDSHAKE|REDIS_NODE_NOADDR) ||
(this->link == NULL && this->numslots == 0))
{
- freshnodes--; /* otherwise we may loop forever. */
- continue;
+ freshnodes--; /* Tecnically not correct, but saves CPU. */
+ continue;
}
/* Check if we already added this node */
@@ -2152,11 +2197,15 @@ void clusterSendPing(clusterLink *link, int type) {
gossip->flags = htons(this->flags);
gossipcount++;
}
+
+ /* Ready to send... fix the totlen fiend and queue the message in the
+ * output buffer. */
totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
totlen += (sizeof(clusterMsgDataGossip)*gossipcount);
hdr->count = htons(gossipcount);
hdr->totlen = htonl(totlen);
clusterSendMessage(link,buf,totlen);
+ zfree(buf);
}
/* Send a PONG packet to every connected node that's not in handshake state
@@ -4039,6 +4088,18 @@ void clusterCommand(redisClient *c) {
addReplyBulkCString(c,ni);
sdsfree(ni);
}
+ } else if (!strcasecmp(c->argv[1]->ptr,"count-failure-reports") &&
+ c->argc == 3)
+ {
+ /* CLUSTER COUNT-FAILURE-REPORTS <NODE ID> */
+ clusterNode *n = clusterLookupNode(c->argv[2]->ptr);
+
+ if (!n) {
+ addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr);
+ return;
+ } else {
+ addReplyLongLong(c,clusterNodeFailureReportsCount(n));
+ }
} else if (!strcasecmp(c->argv[1]->ptr,"failover") &&
(c->argc == 2 || c->argc == 3))
{
diff --git a/utils/create-cluster/README b/utils/create-cluster/README
new file mode 100644
index 000000000..f3a3f0883
--- /dev/null
+++ b/utils/create-cluster/README
@@ -0,0 +1,27 @@
+Create-custer is a small script used to easily start a big number of Redis
+instances configured to run in cluster mode. Its main goal is to allow manual
+testing in a condition which is not easy to replicate with the Redis cluster
+unit tests, for example when a lot of instances are needed in order to trigger
+a give bug.
+
+The tool can also be used just to easily create a number of instances in a
+Redis Cluster in order to experiment a bit with the system.
+
+USAGE
+---
+
+To create a cluster, follow this steps:
+
+1. Edit create-cluster and change the start / end port, depending on the
+number of instances you want to create.
+2. Use "./create-cluster start" in order to run the instances.
+3. Use "./create-cluster create" in order to execute redis-trib create, so that
+an actual Redis cluster will be created.
+4. Now you are ready to play with the cluster. AOF files and logs for each instances are created in the current directory.
+
+In order to stop a cluster:
+
+1. Use "./craete-cluster stop" to stop all the instances. After you stopped the instances you can use "./create-cluster start" to restart them if you change ideas.
+2. Use "./create-cluster clean" to remove all the AOF / log files to restat with a clean environment.
+
+It is currently hardcoded that you start a cluster where each master has one slave, since the script is pretty basic.
diff --git a/utils/create-cluster/create-cluster b/utils/create-cluster/create-cluster
new file mode 100755
index 000000000..76f61091d
--- /dev/null
+++ b/utils/create-cluster/create-cluster
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+PORT=30000
+ENDPORT=30006
+TIMEOUT=15000
+
+if [ "$1" == "start" ]
+then
+ while [ $((PORT < ENDPORT)) != "0" ]; do
+ PORT=$((PORT+1))
+ echo "Starting $PORT"
+ ../../src/redis-server --port $PORT --cluster-enabled yes --cluster-config-file nodes-${PORT}.conf --cluster-node-timeout $TIMEOUT --appendonly yes --appendfilename appendonly-${PORT}.aof --dbfilename dump-${PORT}.rdb --logfile ${PORT}.log --daemonize yes
+ done
+ exit 0
+fi
+
+if [ "$1" == "create" ]
+then
+ HOSTS=""
+ while [ $((PORT < ENDPORT)) != "0" ]; do
+ PORT=$((PORT+1))
+ HOSTS="$HOSTS 127.0.0.1:$PORT"
+ done
+ ../../src/redis-trib.rb create --replicas 1 $HOSTS
+ exit 0
+fi
+
+if [ "$1" == "stop" ]
+then
+ while [ $((PORT < ENDPORT)) != "0" ]; do
+ PORT=$((PORT+1))
+ echo "Stopping $PORT"
+ redis-cli -p $PORT shutdown nosave
+ done
+ exit 0
+fi
+
+if [ "$1" == "join" ]
+then
+ while [ $((PORT < ENDPORT)) != "0" ]; do
+ PORT=$((PORT+1))
+ echo "Joining $PORT"
+ redis-cli -p $PORT CLUSTER MEET 127.0.0.1 10002
+ done
+
+ echo "Waiting 5 seconds"
+ sleep 5
+
+ PORT=30000
+ while [ $((PORT < ENDPORT)) != "0" ]; do
+ PORT=$((PORT+1))
+ echo "Replicate $PORT"
+ redis-cli -p $PORT CLUSTER REPLICATE $2
+ done
+ exit 0
+fi
+
+if [ "$1" == "clean" ]
+then
+ rm -rf *.log
+ rm -rf appendonly*.aof
+ rm -rf dump*.rdb
+ rm -rf nodes*.conf
+ exit 0
+fi
+
+echo "Usage: $0 [start|create|stop|join|clean]"