Cluster: better cluster state transiction handling.

Before we relied on the global cluster state to make sure all the hash slots are linked to some node, when getNodeByQuery() is called. So finding the hash slot unbound was checked with an assertion. However this is fragile. The cluster state is often updated in the clusterBeforeSleep() function, and not ASAP on state change, so it may happen to process clients with a cluster state that is 'ok' but yet certain hash slots set to NULL. With this commit the condition is also checked in getNodeByQuery() and reported with a identical error code of -CLUSTERDOWN but slightly different error message so that we have more debugging clue in the future. Root cause of issue #2288.
author: antirez <antirez@gmail.com> 2015-03-20 09:57:10 +0100
committer: antirez <antirez@gmail.com> 2015-03-20 09:59:28 +0100
commit: 25c0f5ac63320e85f81750bd0f999fc6f5eba5c1 (patch)
tree: 47329f6724d24fa64d3a0ce8119724b652c2de9f
parent: 2ecb5edf3454e3752a428e50b6ee068e357c7fae (diff)
download: redis-25c0f5ac63320e85f81750bd0f999fc6f5eba5c1.tar.gz
3 files changed, 21 insertions, 2 deletions
diff --git a/src/cluster.c b/src/cluster.c
index e24b15d2b..a5f682ba8 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -4703,7 +4703,12 @@ void readwriteCommand(redisClient *c) {
  *
  * REDIS_CLUSTER_REDIR_UNSTABLE if the request contains mutliple keys
  * belonging to the same slot, but the slot is not stable (in migration or
- * importing state, likely because a resharding is in progress). */
+ * importing state, likely because a resharding is in progress).
+ *
+ * REDIS_CLUSTER_REDIR_DOWN if the request addresses a slot which is not
+ * bound to any node. In this case the cluster global state should be already
+ * "down" but it is fragile to rely on the update of the global state, so
+ * we also handle it here. */
 clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *error_code) {
     clusterNode *n = NULL;
     robj *firstkey = NULL;
@@ -4757,7 +4762,18 @@ clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **arg
                 firstkey = thiskey;
                 slot = thisslot;
                 n = server.cluster->slots[slot];
-                redisAssertWithInfo(c,firstkey,n != NULL);
+
+                /* Error: If a slot is not served, we are in "cluster down"
+                 * state. However the state is yet to be updated, so this was
+                 * not trapped earlier in processCommand(). Report the same
+                 * error to the client. */
+                if (n == NULL) {
+                    getKeysFreeResult(keyindex);
+                    if (error_code)
+                        *error_code = REDIS_CLUSTER_REDIR_DOWN;
+                    return NULL;
+                }
+
                 /* If we are migrating or importing this slot, we need to check
                  * if we have all the keys in the request (the only way we
                  * can safely serve the request, otherwise we return a TRYAGAIN
diff --git a/src/cluster.h b/src/cluster.h
index ef5caf0d6..8eaa0ab98 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -30,6 +30,7 @@
 #define REDIS_CLUSTER_REDIR_UNSTABLE 2      /* Keys in slot resharding. */
 #define REDIS_CLUSTER_REDIR_ASK 3           /* -ASK redirection required. */
 #define REDIS_CLUSTER_REDIR_MOVED 4         /* -MOVED redirection required. */
+#define REDIS_CLUSTER_REDIR_DOWN 5          /* -CLUSTERDOWN error. */
 
 struct clusterNode;
 
diff --git a/src/redis.c b/src/redis.c
index 48e7f24ea..e216109c1 100644
--- a/src/redis.c
+++ b/src/redis.c
@@ -2219,6 +2219,8 @@ int processCommand(redisClient *c) {
                      * but the slot is not "stable" currently as there is
                      * a migration or import in progress. */
                     addReplySds(c,sdsnew("-TRYAGAIN Multiple keys request during rehashing of slot\r\n"));
+                } else if (error_code == REDIS_CLUSTER_REDIR_DOWN) {
+                    addReplySds(c,sdsnew("-CLUSTERDOWN The cluster is down. Hash slot is unbound\r\n"));
                 } else {
                     redisPanic("getNodeByQuery() unknown error.");
                 }
author	antirez <antirez@gmail.com>	2015-03-20 09:57:10 +0100
committer	antirez <antirez@gmail.com>	2015-03-20 09:59:28 +0100
commit	25c0f5ac63320e85f81750bd0f999fc6f5eba5c1 (patch)
tree	47329f6724d24fa64d3a0ce8119724b652c2de9f
parent	2ecb5edf3454e3752a428e50b6ee068e357c7fae (diff)
download	redis-25c0f5ac63320e85f81750bd0f999fc6f5eba5c1.tar.gz