summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorantirez <antirez@gmail.com>2015-03-20 16:42:49 +0100
committerantirez <antirez@gmail.com>2015-03-21 18:22:33 +0100
commit0595420b1e42c31b10ff28df889a07e57d802510 (patch)
treee342809b4717b948dc39d22f72264ec486525c67
parent2d34ec60bfb136c6fb10487307fd1c5dc8f14ba2 (diff)
downloadredis-0595420b1e42c31b10ff28df889a07e57d802510.tar.gz
Cluster: refactoring around configEpoch handling.
This commit moves the process of generating a new config epoch without consensus out of the clusterCommand() implementation, in order to make it reusable for other reasons (current target is to have a CLUSTER FAILOVER option forcing the failover when no master majority is reachable). Moreover the commit moves other functions which are similarly related to config epochs in a new logical section of the cluster.c file, just for clarity.
-rw-r--r--src/cluster.c224
1 files changed, 135 insertions, 89 deletions
diff --git a/src/cluster.c b/src/cluster.c
index 178ce9b75..4ff8fb4d6 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -74,27 +74,13 @@ void clusterCloseAllSlots(void);
void clusterSetNodeAsMaster(clusterNode *n);
void clusterDelNode(clusterNode *delnode);
sds representRedisNodeFlags(sds ci, uint16_t flags);
+uint64_t clusterGetMaxEpoch(void);
+int clusterBumpConfigEpochWithoutConsensus(void);
/* -----------------------------------------------------------------------------
* Initialization
* -------------------------------------------------------------------------- */
-/* Return the greatest configEpoch found in the cluster. */
-uint64_t clusterGetMaxEpoch(void) {
- uint64_t max = 0;
- dictIterator *di;
- dictEntry *de;
-
- di = dictGetSafeIterator(server.cluster->nodes);
- while((de = dictNext(di)) != NULL) {
- clusterNode *node = dictGetVal(de);
- if (node->configEpoch > max) max = node->configEpoch;
- }
- dictReleaseIterator(di);
- if (max < server.cluster->currentEpoch) max = server.cluster->currentEpoch;
- return max;
-}
-
/* Load the cluster config from 'filename'.
*
* If the file does not exist or is zero-length (this may happen because
@@ -928,6 +914,137 @@ void clusterRenameNode(clusterNode *node, char *newname) {
}
/* -----------------------------------------------------------------------------
+ * CLUSTER config epoch handling
+ * -------------------------------------------------------------------------- */
+
+/* Return the greatest configEpoch found in the cluster. */
+uint64_t clusterGetMaxEpoch(void) {
+ uint64_t max = 0;
+ dictIterator *di;
+ dictEntry *de;
+
+ di = dictGetSafeIterator(server.cluster->nodes);
+ while((de = dictNext(di)) != NULL) {
+ clusterNode *node = dictGetVal(de);
+ if (node->configEpoch > max) max = node->configEpoch;
+ }
+ dictReleaseIterator(di);
+ if (max < server.cluster->currentEpoch) max = server.cluster->currentEpoch;
+ return max;
+}
+
+/* If this node epoch is zero or is not already the greatest across the
+ * cluster (from the POV of the local configuration), this function will:
+ *
+ * 1) Generate a new config epoch increment the current epoch.
+ * 2) Assign the new epoch to this node, WITHOUT any consensus.
+ * 3) Persist the configuration on disk before sending packets with the
+ * new configuration.
+ *
+ * If the new config epoch is generated and assigend, REDIS_OK is returned,
+ * otherwise REDIS_ERR is returned (since the node has already the greatest
+ * configuration around) and no operation is performed.
+ *
+ * Important note: this function violates the principle that config epochs
+ * should be generated with consensus and should be unique across the cluster.
+ * However Redis Cluster uses this auto-generated new config epochs in two
+ * cases:
+ *
+ * 1) When slots are closed after importing. Otherwise resharding would be
+ * too exansive.
+ * 2) When CLUSTER FAILOVER is called with options that force a slave to
+ * failover its master even if there is not master majority able to
+ * create a new configuration epoch.
+ *
+ * Redis Cluster does not explode using this function, even in the case of
+ * a collision between this node and another node, generating the same
+ * configuration epoch unilaterally, because the config epoch conflict
+ * resolution algorithm will eventually move colliding nodes to different
+ * config epochs. However usign this function may violate the "last failover
+ * wins" rule, so should only be used with care. */
+int clusterBumpConfigEpochWithoutConsensus(void) {
+ uint64_t maxEpoch = clusterGetMaxEpoch();
+
+ if (myself->configEpoch == 0 ||
+ myself->configEpoch != maxEpoch)
+ {
+ server.cluster->currentEpoch++;
+ myself->configEpoch = server.cluster->currentEpoch;
+ clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+ CLUSTER_TODO_FSYNC_CONFIG);
+ redisLog(REDIS_WARNING,
+ "New configEpoch set to %llu",
+ (unsigned long long) myself->configEpoch);
+ return REDIS_OK;
+ } else {
+ return REDIS_ERR;
+ }
+}
+
+/* This function is called when this node is a master, and we receive from
+ * another master a configuration epoch that is equal to our configuration
+ * epoch.
+ *
+ * BACKGROUND
+ *
+ * It is not possible that different slaves get the same config
+ * epoch during a failover election, because the slaves need to get voted
+ * by a majority. However when we perform a manual resharding of the cluster
+ * the node will assign a configuration epoch to itself without to ask
+ * for agreement. Usually resharding happens when the cluster is working well
+ * and is supervised by the sysadmin, however it is possible for a failover
+ * to happen exactly while the node we are resharding a slot to assigns itself
+ * a new configuration epoch, but before it is able to propagate it.
+ *
+ * So technically it is possible in this condition that two nodes end with
+ * the same configuration epoch.
+ *
+ * Another possibility is that there are bugs in the implementation causing
+ * this to happen.
+ *
+ * Moreover when a new cluster is created, all the nodes start with the same
+ * configEpoch. This collision resolution code allows nodes to automatically
+ * end with a different configEpoch at startup automatically.
+ *
+ * In all the cases, we want a mechanism that resolves this issue automatically
+ * as a safeguard. The same configuration epoch for masters serving different
+ * set of slots is not harmful, but it is if the nodes end serving the same
+ * slots for some reason (manual errors or software bugs) without a proper
+ * failover procedure.
+ *
+ * In general we want a system that eventually always ends with different
+ * masters having different configuration epochs whatever happened, since
+ * nothign is worse than a split-brain condition in a distributed system.
+ *
+ * BEHAVIOR
+ *
+ * When this function gets called, what happens is that if this node
+ * has the lexicographically smaller Node ID compared to the other node
+ * with the conflicting epoch (the 'sender' node), it will assign itself
+ * the greatest configuration epoch currently detected among nodes plus 1.
+ *
+ * This means that even if there are multiple nodes colliding, the node
+ * with the greatest Node ID never moves forward, so eventually all the nodes
+ * end with a different configuration epoch.
+ */
+void clusterHandleConfigEpochCollision(clusterNode *sender) {
+ /* Prerequisites: nodes have the same configEpoch and are both masters. */
+ if (sender->configEpoch != myself->configEpoch ||
+ !nodeIsMaster(sender) || !nodeIsMaster(myself)) return;
+ /* Don't act if the colliding node has a smaller Node ID. */
+ if (memcmp(sender->name,myself->name,REDIS_CLUSTER_NAMELEN) <= 0) return;
+ /* Get the next ID available at the best of this node knowledge. */
+ server.cluster->currentEpoch++;
+ myself->configEpoch = server.cluster->currentEpoch;
+ clusterSaveConfigOrDie(1);
+ redisLog(REDIS_VERBOSE,
+ "WARNING: configEpoch collision with node %.40s."
+ " configEpoch set to %llu",
+ sender->name,
+ (unsigned long long) myself->configEpoch);
+}
+
+/* -----------------------------------------------------------------------------
* CLUSTER nodes blacklist
*
* The nodes blacklist is just a way to ensure that a given node with a given
@@ -1399,69 +1516,6 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc
}
}
-/* This function is called when this node is a master, and we receive from
- * another master a configuration epoch that is equal to our configuration
- * epoch.
- *
- * BACKGROUND
- *
- * It is not possible that different slaves get the same config
- * epoch during a failover election, because the slaves need to get voted
- * by a majority. However when we perform a manual resharding of the cluster
- * the node will assign a configuration epoch to itself without to ask
- * for agreement. Usually resharding happens when the cluster is working well
- * and is supervised by the sysadmin, however it is possible for a failover
- * to happen exactly while the node we are resharding a slot to assigns itself
- * a new configuration epoch, but before it is able to propagate it.
- *
- * So technically it is possible in this condition that two nodes end with
- * the same configuration epoch.
- *
- * Another possibility is that there are bugs in the implementation causing
- * this to happen.
- *
- * Moreover when a new cluster is created, all the nodes start with the same
- * configEpoch. This collision resolution code allows nodes to automatically
- * end with a different configEpoch at startup automatically.
- *
- * In all the cases, we want a mechanism that resolves this issue automatically
- * as a safeguard. The same configuration epoch for masters serving different
- * set of slots is not harmful, but it is if the nodes end serving the same
- * slots for some reason (manual errors or software bugs) without a proper
- * failover procedure.
- *
- * In general we want a system that eventually always ends with different
- * masters having different configuration epochs whatever happened, since
- * nothign is worse than a split-brain condition in a distributed system.
- *
- * BEHAVIOR
- *
- * When this function gets called, what happens is that if this node
- * has the lexicographically smaller Node ID compared to the other node
- * with the conflicting epoch (the 'sender' node), it will assign itself
- * the greatest configuration epoch currently detected among nodes plus 1.
- *
- * This means that even if there are multiple nodes colliding, the node
- * with the greatest Node ID never moves forward, so eventually all the nodes
- * end with a different configuration epoch.
- */
-void clusterHandleConfigEpochCollision(clusterNode *sender) {
- /* Prerequisites: nodes have the same configEpoch and are both masters. */
- if (sender->configEpoch != myself->configEpoch ||
- !nodeIsMaster(sender) || !nodeIsMaster(myself)) return;
- /* Don't act if the colliding node has a smaller Node ID. */
- if (memcmp(sender->name,myself->name,REDIS_CLUSTER_NAMELEN) <= 0) return;
- /* Get the next ID available at the best of this node knowledge. */
- server.cluster->currentEpoch++;
- myself->configEpoch = server.cluster->currentEpoch;
- clusterSaveConfigOrDie(1);
- redisLog(REDIS_VERBOSE,
- "WARNING: configEpoch collision with node %.40s."
- " configEpoch set to %llu",
- sender->name,
- (unsigned long long) myself->configEpoch);
-}
-
/* When this function is called, there is a packet to process starting
* at node->rcvbuf. Releasing the buffer is up to the caller, so this
* function should just handle the higher level stuff of processing the
@@ -3902,17 +3956,9 @@ void clusterCommand(redisClient *c) {
* failover happens at the same time we close the slot, the
* configEpoch collision resolution will fix it assigning
* a different epoch to each node. */
- uint64_t maxEpoch = clusterGetMaxEpoch();
-
- if (myself->configEpoch == 0 ||
- myself->configEpoch != maxEpoch)
- {
- server.cluster->currentEpoch++;
- myself->configEpoch = server.cluster->currentEpoch;
- clusterDoBeforeSleep(CLUSTER_TODO_FSYNC_CONFIG);
+ if (clusterBumpConfigEpochWithoutConsensus() == REDIS_OK) {
redisLog(REDIS_WARNING,
- "configEpoch set to %llu after importing slot %d",
- (unsigned long long) myself->configEpoch, slot);
+ "configEpoch updated after importing slot %d", slot);
}
server.cluster->importing_slots_from[slot] = NULL;
}