summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShawn Routhier <sar@isc.org>2014-11-10 19:04:13 -0800
committerShawn Routhier <sar@isc.org>2014-11-10 19:04:13 -0800
commit21d3034757a2e33cfea31f93b8585b20e8a22bbe (patch)
tree166741338d160dfb51b7cc88137d2515a862e7eb
parent6444928cf5788ced056e39a27142a14e61b0a816 (diff)
downloadisc-dhcp-21d3034757a2e33cfea31f93b8585b20e8a22bbe.tar.gz
[master] Patch the failover code to avoid deadlocks
Patch for 36810 & 20352 This coves several related problems 1) When the primary is in conflict done it allows the secondary to transition around resolution interrupted and potentical conflict previously the primary would die on an illegal state. 2) It allows the servers to restart a bind update request. Previously if one of the servers sent an udpate request and there died (or had the communications interrupted) in some states the first server wouldn't retransmit a new update request and the other server wouldn't send any bind updates. This was noticed in potential conflict. 3) Updated the state transitions to move the leases on the ack queue back to the update queue in case of conflict-done as we might need to retransmit them all. 4) Updated a transition from startup to potentical conflict instead of resolution interrupted when the servers reconnect during the startup phase in order to avoid a diffferent dead lock.
-rw-r--r--RELNOTES9
-rw-r--r--includes/dhcpd.h1
-rw-r--r--server/failover.c206
3 files changed, 143 insertions, 73 deletions
diff --git a/RELNOTES b/RELNOTES
index a4c36129..93acdfd6 100644
--- a/RELNOTES
+++ b/RELNOTES
@@ -137,6 +137,15 @@ by Eric Young (eay@cryptsoft.com).
the patch upon which the fix is based.
[ISC-Bugs #32222]
+- In the failover code, handle the case of communications being interrupted
+ when the servers are dealing with POTENTIAL-CONFLICT. This patch allows
+ the primary to accept the secondary moving from POTENTIAL-CONFLICT to
+ RESOLUTION-INTERRUPTED as well as handling the bind update process better.
+ In addition the code to resend update or update all requests has been
+ modified to send requests more often.
+ [ISC-Bugs #36810]
+ [ISC-Bugs #20352]
+
Changes since 4.3.1b1
- Modify the linux and openwrt dhclient scripts to process information
diff --git a/includes/dhcpd.h b/includes/dhcpd.h
index 2043708f..9ad641be 100644
--- a/includes/dhcpd.h
+++ b/includes/dhcpd.h
@@ -3419,6 +3419,7 @@ isc_result_t dhcp_failover_state_signal (omapi_object_t *,
isc_result_t dhcp_failover_state_transition (dhcp_failover_state_t *,
const char *);
isc_result_t dhcp_failover_set_service_state (dhcp_failover_state_t *state);
+void dhcp_failover_rescind_updates (dhcp_failover_state_t *);
isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *,
enum failover_state);
isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *,
diff --git a/server/failover.c b/server/failover.c
index eb2612b7..e6d9fdd6 100644
--- a/server/failover.c
+++ b/server/failover.c
@@ -1519,8 +1519,16 @@ isc_result_t dhcp_failover_state_transition (dhcp_failover_state_t *state,
/* In these situations, we remain in the current
* state, or if in startup enter those states.
*/
- case communications_interrupted:
case conflict_done:
+ /* As the peer may not have received or may have
+ * lost track of updates we sent previously we
+ * rescind them, causing us to retransmit them
+ * on an update request.
+ */
+ dhcp_failover_rescind_updates(state);
+ /* fall through */
+
+ case communications_interrupted:
case partner_down:
case paused:
case recover:
@@ -1703,6 +1711,52 @@ isc_result_t dhcp_failover_set_service_state (dhcp_failover_state_t *state)
return ISC_R_SUCCESS;
}
+/*!
+ * \brief Return any leases on the ack queue back to the update queue
+ *
+ * Re-schedule any pending updates by moving them from the ack queue
+ * (update sent awaiting response) back to the update queue (need to
+ * send an update for this lease). This will result in a retransmission
+ * of the update.
+ *
+ * \param state is the state block for the failover connection we are
+ * updating.
+ */
+
+void dhcp_failover_rescind_updates (dhcp_failover_state_t *state)
+{
+ struct lease *lp;
+
+ if (state->ack_queue_tail == NULL)
+ return;
+
+ /* Zap the flags. */
+ for (lp = state->ack_queue_head; lp; lp = lp->next_pending)
+ lp->flags = ((lp->flags & ~ON_ACK_QUEUE) | ON_UPDATE_QUEUE);
+
+ /* Now hook the ack queue to the beginning of the update queue. */
+ if (state->update_queue_head) {
+ lease_reference(&state->ack_queue_tail->next_pending,
+ state->update_queue_head, MDL);
+ lease_dereference(&state->update_queue_head, MDL);
+ }
+ lease_reference(&state->update_queue_head, state->ack_queue_head, MDL);
+
+ if (!state->update_queue_tail) {
+#if defined (POINTER_DEBUG)
+ if (state->ack_queue_tail->next_pending) {
+ log_error("next pending on ack queue tail.");
+ abort();
+ }
+#endif
+ lease_reference(&state->update_queue_tail,
+ state->ack_queue_tail, MDL);
+ }
+ lease_dereference(&state->ack_queue_tail, MDL);
+ lease_dereference(&state->ack_queue_head, MDL);
+ state->cur_unacked_updates = 0;
+}
+
isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *state,
enum failover_state new_state)
{
@@ -1721,37 +1775,9 @@ isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *state,
case normal:
case potential_conflict:
case partner_down:
- if (state -> ack_queue_tail) {
- struct lease *lp;
-
- /* Zap the flags. */
- for (lp = state -> ack_queue_head; lp; lp = lp -> next_pending)
- lp -> flags = ((lp -> flags & ~ON_ACK_QUEUE) |
- ON_UPDATE_QUEUE);
-
- /* Now hook the ack queue to the beginning of the update
- queue. */
- if (state -> update_queue_head) {
- lease_reference (&state -> ack_queue_tail -> next_pending,
- state -> update_queue_head, MDL);
- lease_dereference (&state -> update_queue_head, MDL);
- }
- lease_reference (&state -> update_queue_head,
- state -> ack_queue_head, MDL);
- if (!state -> update_queue_tail) {
-#if defined (POINTER_DEBUG)
- if (state -> ack_queue_tail -> next_pending) {
- log_error ("next pending on ack queue tail.");
- abort ();
- }
-#endif
- lease_reference (&state -> update_queue_tail,
- state -> ack_queue_tail, MDL);
- }
- lease_dereference (&state -> ack_queue_tail, MDL);
- lease_dereference (&state -> ack_queue_head, MDL);
- state -> cur_unacked_updates = 0;
- }
+ /* Move the ack queue to the update queue */
+ dhcp_failover_rescind_updates(state);
+
/* We will re-queue a timeout later, if applicable. */
cancel_timeout (dhcp_failover_keepalive, state);
break;
@@ -1859,7 +1885,9 @@ isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *state,
break;
case potential_conflict:
- if (state -> i_am == primary)
+ if ((state->i_am == primary) ||
+ ((state->i_am == secondary) &&
+ (state->partner.state == conflict_done)))
dhcp_failover_send_update_request (state);
break;
@@ -1960,7 +1988,18 @@ isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *state,
if (state -> partner.state == new_state && state -> me.state) {
switch (state -> me.state) {
case startup:
- dhcp_failover_set_state (state, state -> saved_state);
+ /*
+ * If we have a peer state we must be connected.
+ * If so we should move to potential_conflict
+ * instead of resolution_interrupted, otherwise
+ * back to whereever we were before we stopped.
+ */
+ if (state->saved_state == resolution_interrupted)
+ dhcp_failover_set_state(state,
+ potential_conflict);
+ else
+ dhcp_failover_set_state(state,
+ state->saved_state);
return ISC_R_SUCCESS;
case unknown_state:
@@ -2183,6 +2222,17 @@ isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *state,
dhcp_failover_set_state(state, new_state);
break;
+ case potential_conflict:
+ case resolution_interrupted:
+ /*
+ * This can happen when the connection is lost and
+ * recovered after the primary has moved to
+ * conflict-done but the secondary is still in
+ * potential-conflict. In that case, we have to
+ * remain in conflict-done.
+ */
+ break;
+
default:
log_fatal("Peer %s: Invalid attempt to move from %s "
"to %s while local state is conflict-done.",
@@ -4861,35 +4911,42 @@ isc_result_t dhcp_failover_send_update_request (dhcp_failover_state_t *state)
# define FMA (char *)0, (unsigned *)0, 0
#endif
- if (!state -> link_to_peer ||
- state -> link_to_peer -> type != dhcp_type_failover_link)
- return DHCP_R_INVALIDARG;
- link = (dhcp_failover_link_t *)state -> link_to_peer;
+ if (!state->link_to_peer ||
+ state->link_to_peer->type != dhcp_type_failover_link)
+ return (DHCP_R_INVALIDARG);
+ link = (dhcp_failover_link_t *)state->link_to_peer;
- if (!link -> outer || link -> outer -> type != omapi_type_connection)
- return DHCP_R_INVALIDARG;
+ if (!link->outer || link->outer->type != omapi_type_connection)
+ return (DHCP_R_INVALIDARG);
- if (state -> curUPD)
- return ISC_R_ALREADYRUNNING;
+ /* We allow an update to be restarted in case we requested an update
+ * and were interrupted by something. If we had an ALL going we need
+ * to restart that. Otherwise we simply continue with the request */
+ if (state->curUPD == FTM_UPDREQALL) {
+ return (dhcp_failover_send_update_request_all(state));
+ }
- status = (dhcp_failover_put_message
- (link, link -> outer,
- FTM_UPDREQ, link->xid++,
- (failover_option_t *)0));
+ status = (dhcp_failover_put_message(link, link->outer, FTM_UPDREQ,
+ link->xid++, NULL));
- if (status == ISC_R_SUCCESS)
- state -> curUPD = FTM_UPDREQ;
+ state->curUPD = FTM_UPDREQ;
#if defined (DEBUG_FAILOVER_MESSAGES)
if (status != ISC_R_SUCCESS)
- failover_print (FMA, " (failed)");
- failover_print (FMA, ")");
+ failover_print(FMA, " (failed)");
+ failover_print(FMA, ")");
if (obufix) {
- log_debug ("%s", obuf);
+ log_debug("%s", obuf);
}
#endif
- log_info ("Sent update request message to %s", state -> name);
- return status;
+
+ if (status == ISC_R_SUCCESS) {
+ log_info("Sent update request message to %s", state->name);
+ } else {
+ log_error("Failed to send update request all message to %s: %s",
+ state->name, isc_result_totext(status));
+ }
+ return (status);
}
isc_result_t dhcp_failover_send_update_request_all (dhcp_failover_state_t
@@ -4907,36 +4964,39 @@ isc_result_t dhcp_failover_send_update_request_all (dhcp_failover_state_t
# define FMA (char *)0, (unsigned *)0, 0
#endif
- if (!state -> link_to_peer ||
- state -> link_to_peer -> type != dhcp_type_failover_link)
- return DHCP_R_INVALIDARG;
- link = (dhcp_failover_link_t *)state -> link_to_peer;
+ if (!state->link_to_peer ||
+ state->link_to_peer->type != dhcp_type_failover_link)
+ return (DHCP_R_INVALIDARG);
+ link = (dhcp_failover_link_t *)state->link_to_peer;
- if (!link -> outer || link -> outer -> type != omapi_type_connection)
- return DHCP_R_INVALIDARG;
+ if (!link->outer || link->outer->type != omapi_type_connection)
+ return (DHCP_R_INVALIDARG);
- /* If there is an UPDREQ in progress, then upgrade to UPDREQALL. */
- if (state -> curUPD && (state -> curUPD != FTM_UPDREQ))
- return ISC_R_ALREADYRUNNING;
+ /* We allow an update to be restarted in case we requested an update
+ * and were interrupted by something.
+ */
- status = (dhcp_failover_put_message
- (link, link -> outer,
- FTM_UPDREQALL, link->xid++,
- (failover_option_t *)0));
+ status = (dhcp_failover_put_message(link, link->outer, FTM_UPDREQALL,
+ link->xid++, NULL));
- if (status == ISC_R_SUCCESS)
- state -> curUPD = FTM_UPDREQALL;
+ state->curUPD = FTM_UPDREQALL;
#if defined (DEBUG_FAILOVER_MESSAGES)
if (status != ISC_R_SUCCESS)
- failover_print (FMA, " (failed)");
- failover_print (FMA, ")");
+ failover_print(FMA, " (failed)");
+ failover_print(FMA, ")");
if (obufix) {
- log_debug ("%s", obuf);
+ log_debug("%s", obuf);
}
#endif
- log_info ("Sent update request all message to %s", state -> name);
- return status;
+
+ if (status == ISC_R_SUCCESS) {
+ log_info("Sent update request all message to %s", state->name);
+ } else {
+ log_error("Failed to send update request all message to %s: %s",
+ state->name, isc_result_totext(status));
+ }
+ return (status);
}
isc_result_t dhcp_failover_send_update_done (dhcp_failover_state_t *state)