summaryrefslogtreecommitdiff
path: root/ovn
diff options
context:
space:
mode:
authorLance Richardson <lrichard@redhat.com>2016-07-07 20:31:08 -0400
committerBen Pfaff <blp@ovn.org>2016-07-23 08:33:13 -0700
commitc8d7a71a0d8514940644f98518d03d9caecc2554 (patch)
treeba4c5f000427abd7fd212fc11ebd50d167d7f131 /ovn
parenta4d04282f12ed20d89edee329c79c84176ad5598 (diff)
downloadopenvswitch-c8d7a71a0d8514940644f98518d03d9caecc2554.tar.gz
ovn-controller: eliminate stall in ofctrl state machine
The "ovn -- 2 HVs, 3 LRs connected via LS, static routes" test case currently exhibits frequent failures. These failures occur because, at the time that the test packets are sent to verify forwarding, no flows have been installed in the vswitch for one of the hypervisors. The state machine implemented by ofctrl_run() is intended to iterate as long as progress is being made, either as long as the state continues to change or as long as packets are being received. Unfortunately, the code had a bug: if receiving a packet caused the state to change, it didn't call the state's run function again to try to see if it would change the state. This caused a real problem in the following case: 1) The state is S_TLV_TABLE_MOD_SENT. 2) An OFPTYPE_NXT_TLV_TABLE_REPLY message is received. 3) No event (other than SB probe timer expiration) is expected that would unblock poll_block() in the main ovn-controller loop. In such a case, ofctrl_run() would receive the packet and advance the state, but not call the run function for the new state, and then leave the state machine paused until the next event (e.g. a timer event) occurred. This commit fixes the problem by continuing to iterate the state machine until the state remains the same and no packet is received in the same iteration. Without this fix, around 40 failures are seen out of 100 attempts, with this fix no failures have been observed in several hundred attempts (using an earlier version of this patch). Signed-off-by: Lance Richardson <lrichard@redhat.com> [blp@ovn.org refactored for clarity] Signed-off-by: Ben Pfaff <blp@ovn.org> Acked-by: Lance Richardson <lrichard@redhat.com>
Diffstat (limited to 'ovn')
-rw-r--r--ovn/controller/ofctrl.c56
1 files changed, 32 insertions, 24 deletions
diff --git a/ovn/controller/ofctrl.c b/ovn/controller/ofctrl.c
index 184e86fba..f0451b7c1 100644
--- a/ovn/controller/ofctrl.c
+++ b/ovn/controller/ofctrl.c
@@ -35,6 +35,7 @@
#include "openvswitch/vlog.h"
#include "ovn-controller.h"
#include "ovn/lib/actions.h"
+#include "poll-loop.h"
#include "physical.h"
#include "rconn.h"
#include "socket-util.h"
@@ -409,9 +410,10 @@ ofctrl_run(const struct ovsrec_bridge *br_int)
state = S_NEW;
}
- enum ofctrl_state old_state;
- do {
- old_state = state;
+ bool progress = true;
+ for (int i = 0; progress && i < 50; i++) {
+ /* Allow the state machine to run. */
+ enum ofctrl_state old_state = state;
switch (state) {
#define STATE(NAME) case NAME: run_##NAME(); break;
STATES
@@ -419,35 +421,41 @@ ofctrl_run(const struct ovsrec_bridge *br_int)
default:
OVS_NOT_REACHED();
}
- } while (state != old_state);
- for (int i = 0; state == old_state && i < 50; i++) {
+ /* Try to process a received packet. */
struct ofpbuf *msg = rconn_recv(swconn);
- if (!msg) {
- break;
- }
-
- const struct ofp_header *oh = msg->data;
- enum ofptype type;
- enum ofperr error;
+ if (msg) {
+ const struct ofp_header *oh = msg->data;
+ enum ofptype type;
+ enum ofperr error;
- error = ofptype_decode(&type, oh);
- if (!error) {
- switch (state) {
+ error = ofptype_decode(&type, oh);
+ if (!error) {
+ switch (state) {
#define STATE(NAME) case NAME: recv_##NAME(oh, type); break;
- STATES
+ STATES
#undef STATE
- default:
- OVS_NOT_REACHED();
+ default:
+ OVS_NOT_REACHED();
+ }
+ } else {
+ char *s = ofp_to_string(oh, ntohs(oh->length), 1);
+ VLOG_WARN("could not decode OpenFlow message (%s): %s",
+ ofperr_to_string(error), s);
+ free(s);
}
- } else {
- char *s = ofp_to_string(oh, ntohs(oh->length), 1);
- VLOG_WARN("could not decode OpenFlow message (%s): %s",
- ofperr_to_string(error), s);
- free(s);
+
+ ofpbuf_delete(msg);
}
- ofpbuf_delete(msg);
+ /* If we did some work, plan to go around again. */
+ progress = old_state != state || msg;
+ }
+ if (progress) {
+ /* We bailed out to limit the amount of work we do in one go, to allow
+ * other code a chance to run. We were still making progress at that
+ * point, so ensure that we come back again without waiting. */
+ poll_immediate_wake();
}
return (state == S_CLEAR_FLOWS || state == S_UPDATE_FLOWS