diff options
author | Ilya Maximets <i.maximets@ovn.org> | 2022-02-21 13:59:51 +0100 |
---|---|---|
committer | Ilya Maximets <i.maximets@ovn.org> | 2022-02-24 17:04:32 +0100 |
commit | 6de8868d19eadd253f3301bce2bf52c981bc5fc4 (patch) | |
tree | b17286caf1e97c17e36e026a31a2a8f6cde88f7d /python | |
parent | 7aaa5b8137c0f48a6797f90d03a2b75ce8e3aaa8 (diff) | |
download | openvswitch-6de8868d19eadd253f3301bce2bf52c981bc5fc4.tar.gz |
reconnect: Fix broken inactivity probe if there is no other reason to wake up.
The purpose of reconnect_deadline__() function is twofold:
1. Its result is used to tell if the state has to be changed right now
in reconnect_run().
2. Its result also used to determine when the process need to wake up
and call reconnect_run() for a next time, i.e. when the state may
need to be changed next time.
Since introduction of the 'receive-attempted' feature, the function
returns LLONG_MAX if the deadline is in the future. That works for
the first case, but doesn't for the second one, because we don't
really know when we need to call reconnect_run().
This is the problem for applications where jsonrpc connection is the
only source of wake ups, e.g. ovn-northd. When the network goes down
silently, e.g. server looses IP address due to DHCP failure, ovn-northd
will sleep in the poll loop indefinitely after being told that it
doesn't need to call reconnect_run() (deadline == LLONG_MAX).
Fixing that by actually returning the expected time if it is in the
future, so we will know when to wake up. In order to keep the
'receive-attempted' feature, returning 'now + 1' in case where the
time has already passed, but receive wasn't attempted. That will
trigger a fast wake up, so the application will be able to attempt the
receive even if there was no real events. In a correctly written
application we should not fall into this case more than once in a row.
'+ 1' ensures that we will not transition into a different state
prematurely, i.e. before the receive is actually attempted.
Fixes: 4241d652e465 ("jsonrpc: Avoid disconnecting prematurely due to long poll intervals.")
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Diffstat (limited to 'python')
-rw-r--r-- | python/ovs/reconnect.py | 36 |
1 files changed, 25 insertions, 11 deletions
diff --git a/python/ovs/reconnect.py b/python/ovs/reconnect.py index c4c6c87e9..6b0d023ae 100644 --- a/python/ovs/reconnect.py +++ b/python/ovs/reconnect.py @@ -44,7 +44,7 @@ class Reconnect(object): is_connected = False @staticmethod - def deadline(fsm): + def deadline(fsm, now): return None @staticmethod @@ -56,7 +56,7 @@ class Reconnect(object): is_connected = False @staticmethod - def deadline(fsm): + def deadline(fsm, now): return None @staticmethod @@ -68,7 +68,7 @@ class Reconnect(object): is_connected = False @staticmethod - def deadline(fsm): + def deadline(fsm, now): return fsm.state_entered + fsm.backoff @staticmethod @@ -80,7 +80,7 @@ class Reconnect(object): is_connected = False @staticmethod - def deadline(fsm): + def deadline(fsm, now): return fsm.state_entered + max(1000, fsm.backoff) @staticmethod @@ -92,13 +92,24 @@ class Reconnect(object): is_connected = True @staticmethod - def deadline(fsm): + def deadline(fsm, now): if fsm.probe_interval: base = max(fsm.last_activity, fsm.state_entered) expiration = base + fsm.probe_interval - if (fsm.last_receive_attempt is None or + if (now < expiration or + fsm.last_receive_attempt is None or fsm.last_receive_attempt >= expiration): + # We still have time before the expiration or the time has + # already passed and there was no activity. In the first + # case we need to wait for the expiration, in the second - + # we're already past the deadline. */ return expiration + else: + # Time has already passed, but we didn't attempt to receive + # anything. We need to wake up and try to receive even if + # nothing is pending, so we can update the expiration time + # or transition to a different state. + return now + 1 return None @staticmethod @@ -114,12 +125,15 @@ class Reconnect(object): is_connected = True @staticmethod - def deadline(fsm): + def deadline(fsm, now): if fsm.probe_interval: expiration = fsm.state_entered + fsm.probe_interval - if (fsm.last_receive_attempt is None or + if (now < expiration or + fsm.last_receive_attempt is None or fsm.last_receive_attempt >= expiration): return expiration + else: + return now + 1 return None @staticmethod @@ -134,7 +148,7 @@ class Reconnect(object): is_connected = False @staticmethod - def deadline(fsm): + def deadline(fsm, now): return fsm.state_entered @staticmethod @@ -545,7 +559,7 @@ class Reconnect(object): returned if the "probe interval" is nonzero--see self.set_probe_interval()).""" - deadline = self.state.deadline(self) + deadline = self.state.deadline(self, now) if deadline is not None and now >= deadline: return self.state.run(self, now) else: @@ -562,7 +576,7 @@ class Reconnect(object): """Returns the number of milliseconds after which self.run() should be called if nothing else notable happens in the meantime, or None if this is currently unnecessary.""" - deadline = self.state.deadline(self) + deadline = self.state.deadline(self, now) if deadline is not None: remaining = deadline - now return max(0, remaining) |