n-acd: better handle interfaces going temporarily down

NM sometimes brings an interface temporarily down (for example to change a VLAN MAC to align it to the parent interface's one). When this happens, any recv() or send() in n-acd fails, the n-acd instance is reset to the initial state and a DOWN event is reported to the manager, which currently does not handle it. The result is an inconsistent state. There is no simple way of dealing with the DOWN event in the manager. What we can do instead is to: - ignore errors during recv() because there is really nothing we can do, except for waiting timeouts to expire; - during probe, ignore errors during send() so that we don't exceed the probe timeout; - during announcement, retry after a send() error to ensure we send all 3 announcements. https://bugzilla.redhat.com/show_bug.cgi?id=1578675
author: Beniamino Galvani <bgalvani@redhat.com> 2018-05-17 16:58:00 +0200
committer: Beniamino Galvani <bgalvani@redhat.com> 2018-05-29 11:18:30 +0200
commit: d082af6b5c2383049d719151c1c1ac9614103e1d (patch)
tree: 1a6048e33ba73981c6bc739798f9f33515582693
parent: 2f4b3392d511ee3a87db3ffc0704e8974f2ec2b1 (diff)
download: NetworkManager-d082af6b5c2383049d719151c1c1ac9614103e1d.tar.gz
1 files changed, 21 insertions, 11 deletions
diff --git a/shared/n-acd/src/n-acd.c b/shared/n-acd/src/n-acd.c
index 42307aa3d6..9164f95895 100644
--- a/shared/n-acd/src/n-acd.c
+++ b/shared/n-acd/src/n-acd.c
@@ -534,7 +534,11 @@ static int n_acd_handle_timeout(NAcd *acd) {
                          */
 
                         r = n_acd_send(acd, NULL);
-                        if (r < 0)
+                        /*
+                         * During probe we must respect the total timeout and so
+                         * we ignore errors caused by a down interface.
+                         */
+                        if (r < 0 && r != -N_ACD_E_DOWN)
                                 return r;
 
                         if (++acd->n_iteration >= N_ACD_RFC_PROBE_NUM)
@@ -559,10 +563,18 @@ static int n_acd_handle_timeout(NAcd *acd) {
                  */
 
                 r = n_acd_send(acd, &acd->config.ip);
-                if (r < 0)
-                        return r;
+                if (r < 0) {
+                        if (r != -N_ACD_E_DOWN)
+                                return r;
+                        /*
+                         * We want to send all the 3 announcements even if the
+                         * interface goes temporarily down. Therefore, if send()
+                         * fails, don't increment the iteration and try again.
+                         */
+                } else
+                        acd->n_iteration++;
 
-                if (++acd->n_iteration < N_ACD_RFC_ANNOUNCE_NUM) {
+                if (acd->n_iteration < N_ACD_RFC_ANNOUNCE_NUM) {
                         /*
                          * Announcements are always scheduled according to the
                          * time-intervals specified in the spec. We always use
@@ -810,14 +822,12 @@ static int n_acd_dispatch_socket(NAcd *acd, struct epoll_event *event) {
                         return -EIO;
                 } else if (errno == ENETDOWN || errno == ENXIO) {
                         /*
-                         * We get ENETDOWN if the network-device goes down or is
-                         * removed. ENXIO might happen on async send-operations if the
-                         * network-device was unplugged and thus the kernel is no
-                         * longer aware of it.
-                         * In any case, we do not allow proceeding with this socket. We
-                         * stop the engine and notify the user gracefully.
+                         * The network device went down or was removed. Ignore
+                         * such errors and let the pending probe time out.
+                         * Subsequent reads will simply return EAGAIN until the
+                         * device is up again and has data queued.
                          */
-                        return -N_ACD_E_DOWN;
+                        return 0;
                 } else if (errno == EAGAIN) {
                         /*
                          * We cannot read data from the socket (we got EAGAIN). As a safety net
author	Beniamino Galvani <bgalvani@redhat.com>	2018-05-17 16:58:00 +0200
committer	Beniamino Galvani <bgalvani@redhat.com>	2018-05-29 11:18:30 +0200
commit	d082af6b5c2383049d719151c1c1ac9614103e1d (patch)
tree	1a6048e33ba73981c6bc739798f9f33515582693
parent	2f4b3392d511ee3a87db3ffc0704e8974f2ec2b1 (diff)
download	NetworkManager-d082af6b5c2383049d719151c1c1ac9614103e1d.tar.gz