diff options
author | Thomas Haller <thaller@redhat.com> | 2022-02-24 09:39:03 +0100 |
---|---|---|
committer | Thomas Haller <thaller@redhat.com> | 2022-02-24 09:39:03 +0100 |
commit | d85b934370156496f169adab5322d38cb4bb419a (patch) | |
tree | 89f28ad3770ce7be2a5d10de3963461b863e795d | |
parent | 7a1734926a4d053080c1d57fb29602bc5bb49f20 (diff) | |
parent | 7c874ed4565623a89f417c026103e7925e775690 (diff) | |
download | NetworkManager-d85b934370156496f169adab5322d38cb4bb419a.tar.gz |
core: merge branch 'th/shutdown-timeout-increase'
https://gitlab.freedesktop.org/NetworkManager/NetworkManager/-/merge_requests/1113
-rw-r--r-- | TODO | 16 | ||||
-rw-r--r-- | src/core/NetworkManagerUtils.c | 9 | ||||
-rw-r--r-- | src/core/NetworkManagerUtils.h | 30 | ||||
-rw-r--r-- | src/core/devices/nm-device-ethernet.c | 2 | ||||
-rw-r--r-- | src/core/devices/nm-device.c | 2 | ||||
-rw-r--r-- | src/core/dns/nm-dns-dnsmasq.c | 4 | ||||
-rw-r--r-- | src/core/nm-firewall-utils.c | 4 | ||||
-rw-r--r-- | src/core/nm-pacrunner-manager.c | 4 | ||||
-rw-r--r-- | src/core/ppp/nm-ppp-manager.c | 2 | ||||
-rw-r--r-- | src/core/settings/nm-secret-agent.c | 2 |
10 files changed, 47 insertions, 28 deletions
@@ -72,25 +72,25 @@ shutdown starts. And no singleton getters work reliably after the main() functio because singletons unref themselves. In general, avoid singleton getters and see that somebody hands you a reference. -After NM_SHUTDOWN_TIMEOUT_MS we loose patience that it's taking too long. +After NM_SHUTDOWN_TIMEOUT_MAX_MSEC we loose patience that it's taking too long. We now log a debug message about who is still blocking shutdown. We also cancel the cancellables from nm_shutdown_wait_obj_register_cancellable() -and give NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG more time. If we then are still +and give NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC more time. If we then are still not complete, we log an error message about who is still blocking shutdown, and just exit with an assertion failure. We encountered a bug. This means, *all* async operations in NetworkManager must either be cancellable (and afterwards complete fast) or they must not take long to begin with. In particular, -every individual async operation must be completed in at most NM_SHUTDOWN_TIMEOUT_MS, -and all async cleanup operations must complete in NM_SHUTDOWN_TIMEOUT_MS too. +every individual async operation must be completed in at most NM_SHUTDOWN_TIMEOUT_MAX_MSEC, +and all async cleanup operations must complete in NM_SHUTDOWN_TIMEOUT_MAX_MSEC too. So if you make an async operation not cancellable, but guarantee that you don't take -longer than NM_SHUTDOWN_TIMEOUT_MS you are mostly fine (better would be to actually -complete fast, if you can). That's why reaching NM_SHUTDOWN_TIMEOUT_MS timeout is -still not a bug scenario. But reaching NM_SHUTDOWN_TIMEOUT_MS+NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG +longer than NM_SHUTDOWN_TIMEOUT_MAX_MSEC you are mostly fine (better would be to actually +complete fast, if you can). That's why reaching NM_SHUTDOWN_TIMEOUT_MAX_MSEC timeout is +still not a bug scenario. But reaching NM_SHUTDOWN_TIMEOUT_MAX_MSEC+NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC is a bug. -As NM_SHUTDOWN_TIMEOUT_MS and nm_shutdown_wait_obj_register_object() API already exists, +As NM_SHUTDOWN_TIMEOUT_MAX_MSEC and nm_shutdown_wait_obj_register_object() API already exists, the first step is to ensure that all parts of NetworkManager can be shutdown and be terminated in a timely manner. diff --git a/src/core/NetworkManagerUtils.c b/src/core/NetworkManagerUtils.c index c50dbadad2..b7dd104b45 100644 --- a/src/core/NetworkManagerUtils.c +++ b/src/core/NetworkManagerUtils.c @@ -34,6 +34,11 @@ /*****************************************************************************/ +G_STATIC_ASSERT(NM_SHUTDOWN_TIMEOUT_1500_MSEC <= NM_SHUTDOWN_TIMEOUT_MAX_MSEC); +G_STATIC_ASSERT(NM_SHUTDOWN_TIMEOUT_5000_MSEC <= NM_SHUTDOWN_TIMEOUT_MAX_MSEC); + +/*****************************************************************************/ + /** * nm_utils_get_shared_wifi_permission: * @connection: the NMConnection to lookup the permission. @@ -1037,7 +1042,7 @@ _shutdown_waitobj_cb(gpointer user_data, GObject *where_the_object_was) * is still used. * * If @wait_type is %NM_SHUTDOWN_WAIT_TYPE_CANCELLABLE, then during shutdown - * (after %NM_SHUTDOWN_TIMEOUT_MS), the cancellable will be cancelled to notify + * (after %NM_SHUTDOWN_TIMEOUT_MAX_MSEC), the cancellable will be cancelled to notify * the source of the shutdown. Note that otherwise, in this mode also @watched_obj * is only tracked with a weak-pointer. Especially, it does not register to the * "cancelled" signal to automatically unregister (otherwise, you would never @@ -1046,7 +1051,7 @@ _shutdown_waitobj_cb(gpointer user_data, GObject *where_the_object_was) * FIXME(shutdown): proper shutdown is not yet implemented, and registering * an object (currently) has no effect. * - * FIXME(shutdown): during shutdown, after %NM_SHUTDOWN_TIMEOUT_MS timeout, cancel + * FIXME(shutdown): during shutdown, after %NM_SHUTDOWN_TIMEOUT_MAX_MSEC timeout, cancel * all remaining %NM_SHUTDOWN_WAIT_TYPE_CANCELLABLE instances. Also, when somebody * enqueues a cancellable after that point, cancel it right away on an idle handler. * diff --git a/src/core/NetworkManagerUtils.h b/src/core/NetworkManagerUtils.h index c9210ba8c5..67c9cba471 100644 --- a/src/core/NetworkManagerUtils.h +++ b/src/core/NetworkManagerUtils.h @@ -103,23 +103,37 @@ NMPlatformRoutingRule *nm_ip_routing_rule_to_platform(const NMIPRoutingRule *rul /*****************************************************************************/ /* during shutdown, there are two relevant timeouts. One is - * NM_SHUTDOWN_TIMEOUT_MS which is plenty of time, that we give for all + * NM_SHUTDOWN_TIMEOUT_MAX_MSEC which is plenty of time, that we give for all * actions to complete. Of course, during shutdown components should hurry * to cleanup. * * When we initiate shutdown, we should start killing child processes - * with SIGTERM. If they don't complete within NM_SHUTDOWN_TIMEOUT_MS, we send + * with SIGTERM. If they don't complete within NM_SHUTDOWN_TIMEOUT_MAX_MSEC, we send * SIGKILL. * - * After NM_SHUTDOWN_TIMEOUT_MS, NetworkManager will however not yet terminate right - * away. It iterates the mainloop for another NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG. This + * After NM_SHUTDOWN_TIMEOUT_MAX_MSEC, NetworkManager will however not yet terminate right + * away. It iterates the mainloop for another NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC. This * should give time to reap the child process (after SIGKILL). * * So, the maximum time we should wait before sending SIGKILL should be at most - * NM_SHUTDOWN_TIMEOUT_MS. + * NM_SHUTDOWN_TIMEOUT_MAX_MSEC. */ -#define NM_SHUTDOWN_TIMEOUT_MS 1500 -#define NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG 500 +#define NM_SHUTDOWN_TIMEOUT_MAX_MSEC 5000 +#define NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC 500 + +/** + * NM_SHUTDOWN_TIMEOUT_1500_MSEC: this is just 1500 msec. The special + * thing about the define is that you are guaranteed that this is not + * longer than NM_SHUTDOWN_TIMEOUT_MAX_MSEC. + * When you perform an async operation, it must either be cancellable + * (and complete fast) or never take longer than NM_SHUTDOWN_TIMEOUT_MAX_MSEC. + * The usage of this macro makes that relation to NM_SHUTDOWN_TIMEOUT_MAX_MSEC + * explicit. + */ +#define NM_SHUTDOWN_TIMEOUT_1500_MSEC 1500 + +/* See NM_SHUTDOWN_TIMEOUT_1500_MSEC. */ +#define NM_SHUTDOWN_TIMEOUT_5000_MSEC 5000 typedef enum { /* There is no watched_obj argument, and the shutdown is delayed until the user @@ -131,7 +145,7 @@ typedef enum { NM_SHUTDOWN_WAIT_TYPE_OBJECT, /* The watched_obj argument is a GCancellable, and shutdown is delayed until the object - * gets destroyed (or unregistered). Note that after NM_SHUTDOWN_TIMEOUT_MS, the + * gets destroyed (or unregistered). Note that after NM_SHUTDOWN_TIMEOUT_MAX_MSEC, the * cancellable will be cancelled to notify listeners about the shutdown. */ NM_SHUTDOWN_WAIT_TYPE_CANCELLABLE, } NMShutdownWaitType; diff --git a/src/core/devices/nm-device-ethernet.c b/src/core/devices/nm-device-ethernet.c index 6e18d119d0..842d863e8f 100644 --- a/src/core/devices/nm-device-ethernet.c +++ b/src/core/devices/nm-device-ethernet.c @@ -1014,7 +1014,7 @@ act_stage1_prepare(NMDevice *device, NMDeviceStateReason *out_failure_reason) * get confused and fail to negotiate the new connection. (rh #1023503) * * FIXME(shutdown): when exiting, we also need to wait before quitting, - * at least for additional NM_SHUTDOWN_TIMEOUT_MS seconds because + * at least for additional NM_SHUTDOWN_TIMEOUT_MAX_MSEC seconds because * otherwise after restart the device won't work for the first seconds. */ if (priv->ppp_data.last_pppoe_time_msec != 0) { diff --git a/src/core/devices/nm-device.c b/src/core/devices/nm-device.c index 78fecb1140..6886f3c1ce 100644 --- a/src/core/devices/nm-device.c +++ b/src/core/devices/nm-device.c @@ -7003,7 +7003,7 @@ sriov_op_queue(NMDevice *self, * * FIXME(shutdown): However, during shutdown we don't have a follow-up write request to cancel * this operation and we have to give it at least some time to complete. The solution is that - * we register a way to abort the last call during shutdown, and after NM_SHUTDOWN_TIMEOUT_MS + * we register a way to abort the last call during shutdown, and after NM_SHUTDOWN_TIMEOUT_MAX_MSEC * grace period we pull the plug and cancel it. */ op = g_slice_new(SriovOp); diff --git a/src/core/dns/nm-dns-dnsmasq.c b/src/core/dns/nm-dns-dnsmasq.c index f0abf564cb..1be42b958d 100644 --- a/src/core/dns/nm-dns-dnsmasq.c +++ b/src/core/dns/nm-dns-dnsmasq.c @@ -39,10 +39,10 @@ #define _NMLOG(level, ...) __NMLOG_DEFAULT(level, _NMLOG_DOMAIN, "dnsmasq", __VA_ARGS__) #define WAIT_MSEC_AFTER_SIGTERM 1000 -G_STATIC_ASSERT(WAIT_MSEC_AFTER_SIGTERM <= NM_SHUTDOWN_TIMEOUT_MS); +G_STATIC_ASSERT(WAIT_MSEC_AFTER_SIGTERM <= NM_SHUTDOWN_TIMEOUT_MAX_MSEC); #define WAIT_MSEC_AFTER_SIGKILL 400 -G_STATIC_ASSERT(WAIT_MSEC_AFTER_SIGKILL + 100 <= NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG); +G_STATIC_ASSERT(WAIT_MSEC_AFTER_SIGKILL + 100 <= NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC); typedef void (*GlPidSpawnAsyncNotify)(GCancellable *cancellable, GPid pid, diff --git a/src/core/nm-firewall-utils.c b/src/core/nm-firewall-utils.c index ddac1137bf..8fa8059e00 100644 --- a/src/core/nm-firewall-utils.c +++ b/src/core/nm-firewall-utils.c @@ -422,7 +422,7 @@ _fw_nft_call_communicate_cb(GObject *source, GAsyncResult *result, gpointer user nm_g_main_context_push_thread_default_if_necessary(NULL); nm_shutdown_wait_obj_register_object(call_data->subprocess, "nft-terminate"); - G_STATIC_ASSERT_EXPR(200 < NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG * 2 / 3); + G_STATIC_ASSERT_EXPR(200 < NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC * 2 / 3); nm_g_subprocess_terminate_in_background(call_data->subprocess, 200); } } else if (g_subprocess_get_successful(call_data->subprocess)) { @@ -546,7 +546,7 @@ _fw_nft_call(GBytes *stdin_buf, call_data); call_data->timeout_source = - nm_g_source_attach(nm_g_timeout_source_new((NM_SHUTDOWN_TIMEOUT_MS * 2) / 3, + nm_g_source_attach(nm_g_timeout_source_new((NM_SHUTDOWN_TIMEOUT_1500_MSEC * 2) / 3, G_PRIORITY_DEFAULT, _fw_nft_call_timeout_cb, call_data, diff --git a/src/core/nm-pacrunner-manager.c b/src/core/nm-pacrunner-manager.c index 5ef425bb97..86a406326e 100644 --- a/src/core/nm-pacrunner-manager.c +++ b/src/core/nm-pacrunner-manager.c @@ -291,7 +291,7 @@ _call_destroy_proxy_configuration(NMPacrunnerManager *self, g_variant_new("(o)", path), G_VARIANT_TYPE("()"), G_DBUS_CALL_FLAGS_NO_AUTO_START, - NM_SHUTDOWN_TIMEOUT_MS, + NM_SHUTDOWN_TIMEOUT_1500_MSEC, priv->cancellable, _call_destroy_proxy_configuration_cb, conf_id_ref(conf_id)); @@ -315,7 +315,7 @@ _call_create_proxy_configuration(NMPacrunnerManager *self, conf_id->parameters, G_VARIANT_TYPE("(o)"), G_DBUS_CALL_FLAGS_NO_AUTO_START, - NM_SHUTDOWN_TIMEOUT_MS, + NM_SHUTDOWN_TIMEOUT_1500_MSEC, priv->cancellable, _call_create_proxy_configuration_cb, conf_id_ref(conf_id)); diff --git a/src/core/ppp/nm-ppp-manager.c b/src/core/ppp/nm-ppp-manager.c index dd6b1bc7f0..213e3761cc 100644 --- a/src/core/ppp/nm-ppp-manager.c +++ b/src/core/ppp/nm-ppp-manager.c @@ -1246,7 +1246,7 @@ _ppp_manager_stop(NMPPPManager *self, SIGTERM, LOGD_PPP, "pppd", - 5000, + NM_SHUTDOWN_TIMEOUT_5000_MSEC, _stop_child_cb, handle); diff --git a/src/core/settings/nm-secret-agent.c b/src/core/settings/nm-secret-agent.c index b222bd5ced..a3df449739 100644 --- a/src/core/settings/nm-secret-agent.c +++ b/src/core/settings/nm-secret-agent.c @@ -511,7 +511,7 @@ nm_secret_agent_cancel_call(NMSecretAgent *self, NMSecretAgentCallId *call_id) g_variant_new("(os)", call_id->path, call_id->setting_name), G_VARIANT_TYPE("()"), G_DBUS_CALL_FLAGS_NO_AUTO_START, - NM_SHUTDOWN_TIMEOUT_MS, + NM_SHUTDOWN_TIMEOUT_1500_MSEC, NULL, /* this operation is not cancellable. We rely on the timeout. */ _call_cancel_cb, call_id); |