diff options
author | Steve Baker <sbaker@redhat.com> | 2023-02-27 11:05:10 +1300 |
---|---|---|
committer | Steve Baker <sbaker@redhat.com> | 2023-02-27 11:10:31 +1300 |
commit | 6a9e319fbeb0851c51bb14b9c4c3c5fa4685b14d (patch) | |
tree | 7505771b8835a033e60c08c39827f03011f3d682 /ironic/common | |
parent | e54ee2ba4cb818e25c75fcdc69f7ff1dc4956c73 (diff) | |
download | ironic-6a9e319fbeb0851c51bb14b9c4c3c5fa4685b14d.tar.gz |
On rpc service stop, wait for node reservation release
Instead of clearing existing reservations at the beginning of
del_host, wait for the tasks holding them to go to completion. This
check continues indefinitely until the conductor process exits due to
one of:
- All reservations for this conductor are released
- CONF.graceful_shutdown_timeout has elapsed
- The process manager (systemd, kubernetes) sends SIGKILL after the
configured graceful period
Because the default values of [DEFAULT]graceful_shutdown_timeout and
[conductor]heartbeat_timeout are the same (60s) no other conductor
will claim a node as an orphan until this conductor exits.
Change-Id: Ib8db915746228cd87272740825aaaea1fdf953c7
Diffstat (limited to 'ironic/common')
-rw-r--r-- | ironic/common/rpc_service.py | 18 |
1 files changed, 17 insertions, 1 deletions
diff --git a/ironic/common/rpc_service.py b/ironic/common/rpc_service.py index cb0f23c98..a74f6bab3 100644 --- a/ironic/common/rpc_service.py +++ b/ironic/common/rpc_service.py @@ -100,7 +100,8 @@ class RPCService(service.Service): seconds=CONF.hash_ring_reset_interval) try: - self.manager.del_host(deregister=self.deregister) + self.manager.del_host(deregister=self.deregister, + clear_node_reservations=False) except Exception as e: LOG.exception('Service error occurred when cleaning up ' 'the RPC manager. Error: %s', e) @@ -127,6 +128,21 @@ class RPCService(service.Service): LOG.info('Stopped RPC server for service %(service)s on host ' '%(host)s.', {'service': self.topic, 'host': self.host}) + + # Wait for reservation locks held by this conductor. + # The conductor process will end when: + # - All reservations for this conductor are released + # - CONF.graceful_shutdown_timeout has elapsed + # - The process manager (systemd, kubernetes) sends SIGKILL after the + # configured graceful period + graceful_time = initial_time + datetime.timedelta( + seconds=CONF.graceful_shutdown_timeout) + while (self.manager.has_reserved() + and graceful_time > timeutils.utcnow()): + LOG.info('Waiting for reserved nodes to clear on host %(host)s', + {'host': self.host}) + time.sleep(1) + rpc.set_global_manager(None) def _handle_signal(self, signo, frame): |