summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulia Kreger <juliaashleykreger@gmail.com>2022-06-23 11:04:20 -0700
committerRuby Loo <opensrloo@gmail.com>2022-12-19 20:28:53 +0000
commit7048c4f10a9f9fa648f6c729178d6f22a264e8f0 (patch)
tree54e300d57fe4d63b81fe5d51d08cdd560b11e0fb
parentdfab332c17fe3a4399bd0ae254c6e31a0ff5147c (diff)
downloadironic-7048c4f10a9f9fa648f6c729178d6f22a264e8f0.tar.gz
Prevent pxe retry when agent token exists
A race condition can be observed in CI under heavy load where the conductor triggers are boot of the agent before it is fully online based upon state, but not considering the existence of an agent token. As a result, agent is never able to check in with Ironic and the overall operation fails. We now consider agent token's existence before retrying PXE as it is the very earliest indicator of a starting agent. Change-Id: Ice764866a08647031d16570860ec384204269501 Story: 2010107 Task: 45674 (cherry picked from commit d75424b5e5685a9cf04b30a5b0555efd1313e9c3)
-rw-r--r--ironic/drivers/modules/pxe_base.py6
-rw-r--r--ironic/tests/unit/drivers/modules/test_pxe.py12
-rw-r--r--releasenotes/notes/prevent-pxe-retry-when-token-exists-a4f38f7da56c1397.yaml7
3 files changed, 25 insertions, 0 deletions
diff --git a/ironic/drivers/modules/pxe_base.py b/ironic/drivers/modules/pxe_base.py
index 7434e179b..8da196dae 100644
--- a/ironic/drivers/modules/pxe_base.py
+++ b/ironic/drivers/modules/pxe_base.py
@@ -505,6 +505,12 @@ class PXEBaseMixin(object):
def _should_retry_boot(node):
# NOTE(dtantsur): this assumes IPA, do we need to make it generic?
for field in ('agent_last_heartbeat', 'last_power_state_change'):
+ if node.driver_internal_info.get('agent_secret_token', False):
+ LOG.debug('Not retrying PXE boot for node %(node)s; an agent '
+ 'token has been identified, meaning the agent '
+ 'has started.',
+ {'node': node.uuid})
+ return False
if manager_utils.value_within_timeout(
node.driver_internal_info.get(field),
CONF.pxe.boot_retry_timeout):
diff --git a/ironic/tests/unit/drivers/modules/test_pxe.py b/ironic/tests/unit/drivers/modules/test_pxe.py
index 81a72bc00..d3b203498 100644
--- a/ironic/tests/unit/drivers/modules/test_pxe.py
+++ b/ironic/tests/unit/drivers/modules/test_pxe.py
@@ -1407,6 +1407,18 @@ class PXEBootRetryTestCase(db_base.DbTestCase):
mock_boot_dev.assert_called_once_with(task, 'pxe',
persistent=False)
+ def test_check_boot_status_not_retry_with_token(self, mock_power,
+ mock_boot_dev):
+ with task_manager.acquire(self.context, self.node.uuid,
+ shared=True) as task:
+ task.node.driver_internal_info = {
+ 'agent_secret_token': 'xyz'
+ }
+ task.driver.boot._check_boot_status(task)
+ self.assertTrue(task.shared)
+ mock_power.assert_not_called()
+ mock_boot_dev.assert_not_called()
+
class iPXEBootRetryTestCase(PXEBootRetryTestCase):
diff --git a/releasenotes/notes/prevent-pxe-retry-when-token-exists-a4f38f7da56c1397.yaml b/releasenotes/notes/prevent-pxe-retry-when-token-exists-a4f38f7da56c1397.yaml
new file mode 100644
index 000000000..5db6db6ec
--- /dev/null
+++ b/releasenotes/notes/prevent-pxe-retry-when-token-exists-a4f38f7da56c1397.yaml
@@ -0,0 +1,7 @@
+---
+fixes:
+ - |
+ Fixes a race condition in PXE initialization where logic to retry
+ what we suspect as potentially failed PXE boot operations was not
+ consulting if an ``agent token`` had been established, which is the
+ very first step in agent initialization.