summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.opendev.org>2021-03-05 12:38:02 +0000
committerGerrit Code Review <review@openstack.org>2021-03-05 12:38:02 +0000
commite65beeb0c42f9e212f6b7dd993e1b66dcd044f04 (patch)
tree99d1ccbaffaa0b3a77fc6ba5d9664cbeb0adc8b6
parentd1ffc6a557c6b3f3641bee2cfc9ec3be4649bdbe (diff)
parent25a05cf352b5125fdfa3b702e8318a12160f1993 (diff)
downloadironic-e65beeb0c42f9e212f6b7dd993e1b66dcd044f04.tar.gz
Merge "Always retry locking when performing task handoff" into stable/victoria
-rw-r--r--ironic/conductor/manager.py4
-rw-r--r--ironic/conductor/task_manager.py13
-rw-r--r--ironic/tests/unit/conductor/test_manager.py43
-rw-r--r--ironic/tests/unit/conductor/test_task_manager.py19
-rw-r--r--releasenotes/notes/story-2008323-fix-stuck-deploying-state-43d51149a02c08b8.yaml7
5 files changed, 82 insertions, 4 deletions
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py
index 1852d261a..748754a90 100644
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@@ -861,7 +861,7 @@ class ConductorManager(base_manager.BaseConductorManager):
"""
LOG.debug("RPC continue_node_deploy called for node %s.", node_id)
- with task_manager.acquire(context, node_id, shared=False,
+ with task_manager.acquire(context, node_id, shared=False, patient=True,
purpose='continue node deploying') as task:
node = task.node
@@ -1151,7 +1151,7 @@ class ConductorManager(base_manager.BaseConductorManager):
"""
LOG.debug("RPC continue_node_clean called for node %s.", node_id)
- with task_manager.acquire(context, node_id, shared=False,
+ with task_manager.acquire(context, node_id, shared=False, patient=True,
purpose='continue node cleaning') as task:
node = task.node
if node.target_provision_state == states.MANAGEABLE:
diff --git a/ironic/conductor/task_manager.py b/ironic/conductor/task_manager.py
index 7ed3eab3c..8a337ee33 100644
--- a/ironic/conductor/task_manager.py
+++ b/ironic/conductor/task_manager.py
@@ -170,7 +170,7 @@ class TaskManager(object):
"""
def __init__(self, context, node_id, shared=False,
- purpose='unspecified action', retry=True,
+ purpose='unspecified action', retry=True, patient=False,
load_driver=True):
"""Create a new TaskManager.
@@ -185,6 +185,12 @@ class TaskManager(object):
lock. Default: False.
:param purpose: human-readable purpose to put to debug logs.
:param retry: whether to retry locking if it fails. Default: True.
+ :param patient: whether to indefinitely retry locking if it fails.
+ Set this to True if there is an operation that does not
+ have any fallback or built-in retry mechanism, such as
+ finalizing a state transition during deploy/clean.
+ The default retry behavior is to retry a configured
+ number of times and then give up. Default: False.
:param load_driver: whether to load the ``driver`` object. Set this to
False if loading the driver is undesired or
impossible.
@@ -203,6 +209,7 @@ class TaskManager(object):
self.node_id = node_id
self.shared = shared
self._retry = retry
+ self._patient = patient
self.fsm = states.machine.copy()
self._purpose = purpose
@@ -260,7 +267,9 @@ class TaskManager(object):
def _lock(self):
self._debug_timer.restart()
- if self._retry:
+ if self._patient:
+ attempts = None
+ elif self._retry:
attempts = CONF.conductor.node_locked_retry_attempts
else:
attempts = 1
diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py
index 5a24e9683..d68470277 100644
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@@ -1821,6 +1821,27 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin,
deployments.do_next_deploy_step,
mock.ANY, 1)
+ @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
+ autospec=True)
+ def test_continue_node_deploy_locked(self, mock_spawn):
+ """Test that continuing a deploy via RPC cannot fail due to locks."""
+ max_attempts = 3
+ self.config(node_locked_retry_attempts=max_attempts, group='conductor')
+ prv_state = states.DEPLOYWAIT
+ tgt_prv_state = states.ACTIVE
+ node = obj_utils.create_test_node(self.context, driver='fake-hardware',
+ provision_state=prv_state,
+ target_provision_state=tgt_prv_state,
+ last_error=None,
+ deploy_step=self.deploy_steps[0])
+ self._start_service()
+ with mock.patch.object(objects.Node, 'reserve', autospec=True) as mck:
+ mck.side_effect = (
+ ([exception.NodeLocked(node='foo', host='foo')] * max_attempts)
+ + [node])
+ self.service.continue_node_deploy(self.context, node.uuid)
+ self._stop_service()
+
@mock.patch.object(task_manager.TaskManager, 'process_event',
autospec=True)
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
@@ -2768,6 +2789,28 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
def test_continue_node_clean_manual_abort_last_clean_step(self):
self._continue_node_clean_abort_last_clean_step(manual=True)
+ @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
+ autospec=True)
+ def test_continue_node_clean_locked(self, mock_spawn):
+ """Test that continuing a clean via RPC cannot fail due to locks."""
+ max_attempts = 3
+ self.config(node_locked_retry_attempts=max_attempts, group='conductor')
+ driver_info = {'clean_steps': [self.clean_steps[0]],
+ 'clean_step_index': 0}
+ tgt_prov_state = states.AVAILABLE
+ node = obj_utils.create_test_node(
+ self.context, driver='fake-hardware',
+ provision_state=states.CLEANWAIT,
+ target_provision_state=tgt_prov_state, last_error=None,
+ driver_internal_info=driver_info, clean_step=self.clean_steps[0])
+ self._start_service()
+ with mock.patch.object(objects.Node, 'reserve', autospec=True) as mck:
+ mck.side_effect = (
+ ([exception.NodeLocked(node='foo', host='foo')] * max_attempts)
+ + [node])
+ self.service.continue_node_clean(self.context, node.uuid)
+ self._stop_service()
+
class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
db_base.DbTestCase):
diff --git a/ironic/tests/unit/conductor/test_task_manager.py b/ironic/tests/unit/conductor/test_task_manager.py
index e49982d59..c20d053e3 100644
--- a/ironic/tests/unit/conductor/test_task_manager.py
+++ b/ironic/tests/unit/conductor/test_task_manager.py
@@ -222,6 +222,25 @@ class TaskManagerTestCase(db_base.DbTestCase):
reserve_mock.assert_called_once_with(self.context, self.host,
'fake-node-id')
+ def test_excl_lock_exception_patient(
+ self, get_voltgt_mock, get_volconn_mock, get_portgroups_mock,
+ get_ports_mock, build_driver_mock,
+ reserve_mock, release_mock, node_get_mock):
+ retry_attempts = 3
+ self.config(node_locked_retry_attempts=retry_attempts,
+ group='conductor')
+
+ # Fail on the first 3 attempts, succeed on the fourth.
+ reserve_mock.side_effect = (
+ ([exception.NodeLocked(node='foo', host='foo')] * 3) + [self.node])
+
+ task_manager.TaskManager(self.context, 'fake-node-id', patient=True)
+
+ expected_calls = [mock.call(self.context, self.host,
+ 'fake-node-id')] * 4
+ reserve_mock.assert_has_calls(expected_calls)
+ self.assertEqual(4, reserve_mock.call_count)
+
def test_excl_lock_reserve_exception(
self, get_voltgt_mock, get_volconn_mock, get_portgroups_mock,
get_ports_mock, build_driver_mock,
diff --git a/releasenotes/notes/story-2008323-fix-stuck-deploying-state-43d51149a02c08b8.yaml b/releasenotes/notes/story-2008323-fix-stuck-deploying-state-43d51149a02c08b8.yaml
new file mode 100644
index 000000000..a8f1abd5e
--- /dev/null
+++ b/releasenotes/notes/story-2008323-fix-stuck-deploying-state-43d51149a02c08b8.yaml
@@ -0,0 +1,7 @@
+---
+fixes:
+ - |
+ Fixes a bug where a conductor could fail to complete a deployment if there
+ was contention on a shared lock. This would manifest as an instance being
+ stuck in the "deploying" state, though the node had in fact started or even
+ completed its final boot.