diff options
author | David Shrewsbury <shrewsbury.dave@gmail.com> | 2014-07-17 09:41:00 -0400 |
---|---|---|
committer | David Shrewsbury <shrewsbury.dave@gmail.com> | 2014-07-22 12:24:20 -0400 |
commit | 428c24767575419d5b472b52f2b622588a3a7253 (patch) | |
tree | dc96fd0c51b87dd151a996590323be9711dd7266 | |
parent | 1326235151aea7858e52efda58eb01782e382110 (diff) | |
download | ironic-428c24767575419d5b472b52f2b622588a3a7253.tar.gz |
Implement retry on NodeLocked exceptions
This adds the capability to retry node locking when a node is already
locked. Two conductor configuration variables are added to control this
behavior, which can be totally disabled by setting the value for the
node_locked_retry_attempts variable to zero.
A new unit test is added to check that the retry will work after first
getting a NodeLocked exception, then succeeding getting the lock on the
next lock attempt.
Change-Id: I60b099beea1bc3a954a5ab4699e623aaa71ba6c5
Blueprint: add-nodelocked-retry
-rw-r--r-- | etc/ironic/ironic.conf.sample | 6 | ||||
-rw-r--r-- | ironic/conductor/manager.py | 6 | ||||
-rw-r--r-- | ironic/conductor/task_manager.py | 16 | ||||
-rw-r--r-- | ironic/tests/conductor/test_task_manager.py | 24 | ||||
-rw-r--r-- | requirements.txt | 1 |
5 files changed, 51 insertions, 2 deletions
diff --git a/etc/ironic/ironic.conf.sample b/etc/ironic/ironic.conf.sample index 53b378dab..b173c8743 100644 --- a/etc/ironic/ironic.conf.sample +++ b/etc/ironic/ironic.conf.sample @@ -517,6 +517,12 @@ # The size of the workers greenthread pool. (integer value) #workers_pool_size=100 +# Number of attempts to grab a node lock. (integer value) +#node_locked_retry_attempts=3 + +# Seconds to sleep between node lock attempts. (integer value) +#node_locked_retry_interval=1 + [console] diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 3b0b6619b..d8162525d 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -114,6 +114,12 @@ conductor_opts = [ cfg.IntOpt('workers_pool_size', default=100, help='The size of the workers greenthread pool.'), + cfg.IntOpt('node_locked_retry_attempts', + default=3, + help='Number of attempts to grab a node lock.'), + cfg.IntOpt('node_locked_retry_interval', + default=1, + help='Seconds to sleep between node lock attempts.'), ] CONF = cfg.CONF diff --git a/ironic/conductor/task_manager.py b/ironic/conductor/task_manager.py index a5fe434b7..db0b3fd62 100644 --- a/ironic/conductor/task_manager.py +++ b/ironic/conductor/task_manager.py @@ -85,6 +85,8 @@ code when the spawned task generates an exception: """ +import retrying + from oslo.config import cfg from ironic.openstack.common import excutils @@ -168,9 +170,21 @@ class TaskManager(object): self.node = None self.shared = shared + # NodeLocked exceptions can be annoying. Let's try to alleviate + # some of that pain by retrying our lock attempts. The retrying + # module expects a wait_fixed value in milliseconds. + @retrying.retry( + retry_on_exception=lambda e: isinstance(e, exception.NodeLocked), + stop_max_attempt_number=CONF.conductor.node_locked_retry_attempts, + wait_fixed=CONF.conductor.node_locked_retry_interval * 1000) + def reserve_node(): + LOG.debug("Attempting to reserve node %(node)s", + {'node': node_id}) + self.node = self._dbapi.reserve_node(CONF.host, node_id) + try: if not self.shared: - self.node = self._dbapi.reserve_node(CONF.host, node_id) + reserve_node() else: self.node = objects.Node.get(context, node_id) self.ports = self._dbapi.get_ports_by_node_id(self.node.id) diff --git a/ironic/tests/conductor/test_task_manager.py b/ironic/tests/conductor/test_task_manager.py index 734863434..b0cef5f0b 100644 --- a/ironic/tests/conductor/test_task_manager.py +++ b/ironic/tests/conductor/test_task_manager.py @@ -115,9 +115,30 @@ class TaskManagerTestCase(tests_base.TestCase): release_mock.call_args_list) self.assertFalse(node_get_mock.called) + def test_excl_lock_exception_then_lock(self, get_ports_mock, + get_driver_mock, reserve_mock, + release_mock, node_get_mock): + retry_attempts = 3 + self.config(node_locked_retry_attempts=retry_attempts, + group='conductor') + + # Fail on the first lock attempt, succeed on the second. + reserve_mock.side_effect = [exception.NodeLocked(node='foo', + host='foo'), + self.node] + + with task_manager.TaskManager(self.context, 'fake-node-id') as task: + self.assertFalse(task.shared) + + reserve_mock.assert_called(self.host, 'fake-node-id') + self.assertEqual(2, reserve_mock.call_count) + def test_excl_lock_reserve_exception(self, get_ports_mock, get_driver_mock, reserve_mock, release_mock, node_get_mock): + retry_attempts = 3 + self.config(node_locked_retry_attempts=retry_attempts, + group='conductor') reserve_mock.side_effect = exception.NodeLocked(node='foo', host='foo') @@ -126,7 +147,8 @@ class TaskManagerTestCase(tests_base.TestCase): self.context, 'fake-node-id') - reserve_mock.assert_called_once_with(self.host, 'fake-node-id') + reserve_mock.assert_called_with(self.host, 'fake-node-id') + self.assertEqual(retry_attempts, reserve_mock.call_count) self.assertFalse(get_ports_mock.called) self.assertFalse(get_driver_mock.called) self.assertFalse(release_mock.called) diff --git a/requirements.txt b/requirements.txt index 65aadd128..2beed54b8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,3 +28,4 @@ jsonpatch>=1.1 WSME>=0.6 Jinja2 oslo.messaging>=1.3.0 +retrying>=1.2.2 |