summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Shrewsbury <shrewsbury.dave@gmail.com>2014-07-17 09:41:00 -0400
committerDavid Shrewsbury <shrewsbury.dave@gmail.com>2014-07-22 12:24:20 -0400
commit428c24767575419d5b472b52f2b622588a3a7253 (patch)
treedc96fd0c51b87dd151a996590323be9711dd7266
parent1326235151aea7858e52efda58eb01782e382110 (diff)
downloadironic-428c24767575419d5b472b52f2b622588a3a7253.tar.gz
Implement retry on NodeLocked exceptions
This adds the capability to retry node locking when a node is already locked. Two conductor configuration variables are added to control this behavior, which can be totally disabled by setting the value for the node_locked_retry_attempts variable to zero. A new unit test is added to check that the retry will work after first getting a NodeLocked exception, then succeeding getting the lock on the next lock attempt. Change-Id: I60b099beea1bc3a954a5ab4699e623aaa71ba6c5 Blueprint: add-nodelocked-retry
-rw-r--r--etc/ironic/ironic.conf.sample6
-rw-r--r--ironic/conductor/manager.py6
-rw-r--r--ironic/conductor/task_manager.py16
-rw-r--r--ironic/tests/conductor/test_task_manager.py24
-rw-r--r--requirements.txt1
5 files changed, 51 insertions, 2 deletions
diff --git a/etc/ironic/ironic.conf.sample b/etc/ironic/ironic.conf.sample
index 53b378dab..b173c8743 100644
--- a/etc/ironic/ironic.conf.sample
+++ b/etc/ironic/ironic.conf.sample
@@ -517,6 +517,12 @@
# The size of the workers greenthread pool. (integer value)
#workers_pool_size=100
+# Number of attempts to grab a node lock. (integer value)
+#node_locked_retry_attempts=3
+
+# Seconds to sleep between node lock attempts. (integer value)
+#node_locked_retry_interval=1
+
[console]
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py
index 3b0b6619b..d8162525d 100644
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@@ -114,6 +114,12 @@ conductor_opts = [
cfg.IntOpt('workers_pool_size',
default=100,
help='The size of the workers greenthread pool.'),
+ cfg.IntOpt('node_locked_retry_attempts',
+ default=3,
+ help='Number of attempts to grab a node lock.'),
+ cfg.IntOpt('node_locked_retry_interval',
+ default=1,
+ help='Seconds to sleep between node lock attempts.'),
]
CONF = cfg.CONF
diff --git a/ironic/conductor/task_manager.py b/ironic/conductor/task_manager.py
index a5fe434b7..db0b3fd62 100644
--- a/ironic/conductor/task_manager.py
+++ b/ironic/conductor/task_manager.py
@@ -85,6 +85,8 @@ code when the spawned task generates an exception:
"""
+import retrying
+
from oslo.config import cfg
from ironic.openstack.common import excutils
@@ -168,9 +170,21 @@ class TaskManager(object):
self.node = None
self.shared = shared
+ # NodeLocked exceptions can be annoying. Let's try to alleviate
+ # some of that pain by retrying our lock attempts. The retrying
+ # module expects a wait_fixed value in milliseconds.
+ @retrying.retry(
+ retry_on_exception=lambda e: isinstance(e, exception.NodeLocked),
+ stop_max_attempt_number=CONF.conductor.node_locked_retry_attempts,
+ wait_fixed=CONF.conductor.node_locked_retry_interval * 1000)
+ def reserve_node():
+ LOG.debug("Attempting to reserve node %(node)s",
+ {'node': node_id})
+ self.node = self._dbapi.reserve_node(CONF.host, node_id)
+
try:
if not self.shared:
- self.node = self._dbapi.reserve_node(CONF.host, node_id)
+ reserve_node()
else:
self.node = objects.Node.get(context, node_id)
self.ports = self._dbapi.get_ports_by_node_id(self.node.id)
diff --git a/ironic/tests/conductor/test_task_manager.py b/ironic/tests/conductor/test_task_manager.py
index 734863434..b0cef5f0b 100644
--- a/ironic/tests/conductor/test_task_manager.py
+++ b/ironic/tests/conductor/test_task_manager.py
@@ -115,9 +115,30 @@ class TaskManagerTestCase(tests_base.TestCase):
release_mock.call_args_list)
self.assertFalse(node_get_mock.called)
+ def test_excl_lock_exception_then_lock(self, get_ports_mock,
+ get_driver_mock, reserve_mock,
+ release_mock, node_get_mock):
+ retry_attempts = 3
+ self.config(node_locked_retry_attempts=retry_attempts,
+ group='conductor')
+
+ # Fail on the first lock attempt, succeed on the second.
+ reserve_mock.side_effect = [exception.NodeLocked(node='foo',
+ host='foo'),
+ self.node]
+
+ with task_manager.TaskManager(self.context, 'fake-node-id') as task:
+ self.assertFalse(task.shared)
+
+ reserve_mock.assert_called(self.host, 'fake-node-id')
+ self.assertEqual(2, reserve_mock.call_count)
+
def test_excl_lock_reserve_exception(self, get_ports_mock,
get_driver_mock, reserve_mock,
release_mock, node_get_mock):
+ retry_attempts = 3
+ self.config(node_locked_retry_attempts=retry_attempts,
+ group='conductor')
reserve_mock.side_effect = exception.NodeLocked(node='foo',
host='foo')
@@ -126,7 +147,8 @@ class TaskManagerTestCase(tests_base.TestCase):
self.context,
'fake-node-id')
- reserve_mock.assert_called_once_with(self.host, 'fake-node-id')
+ reserve_mock.assert_called_with(self.host, 'fake-node-id')
+ self.assertEqual(retry_attempts, reserve_mock.call_count)
self.assertFalse(get_ports_mock.called)
self.assertFalse(get_driver_mock.called)
self.assertFalse(release_mock.called)
diff --git a/requirements.txt b/requirements.txt
index 65aadd128..2beed54b8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,3 +28,4 @@ jsonpatch>=1.1
WSME>=0.6
Jinja2
oslo.messaging>=1.3.0
+retrying>=1.2.2