summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.opendev.org>2020-07-23 04:48:55 +0000
committerGerrit Code Review <review@openstack.org>2020-07-23 04:48:55 +0000
commit8418e651c3dcbea1d0b1f8561d079d690d549622 (patch)
tree4d3078eadc61dc051dbbae459f58c3150e4627dc
parent0e26dfe83902fdc84890f81d9ca06b72fae35ce5 (diff)
parent46f8c857524e8a45935836ac97463928a2d9a0a9 (diff)
downloadironic-8418e651c3dcbea1d0b1f8561d079d690d549622.tar.gz
Merge "Add agent power interface"
-rw-r--r--ironic/conductor/task_manager.py13
-rw-r--r--ironic/conductor/utils.py19
-rw-r--r--ironic/drivers/generic.py3
-rw-r--r--ironic/drivers/modules/agent_client.py17
-rw-r--r--ironic/drivers/modules/agent_power.py220
-rw-r--r--ironic/tests/unit/drivers/modules/test_agent_power.py127
-rw-r--r--releasenotes/notes/agent-power-a000fdf37cb870e4.yaml6
-rw-r--r--setup.cfg1
8 files changed, 402 insertions, 4 deletions
diff --git a/ironic/conductor/task_manager.py b/ironic/conductor/task_manager.py
index df68c27aa..b4089d618 100644
--- a/ironic/conductor/task_manager.py
+++ b/ironic/conductor/task_manager.py
@@ -336,6 +336,19 @@ class TaskManager(object):
self._on_error_args = args
self._on_error_kwargs = kwargs
+ def downgrade_lock(self):
+ """Downgrade the lock to a shared one."""
+ if self.node is None:
+ raise RuntimeError("Cannot downgrade an already released lock")
+
+ if not self.shared:
+ objects.Node.release(self.context, CONF.host, self.node.id)
+ self.shared = True
+ self.node.refresh()
+ LOG.debug("Successfully downgraded lock for %(purpose)s "
+ "on node %(node)s",
+ {'purpose': self._purpose, 'node': self.node.uuid})
+
def release_resources(self):
"""Unlock a node and release resources.
diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py
index b4d7bf076..c64dc9f5a 100644
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@@ -963,6 +963,21 @@ def value_within_timeout(value, timeout):
return last_valid <= last
+def agent_is_alive(node, timeout=None):
+ """Check that the agent is likely alive.
+
+ The method then checks for the last agent heartbeat, and if it occured
+ within the timeout set by [deploy]fast_track_timeout, then agent is
+ presumed alive.
+
+ :param node: A node object.
+ :param timeout: Heartbeat timeout, defaults to `fast_track_timeout`.
+ """
+ return value_within_timeout(
+ node.driver_internal_info.get('agent_last_heartbeat'),
+ timeout or CONF.deploy.fast_track_timeout)
+
+
def is_fast_track(task):
"""Checks a fast track is available.
@@ -987,9 +1002,7 @@ def is_fast_track(task):
{'node': task.node.uuid, 'error': task.node.last_error})
return False
- if value_within_timeout(
- task.node.driver_internal_info.get('agent_last_heartbeat'),
- CONF.deploy.fast_track_timeout):
+ if agent_is_alive(task.node):
return True
else:
LOG.debug('Node %(node)s should be fast-track-able, but the agent '
diff --git a/ironic/drivers/generic.py b/ironic/drivers/generic.py
index 1e7a83c4b..599e1139c 100644
--- a/ironic/drivers/generic.py
+++ b/ironic/drivers/generic.py
@@ -18,6 +18,7 @@ Generic hardware types.
from ironic.drivers import hardware_type
from ironic.drivers.modules import agent
+from ironic.drivers.modules import agent_power
from ironic.drivers.modules.ansible import deploy as ansible_deploy
from ironic.drivers.modules import fake
from ironic.drivers.modules import inspector
@@ -102,7 +103,7 @@ class ManualManagementHardware(GenericHardware):
@property
def supported_power_interfaces(self):
"""List of supported power interfaces."""
- return [fake.FakePower]
+ return [agent_power.AgentPower, fake.FakePower]
@property
def supported_vendor_interfaces(self):
diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py
index e4395ed25..d1f3d6759 100644
--- a/ironic/drivers/modules/agent_client.py
+++ b/ironic/drivers/modules/agent_client.py
@@ -30,6 +30,8 @@ METRICS = metrics_utils.get_metrics_logger(__name__)
DEFAULT_IPA_PORTAL_PORT = 3260
+REBOOT_COMMAND = 'run_image'
+
def get_command_error(command):
"""Extract an error string from the command result.
@@ -564,6 +566,21 @@ class AgentClient(object):
method='standby.power_off',
params={})
+ @METRICS.timer('AgentClient.reboot')
+ def reboot(self, node):
+ """Soft reboots the bare metal node by shutting down ramdisk OS.
+
+ :param node: A Node object.
+ :raises: IronicException when failed to issue the request or there was
+ a malformed response from the agent.
+ :raises: AgentAPIError when agent failed to execute specified command.
+ :returns: A dict containing command response from agent.
+ See :func:`get_commands_status` for a command result sample.
+ """
+ return self._command(node=node,
+ method='standby.%s' % REBOOT_COMMAND,
+ params={})
+
@METRICS.timer('AgentClient.sync')
def sync(self, node):
"""Flush file system buffers forcing changed blocks to disk.
diff --git a/ironic/drivers/modules/agent_power.py b/ironic/drivers/modules/agent_power.py
new file mode 100644
index 000000000..11ef5711a
--- /dev/null
+++ b/ironic/drivers/modules/agent_power.py
@@ -0,0 +1,220 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+"""
+The agent power interface.
+"""
+
+import time
+
+from oslo_config import cfg
+from oslo_log import log
+import retrying
+
+from ironic.common import exception
+from ironic.common.i18n import _
+from ironic.common import states
+from ironic.conductor import utils as cond_utils
+from ironic.drivers import base
+from ironic.drivers.modules import agent_client
+
+
+CONF = cfg.CONF
+
+LOG = log.getLogger(__name__)
+
+_POWER_WAIT = 30
+
+
+class AgentPower(base.PowerInterface):
+ """Power interface using the running agent for power actions."""
+
+ def __init__(self):
+ super(AgentPower, self).__init__()
+ if not CONF.deploy.fast_track:
+ raise exception.InvalidParameterValue(
+ _('[deploy]fast_track must be True to enable the agent '
+ 'power interface'))
+ self._client = agent_client.AgentClient()
+
+ def get_properties(self):
+ """Return the properties of the interface.
+
+ :returns: dictionary of <property name>:<property description> entries.
+ """
+ return {}
+
+ def validate(self, task):
+ """Validate the driver-specific Node deployment info.
+
+ :param task: A TaskManager instance containing the node to act on.
+ :raises: InvalidParameterValue on malformed parameter(s)
+ """
+ # NOTE(dtantsur): the fast_track option is mutable, so we have to check
+ # it again on validation.
+ if not CONF.deploy.fast_track:
+ raise exception.InvalidParameterValue(
+ _('[deploy]fast_track must be True to enable the agent '
+ 'power interface'))
+ # TODO(dtantsur): support ACTIVE nodes
+ if not cond_utils.agent_is_alive(task.node):
+ raise exception.InvalidParameterValue(
+ _('Agent seems offline for node %s, the agent power interface '
+ 'cannot be used') % task.node.uuid)
+
+ def supports_power_sync(self, task):
+ """Check if power sync is supported for the given node.
+
+ Not supported for the agent power since it is not possible to power
+ on/off nodes.
+
+ :param task: A TaskManager instance containing the node to act on
+ with a **shared** lock.
+ :returns: boolean, whether power sync is supported.
+ """
+ return False
+
+ def get_supported_power_states(self, task):
+ """Get a list of the supported power states.
+
+ Only contains REBOOT.
+
+ :param task: A TaskManager instance containing the node to act on.
+ :returns: A list with the supported power states defined
+ in :mod:`ironic.common.states`.
+ """
+ return [states.REBOOT, states.SOFT_REBOOT]
+
+ def get_power_state(self, task):
+ """Return the power state of the task's node.
+
+ Essentially, the only known state is POWER ON, everything else is
+ an error (or more precisely ``None``).
+
+ :param task: A TaskManager instance containing the node to act on.
+ :returns: A power state. One of :mod:`ironic.common.states`.
+ """
+ # TODO(dtantsur): support ACTIVE nodes
+ if cond_utils.agent_is_alive(task.node):
+ return states.POWER_ON
+ else:
+ LOG.error('Node %s is not fast-track-able, cannot determine '
+ 'its power state via the "agent" power interface',
+ task.node.uuid)
+ return None
+
+ def set_power_state(self, task, power_state, timeout=None):
+ """Set the power state of the task's node.
+
+ :param task: A TaskManager instance containing the node to act on.
+ :param power_state: Power state from :mod:`ironic.common.states`.
+ Only REBOOT and SOFT_REBOOT are supported and are synonymous.
+ :param timeout: timeout (in seconds) positive integer (> 0) for any
+ power state. ``None`` indicates to use default timeout.
+ :raises: PowerStateFailure on non-supported power state.
+ """
+ if power_state in (states.REBOOT, states.SOFT_REBOOT):
+ return self.reboot(task)
+ else:
+ LOG.error('Power state %(state)s is not implemented for node '
+ '%(node)s using the "agent" power interface',
+ {'node': task.node.uuid, 'state': power_state})
+ raise exception.PowerStateFailure(pstate=power_state)
+
+ def reboot(self, task, timeout=None):
+ """Perform a reboot of the task's node.
+
+ Only soft reboot is implemented.
+
+ :param task: A TaskManager instance containing the node to act on.
+ :param timeout: timeout (in seconds) positive integer (> 0) for any
+ power state. ``None`` indicates to use default timeout.
+ """
+ node = task.node
+
+ self._client.reboot(node)
+
+ info = node.driver_internal_info
+ # NOTE(dtantsur): wipe the agent token, otherwise the rebooted agent
+ # won't be able to heartbeat. This is mostly a precaution since the
+ # calling code in conductor is expected to handle it.
+ if not info.get('agent_secret_token_pregenerated'):
+ info.pop('agent_secret_token', None)
+ # NOTE(dtantsur): the URL may change on reboot, wipe it as well (but
+ # only after we call reboot).
+ info.pop('agent_url', None)
+ node.driver_internal_info = info
+ node.save()
+
+ LOG.debug('Requested reboot of node %(node)s via the agent, waiting '
+ '%(wait)d seconds for the node to power down',
+ {'node': task.node.uuid, 'wait': _POWER_WAIT})
+ time.sleep(_POWER_WAIT)
+
+ if (node.provision_state in (states.DEPLOYING, states.CLEANING)
+ and (node.driver_internal_info.get('deployment_reboot')
+ or node.driver_internal_info.get('cleaning_reboot'))):
+ # NOTE(dtantsur): we need to downgrade the lock otherwise
+ # heartbeats won't be processed. It should not have side effects
+ # for nodes in DEPLOYING/CLEANING.
+ task.downgrade_lock()
+
+ try:
+ self._wait_for_reboot(task, timeout)
+ finally:
+ # The caller probably expects a lock, so re-acquire it
+ task.upgrade_lock()
+
+ def _wait_for_reboot(self, task, timeout):
+ wait = CONF.agent.post_deploy_get_power_state_retry_interval
+ if not timeout:
+ timeout = CONF.agent.post_deploy_get_power_state_retries * wait
+
+ @retrying.retry(
+ stop_max_delay=timeout,
+ retry_on_result=lambda result: not result,
+ retry_on_exception=(
+ lambda e: isinstance(e, exception.AgentConnectionFailed)),
+ wait_fixed=wait * 1000
+ )
+ def _wait_until_rebooted(task):
+ try:
+ status = self._client.get_commands_status(
+ task.node, retry_connection=False, expect_errors=True)
+ except exception.AgentConnectionFailed:
+ LOG.debug('Still waiting for the agent to come back on the '
+ 'node %s', task.node.uuid)
+ raise
+
+ if any(cmd['command_name'] == agent_client.REBOOT_COMMAND
+ for cmd in status):
+ LOG.debug('Still waiting for the agent to power off on the '
+ 'node %s', task.node.uuid)
+ return False
+
+ return True
+
+ try:
+ _wait_until_rebooted(task)
+ except exception.AgentConnectionFailed as exc:
+ msg = _('Agent failed to come back on %(node)s with the "agent" '
+ 'power interface: %(exc)s') % {
+ 'node': task.node.uuid, 'exc': exc}
+ LOG.error(msg)
+ raise exception.PowerStateFailure(msg)
+ except Exception as exc:
+ LOG.error('Could not reboot node %(node)s with the "agent" power '
+ 'interface: %(exc)s',
+ {'node': task.node.uuid, 'exc': exc})
+ raise exception.PowerStateFailure(
+ _('Unexpected error when rebooting through the agent: %s')
+ % exc)
diff --git a/ironic/tests/unit/drivers/modules/test_agent_power.py b/ironic/tests/unit/drivers/modules/test_agent_power.py
new file mode 100644
index 000000000..0d4004c66
--- /dev/null
+++ b/ironic/tests/unit/drivers/modules/test_agent_power.py
@@ -0,0 +1,127 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import datetime
+from unittest import mock
+
+from ironic.common import exception
+from ironic.common import states
+from ironic.conductor import task_manager
+from ironic.drivers.modules import agent_client
+from ironic.drivers.modules import agent_power
+from ironic.tests.unit.db import base as db_base
+from ironic.tests.unit.objects import utils as object_utils
+
+
+@mock.patch('time.sleep', lambda _sec: None)
+class AgentPowerTest(db_base.DbTestCase):
+
+ def setUp(self):
+ super(AgentPowerTest, self).setUp()
+ self.config(fast_track=True, group='deploy')
+ self.power = agent_power.AgentPower()
+ dii = {
+ 'agent_last_heartbeat': datetime.datetime.now().strftime(
+ "%Y-%m-%dT%H:%M:%S.%f"),
+ 'deployment_reboot': True,
+ 'agent_url': 'http://url',
+ 'agent_secret_token': 'very secret',
+ }
+ self.node = object_utils.create_test_node(
+ self.context, driver_internal_info=dii,
+ provision_state=states.DEPLOYING)
+ self.task = mock.Mock(spec=task_manager.TaskManager, node=self.node)
+
+ def test_basics(self):
+ self.assertEqual({}, self.power.get_properties())
+ self.assertFalse(self.power.supports_power_sync(self.task))
+ self.assertEqual([states.REBOOT, states.SOFT_REBOOT],
+ self.power.get_supported_power_states(self.task))
+
+ def test_validate(self):
+ self.power.validate(self.task)
+
+ def test_validate_fails(self):
+ self.node.driver_internal_info['agent_last_heartbeat'] = \
+ datetime.datetime(2010, 7, 19).strftime(
+ "%Y-%m-%dT%H:%M:%S.%f")
+ self.assertRaises(exception.InvalidParameterValue,
+ self.power.validate, self.task)
+
+ del self.node.driver_internal_info['agent_last_heartbeat']
+ self.assertRaises(exception.InvalidParameterValue,
+ self.power.validate, self.task)
+
+ def test_get_power_state(self):
+ self.assertEqual(states.POWER_ON,
+ self.power.get_power_state(self.task))
+
+ def test_get_power_state_unknown(self):
+ self.node.driver_internal_info['agent_last_heartbeat'] = \
+ datetime.datetime(2010, 7, 19).strftime(
+ "%Y-%m-%dT%H:%M:%S.%f")
+ self.assertIsNone(self.power.get_power_state(self.task))
+
+ del self.node.driver_internal_info['agent_last_heartbeat']
+ self.assertIsNone(self.power.get_power_state(self.task))
+
+ @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+ autospec=True)
+ @mock.patch.object(agent_client.AgentClient, 'reboot', autospec=True)
+ def test_reboot(self, mock_reboot, mock_commands):
+ mock_commands.side_effect = [
+ [{'command_name': 'run_image', 'command_status': 'RUNNING'}],
+ exception.AgentConnectionFailed,
+ exception.AgentConnectionFailed,
+ [{'command_name': 'get_deploy_steps', 'command_status': 'RUNNING'}]
+ ]
+ with task_manager.acquire(self.context, self.node.id) as task:
+ # Save the node since the upgrade_lock call changes it
+ node = task.node
+ self.power.reboot(task)
+ mock_reboot.assert_called_once_with(self.power._client, node)
+ mock_commands.assert_called_with(self.power._client, node,
+ retry_connection=False,
+ expect_errors=True)
+ self.assertEqual(4, mock_commands.call_count)
+
+ node.refresh()
+ self.assertNotIn('agent_secret_token', node.driver_internal_info)
+ self.assertNotIn('agent_url', node.driver_internal_info)
+
+ @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+ autospec=True)
+ @mock.patch.object(agent_client.AgentClient, 'reboot', autospec=True)
+ def test_reboot_timeout(self, mock_reboot, mock_commands):
+ mock_commands.side_effect = exception.AgentConnectionFailed
+ with task_manager.acquire(self.context, self.node.id) as task:
+ node = task.node
+ self.assertRaisesRegex(exception.PowerStateFailure,
+ 'Agent failed to come back',
+ self.power.reboot, task, timeout=0.001)
+ mock_commands.assert_called_with(self.power._client, node,
+ retry_connection=False,
+ expect_errors=True)
+
+ @mock.patch.object(agent_client.AgentClient, 'reboot', autospec=True)
+ def test_reboot_another_state(self, mock_reboot):
+ with task_manager.acquire(self.context, self.node.id) as task:
+ task.node.provision_state = states.DEPLOYWAIT
+ self.power.reboot(task)
+ mock_reboot.assert_called_once_with(self.power._client, task.node)
+
+ @mock.patch.object(agent_client.AgentClient, 'reboot', autospec=True)
+ def test_reboot_into_instance(self, mock_reboot):
+ with task_manager.acquire(self.context, self.node.id) as task:
+ del task.node.driver_internal_info['deployment_reboot']
+ self.power.reboot(task)
+ mock_reboot.assert_called_once_with(self.power._client, task.node)
diff --git a/releasenotes/notes/agent-power-a000fdf37cb870e4.yaml b/releasenotes/notes/agent-power-a000fdf37cb870e4.yaml
new file mode 100644
index 000000000..549a78b21
--- /dev/null
+++ b/releasenotes/notes/agent-power-a000fdf37cb870e4.yaml
@@ -0,0 +1,6 @@
+---
+features:
+ - |
+ The new **experimental** ``agent`` power interface allows limited
+ provisioning operations on nodes without BMC credentials. See `story
+ 2007771 <https://storyboard.openstack.org/#!/story/2007771>`_ for details.
diff --git a/setup.cfg b/setup.cfg
index 1101c09b5..c314e21a5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -121,6 +121,7 @@ ironic.hardware.interfaces.network =
noop = ironic.drivers.modules.network.noop:NoopNetwork
ironic.hardware.interfaces.power =
+ agent = ironic.drivers.modules.agent_power:AgentPower
fake = ironic.drivers.modules.fake:FakePower
ibmc = ironic.drivers.modules.ibmc.power:IBMCPower
idrac = ironic.drivers.modules.drac.power:DracPower