summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/source/admin/compute-node-identification.rst83
-rw-r--r--doc/source/admin/index.rst1
-rw-r--r--doc/source/cli/nova-compute.rst2
-rw-r--r--nova/compute/manager.py8
-rw-r--r--nova/compute/resource_tracker.py8
-rw-r--r--nova/exception.py4
-rw-r--r--nova/objects/compute_node.py8
-rw-r--r--nova/tests/functional/test_service.py85
-rw-r--r--nova/tests/unit/compute/test_resource_tracker.py14
-rw-r--r--nova/tests/unit/objects/test_compute_node.py9
-rw-r--r--nova/virt/fake.py20
11 files changed, 240 insertions, 2 deletions
diff --git a/doc/source/admin/compute-node-identification.rst b/doc/source/admin/compute-node-identification.rst
new file mode 100644
index 0000000000..31d4802d0b
--- /dev/null
+++ b/doc/source/admin/compute-node-identification.rst
@@ -0,0 +1,83 @@
+===========================
+Compute Node Identification
+===========================
+
+Nova requires that compute nodes maintain a constant and consistent identity
+during their lifecycle. With the exception of the ironic driver, starting in
+the 2023.1 release, this is achieved by use of a file containing the node
+unique identifier that is persisted on disk. Prior to 2023.1, a combination of
+the compute node's hostname and the :oslo.config:option:`host` value in the
+configuration file were used.
+
+The 2023.1 and later compute node identification file must remain unchanged
+during the lifecycle of the compute node. Changing the value or removing the
+file will result in a failure to start and may require advanced techniques
+for recovery. The file is read once at `nova-compute`` startup, at which point
+it is validated for formatting and the corresponding node is located or
+created in the database.
+
+.. note::
+
+ Even after 2023.1, the compute node's hostname may not be changed after
+ the initial registration with the controller nodes, it is just not used
+ as the primary method for identification.
+
+The behavior of ``nova-compute`` is different when using the ironic driver,
+as the (UUID-based) identity and mapping of compute nodes to compute manager
+service hosts is dynamic. In that case, no single node identity is maintained
+by the compute host and thus no identity file is read or written. Thus none
+of the sections below apply to hosts with :oslo.config:option:`compute_driver`
+set to `ironic`.
+
+Self-provisioning of the node identity
+--------------------------------------
+
+By default, ``nova-compute`` will automatically generate and write a UUID to
+disk the first time it starts up, and will use that going forward as its
+stable identity. Using the :oslo.config:option:`state_path`
+(which is ``/var/lib/nova`` on most systems), a ``compute_id`` file will be
+created with a generated UUID.
+
+Since this file (and it's parent directory) is writable by nova, it may be
+desirable to move this to one of the other locations that nova looks for the
+identification file.
+
+Deployment provisioning of the node identity
+--------------------------------------------
+
+In addition to the location mentioned above, nova will also search the parent
+directories of any config file in use (either the defaults or provided on
+the command line) for a ``compute_id`` file. Thus, a deployment tool may, on
+most systems, pre-provision the node's UUID by writing one to
+``/etc/nova/compute_id``.
+
+The contents of the file should be a single UUID in canonical textual
+representation with no additional whitespace or other characters. The following
+should work on most Linux systems:
+
+.. code-block:: shell
+
+ $ uuidgen > /etc/nova/compute_id
+
+.. note::
+
+ **Do not** execute the above command blindly in every run of a deployment
+ tool, as that will result in overwriting the ``compute_id`` file each time,
+ which *will* prevent nova from working properly.
+
+Upgrading from pre-2023.1
+-------------------------
+
+Before release 2023.1, ``nova-compute`` only used the hostname (combined with
+:oslo.config:option:`host`, if set) to identify its compute node objects in
+the database. When upgrading from a prior release, the compute node will
+perform a one-time migration of the hostname-matched compute node UUID to the
+``compute_id`` file in the :oslo.config:option:`state_path` location.
+
+.. note::
+
+ It is imperative that you allow the above migration to run and complete on
+ compute nodes that are being upgraded. Skipping this step by
+ pre-provisioning a ``compute_id`` file before the upgrade will **not** work
+ and will be equivalent to changing the compute node UUID after it has
+ already been created once.
diff --git a/doc/source/admin/index.rst b/doc/source/admin/index.rst
index 93b4e6a554..8cb5bf7156 100644
--- a/doc/source/admin/index.rst
+++ b/doc/source/admin/index.rst
@@ -206,6 +206,7 @@ instance for these kind of workloads.
secure-boot
sev
managing-resource-providers
+ compute-node-identification
resource-limits
cpu-models
libvirt-misc
diff --git a/doc/source/cli/nova-compute.rst b/doc/source/cli/nova-compute.rst
index f190949efa..1346dab92e 100644
--- a/doc/source/cli/nova-compute.rst
+++ b/doc/source/cli/nova-compute.rst
@@ -41,6 +41,8 @@ Files
* ``/etc/nova/policy.d/``
* ``/etc/nova/rootwrap.conf``
* ``/etc/nova/rootwrap.d/``
+* ``/etc/nova/compute_id``
+* ``/var/lib/nova/compute_id``
See Also
========
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index c4537cd9a2..952ab3e199 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -10480,6 +10480,14 @@ class ComputeManager(manager.Manager):
LOG.exception(
"Error updating PCI resources for node %(node)s.",
{'node': nodename})
+ except exception.InvalidConfiguration as e:
+ if startup:
+ # If this happens during startup, we need to let it raise to
+ # abort our service startup.
+ raise
+ else:
+ LOG.error("Error updating resources for node %s: %s",
+ nodename, e)
except Exception:
LOG.exception("Error updating resources for node %(node)s.",
{'node': nodename})
diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py
index 70c56fd2e3..3f911f3708 100644
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@@ -728,7 +728,13 @@ class ResourceTracker(object):
cn = objects.ComputeNode(context)
cn.host = self.host
self._copy_resources(cn, resources, initial=True)
- cn.create()
+ try:
+ cn.create()
+ except exception.DuplicateRecord:
+ raise exception.InvalidConfiguration(
+ 'Duplicate compute node record found for host %s node %s' % (
+ cn.host, cn.hypervisor_hostname))
+
# Only map the ComputeNode into compute_nodes if create() was OK
# because if create() fails, on the next run through here nodename
# would be in compute_nodes and we won't try to create again (because
diff --git a/nova/exception.py b/nova/exception.py
index 20c112b628..f5993e79f8 100644
--- a/nova/exception.py
+++ b/nova/exception.py
@@ -2512,6 +2512,10 @@ class InvalidNodeConfiguration(NovaException):
msg_fmt = _('Invalid node identity configuration: %(reason)s')
+class DuplicateRecord(NovaException):
+ msg_fmt = _('Unable to create duplicate record for %(target)s')
+
+
class NotSupportedComputeForEvacuateV295(NotSupported):
msg_fmt = _("Starting to microversion 2.95, evacuate API will stop "
"instance on destination. To evacuate before upgrades are "
diff --git a/nova/objects/compute_node.py b/nova/objects/compute_node.py
index 528cfc0776..dfc1b2ae28 100644
--- a/nova/objects/compute_node.py
+++ b/nova/objects/compute_node.py
@@ -12,6 +12,7 @@
# License for the specific language governing permissions and limitations
# under the License.
+from oslo_db import exception as db_exc
from oslo_serialization import jsonutils
from oslo_utils import uuidutils
from oslo_utils import versionutils
@@ -339,7 +340,12 @@ class ComputeNode(base.NovaPersistentObject, base.NovaObject):
self._convert_supported_instances_to_db_format(updates)
self._convert_pci_stats_to_db_format(updates)
- db_compute = db.compute_node_create(self._context, updates)
+ try:
+ db_compute = db.compute_node_create(self._context, updates)
+ except db_exc.DBDuplicateEntry:
+ target = 'compute node %s:%s' % (updates['hypervisor_hostname'],
+ updates['uuid'])
+ raise exception.DuplicateRecord(target=target)
self._from_db_object(self._context, self, db_compute)
@base.remotable
diff --git a/nova/tests/functional/test_service.py b/nova/tests/functional/test_service.py
index 65b41594bd..21e9a519ee 100644
--- a/nova/tests/functional/test_service.py
+++ b/nova/tests/functional/test_service.py
@@ -10,8 +10,12 @@
# License for the specific language governing permissions and limitations
# under the License.
+import functools
from unittest import mock
+import fixtures
+from oslo_utils.fixture import uuidsentinel as uuids
+
from nova import context as nova_context
from nova import exception
from nova.objects import service
@@ -19,6 +23,7 @@ from nova import test
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional import fixtures as func_fixtures
from nova.tests.functional import integrated_helpers
+from nova.virt import node
class ServiceTestCase(test.TestCase,
@@ -137,3 +142,83 @@ class TestOldComputeCheck(
return_value=old_version):
self.assertRaises(
exception.TooOldComputeService, self._start_compute, 'host1')
+
+
+class TestComputeStartupChecks(test.TestCase):
+ STUB_COMPUTE_ID = False
+
+ def setUp(self):
+ super().setUp()
+ self.useFixture(nova_fixtures.RealPolicyFixture())
+ self.useFixture(nova_fixtures.NeutronFixture(self))
+ self.useFixture(nova_fixtures.GlanceFixture(self))
+ self.useFixture(func_fixtures.PlacementFixture())
+
+ self._local_uuid = str(uuids.node)
+
+ self.useFixture(fixtures.MockPatch(
+ 'nova.virt.node.get_local_node_uuid',
+ functools.partial(self.local_uuid, True)))
+ self.useFixture(fixtures.MockPatch(
+ 'nova.virt.node.read_local_node_uuid',
+ self.local_uuid))
+ self.useFixture(fixtures.MockPatch(
+ 'nova.virt.node.write_local_node_uuid',
+ mock.DEFAULT))
+ self.flags(compute_driver='fake.FakeDriverWithoutFakeNodes')
+
+ def local_uuid(self, get=False):
+ if get and not self._local_uuid:
+ # Simulate the get_local_node_uuid behavior of calling write once
+ self._local_uuid = str(uuids.node)
+ node.write_local_node_uuid(self._local_uuid)
+ return self._local_uuid
+
+ def test_compute_node_identity_greenfield(self):
+ # Level-set test case to show that starting and re-starting without
+ # any error cases works as expected.
+
+ # Start with no local compute_id
+ self._local_uuid = None
+ self.start_service('compute')
+
+ # Start should have generated and written a compute id
+ node.write_local_node_uuid.assert_called_once_with(str(uuids.node))
+
+ # Starting again should succeed and not cause another write
+ self.start_service('compute')
+ node.write_local_node_uuid.assert_called_once_with(str(uuids.node))
+
+ def test_compute_node_identity_deleted(self):
+ self.start_service('compute')
+
+ # Simulate the compute_id file being deleted
+ self._local_uuid = None
+
+ # Should refuse to start because it's not our first time and the file
+ # being missing is a hard error.
+ exc = self.assertRaises(exception.InvalidConfiguration,
+ self.start_service, 'compute')
+ self.assertIn('lost that state', str(exc))
+
+ def test_compute_node_hostname_changed(self):
+ # Start our compute once to create the node record
+ self.start_service('compute')
+
+ # Starting with a different hostname should trigger the abort
+ exc = self.assertRaises(exception.InvalidConfiguration,
+ self.start_service, 'compute', host='other')
+ self.assertIn('hypervisor_hostname', str(exc))
+
+ def test_compute_node_uuid_changed(self):
+ # Start our compute once to create the node record
+ self.start_service('compute')
+
+ # Simulate a changed local compute_id file
+ self._local_uuid = str(uuids.othernode)
+
+ # We should fail to create the compute node record again, but with a
+ # useful error message about why.
+ exc = self.assertRaises(exception.InvalidConfiguration,
+ self.start_service, 'compute')
+ self.assertIn('Duplicate compute node record', str(exc))
diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py
index dfea323a9a..cd36b8987f 100644
--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@@ -1552,6 +1552,20 @@ class TestInitComputeNode(BaseTestCase):
self.assertEqual('fake-host', node.host)
mock_update.assert_called()
+ @mock.patch.object(resource_tracker.ResourceTracker,
+ '_get_compute_node',
+ return_value=None)
+ @mock.patch('nova.objects.compute_node.ComputeNode.create')
+ def test_create_failed_conflict(self, mock_create, mock_getcn):
+ self._setup_rt()
+ resources = {'hypervisor_hostname': 'node1',
+ 'uuid': uuids.node1}
+ mock_create.side_effect = exc.DuplicateRecord(target='foo')
+ self.assertRaises(exc.InvalidConfiguration,
+ self.rt._init_compute_node,
+ mock.MagicMock,
+ resources)
+
@ddt.ddt
class TestUpdateComputeNode(BaseTestCase):
diff --git a/nova/tests/unit/objects/test_compute_node.py b/nova/tests/unit/objects/test_compute_node.py
index 63b070c543..84c4e87785 100644
--- a/nova/tests/unit/objects/test_compute_node.py
+++ b/nova/tests/unit/objects/test_compute_node.py
@@ -16,6 +16,7 @@ import copy
from unittest import mock
import netaddr
+from oslo_db import exception as db_exc
from oslo_serialization import jsonutils
from oslo_utils.fixture import uuidsentinel
from oslo_utils import timeutils
@@ -341,6 +342,14 @@ class _TestComputeNodeObject(object):
'uuid': uuidsentinel.fake_compute_node}
mock_create.assert_called_once_with(self.context, param_dict)
+ @mock.patch('nova.db.main.api.compute_node_create')
+ def test_create_duplicate(self, mock_create):
+ mock_create.side_effect = db_exc.DBDuplicateEntry
+ compute = compute_node.ComputeNode(context=self.context)
+ compute.service_id = 456
+ compute.hypervisor_hostname = 'node1'
+ self.assertRaises(exception.DuplicateRecord, compute.create)
+
@mock.patch.object(db, 'compute_node_update')
@mock.patch(
'nova.db.main.api.compute_node_get', return_value=fake_compute_node)
diff --git a/nova/virt/fake.py b/nova/virt/fake.py
index 2234bd068e..bf7dc8fc72 100644
--- a/nova/virt/fake.py
+++ b/nova/virt/fake.py
@@ -49,6 +49,7 @@ from nova.objects import migrate_data
from nova.virt import driver
from nova.virt import hardware
from nova.virt.ironic import driver as ironic
+import nova.virt.node
from nova.virt import virtapi
CONF = nova.conf.CONF
@@ -1130,3 +1131,22 @@ class EphEncryptionDriverPLAIN(MediumFakeDriver):
FakeDriver.capabilities,
supports_ephemeral_encryption=True,
supports_ephemeral_encryption_plain=True)
+
+
+class FakeDriverWithoutFakeNodes(FakeDriver):
+ """FakeDriver that behaves like a real single-node driver.
+
+ This behaves like a real virt driver from the perspective of its
+ nodes, with a stable nodename and use of the global node identity
+ stuff to provide a stable node UUID.
+ """
+
+ def get_available_resource(self, nodename):
+ resources = super().get_available_resource(nodename)
+ resources['uuid'] = nova.virt.node.get_local_node_uuid()
+ return resources
+
+ def get_nodenames_by_uuid(self, refresh=False):
+ return {
+ nova.virt.node.get_local_node_uuid(): self.get_available_nodes()[0]
+ }