diff options
-rw-r--r-- | doc/source/admin/compute-node-identification.rst | 83 | ||||
-rw-r--r-- | doc/source/admin/index.rst | 1 | ||||
-rw-r--r-- | doc/source/cli/nova-compute.rst | 2 | ||||
-rw-r--r-- | nova/compute/manager.py | 8 | ||||
-rw-r--r-- | nova/compute/resource_tracker.py | 8 | ||||
-rw-r--r-- | nova/exception.py | 4 | ||||
-rw-r--r-- | nova/objects/compute_node.py | 8 | ||||
-rw-r--r-- | nova/tests/functional/test_service.py | 85 | ||||
-rw-r--r-- | nova/tests/unit/compute/test_resource_tracker.py | 14 | ||||
-rw-r--r-- | nova/tests/unit/objects/test_compute_node.py | 9 | ||||
-rw-r--r-- | nova/virt/fake.py | 20 |
11 files changed, 240 insertions, 2 deletions
diff --git a/doc/source/admin/compute-node-identification.rst b/doc/source/admin/compute-node-identification.rst new file mode 100644 index 0000000000..31d4802d0b --- /dev/null +++ b/doc/source/admin/compute-node-identification.rst @@ -0,0 +1,83 @@ +=========================== +Compute Node Identification +=========================== + +Nova requires that compute nodes maintain a constant and consistent identity +during their lifecycle. With the exception of the ironic driver, starting in +the 2023.1 release, this is achieved by use of a file containing the node +unique identifier that is persisted on disk. Prior to 2023.1, a combination of +the compute node's hostname and the :oslo.config:option:`host` value in the +configuration file were used. + +The 2023.1 and later compute node identification file must remain unchanged +during the lifecycle of the compute node. Changing the value or removing the +file will result in a failure to start and may require advanced techniques +for recovery. The file is read once at `nova-compute`` startup, at which point +it is validated for formatting and the corresponding node is located or +created in the database. + +.. note:: + + Even after 2023.1, the compute node's hostname may not be changed after + the initial registration with the controller nodes, it is just not used + as the primary method for identification. + +The behavior of ``nova-compute`` is different when using the ironic driver, +as the (UUID-based) identity and mapping of compute nodes to compute manager +service hosts is dynamic. In that case, no single node identity is maintained +by the compute host and thus no identity file is read or written. Thus none +of the sections below apply to hosts with :oslo.config:option:`compute_driver` +set to `ironic`. + +Self-provisioning of the node identity +-------------------------------------- + +By default, ``nova-compute`` will automatically generate and write a UUID to +disk the first time it starts up, and will use that going forward as its +stable identity. Using the :oslo.config:option:`state_path` +(which is ``/var/lib/nova`` on most systems), a ``compute_id`` file will be +created with a generated UUID. + +Since this file (and it's parent directory) is writable by nova, it may be +desirable to move this to one of the other locations that nova looks for the +identification file. + +Deployment provisioning of the node identity +-------------------------------------------- + +In addition to the location mentioned above, nova will also search the parent +directories of any config file in use (either the defaults or provided on +the command line) for a ``compute_id`` file. Thus, a deployment tool may, on +most systems, pre-provision the node's UUID by writing one to +``/etc/nova/compute_id``. + +The contents of the file should be a single UUID in canonical textual +representation with no additional whitespace or other characters. The following +should work on most Linux systems: + +.. code-block:: shell + + $ uuidgen > /etc/nova/compute_id + +.. note:: + + **Do not** execute the above command blindly in every run of a deployment + tool, as that will result in overwriting the ``compute_id`` file each time, + which *will* prevent nova from working properly. + +Upgrading from pre-2023.1 +------------------------- + +Before release 2023.1, ``nova-compute`` only used the hostname (combined with +:oslo.config:option:`host`, if set) to identify its compute node objects in +the database. When upgrading from a prior release, the compute node will +perform a one-time migration of the hostname-matched compute node UUID to the +``compute_id`` file in the :oslo.config:option:`state_path` location. + +.. note:: + + It is imperative that you allow the above migration to run and complete on + compute nodes that are being upgraded. Skipping this step by + pre-provisioning a ``compute_id`` file before the upgrade will **not** work + and will be equivalent to changing the compute node UUID after it has + already been created once. diff --git a/doc/source/admin/index.rst b/doc/source/admin/index.rst index 93b4e6a554..8cb5bf7156 100644 --- a/doc/source/admin/index.rst +++ b/doc/source/admin/index.rst @@ -206,6 +206,7 @@ instance for these kind of workloads. secure-boot sev managing-resource-providers + compute-node-identification resource-limits cpu-models libvirt-misc diff --git a/doc/source/cli/nova-compute.rst b/doc/source/cli/nova-compute.rst index f190949efa..1346dab92e 100644 --- a/doc/source/cli/nova-compute.rst +++ b/doc/source/cli/nova-compute.rst @@ -41,6 +41,8 @@ Files * ``/etc/nova/policy.d/`` * ``/etc/nova/rootwrap.conf`` * ``/etc/nova/rootwrap.d/`` +* ``/etc/nova/compute_id`` +* ``/var/lib/nova/compute_id`` See Also ======== diff --git a/nova/compute/manager.py b/nova/compute/manager.py index c4537cd9a2..952ab3e199 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -10480,6 +10480,14 @@ class ComputeManager(manager.Manager): LOG.exception( "Error updating PCI resources for node %(node)s.", {'node': nodename}) + except exception.InvalidConfiguration as e: + if startup: + # If this happens during startup, we need to let it raise to + # abort our service startup. + raise + else: + LOG.error("Error updating resources for node %s: %s", + nodename, e) except Exception: LOG.exception("Error updating resources for node %(node)s.", {'node': nodename}) diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 70c56fd2e3..3f911f3708 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -728,7 +728,13 @@ class ResourceTracker(object): cn = objects.ComputeNode(context) cn.host = self.host self._copy_resources(cn, resources, initial=True) - cn.create() + try: + cn.create() + except exception.DuplicateRecord: + raise exception.InvalidConfiguration( + 'Duplicate compute node record found for host %s node %s' % ( + cn.host, cn.hypervisor_hostname)) + # Only map the ComputeNode into compute_nodes if create() was OK # because if create() fails, on the next run through here nodename # would be in compute_nodes and we won't try to create again (because diff --git a/nova/exception.py b/nova/exception.py index 20c112b628..f5993e79f8 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -2512,6 +2512,10 @@ class InvalidNodeConfiguration(NovaException): msg_fmt = _('Invalid node identity configuration: %(reason)s') +class DuplicateRecord(NovaException): + msg_fmt = _('Unable to create duplicate record for %(target)s') + + class NotSupportedComputeForEvacuateV295(NotSupported): msg_fmt = _("Starting to microversion 2.95, evacuate API will stop " "instance on destination. To evacuate before upgrades are " diff --git a/nova/objects/compute_node.py b/nova/objects/compute_node.py index 528cfc0776..dfc1b2ae28 100644 --- a/nova/objects/compute_node.py +++ b/nova/objects/compute_node.py @@ -12,6 +12,7 @@ # License for the specific language governing permissions and limitations # under the License. +from oslo_db import exception as db_exc from oslo_serialization import jsonutils from oslo_utils import uuidutils from oslo_utils import versionutils @@ -339,7 +340,12 @@ class ComputeNode(base.NovaPersistentObject, base.NovaObject): self._convert_supported_instances_to_db_format(updates) self._convert_pci_stats_to_db_format(updates) - db_compute = db.compute_node_create(self._context, updates) + try: + db_compute = db.compute_node_create(self._context, updates) + except db_exc.DBDuplicateEntry: + target = 'compute node %s:%s' % (updates['hypervisor_hostname'], + updates['uuid']) + raise exception.DuplicateRecord(target=target) self._from_db_object(self._context, self, db_compute) @base.remotable diff --git a/nova/tests/functional/test_service.py b/nova/tests/functional/test_service.py index 65b41594bd..21e9a519ee 100644 --- a/nova/tests/functional/test_service.py +++ b/nova/tests/functional/test_service.py @@ -10,8 +10,12 @@ # License for the specific language governing permissions and limitations # under the License. +import functools from unittest import mock +import fixtures +from oslo_utils.fixture import uuidsentinel as uuids + from nova import context as nova_context from nova import exception from nova.objects import service @@ -19,6 +23,7 @@ from nova import test from nova.tests import fixtures as nova_fixtures from nova.tests.functional import fixtures as func_fixtures from nova.tests.functional import integrated_helpers +from nova.virt import node class ServiceTestCase(test.TestCase, @@ -137,3 +142,83 @@ class TestOldComputeCheck( return_value=old_version): self.assertRaises( exception.TooOldComputeService, self._start_compute, 'host1') + + +class TestComputeStartupChecks(test.TestCase): + STUB_COMPUTE_ID = False + + def setUp(self): + super().setUp() + self.useFixture(nova_fixtures.RealPolicyFixture()) + self.useFixture(nova_fixtures.NeutronFixture(self)) + self.useFixture(nova_fixtures.GlanceFixture(self)) + self.useFixture(func_fixtures.PlacementFixture()) + + self._local_uuid = str(uuids.node) + + self.useFixture(fixtures.MockPatch( + 'nova.virt.node.get_local_node_uuid', + functools.partial(self.local_uuid, True))) + self.useFixture(fixtures.MockPatch( + 'nova.virt.node.read_local_node_uuid', + self.local_uuid)) + self.useFixture(fixtures.MockPatch( + 'nova.virt.node.write_local_node_uuid', + mock.DEFAULT)) + self.flags(compute_driver='fake.FakeDriverWithoutFakeNodes') + + def local_uuid(self, get=False): + if get and not self._local_uuid: + # Simulate the get_local_node_uuid behavior of calling write once + self._local_uuid = str(uuids.node) + node.write_local_node_uuid(self._local_uuid) + return self._local_uuid + + def test_compute_node_identity_greenfield(self): + # Level-set test case to show that starting and re-starting without + # any error cases works as expected. + + # Start with no local compute_id + self._local_uuid = None + self.start_service('compute') + + # Start should have generated and written a compute id + node.write_local_node_uuid.assert_called_once_with(str(uuids.node)) + + # Starting again should succeed and not cause another write + self.start_service('compute') + node.write_local_node_uuid.assert_called_once_with(str(uuids.node)) + + def test_compute_node_identity_deleted(self): + self.start_service('compute') + + # Simulate the compute_id file being deleted + self._local_uuid = None + + # Should refuse to start because it's not our first time and the file + # being missing is a hard error. + exc = self.assertRaises(exception.InvalidConfiguration, + self.start_service, 'compute') + self.assertIn('lost that state', str(exc)) + + def test_compute_node_hostname_changed(self): + # Start our compute once to create the node record + self.start_service('compute') + + # Starting with a different hostname should trigger the abort + exc = self.assertRaises(exception.InvalidConfiguration, + self.start_service, 'compute', host='other') + self.assertIn('hypervisor_hostname', str(exc)) + + def test_compute_node_uuid_changed(self): + # Start our compute once to create the node record + self.start_service('compute') + + # Simulate a changed local compute_id file + self._local_uuid = str(uuids.othernode) + + # We should fail to create the compute node record again, but with a + # useful error message about why. + exc = self.assertRaises(exception.InvalidConfiguration, + self.start_service, 'compute') + self.assertIn('Duplicate compute node record', str(exc)) diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index dfea323a9a..cd36b8987f 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -1552,6 +1552,20 @@ class TestInitComputeNode(BaseTestCase): self.assertEqual('fake-host', node.host) mock_update.assert_called() + @mock.patch.object(resource_tracker.ResourceTracker, + '_get_compute_node', + return_value=None) + @mock.patch('nova.objects.compute_node.ComputeNode.create') + def test_create_failed_conflict(self, mock_create, mock_getcn): + self._setup_rt() + resources = {'hypervisor_hostname': 'node1', + 'uuid': uuids.node1} + mock_create.side_effect = exc.DuplicateRecord(target='foo') + self.assertRaises(exc.InvalidConfiguration, + self.rt._init_compute_node, + mock.MagicMock, + resources) + @ddt.ddt class TestUpdateComputeNode(BaseTestCase): diff --git a/nova/tests/unit/objects/test_compute_node.py b/nova/tests/unit/objects/test_compute_node.py index 63b070c543..84c4e87785 100644 --- a/nova/tests/unit/objects/test_compute_node.py +++ b/nova/tests/unit/objects/test_compute_node.py @@ -16,6 +16,7 @@ import copy from unittest import mock import netaddr +from oslo_db import exception as db_exc from oslo_serialization import jsonutils from oslo_utils.fixture import uuidsentinel from oslo_utils import timeutils @@ -341,6 +342,14 @@ class _TestComputeNodeObject(object): 'uuid': uuidsentinel.fake_compute_node} mock_create.assert_called_once_with(self.context, param_dict) + @mock.patch('nova.db.main.api.compute_node_create') + def test_create_duplicate(self, mock_create): + mock_create.side_effect = db_exc.DBDuplicateEntry + compute = compute_node.ComputeNode(context=self.context) + compute.service_id = 456 + compute.hypervisor_hostname = 'node1' + self.assertRaises(exception.DuplicateRecord, compute.create) + @mock.patch.object(db, 'compute_node_update') @mock.patch( 'nova.db.main.api.compute_node_get', return_value=fake_compute_node) diff --git a/nova/virt/fake.py b/nova/virt/fake.py index 2234bd068e..bf7dc8fc72 100644 --- a/nova/virt/fake.py +++ b/nova/virt/fake.py @@ -49,6 +49,7 @@ from nova.objects import migrate_data from nova.virt import driver from nova.virt import hardware from nova.virt.ironic import driver as ironic +import nova.virt.node from nova.virt import virtapi CONF = nova.conf.CONF @@ -1130,3 +1131,22 @@ class EphEncryptionDriverPLAIN(MediumFakeDriver): FakeDriver.capabilities, supports_ephemeral_encryption=True, supports_ephemeral_encryption_plain=True) + + +class FakeDriverWithoutFakeNodes(FakeDriver): + """FakeDriver that behaves like a real single-node driver. + + This behaves like a real virt driver from the perspective of its + nodes, with a stable nodename and use of the global node identity + stuff to provide a stable node UUID. + """ + + def get_available_resource(self, nodename): + resources = super().get_available_resource(nodename) + resources['uuid'] = nova.virt.node.get_local_node_uuid() + return resources + + def get_nodenames_by_uuid(self, refresh=False): + return { + nova.virt.node.get_local_node_uuid(): self.get_available_nodes()[0] + } |