Don't refresh network cache for instances building or deleting

Refreshing the network cache for an instance which is still building is both unnecessary (the build itself will update the cache when it completes) and can lead to a race condition if the periodic task gets incomplete network information, then gets pre-empted by the build thread, and then updates the cache after the build has finished. Likewise there is no value in updating the network cache for an instance which is in the process of being deleted. Closes-Bug: 1254320 (cherry picked from commit 713e538237c7fea0b93ade343b1d9368bdbf2698) Conflicts: nova/compute/manager.py Change-Id: I7dd1d0a005662b979eaea3476af2506cbc51f17a
author: Phil Day <philip.day@hp.com> 2013-11-23 22:51:12 +0000
committer: Alan Pevec <alan.pevec@redhat.com> 2014-09-18 22:09:45 +0200
commit: 2afe82ea73069befb281508361f16462e71d6fc0 (patch)
tree: 75dc9efb2f68ba8c34957b1addc964ac63d5c067
parent: e79de9d696b8becbc16aecf5ac276bcaeb2036aa (diff)
download: nova-2afe82ea73069befb281508361f16462e71d6fc0.tar.gz
2 files changed, 92 insertions, 44 deletions
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 27d61a92c2..d54e39c025 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -4341,38 +4341,76 @@ class ComputeManager(manager.SchedulerDependentManager):
             return
         self._last_info_cache_heal = curr_time
 
-        instance_uuids = getattr(self, '_instance_uuids_to_heal', None)
+        instance_uuids = getattr(self, '_instance_uuids_to_heal', [])
         instance = None
 
-        while not instance or instance['host'] != self.host:
-            if instance_uuids:
+        LOG.debug(_('Starting heal instance info cache'))
+
+        if not instance_uuids:
+            # The list of instances to heal is empty so rebuild it
+            LOG.debug(_('Rebuilding the list of instances to heal'))
+            db_instances = instance_obj.InstanceList.get_by_host(
+                context, self.host, expected_attrs=[])
+            for inst in db_instances:
+                # We don't want to refersh the cache for instances
+                # which are building or deleting so don't put them
+                # in the list. If they are building they will get
+                # added to the list next time we build it.
+                if (inst.vm_state == vm_states.BUILDING):
+                    LOG.debug(_('Skipping network cache update for instance '
+                                'because it is Building.'), instance=inst)
+                    continue
+                if (inst.task_state == task_states.DELETING):
+                    LOG.debug(_('Skipping network cache update for instance '
+                                'because it is being deleted.'), instance=inst)
+                    continue
+
+                if not instance:
+                    # Save the first one we find so we don't
+                    # have to get it again
+                    instance = inst
+                else:
+                    instance_uuids.append(inst['uuid'])
+
+            self._instance_uuids_to_heal = instance_uuids
+        else:
+            # Find the next valid instance on the list
+            while instance_uuids:
                 try:
-                    instance = instance_obj.Instance.get_by_uuid(
-                        context, instance_uuids.pop(0),
-                        expected_attrs=['system_metadata', 'info_cache'])
+                    inst = instance_obj.Instance.get_by_uuid(
+                            context, instance_uuids.pop(0),
+                            expected_attrs=['system_metadata', 'info_cache'])
                 except exception.InstanceNotFound:
                     # Instance is gone.  Try to grab another.
                     continue
-            else:
-                # No more in our copy of uuids.  Pull from the DB.
-                db_instances = instance_obj.InstanceList.get_by_host(
-                    context, self.host, expected_attrs=[])
-                if not db_instances:
-                    # None.. just return.
-                    return
-                instance = db_instances[0]
-                instance_uuids = [inst['uuid'] for inst in db_instances[1:]]
-                self._instance_uuids_to_heal = instance_uuids
 
-        # We have an instance now and it's ours
-        try:
-            # Call to network API to get instance info.. this will
-            # force an update to the instance's info_cache
-            self._get_instance_nw_info(context, instance)
-            LOG.debug(_('Updated the info_cache for instance'),
-                      instance=instance)
-        except Exception as e:
-            LOG.debug(_("An error occurred: %s"), e)
+                # Check the instance hasn't been migrated
+                if inst.host != self.host:
+                    LOG.debug(_('Skipping network cache update for instance '
+                                'because it has been migrated to another '
+                                'host.'), instance=inst)
+                # Check the instance isn't being deleting
+                elif inst.task_state == task_states.DELETING:
+                    LOG.debug(_('Skipping network cache update for instance '
+                                'because it is being deleted.'), instance=inst)
+                else:
+                    instance = inst
+                    break
+
+        if instance:
+            # We have an instance now to refresh
+            try:
+                # Call to network API to get instance info.. this will
+                # force an update to the instance's info_cache
+                self._get_instance_nw_info(context, instance)
+                LOG.debug(_('Updated the network info_cache for instance'),
+                          instance=instance)
+            except Exception:
+                LOG.error(_('An error occurred while refreshing the network '
+                            'cache.'), instance=instance, exc_info=True)
+        else:
+            LOG.debug(_("Didn't find any instances for network info cache "
+                        "update."))
 
     @periodic_task.periodic_task
     def _poll_rebooting_instances(self, context):
diff --git a/nova/tests/compute/test_compute.py b/nova/tests/compute/test_compute.py
index 44fb0bb914..9fcf841720 100644
--- a/nova/tests/compute/test_compute.py
+++ b/nova/tests/compute/test_compute.py
@@ -5010,7 +5010,7 @@ class ComputeTestCase(BaseTestCase):
 
         instance_map = {}
         instances = []
-        for x in xrange(5):
+        for x in xrange(8):
             inst_uuid = 'fake-uuid-%s' % x
             instance_map[inst_uuid] = fake_instance.fake_db_instance(
                 uuid=inst_uuid, host=CONF.host, created_at=None)
@@ -5050,40 +5050,50 @@ class ComputeTestCase(BaseTestCase):
         self.stubs.Set(self.compute, '_get_instance_nw_info',
                 fake_get_instance_nw_info)
 
-        call_info['expected_instance'] = instances[0]
+        # Make an instance appear to be still Building
+        instances[0]['vm_state'] = vm_states.BUILDING
+        # Make an instance appear to be Deleting
+        instances[1]['task_state'] = task_states.DELETING
+        # '0', '1' should be skipped..
+        call_info['expected_instance'] = instances[2]
         self.compute._heal_instance_info_cache(ctxt)
         self.assertEqual(1, call_info['get_all_by_host'])
         self.assertEqual(0, call_info['get_by_uuid'])
         self.assertEqual(1, call_info['get_nw_info'])
 
-        call_info['expected_instance'] = instances[1]
+        call_info['expected_instance'] = instances[3]
         self.compute._heal_instance_info_cache(ctxt)
         self.assertEqual(1, call_info['get_all_by_host'])
         self.assertEqual(1, call_info['get_by_uuid'])
         self.assertEqual(2, call_info['get_nw_info'])
 
         # Make an instance switch hosts
-        instances[2]['host'] = 'not-me'
+        instances[4]['host'] = 'not-me'
         # Make an instance disappear
-        instance_map.pop(instances[3]['uuid'])
-        # '2' and '3' should be skipped..
-        call_info['expected_instance'] = instances[4]
+        instance_map.pop(instances[5]['uuid'])
+        # Make an instance switch to be Deleting
+        instances[6]['task_state'] = task_states.DELETING
+        # '4', '5', and '6' should be skipped..
+        call_info['expected_instance'] = instances[7]
         self.compute._heal_instance_info_cache(ctxt)
-        self.assertEqual(call_info['get_all_by_host'], 1)
-        # Incremented for '2' and '4'.. '3' caused a raise above.
-        self.assertEqual(call_info['get_by_uuid'], 3)
-        self.assertEqual(call_info['get_nw_info'], 3)
+        self.assertEqual(1, call_info['get_all_by_host'])
+        self.assertEqual(4, call_info['get_by_uuid'])
+        self.assertEqual(3, call_info['get_nw_info'])
         # Should be no more left.
-        self.assertEqual(len(self.compute._instance_uuids_to_heal), 0)
+        self.assertEqual(0, len(self.compute._instance_uuids_to_heal))
+
+        # This should cause a DB query now, so get a list of instances
+        # where none can be processed to make sure we handle that case
+        # cleanly.   Use just '0' (Building) and '1' (Deleting)
+        instances = instances[0:2]
 
-        # This should cause a DB query now so we get first instance
-        # back again
-        call_info['expected_instance'] = instances[0]
         self.compute._heal_instance_info_cache(ctxt)
-        self.assertEqual(call_info['get_all_by_host'], 2)
-        # Stays the same, because the instance came from the DB
-        self.assertEqual(call_info['get_by_uuid'], 3)
-        self.assertEqual(call_info['get_nw_info'], 4)
+        # Should have called the list once more
+        self.assertEqual(2, call_info['get_all_by_host'])
+        # Stays the same because we remove invalid entries from the list
+        self.assertEqual(4, call_info['get_by_uuid'])
+        # Stays the same because we didn't find anything to process
+        self.assertEqual(3, call_info['get_nw_info'])
 
     def test_poll_rescued_instances(self):
         timed_out_time = timeutils.utcnow() - datetime.timedelta(minutes=5)
author	Phil Day <philip.day@hp.com>	2013-11-23 22:51:12 +0000
committer	Alan Pevec <alan.pevec@redhat.com>	2014-09-18 22:09:45 +0200
commit	2afe82ea73069befb281508361f16462e71d6fc0 (patch)
tree	75dc9efb2f68ba8c34957b1addc964ac63d5c067
parent	e79de9d696b8becbc16aecf5ac276bcaeb2036aa (diff)
download	nova-2afe82ea73069befb281508361f16462e71d6fc0.tar.gz