diff options
author | Lingxian Kong <anlin.kong@gmail.com> | 2021-02-10 09:53:06 +1300 |
---|---|---|
committer | Lingxian Kong <anlin.kong@gmail.com> | 2021-02-11 10:57:45 +1300 |
commit | 2b6b7ab4b7810667426be26ff03a6c367bb2460d (patch) | |
tree | 7f6b96065c913bb622a9ae67d524526e0eacdcc3 | |
parent | 73978a9b978788817ee98ef6970a35f9121a2dc9 (diff) | |
download | trove-2b6b7ab4b7810667426be26ff03a6c367bb2460d.tar.gz |
Do not override RESTART_REQUIRED service status
The RESTART_REQUIRED service status set by guest agent should not
be overridden by the health heartbeat mechanism.
The RESTART_REQUIRED can only be changed when the instance is restarted
or rebooted.
Story: 2008612
Task: 41795
Change-Id: I98baf252452353237bc8fb14357df4e7bcb2867a
(Cherry picked from 9c2e0bf3a0f1bc0b35a148174d8a4d2083f2b3c5)
-rw-r--r-- | trove/conductor/manager.py | 13 | ||||
-rw-r--r-- | trove/guestagent/guest_log.py | 27 | ||||
-rw-r--r-- | trove/instance/models.py | 62 | ||||
-rw-r--r-- | trove/instance/service_status.py | 1 | ||||
-rw-r--r-- | trove/tests/unittests/instance/test_service.py | 129 | ||||
-rw-r--r-- | trove/tests/unittests/trove_testtools.py | 34 | ||||
-rw-r--r-- | trove/tests/unittests/util/util.py | 8 |
7 files changed, 230 insertions, 44 deletions
diff --git a/trove/conductor/manager.py b/trove/conductor/manager.py index 22fb75c5..d5738e19 100644 --- a/trove/conductor/manager.py +++ b/trove/conductor/manager.py @@ -24,7 +24,7 @@ from trove.common.serializable_notification import SerializableNotification from trove.conductor.models import LastSeen from trove.extensions.mysql import models as mysql_models from trove.instance import models as inst_models -from trove.instance.service_status import ServiceStatus +from trove.instance import service_status as svc_status LOG = logging.getLogger(__name__) CONF = cfg.CONF @@ -86,11 +86,20 @@ class Manager(periodic_task.PeriodicTasks): "payload": str(payload)}) status = inst_models.InstanceServiceStatus.find_by( instance_id=instance_id) + if self._message_too_old(instance_id, 'heartbeat', sent): return + + if status.get_status() == svc_status.ServiceStatuses.RESTART_REQUIRED: + LOG.debug("Instance %s service status is RESTART_REQUIRED, " + "skip heartbeat", instance_id) + return + if payload.get('service_status') is not None: status.set_status( - ServiceStatus.from_description(payload['service_status'])) + svc_status.ServiceStatus.from_description( + payload['service_status']) + ) status.save() def update_backup(self, context, instance_id, backup_id, diff --git a/trove/guestagent/guest_log.py b/trove/guestagent/guest_log.py index 1b7c85d6..570a08ed 100644 --- a/trove/guestagent/guest_log.py +++ b/trove/guestagent/guest_log.py @@ -15,6 +15,7 @@ import enum import hashlib import os +from pathlib import Path from requests.exceptions import ConnectionError from oslo_log import log as logging @@ -245,34 +246,36 @@ class GuestLog(object): 'published': self._published_size}) def _update_details(self): - # Make sure we can read the file - if not self._file_readable or not os.access(self._file, os.R_OK): + if operating_system.exists(self._file, as_root=True): + file_path = Path(self._file) + + # Make sure guest agent can read the log file. if not os.access(self._file, os.R_OK): - if operating_system.exists(self._file, as_root=True): - operating_system.chmod( - self._file, FileMode.ADD_ALL_R, as_root=True) - self._file_readable = True - - if os.path.isfile(self._file): - logstat = os.stat(self._file) - self._size = logstat.st_size + operating_system.chmod(self._file, FileMode.ADD_ALL_R, + as_root=True) + operating_system.chmod(str(file_path.parent), + FileMode.ADD_GRP_RX_OTH_RX, + as_root=True) + + self._size = file_path.stat().st_size self._update_log_header_digest(self._file) if self.status != LogStatus.Disabled: if self._log_rotated(): self.status = LogStatus.Rotated # See if we have stuff to publish - elif logstat.st_size > self._published_size: + elif self._size > self._published_size: self._set_status(self._published_size, LogStatus.Partial, LogStatus.Ready) # We've published everything so far - elif logstat.st_size == self._published_size: + elif self._size == self._published_size: self._set_status(self._published_size, LogStatus.Published, LogStatus.Enabled) # We've already handled this case (log rotated) so what gives? else: raise Exception(_("Bug in _log_rotated ?")) else: + LOG.warning(f"File {self._file} does not exist") self._published_size = 0 self._size = 0 diff --git a/trove/instance/models.py b/trove/instance/models.py index c33ea1c6..70bd6fd6 100644 --- a/trove/instance/models.py +++ b/trove/instance/models.py @@ -602,24 +602,30 @@ def load_instance(cls, context, id, needs_server=False, return cls(context, db_info, server, service_status) -def load_instance_with_info(cls, context, id, cluster_id=None): - db_info = get_db_info(context, id, cluster_id) - LOG.debug('Task status for instance %s: %s', id, db_info.task_status) - - service_status = InstanceServiceStatus.find_by(instance_id=id) - if (db_info.task_status == InstanceTasks.NONE and - not service_status.is_uptodate()): - LOG.warning('Guest agent heartbeat for instance %s has expried', id) +def update_service_status(task_status, service_status, ins_id): + """Update service status as needed.""" + if (task_status == InstanceTasks.NONE and + service_status.status != srvstatus.ServiceStatuses.RESTART_REQUIRED and + not service_status.is_uptodate()): + LOG.warning('Guest agent heartbeat for instance %s has expried', + ins_id) service_status.status = \ srvstatus.ServiceStatuses.FAILED_TIMEOUT_GUESTAGENT + +def load_instance_with_info(cls, context, ins_id, cluster_id=None): + db_info = get_db_info(context, ins_id, cluster_id) + service_status = InstanceServiceStatus.find_by(instance_id=ins_id) + + update_service_status(db_info.task_status, service_status, ins_id) + load_simple_instance_server_status(context, db_info) load_simple_instance_addresses(context, db_info) instance = cls(context, db_info, service_status) - load_guest_info(instance, context, id) + load_guest_info(instance, context, ins_id) load_server_group_info(instance, context) @@ -897,6 +903,11 @@ class BaseInstance(SimpleInstance): del_instance.set_status(srvstatus.ServiceStatuses.DELETED) del_instance.save() + def set_servicestatus_restart(self): + del_instance = InstanceServiceStatus.find_by(instance_id=self.id) + del_instance.set_status(srvstatus.ServiceStatuses.RESTARTING) + del_instance.save() + def set_instance_fault_deleted(self): try: del_fault = DBInstanceFault.find_by(instance_id=self.id) @@ -1432,7 +1443,10 @@ class Instance(BuiltInstance): LOG.info("Rebooting instance %s.", self.id) if self.db_info.cluster_id is not None and not self.context.is_admin: raise exception.ClusterInstanceOperationNotSupported() + self.update_db(task_status=InstanceTasks.REBOOTING) + self.set_servicestatus_restart() + task_api.API(self.context).reboot(self.id) def restart(self): @@ -1440,13 +1454,10 @@ class Instance(BuiltInstance): LOG.info("Restarting datastore on instance %s.", self.id) if self.db_info.cluster_id is not None and not self.context.is_admin: raise exception.ClusterInstanceOperationNotSupported() - # Set our local status since Nova might not change it quick enough. - # TODO(tim.simpson): Possible bad stuff can happen if this service - # shuts down before it can set status to NONE. - # We need a last updated time to mitigate this; - # after some period of tolerance, we'll assume the - # status is no longer in effect. + self.update_db(task_status=InstanceTasks.REBOOTING) + self.set_servicestatus_restart() + task_api.API(self.context).restart(self.id) def detach_replica(self): @@ -1819,9 +1830,9 @@ class Instances(object): db.server_status = "SHUTDOWN" # Fake it... db.addresses = [] - datastore_status = InstanceServiceStatus.find_by( + service_status = InstanceServiceStatus.find_by( instance_id=db.id) - if not datastore_status.status: # This should never happen. + if not service_status.status: # This should never happen. LOG.error("Server status could not be read for " "instance id(%s).", db.id) continue @@ -1829,24 +1840,15 @@ class Instances(object): # Get the real-time service status. LOG.debug('Task status for instance %s: %s', db.id, db.task_status) - if db.task_status == InstanceTasks.NONE: - last_heartbeat_delta = ( - timeutils.utcnow() - datastore_status.updated_at) - agent_expiry_interval = timedelta( - seconds=CONF.agent_heartbeat_expiry) - if last_heartbeat_delta > agent_expiry_interval: - LOG.warning( - 'Guest agent heartbeat for instance %s has ' - 'expried', id) - datastore_status.status = \ - srvstatus.ServiceStatuses.FAILED_TIMEOUT_GUESTAGENT + update_service_status(db.task_status, service_status, db.id) except exception.ModelNotFoundError: LOG.error("Server status could not be read for " "instance id(%s).", db.id) continue - ret.append(load_instance(context, db, datastore_status, - server=server)) + ret.append( + load_instance(context, db, service_status, server=server) + ) return ret diff --git a/trove/instance/service_status.py b/trove/instance/service_status.py index 913ecb5a..6c1c3fa9 100644 --- a/trove/instance/service_status.py +++ b/trove/instance/service_status.py @@ -36,6 +36,7 @@ class ServiceStatus(object): ServiceStatuses.CRASHED._code, ServiceStatuses.BLOCKED._code, ServiceStatuses.HEALTHY._code, + ServiceStatuses.RESTART_REQUIRED._code, ] return self._code in allowed_statuses diff --git a/trove/tests/unittests/instance/test_service.py b/trove/tests/unittests/instance/test_service.py new file mode 100644 index 00000000..aa0af8de --- /dev/null +++ b/trove/tests/unittests/instance/test_service.py @@ -0,0 +1,129 @@ +# Copyright 2020 Catalyst Cloud +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from datetime import timedelta +from unittest import mock + +from trove.common import cfg +from trove.common import timeutils +from trove.datastore import models as ds_models +from trove.instance import models as ins_models +from trove.instance import service +from trove.instance import service_status as srvstatus +from trove.tests.unittests import trove_testtools +from trove.tests.unittests.util import util + +CONF = cfg.CONF + + +class TestInstanceController(trove_testtools.TestCase): + @classmethod + def setUpClass(cls): + util.init_db() + + cls.ds_name = cls.random_name('datastore', + prefix='TestInstanceController') + ds_models.update_datastore(name=cls.ds_name, default_version=None) + cls.ds = ds_models.Datastore.load(cls.ds_name) + + ds_models.update_datastore_version( + cls.ds_name, 'test_image_id', 'mysql', cls.random_uuid(), '', 1) + + cls.ds_version_imageid = ds_models.DatastoreVersion.load( + cls.ds, 'test_image_id') + + cls.controller = service.InstanceController() + + super(TestInstanceController, cls).setUpClass() + + @classmethod + def tearDownClass(cls): + util.cleanup_db() + + super(TestInstanceController, cls).tearDownClass() + + def setUp(self): + trove_testtools.patch_notifier(self) + super(TestInstanceController, self).setUp() + + @mock.patch('trove.instance.models.load_server_group_info') + @mock.patch('trove.instance.models.load_guest_info') + @mock.patch('trove.instance.models.load_simple_instance_addresses') + @mock.patch('trove.instance.models.load_simple_instance_server_status') + def test_show_with_restart_required(self, load_server_mock, + load_addr_mock, load_guest_mock, + load_server_grp_mock): + # Create an instance in db. + instance = ins_models.DBInstance.create( + name=self.random_name('instance'), + flavor_id=self.random_uuid(), + tenant_id=self.random_uuid(), + volume_size=1, + datastore_version_id=self.ds_version_imageid.id, + task_status=ins_models.InstanceTasks.NONE, + compute_instance_id=self.random_uuid(), + server_status='ACTIVE' + ) + ins_models.InstanceServiceStatus.create( + instance_id=instance.id, + status=srvstatus.ServiceStatuses.RESTART_REQUIRED, + ) + + # workaround to reset updated_at field. + service_status = ins_models.InstanceServiceStatus.find_by( + instance_id=instance.id) + service_status.updated_at = timeutils.utcnow() - timedelta( + seconds=(CONF.agent_heartbeat_expiry + 60)) + ins_models.get_db_api().save(service_status) + + ret = self.controller.show(mock.MagicMock(), mock.ANY, instance.id) + self.assertEqual(200, ret.status) + + ret_instance = ret.data(None)['instance'] + self.assertEqual('RESTART_REQUIRED', ret_instance.get('status')) + + @mock.patch('trove.instance.models.load_server_group_info') + @mock.patch('trove.instance.models.load_guest_info') + @mock.patch('trove.instance.models.load_simple_instance_addresses') + @mock.patch('trove.instance.models.load_simple_instance_server_status') + def test_show_without_restart_required(self, load_server_mock, + load_addr_mock, load_guest_mock, + load_server_grp_mock): + # Create an instance in db. + instance = ins_models.DBInstance.create( + name=self.random_name('instance'), + flavor_id=self.random_uuid(), + tenant_id=self.random_uuid(), + volume_size=1, + datastore_version_id=self.ds_version_imageid.id, + task_status=ins_models.InstanceTasks.NONE, + compute_instance_id=self.random_uuid(), + server_status='ACTIVE' + ) + ins_models.InstanceServiceStatus.create( + instance_id=instance.id, + status=srvstatus.ServiceStatuses.HEALTHY, + ) + + # workaround to reset updated_at field. + service_status = ins_models.InstanceServiceStatus.find_by( + instance_id=instance.id) + service_status.updated_at = timeutils.utcnow() - timedelta( + seconds=(CONF.agent_heartbeat_expiry + 60)) + ins_models.get_db_api().save(service_status) + + ret = self.controller.show(mock.MagicMock(), mock.ANY, instance.id) + self.assertEqual(200, ret.status) + + ret_instance = ret.data(None)['instance'] + self.assertEqual('ERROR', ret_instance.get('status')) diff --git a/trove/tests/unittests/trove_testtools.py b/trove/tests/unittests/trove_testtools.py index 0366d821..34e67b89 100644 --- a/trove/tests/unittests/trove_testtools.py +++ b/trove/tests/unittests/trove_testtools.py @@ -14,8 +14,10 @@ # under the License. import abc +import random import testtools from unittest import mock +import uuid from trove.common import cfg from trove.common.context import TroveContext @@ -96,3 +98,35 @@ class TestCase(testtools.TestCase): new_callable=mock.PropertyMock(return_value=value)) self.addCleanup(conf_patcher.stop) return conf_patcher.start() + + @classmethod + def random_name(cls, name='', prefix=None): + """Generate a random name that inclues a random number. + + :param str name: The name that you want to include + :param str prefix: The prefix that you want to include + + :return: a random name. The format is + '<prefix>-<name>-<random number>'. + (e.g. 'prefixfoo-namebar-154876201') + :rtype: string + """ + randbits = str(random.randint(1, 0x7fffffff)) + rand_name = randbits + if name: + rand_name = name + '-' + rand_name + if prefix: + rand_name = prefix + '-' + rand_name + return rand_name + + @classmethod + def random_uuid(cls): + return str(uuid.uuid4()) + + def assertDictContains(self, parent, child): + """Checks whether child dict is a subset of parent. + + assertDictContainsSubset() in standard Python 2.7 has been deprecated + since Python 3.2 + """ + self.assertEqual(parent, dict(parent, **child)) diff --git a/trove/tests/unittests/util/util.py b/trove/tests/unittests/util/util.py index bf1eda66..ce22ed7b 100644 --- a/trove/tests/unittests/util/util.py +++ b/trove/tests/unittests/util/util.py @@ -31,3 +31,11 @@ def init_db(): db_api.db_sync(CONF) session.configure_db(CONF) DB_SETUP = True + + +def cleanup_db(): + with LOCK: + global DB_SETUP + if DB_SETUP: + session.clean_db() + DB_SETUP = False |