summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLingxian Kong <anlin.kong@gmail.com>2021-02-10 09:53:06 +1300
committerLingxian Kong <anlin.kong@gmail.com>2021-02-11 10:57:45 +1300
commit2b6b7ab4b7810667426be26ff03a6c367bb2460d (patch)
tree7f6b96065c913bb622a9ae67d524526e0eacdcc3
parent73978a9b978788817ee98ef6970a35f9121a2dc9 (diff)
downloadtrove-2b6b7ab4b7810667426be26ff03a6c367bb2460d.tar.gz
Do not override RESTART_REQUIRED service status
The RESTART_REQUIRED service status set by guest agent should not be overridden by the health heartbeat mechanism. The RESTART_REQUIRED can only be changed when the instance is restarted or rebooted. Story: 2008612 Task: 41795 Change-Id: I98baf252452353237bc8fb14357df4e7bcb2867a (Cherry picked from 9c2e0bf3a0f1bc0b35a148174d8a4d2083f2b3c5)
-rw-r--r--trove/conductor/manager.py13
-rw-r--r--trove/guestagent/guest_log.py27
-rw-r--r--trove/instance/models.py62
-rw-r--r--trove/instance/service_status.py1
-rw-r--r--trove/tests/unittests/instance/test_service.py129
-rw-r--r--trove/tests/unittests/trove_testtools.py34
-rw-r--r--trove/tests/unittests/util/util.py8
7 files changed, 230 insertions, 44 deletions
diff --git a/trove/conductor/manager.py b/trove/conductor/manager.py
index 22fb75c5..d5738e19 100644
--- a/trove/conductor/manager.py
+++ b/trove/conductor/manager.py
@@ -24,7 +24,7 @@ from trove.common.serializable_notification import SerializableNotification
from trove.conductor.models import LastSeen
from trove.extensions.mysql import models as mysql_models
from trove.instance import models as inst_models
-from trove.instance.service_status import ServiceStatus
+from trove.instance import service_status as svc_status
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
@@ -86,11 +86,20 @@ class Manager(periodic_task.PeriodicTasks):
"payload": str(payload)})
status = inst_models.InstanceServiceStatus.find_by(
instance_id=instance_id)
+
if self._message_too_old(instance_id, 'heartbeat', sent):
return
+
+ if status.get_status() == svc_status.ServiceStatuses.RESTART_REQUIRED:
+ LOG.debug("Instance %s service status is RESTART_REQUIRED, "
+ "skip heartbeat", instance_id)
+ return
+
if payload.get('service_status') is not None:
status.set_status(
- ServiceStatus.from_description(payload['service_status']))
+ svc_status.ServiceStatus.from_description(
+ payload['service_status'])
+ )
status.save()
def update_backup(self, context, instance_id, backup_id,
diff --git a/trove/guestagent/guest_log.py b/trove/guestagent/guest_log.py
index 1b7c85d6..570a08ed 100644
--- a/trove/guestagent/guest_log.py
+++ b/trove/guestagent/guest_log.py
@@ -15,6 +15,7 @@
import enum
import hashlib
import os
+from pathlib import Path
from requests.exceptions import ConnectionError
from oslo_log import log as logging
@@ -245,34 +246,36 @@ class GuestLog(object):
'published': self._published_size})
def _update_details(self):
- # Make sure we can read the file
- if not self._file_readable or not os.access(self._file, os.R_OK):
+ if operating_system.exists(self._file, as_root=True):
+ file_path = Path(self._file)
+
+ # Make sure guest agent can read the log file.
if not os.access(self._file, os.R_OK):
- if operating_system.exists(self._file, as_root=True):
- operating_system.chmod(
- self._file, FileMode.ADD_ALL_R, as_root=True)
- self._file_readable = True
-
- if os.path.isfile(self._file):
- logstat = os.stat(self._file)
- self._size = logstat.st_size
+ operating_system.chmod(self._file, FileMode.ADD_ALL_R,
+ as_root=True)
+ operating_system.chmod(str(file_path.parent),
+ FileMode.ADD_GRP_RX_OTH_RX,
+ as_root=True)
+
+ self._size = file_path.stat().st_size
self._update_log_header_digest(self._file)
if self.status != LogStatus.Disabled:
if self._log_rotated():
self.status = LogStatus.Rotated
# See if we have stuff to publish
- elif logstat.st_size > self._published_size:
+ elif self._size > self._published_size:
self._set_status(self._published_size,
LogStatus.Partial, LogStatus.Ready)
# We've published everything so far
- elif logstat.st_size == self._published_size:
+ elif self._size == self._published_size:
self._set_status(self._published_size,
LogStatus.Published, LogStatus.Enabled)
# We've already handled this case (log rotated) so what gives?
else:
raise Exception(_("Bug in _log_rotated ?"))
else:
+ LOG.warning(f"File {self._file} does not exist")
self._published_size = 0
self._size = 0
diff --git a/trove/instance/models.py b/trove/instance/models.py
index c33ea1c6..70bd6fd6 100644
--- a/trove/instance/models.py
+++ b/trove/instance/models.py
@@ -602,24 +602,30 @@ def load_instance(cls, context, id, needs_server=False,
return cls(context, db_info, server, service_status)
-def load_instance_with_info(cls, context, id, cluster_id=None):
- db_info = get_db_info(context, id, cluster_id)
- LOG.debug('Task status for instance %s: %s', id, db_info.task_status)
-
- service_status = InstanceServiceStatus.find_by(instance_id=id)
- if (db_info.task_status == InstanceTasks.NONE and
- not service_status.is_uptodate()):
- LOG.warning('Guest agent heartbeat for instance %s has expried', id)
+def update_service_status(task_status, service_status, ins_id):
+ """Update service status as needed."""
+ if (task_status == InstanceTasks.NONE and
+ service_status.status != srvstatus.ServiceStatuses.RESTART_REQUIRED and
+ not service_status.is_uptodate()):
+ LOG.warning('Guest agent heartbeat for instance %s has expried',
+ ins_id)
service_status.status = \
srvstatus.ServiceStatuses.FAILED_TIMEOUT_GUESTAGENT
+
+def load_instance_with_info(cls, context, ins_id, cluster_id=None):
+ db_info = get_db_info(context, ins_id, cluster_id)
+ service_status = InstanceServiceStatus.find_by(instance_id=ins_id)
+
+ update_service_status(db_info.task_status, service_status, ins_id)
+
load_simple_instance_server_status(context, db_info)
load_simple_instance_addresses(context, db_info)
instance = cls(context, db_info, service_status)
- load_guest_info(instance, context, id)
+ load_guest_info(instance, context, ins_id)
load_server_group_info(instance, context)
@@ -897,6 +903,11 @@ class BaseInstance(SimpleInstance):
del_instance.set_status(srvstatus.ServiceStatuses.DELETED)
del_instance.save()
+ def set_servicestatus_restart(self):
+ del_instance = InstanceServiceStatus.find_by(instance_id=self.id)
+ del_instance.set_status(srvstatus.ServiceStatuses.RESTARTING)
+ del_instance.save()
+
def set_instance_fault_deleted(self):
try:
del_fault = DBInstanceFault.find_by(instance_id=self.id)
@@ -1432,7 +1443,10 @@ class Instance(BuiltInstance):
LOG.info("Rebooting instance %s.", self.id)
if self.db_info.cluster_id is not None and not self.context.is_admin:
raise exception.ClusterInstanceOperationNotSupported()
+
self.update_db(task_status=InstanceTasks.REBOOTING)
+ self.set_servicestatus_restart()
+
task_api.API(self.context).reboot(self.id)
def restart(self):
@@ -1440,13 +1454,10 @@ class Instance(BuiltInstance):
LOG.info("Restarting datastore on instance %s.", self.id)
if self.db_info.cluster_id is not None and not self.context.is_admin:
raise exception.ClusterInstanceOperationNotSupported()
- # Set our local status since Nova might not change it quick enough.
- # TODO(tim.simpson): Possible bad stuff can happen if this service
- # shuts down before it can set status to NONE.
- # We need a last updated time to mitigate this;
- # after some period of tolerance, we'll assume the
- # status is no longer in effect.
+
self.update_db(task_status=InstanceTasks.REBOOTING)
+ self.set_servicestatus_restart()
+
task_api.API(self.context).restart(self.id)
def detach_replica(self):
@@ -1819,9 +1830,9 @@ class Instances(object):
db.server_status = "SHUTDOWN" # Fake it...
db.addresses = []
- datastore_status = InstanceServiceStatus.find_by(
+ service_status = InstanceServiceStatus.find_by(
instance_id=db.id)
- if not datastore_status.status: # This should never happen.
+ if not service_status.status: # This should never happen.
LOG.error("Server status could not be read for "
"instance id(%s).", db.id)
continue
@@ -1829,24 +1840,15 @@ class Instances(object):
# Get the real-time service status.
LOG.debug('Task status for instance %s: %s', db.id,
db.task_status)
- if db.task_status == InstanceTasks.NONE:
- last_heartbeat_delta = (
- timeutils.utcnow() - datastore_status.updated_at)
- agent_expiry_interval = timedelta(
- seconds=CONF.agent_heartbeat_expiry)
- if last_heartbeat_delta > agent_expiry_interval:
- LOG.warning(
- 'Guest agent heartbeat for instance %s has '
- 'expried', id)
- datastore_status.status = \
- srvstatus.ServiceStatuses.FAILED_TIMEOUT_GUESTAGENT
+ update_service_status(db.task_status, service_status, db.id)
except exception.ModelNotFoundError:
LOG.error("Server status could not be read for "
"instance id(%s).", db.id)
continue
- ret.append(load_instance(context, db, datastore_status,
- server=server))
+ ret.append(
+ load_instance(context, db, service_status, server=server)
+ )
return ret
diff --git a/trove/instance/service_status.py b/trove/instance/service_status.py
index 913ecb5a..6c1c3fa9 100644
--- a/trove/instance/service_status.py
+++ b/trove/instance/service_status.py
@@ -36,6 +36,7 @@ class ServiceStatus(object):
ServiceStatuses.CRASHED._code,
ServiceStatuses.BLOCKED._code,
ServiceStatuses.HEALTHY._code,
+ ServiceStatuses.RESTART_REQUIRED._code,
]
return self._code in allowed_statuses
diff --git a/trove/tests/unittests/instance/test_service.py b/trove/tests/unittests/instance/test_service.py
new file mode 100644
index 00000000..aa0af8de
--- /dev/null
+++ b/trove/tests/unittests/instance/test_service.py
@@ -0,0 +1,129 @@
+# Copyright 2020 Catalyst Cloud
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from datetime import timedelta
+from unittest import mock
+
+from trove.common import cfg
+from trove.common import timeutils
+from trove.datastore import models as ds_models
+from trove.instance import models as ins_models
+from trove.instance import service
+from trove.instance import service_status as srvstatus
+from trove.tests.unittests import trove_testtools
+from trove.tests.unittests.util import util
+
+CONF = cfg.CONF
+
+
+class TestInstanceController(trove_testtools.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ util.init_db()
+
+ cls.ds_name = cls.random_name('datastore',
+ prefix='TestInstanceController')
+ ds_models.update_datastore(name=cls.ds_name, default_version=None)
+ cls.ds = ds_models.Datastore.load(cls.ds_name)
+
+ ds_models.update_datastore_version(
+ cls.ds_name, 'test_image_id', 'mysql', cls.random_uuid(), '', 1)
+
+ cls.ds_version_imageid = ds_models.DatastoreVersion.load(
+ cls.ds, 'test_image_id')
+
+ cls.controller = service.InstanceController()
+
+ super(TestInstanceController, cls).setUpClass()
+
+ @classmethod
+ def tearDownClass(cls):
+ util.cleanup_db()
+
+ super(TestInstanceController, cls).tearDownClass()
+
+ def setUp(self):
+ trove_testtools.patch_notifier(self)
+ super(TestInstanceController, self).setUp()
+
+ @mock.patch('trove.instance.models.load_server_group_info')
+ @mock.patch('trove.instance.models.load_guest_info')
+ @mock.patch('trove.instance.models.load_simple_instance_addresses')
+ @mock.patch('trove.instance.models.load_simple_instance_server_status')
+ def test_show_with_restart_required(self, load_server_mock,
+ load_addr_mock, load_guest_mock,
+ load_server_grp_mock):
+ # Create an instance in db.
+ instance = ins_models.DBInstance.create(
+ name=self.random_name('instance'),
+ flavor_id=self.random_uuid(),
+ tenant_id=self.random_uuid(),
+ volume_size=1,
+ datastore_version_id=self.ds_version_imageid.id,
+ task_status=ins_models.InstanceTasks.NONE,
+ compute_instance_id=self.random_uuid(),
+ server_status='ACTIVE'
+ )
+ ins_models.InstanceServiceStatus.create(
+ instance_id=instance.id,
+ status=srvstatus.ServiceStatuses.RESTART_REQUIRED,
+ )
+
+ # workaround to reset updated_at field.
+ service_status = ins_models.InstanceServiceStatus.find_by(
+ instance_id=instance.id)
+ service_status.updated_at = timeutils.utcnow() - timedelta(
+ seconds=(CONF.agent_heartbeat_expiry + 60))
+ ins_models.get_db_api().save(service_status)
+
+ ret = self.controller.show(mock.MagicMock(), mock.ANY, instance.id)
+ self.assertEqual(200, ret.status)
+
+ ret_instance = ret.data(None)['instance']
+ self.assertEqual('RESTART_REQUIRED', ret_instance.get('status'))
+
+ @mock.patch('trove.instance.models.load_server_group_info')
+ @mock.patch('trove.instance.models.load_guest_info')
+ @mock.patch('trove.instance.models.load_simple_instance_addresses')
+ @mock.patch('trove.instance.models.load_simple_instance_server_status')
+ def test_show_without_restart_required(self, load_server_mock,
+ load_addr_mock, load_guest_mock,
+ load_server_grp_mock):
+ # Create an instance in db.
+ instance = ins_models.DBInstance.create(
+ name=self.random_name('instance'),
+ flavor_id=self.random_uuid(),
+ tenant_id=self.random_uuid(),
+ volume_size=1,
+ datastore_version_id=self.ds_version_imageid.id,
+ task_status=ins_models.InstanceTasks.NONE,
+ compute_instance_id=self.random_uuid(),
+ server_status='ACTIVE'
+ )
+ ins_models.InstanceServiceStatus.create(
+ instance_id=instance.id,
+ status=srvstatus.ServiceStatuses.HEALTHY,
+ )
+
+ # workaround to reset updated_at field.
+ service_status = ins_models.InstanceServiceStatus.find_by(
+ instance_id=instance.id)
+ service_status.updated_at = timeutils.utcnow() - timedelta(
+ seconds=(CONF.agent_heartbeat_expiry + 60))
+ ins_models.get_db_api().save(service_status)
+
+ ret = self.controller.show(mock.MagicMock(), mock.ANY, instance.id)
+ self.assertEqual(200, ret.status)
+
+ ret_instance = ret.data(None)['instance']
+ self.assertEqual('ERROR', ret_instance.get('status'))
diff --git a/trove/tests/unittests/trove_testtools.py b/trove/tests/unittests/trove_testtools.py
index 0366d821..34e67b89 100644
--- a/trove/tests/unittests/trove_testtools.py
+++ b/trove/tests/unittests/trove_testtools.py
@@ -14,8 +14,10 @@
# under the License.
import abc
+import random
import testtools
from unittest import mock
+import uuid
from trove.common import cfg
from trove.common.context import TroveContext
@@ -96,3 +98,35 @@ class TestCase(testtools.TestCase):
new_callable=mock.PropertyMock(return_value=value))
self.addCleanup(conf_patcher.stop)
return conf_patcher.start()
+
+ @classmethod
+ def random_name(cls, name='', prefix=None):
+ """Generate a random name that inclues a random number.
+
+ :param str name: The name that you want to include
+ :param str prefix: The prefix that you want to include
+
+ :return: a random name. The format is
+ '<prefix>-<name>-<random number>'.
+ (e.g. 'prefixfoo-namebar-154876201')
+ :rtype: string
+ """
+ randbits = str(random.randint(1, 0x7fffffff))
+ rand_name = randbits
+ if name:
+ rand_name = name + '-' + rand_name
+ if prefix:
+ rand_name = prefix + '-' + rand_name
+ return rand_name
+
+ @classmethod
+ def random_uuid(cls):
+ return str(uuid.uuid4())
+
+ def assertDictContains(self, parent, child):
+ """Checks whether child dict is a subset of parent.
+
+ assertDictContainsSubset() in standard Python 2.7 has been deprecated
+ since Python 3.2
+ """
+ self.assertEqual(parent, dict(parent, **child))
diff --git a/trove/tests/unittests/util/util.py b/trove/tests/unittests/util/util.py
index bf1eda66..ce22ed7b 100644
--- a/trove/tests/unittests/util/util.py
+++ b/trove/tests/unittests/util/util.py
@@ -31,3 +31,11 @@ def init_db():
db_api.db_sync(CONF)
session.configure_db(CONF)
DB_SETUP = True
+
+
+def cleanup_db():
+ with LOCK:
+ global DB_SETUP
+ if DB_SETUP:
+ session.clean_db()
+ DB_SETUP = False