summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLingxian Kong <anlin.kong@gmail.com>2021-02-10 09:53:06 +1300
committerLingxian Kong <anlin.kong@gmail.com>2021-02-10 15:51:54 +1300
commit9c2e0bf3a0f1bc0b35a148174d8a4d2083f2b3c5 (patch)
tree86a3bf502d2addb21bcfd359c9f437b1e56edf75
parent6edd3332b68864e69eb6733a71ecb5e827871d8f (diff)
downloadtrove-9c2e0bf3a0f1bc0b35a148174d8a4d2083f2b3c5.tar.gz
Do not override RESTART_REQUIRED service status
The RESTART_REQUIRED service status set by guest agent should not be overridden by the health heartbeat mechanism. The RESTART_REQUIRED can only be changed when the instance is restarted or rebooted. Story: 2008612 Task: 41795 Change-Id: I98baf252452353237bc8fb14357df4e7bcb2867a
-rw-r--r--trove/conductor/manager.py13
-rw-r--r--trove/guestagent/guest_log.py27
-rw-r--r--trove/instance/models.py61
-rw-r--r--trove/instance/service_status.py1
-rw-r--r--trove/tests/unittests/instance/test_service.py79
5 files changed, 138 insertions, 43 deletions
diff --git a/trove/conductor/manager.py b/trove/conductor/manager.py
index 22fb75c5..d5738e19 100644
--- a/trove/conductor/manager.py
+++ b/trove/conductor/manager.py
@@ -24,7 +24,7 @@ from trove.common.serializable_notification import SerializableNotification
from trove.conductor.models import LastSeen
from trove.extensions.mysql import models as mysql_models
from trove.instance import models as inst_models
-from trove.instance.service_status import ServiceStatus
+from trove.instance import service_status as svc_status
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
@@ -86,11 +86,20 @@ class Manager(periodic_task.PeriodicTasks):
"payload": str(payload)})
status = inst_models.InstanceServiceStatus.find_by(
instance_id=instance_id)
+
if self._message_too_old(instance_id, 'heartbeat', sent):
return
+
+ if status.get_status() == svc_status.ServiceStatuses.RESTART_REQUIRED:
+ LOG.debug("Instance %s service status is RESTART_REQUIRED, "
+ "skip heartbeat", instance_id)
+ return
+
if payload.get('service_status') is not None:
status.set_status(
- ServiceStatus.from_description(payload['service_status']))
+ svc_status.ServiceStatus.from_description(
+ payload['service_status'])
+ )
status.save()
def update_backup(self, context, instance_id, backup_id,
diff --git a/trove/guestagent/guest_log.py b/trove/guestagent/guest_log.py
index 1b7c85d6..570a08ed 100644
--- a/trove/guestagent/guest_log.py
+++ b/trove/guestagent/guest_log.py
@@ -15,6 +15,7 @@
import enum
import hashlib
import os
+from pathlib import Path
from requests.exceptions import ConnectionError
from oslo_log import log as logging
@@ -245,34 +246,36 @@ class GuestLog(object):
'published': self._published_size})
def _update_details(self):
- # Make sure we can read the file
- if not self._file_readable or not os.access(self._file, os.R_OK):
+ if operating_system.exists(self._file, as_root=True):
+ file_path = Path(self._file)
+
+ # Make sure guest agent can read the log file.
if not os.access(self._file, os.R_OK):
- if operating_system.exists(self._file, as_root=True):
- operating_system.chmod(
- self._file, FileMode.ADD_ALL_R, as_root=True)
- self._file_readable = True
-
- if os.path.isfile(self._file):
- logstat = os.stat(self._file)
- self._size = logstat.st_size
+ operating_system.chmod(self._file, FileMode.ADD_ALL_R,
+ as_root=True)
+ operating_system.chmod(str(file_path.parent),
+ FileMode.ADD_GRP_RX_OTH_RX,
+ as_root=True)
+
+ self._size = file_path.stat().st_size
self._update_log_header_digest(self._file)
if self.status != LogStatus.Disabled:
if self._log_rotated():
self.status = LogStatus.Rotated
# See if we have stuff to publish
- elif logstat.st_size > self._published_size:
+ elif self._size > self._published_size:
self._set_status(self._published_size,
LogStatus.Partial, LogStatus.Ready)
# We've published everything so far
- elif logstat.st_size == self._published_size:
+ elif self._size == self._published_size:
self._set_status(self._published_size,
LogStatus.Published, LogStatus.Enabled)
# We've already handled this case (log rotated) so what gives?
else:
raise Exception(_("Bug in _log_rotated ?"))
else:
+ LOG.warning(f"File {self._file} does not exist")
self._published_size = 0
self._size = 0
diff --git a/trove/instance/models.py b/trove/instance/models.py
index cbd6933a..b04e3ae6 100644
--- a/trove/instance/models.py
+++ b/trove/instance/models.py
@@ -607,23 +607,30 @@ def load_instance(cls, context, id, needs_server=False,
return cls(context, db_info, server, service_status)
-def load_instance_with_info(cls, context, id, cluster_id=None):
- db_info = get_db_info(context, id, cluster_id)
-
- service_status = InstanceServiceStatus.find_by(instance_id=id)
- if (db_info.task_status == InstanceTasks.NONE and
- not service_status.is_uptodate()):
- LOG.warning('Guest agent heartbeat for instance %s has expried', id)
+def update_service_status(task_status, service_status, ins_id):
+ """Update service status as needed."""
+ if (task_status == InstanceTasks.NONE and
+ service_status.status != srvstatus.ServiceStatuses.RESTART_REQUIRED and
+ not service_status.is_uptodate()):
+ LOG.warning('Guest agent heartbeat for instance %s has expried',
+ ins_id)
service_status.status = \
srvstatus.ServiceStatuses.FAILED_TIMEOUT_GUESTAGENT
+
+def load_instance_with_info(cls, context, ins_id, cluster_id=None):
+ db_info = get_db_info(context, ins_id, cluster_id)
+ service_status = InstanceServiceStatus.find_by(instance_id=ins_id)
+
+ update_service_status(db_info.task_status, service_status, ins_id)
+
load_simple_instance_server_status(context, db_info)
load_simple_instance_addresses(context, db_info)
instance = cls(context, db_info, service_status)
- load_guest_info(instance, context, id)
+ load_guest_info(instance, context, ins_id)
load_server_group_info(instance, context)
@@ -901,6 +908,11 @@ class BaseInstance(SimpleInstance):
del_instance.set_status(srvstatus.ServiceStatuses.DELETED)
del_instance.save()
+ def set_servicestatus_restart(self):
+ del_instance = InstanceServiceStatus.find_by(instance_id=self.id)
+ del_instance.set_status(srvstatus.ServiceStatuses.RESTARTING)
+ del_instance.save()
+
def set_instance_fault_deleted(self):
try:
del_fault = DBInstanceFault.find_by(instance_id=self.id)
@@ -1442,7 +1454,10 @@ class Instance(BuiltInstance):
LOG.info("Rebooting instance %s.", self.id)
if self.db_info.cluster_id is not None and not self.context.is_admin:
raise exception.ClusterInstanceOperationNotSupported()
+
self.update_db(task_status=InstanceTasks.REBOOTING)
+ self.set_servicestatus_restart()
+
task_api.API(self.context).reboot(self.id)
def restart(self):
@@ -1450,13 +1465,10 @@ class Instance(BuiltInstance):
LOG.info("Restarting datastore on instance %s.", self.id)
if self.db_info.cluster_id is not None and not self.context.is_admin:
raise exception.ClusterInstanceOperationNotSupported()
- # Set our local status since Nova might not change it quick enough.
- # TODO(tim.simpson): Possible bad stuff can happen if this service
- # shuts down before it can set status to NONE.
- # We need a last updated time to mitigate this;
- # after some period of tolerance, we'll assume the
- # status is no longer in effect.
+
self.update_db(task_status=InstanceTasks.REBOOTING)
+ self.set_servicestatus_restart()
+
task_api.API(self.context).restart(self.id)
def detach_replica(self):
@@ -1829,9 +1841,9 @@ class Instances(object):
db.server_status = "SHUTDOWN" # Fake it...
db.addresses = []
- datastore_status = InstanceServiceStatus.find_by(
+ service_status = InstanceServiceStatus.find_by(
instance_id=db.id)
- if not datastore_status.status: # This should never happen.
+ if not service_status.status: # This should never happen.
LOG.error("Server status could not be read for "
"instance id(%s).", db.id)
continue
@@ -1839,24 +1851,15 @@ class Instances(object):
# Get the real-time service status.
LOG.debug('Task status for instance %s: %s', db.id,
db.task_status)
- if db.task_status == InstanceTasks.NONE:
- last_heartbeat_delta = (
- timeutils.utcnow() - datastore_status.updated_at)
- agent_expiry_interval = timedelta(
- seconds=CONF.agent_heartbeat_expiry)
- if last_heartbeat_delta > agent_expiry_interval:
- LOG.warning(
- 'Guest agent heartbeat for instance %s has '
- 'expried', id)
- datastore_status.status = \
- srvstatus.ServiceStatuses.FAILED_TIMEOUT_GUESTAGENT
+ update_service_status(db.task_status, service_status, db.id)
except exception.ModelNotFoundError:
LOG.error("Server status could not be read for "
"instance id(%s).", db.id)
continue
- ret.append(load_instance(context, db, datastore_status,
- server=server))
+ ret.append(
+ load_instance(context, db, service_status, server=server)
+ )
return ret
diff --git a/trove/instance/service_status.py b/trove/instance/service_status.py
index 913ecb5a..6c1c3fa9 100644
--- a/trove/instance/service_status.py
+++ b/trove/instance/service_status.py
@@ -36,6 +36,7 @@ class ServiceStatus(object):
ServiceStatuses.CRASHED._code,
ServiceStatuses.BLOCKED._code,
ServiceStatuses.HEALTHY._code,
+ ServiceStatuses.RESTART_REQUIRED._code,
]
return self._code in allowed_statuses
diff --git a/trove/tests/unittests/instance/test_service.py b/trove/tests/unittests/instance/test_service.py
index a38c62f5..807cb161 100644
--- a/trove/tests/unittests/instance/test_service.py
+++ b/trove/tests/unittests/instance/test_service.py
@@ -11,11 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+from datetime import timedelta
from unittest import mock
from trove.common import cfg
from trove.common import clients
from trove.common import exception
+from trove.common import timeutils
from trove.datastore import models as ds_models
from trove.instance import models as ins_models
from trove.instance import service
@@ -174,3 +176,80 @@ class TestInstanceController(trove_testtools.TestCase):
mock.ANY, "upgrade",
instance_id=instance.id,
datastore_version_id=new_ds_version.id)
+
+ @mock.patch('trove.instance.models.load_server_group_info')
+ @mock.patch('trove.instance.models.load_guest_info')
+ @mock.patch('trove.instance.models.load_simple_instance_addresses')
+ @mock.patch('trove.instance.models.load_simple_instance_server_status')
+ def test_show_with_restart_required(self, load_server_mock,
+ load_addr_mock, load_guest_mock,
+ load_server_grp_mock):
+ # Create an instance in db.
+ instance = ins_models.DBInstance.create(
+ name=self.random_name('instance'),
+ flavor_id=self.random_uuid(),
+ tenant_id=self.random_uuid(),
+ volume_size=1,
+ datastore_version_id=self.ds_version_imageid.id,
+ task_status=ins_models.InstanceTasks.NONE,
+ compute_instance_id=self.random_uuid(),
+ server_status='ACTIVE'
+ )
+ ins_models.InstanceServiceStatus.create(
+ instance_id=instance.id,
+ status=srvstatus.ServiceStatuses.RESTART_REQUIRED,
+ )
+
+ # workaround to reset updated_at field.
+ service_status = ins_models.InstanceServiceStatus.find_by(
+ instance_id=instance.id)
+ service_status.updated_at = timeutils.utcnow() - timedelta(
+ seconds=(CONF.agent_heartbeat_expiry + 60))
+ ins_models.get_db_api().save(service_status)
+
+ ret = self.controller.show(mock.MagicMock(), mock.ANY, instance.id)
+ self.assertEqual(200, ret.status)
+
+ ret_instance = ret.data(None)['instance']
+
+ self.assertEqual('ACTIVE', ret_instance.get('status'))
+ self.assertEqual('RESTART_REQUIRED',
+ ret_instance.get('operating_status'))
+
+ @mock.patch('trove.instance.models.load_server_group_info')
+ @mock.patch('trove.instance.models.load_guest_info')
+ @mock.patch('trove.instance.models.load_simple_instance_addresses')
+ @mock.patch('trove.instance.models.load_simple_instance_server_status')
+ def test_show_without_restart_required(self, load_server_mock,
+ load_addr_mock, load_guest_mock,
+ load_server_grp_mock):
+ # Create an instance in db.
+ instance = ins_models.DBInstance.create(
+ name=self.random_name('instance'),
+ flavor_id=self.random_uuid(),
+ tenant_id=self.random_uuid(),
+ volume_size=1,
+ datastore_version_id=self.ds_version_imageid.id,
+ task_status=ins_models.InstanceTasks.NONE,
+ compute_instance_id=self.random_uuid(),
+ server_status='ACTIVE'
+ )
+ ins_models.InstanceServiceStatus.create(
+ instance_id=instance.id,
+ status=srvstatus.ServiceStatuses.HEALTHY,
+ )
+
+ # workaround to reset updated_at field.
+ service_status = ins_models.InstanceServiceStatus.find_by(
+ instance_id=instance.id)
+ service_status.updated_at = timeutils.utcnow() - timedelta(
+ seconds=(CONF.agent_heartbeat_expiry + 60))
+ ins_models.get_db_api().save(service_status)
+
+ ret = self.controller.show(mock.MagicMock(), mock.ANY, instance.id)
+ self.assertEqual(200, ret.status)
+
+ ret_instance = ret.data(None)['instance']
+
+ self.assertEqual('ACTIVE', ret_instance.get('status'))
+ self.assertEqual('ERROR', ret_instance.get('operating_status'))