From cee1f8e6c7e89558191b244395e50a7657a0f240 Mon Sep 17 00:00:00 2001 From: Ali Adil Date: Mon, 18 Jul 2016 21:23:48 +0000 Subject: Add command to delete BUILD instances and clusters Sometimes an instance/cluster can be stuck in BUILD state forever. Attempting to delete the instance in this state is currently not allowed. Add force-delete and reset-status command. Reset-status will reset the status of an instance to ERROR and cluster to NONE. The reset-status command can only be used if the instance/cluster is in BUILD or ERROR state. Resetting the status of an instance in ERROR state can be useful as an instance might go ACTIVE after the specified timeout. Once the status has been reset it is possible for an instance to go ACTIVE if it receives a hearbeat from the guestagent. Force-delete will combine functionality of reset-status and delete. Change-Id: I83f6cdcdd884e51d002295b0d1f07341990e512c Depends-On: I957b4be5030e493e0eb8c6b6855d41b942b2823c Partial-Bug: #1579141 --- .../notes/force_delete-c2b06dbead554726.yaml | 6 ++ trove/cluster/models.py | 19 +++++- trove/common/notification.py | 21 +++++++ trove/instance/models.py | 31 ++++++++-- trove/instance/service.py | 17 +++++- trove/taskmanager/models.py | 4 +- trove/tests/int_tests.py | 8 ++- trove/tests/scenario/groups/__init__.py | 5 ++ .../scenario/groups/instance_error_create_group.py | 5 +- .../scenario/groups/instance_force_delete_group.py | 67 ++++++++++++++++++++++ trove/tests/scenario/groups/module_group.py | 2 +- .../runners/instance_force_delete_runners.py | 54 +++++++++++++++++ trove/tests/scenario/runners/test_runners.py | 2 +- 13 files changed, 229 insertions(+), 12 deletions(-) create mode 100644 releasenotes/notes/force_delete-c2b06dbead554726.yaml create mode 100644 trove/tests/scenario/groups/instance_force_delete_group.py create mode 100644 trove/tests/scenario/runners/instance_force_delete_runners.py diff --git a/releasenotes/notes/force_delete-c2b06dbead554726.yaml b/releasenotes/notes/force_delete-c2b06dbead554726.yaml new file mode 100644 index 00000000..3b5c89fb --- /dev/null +++ b/releasenotes/notes/force_delete-c2b06dbead554726.yaml @@ -0,0 +1,6 @@ +features: + - The reset-status command will set the task and status + of an instance to ERROR after which it can be deleted. + - The force-delete command will allow the deletion of + an instance even if the instance is stuck in BUILD + state. diff --git a/trove/cluster/models.py b/trove/cluster/models.py index 0cb62796..7ec90f28 100644 --- a/trove/cluster/models.py +++ b/trove/cluster/models.py @@ -21,7 +21,8 @@ from trove.cluster.tasks import ClusterTasks from trove.common import cfg from trove.common import exception from trove.common.i18n import _ -from trove.common.notification import DBaaSClusterGrow, DBaaSClusterShrink +from trove.common.notification import (DBaaSClusterGrow, DBaaSClusterShrink, + DBaaSClusterResetStatus) from trove.common.notification import StartNotification from trove.common import remote from trove.common import server_group as srv_grp @@ -136,6 +137,16 @@ class Cluster(object): LOG.info(_("Setting task to NONE on cluster %s") % self.id) self.update_db(task_status=ClusterTasks.NONE) + def reset_status(self): + self.validate_cluster_available([ClusterTasks.BUILDING_INITIAL]) + LOG.info(_("Resetting status to NONE on cluster %s") % self.id) + self.reset_task() + instances = inst_models.DBInstance.find_all(cluster_id=self.id, + deleted=False).all() + for inst in instances: + instance = inst_models.load_any_instance(self.context, inst.id) + instance.reset_status() + @property def id(self): return self.db_info.id @@ -291,6 +302,12 @@ class Cluster(object): with StartNotification(context, cluster_id=self.id): instance_ids = [instance['id'] for instance in param] return self.shrink(instance_ids) + elif action == "reset-status": + context.notification = DBaaSClusterResetStatus(context, + request=req) + with StartNotification(context, cluster_id=self.id): + return self.reset_status() + else: raise exception.BadRequest(_("Action %s not supported") % action) diff --git a/trove/common/notification.py b/trove/common/notification.py index 5bf23ab3..be7c96bf 100644 --- a/trove/common/notification.py +++ b/trove/common/notification.py @@ -506,6 +506,16 @@ class DBaaSInstanceDelete(DBaaSAPINotification): return ['instance_id'] +class DBaaSInstanceResetStatus(DBaaSAPINotification): + + def event_type(self): + return 'instance_reset_status' + + @abc.abstractmethod + def required_start_traits(self): + return ['instance_id'] + + class DBaaSInstanceDetach(DBaaSAPINotification): @abc.abstractmethod @@ -565,6 +575,17 @@ class DBaaSClusterDelete(DBaaSAPINotification): return ['cluster_id'] +class DBaaSClusterResetStatus(DBaaSAPINotification): + + @abc.abstractmethod + def event_type(self): + return 'cluster_reset_status' + + @abc.abstractmethod + def required_start_traits(self): + return ['cluster_id'] + + class DBaaSClusterAddShard(DBaaSAPINotification): @abc.abstractmethod diff --git a/trove/instance/models.py b/trove/instance/models.py index 6f213ed3..8b3668e6 100644 --- a/trove/instance/models.py +++ b/trove/instance/models.py @@ -246,6 +246,10 @@ class SimpleInstance(object): def is_building(self): return self.status in [InstanceStatus.BUILD] + @property + def is_error(self): + return self.status in [InstanceStatus.ERROR] + @property def is_datastore_running(self): """True if the service status indicates datastore is up and running.""" @@ -292,6 +296,10 @@ class SimpleInstance(object): if self.db_info.task_status.is_error: return InstanceStatus.ERROR + # If we've reset the status, show it as an error + if tr_instance.ServiceStatuses.UNKNOWN == self.datastore_status.status: + return InstanceStatus.ERROR + # Check for taskmanager status. action = self.db_info.task_status.action if 'BUILDING' == action: @@ -597,8 +605,9 @@ class BaseInstance(SimpleInstance): def delete(self): def _delete_resources(): if self.is_building: - raise exception.UnprocessableEntity("Instance %s is not ready." - % self.id) + raise exception.UnprocessableEntity( + "Instance %s is not ready. (Status is %s)." % + (self.id, self.status)) LOG.debug("Deleting instance with compute id = %s.", self.db_info.compute_instance_id) @@ -718,6 +727,20 @@ class BaseInstance(SimpleInstance): return files + def reset_status(self): + if self.is_building or self.is_error: + LOG.info(_LI("Resetting the status to ERROR on instance %s."), + self.id) + self.reset_task_status() + + reset_instance = InstanceServiceStatus.find_by(instance_id=self.id) + reset_instance.set_status(tr_instance.ServiceStatuses.UNKNOWN) + reset_instance.save() + else: + raise exception.UnprocessableEntity( + "Instance %s status can only be reset in BUILD or ERROR " + "state." % self.id) + class FreshInstance(BaseInstance): @classmethod @@ -727,8 +750,8 @@ class FreshInstance(BaseInstance): class BuiltInstance(BaseInstance): @classmethod - def load(cls, context, id): - return load_instance(cls, context, id, needs_server=True) + def load(cls, context, id, needs_server=True): + return load_instance(cls, context, id, needs_server=needs_server) class Instance(BuiltInstance): diff --git a/trove/instance/service.py b/trove/instance/service.py index 4fcf77f5..9cc0630e 100644 --- a/trove/instance/service.py +++ b/trove/instance/service.py @@ -78,7 +78,6 @@ class InstanceController(wsgi.Controller): if not body: raise exception.BadRequest(_("Invalid request body.")) context = req.environ[wsgi.CONTEXT_KEY] - instance = models.Instance.load(context, id) _actions = { 'restart': self._action_restart, 'resize': self._action_resize, @@ -86,6 +85,7 @@ class InstanceController(wsgi.Controller): 'promote_to_replica_source': self._action_promote_to_replica_source, 'eject_replica_source': self._action_eject_replica_source, + 'reset_status': self._action_reset_status, } selected_action = None action_name = None @@ -97,6 +97,10 @@ class InstanceController(wsgi.Controller): "instance %(instance_id)s for tenant '%(tenant_id)s'"), {'action_name': action_name, 'instance_id': id, 'tenant_id': tenant_id}) + needs_server = True + if action_name in ['reset_status']: + needs_server = False + instance = models.Instance.load(context, id, needs_server=needs_server) return selected_action(context, req, instance, body) def _action_restart(self, context, req, instance, body): @@ -163,6 +167,17 @@ class InstanceController(wsgi.Controller): instance.eject_replica_source() return wsgi.Result(None, 202) + def _action_reset_status(self, context, req, instance, body): + context.notification = notification.DBaaSInstanceResetStatus( + context, request=req) + with StartNotification(context, instance_id=instance.id): + instance.reset_status() + + LOG.debug("Failing backups for instance %s." % instance.id) + backup_model.fail_for_instance(instance.id) + + return wsgi.Result(None, 202) + def index(self, req, tenant_id): """Return all instances.""" LOG.info(_LI("Listing database instances for tenant '%s'"), tenant_id) diff --git a/trove/taskmanager/models.py b/trove/taskmanager/models.py index 1dae4d05..b170c43d 100755 --- a/trove/taskmanager/models.py +++ b/trove/taskmanager/models.py @@ -619,7 +619,9 @@ class FreshInstanceTasks(FreshInstance, NotifyMixin, ConfigurationMixin): status == rd_instance.ServiceStatuses.INSTANCE_READY): return True elif status not in [rd_instance.ServiceStatuses.NEW, - rd_instance.ServiceStatuses.BUILDING]: + rd_instance.ServiceStatuses.BUILDING, + rd_instance.ServiceStatuses.UNKNOWN, + rd_instance.ServiceStatuses.DELETED]: raise TroveError(_("Service not active, status: %s") % status) c_id = self.db_info.compute_instance_id diff --git a/trove/tests/int_tests.py b/trove/tests/int_tests.py index 751904c2..18bfe5c9 100644 --- a/trove/tests/int_tests.py +++ b/trove/tests/int_tests.py @@ -42,6 +42,7 @@ from trove.tests.scenario.groups import instance_actions_group from trove.tests.scenario.groups import instance_create_group from trove.tests.scenario.groups import instance_delete_group from trove.tests.scenario.groups import instance_error_create_group +from trove.tests.scenario.groups import instance_force_delete_group from trove.tests.scenario.groups import instance_upgrade_group from trove.tests.scenario.groups import module_group from trove.tests.scenario.groups import negative_cluster_actions_group @@ -150,6 +151,9 @@ instance_error_create_groups.extend([instance_error_create_group.GROUP]) instance_upgrade_groups = list(instance_create_groups) instance_upgrade_groups.extend([instance_upgrade_group.GROUP]) +instance_force_delete_groups = list(base_groups) +instance_force_delete_groups.extend([instance_force_delete_group.GROUP]) + backup_groups = list(instance_create_groups) backup_groups.extend([groups.BACKUP, groups.BACKUP_INST]) @@ -195,12 +199,13 @@ user_actions_groups.extend([user_actions_group.GROUP]) # groups common to all datastores common_groups = list(instance_actions_groups) common_groups.extend([guest_log_groups, instance_error_create_groups, - module_groups]) + instance_force_delete_groups, module_groups]) # Register: Component based groups register(["backup"], backup_groups) register(["backup_incremental"], backup_incremental_groups) register(["cluster"], cluster_actions_groups) +register(["common"], common_groups) register(["configuration"], configuration_groups) register(["configuration_create"], configuration_create_groups) register(["database"], database_actions_groups) @@ -209,6 +214,7 @@ register(["instance", "instance_actions"], instance_actions_groups) register(["instance_create"], instance_create_groups) register(["instance_error_create"], instance_error_create_groups) register(["instance_upgrade"], instance_upgrade_groups) +register(["instance_force_delete"], instance_force_delete_groups) register(["module"], module_groups) register(["module_create"], module_create_groups) register(["replication"], replication_groups) diff --git a/trove/tests/scenario/groups/__init__.py b/trove/tests/scenario/groups/__init__.py index 48d4c41f..42cb2755 100644 --- a/trove/tests/scenario/groups/__init__.py +++ b/trove/tests/scenario/groups/__init__.py @@ -87,6 +87,11 @@ INST_ERROR_DELETE = "scenario.inst_error_delete_grp" INST_ERROR_DELETE_WAIT = "scenario.inst_error_delete_wait_grp" +# Instance Force Delete Group +INST_FORCE_DELETE = "scenario.inst_force_delete_grp" +INST_FORCE_DELETE_WAIT = "scenario.inst_force_delete_wait_grp" + + # Module Group MODULE_CREATE = "scenario.module_create_grp" MODULE_DELETE = "scenario.module_delete_grp" diff --git a/trove/tests/scenario/groups/instance_error_create_group.py b/trove/tests/scenario/groups/instance_error_create_group.py index 888caf32..713b2cb9 100644 --- a/trove/tests/scenario/groups/instance_error_create_group.py +++ b/trove/tests/scenario/groups/instance_error_create_group.py @@ -52,8 +52,8 @@ class InstanceErrorCreateGroup(TestGroup): @test(depends_on_groups=[groups.INST_ERROR_CREATE], - groups=[GROUP, groups.INST_ERROR_CREATE_WAIT], - runs_after_groups=[groups.MODULE_CREATE, groups.CFGGRP_CREATE]) + runs_after_groups=[groups.MODULE_CREATE, groups.CFGGRP_CREATE], + groups=[GROUP, groups.INST_ERROR_CREATE_WAIT]) class InstanceErrorCreateWaitGroup(TestGroup): """Test that Instance Error Create Completes.""" @@ -94,6 +94,7 @@ class InstanceErrorDeleteGroup(TestGroup): @test(depends_on_groups=[groups.INST_ERROR_DELETE], + runs_after_groups=[groups.MODULE_INST_CREATE], groups=[GROUP, groups.INST_ERROR_DELETE_WAIT]) class InstanceErrorDeleteWaitGroup(TestGroup): """Test that Instance Error Delete Completes.""" diff --git a/trove/tests/scenario/groups/instance_force_delete_group.py b/trove/tests/scenario/groups/instance_force_delete_group.py new file mode 100644 index 00000000..fef58d18 --- /dev/null +++ b/trove/tests/scenario/groups/instance_force_delete_group.py @@ -0,0 +1,67 @@ +# Copyright 2016 Tesora Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from proboscis import test + +from trove.tests import PRE_INSTANCES +from trove.tests.scenario import groups +from trove.tests.scenario.groups.test_group import TestGroup +from trove.tests.scenario.runners import test_runners + + +GROUP = "scenario.instance_force_delete_group" + + +class InstanceForceDeleteRunnerFactory(test_runners.RunnerFactory): + + _runner_ns = 'instance_force_delete_runners' + _runner_cls = 'InstanceForceDeleteRunner' + + +@test(depends_on_groups=["services.initialize"], + runs_after_groups=[PRE_INSTANCES, groups.INST_ERROR_CREATE], + groups=[GROUP, groups.INST_FORCE_DELETE]) +class InstanceForceDeleteGroup(TestGroup): + """Test Instance Force Delete functionality.""" + + def __init__(self): + super(InstanceForceDeleteGroup, self).__init__( + InstanceForceDeleteRunnerFactory.instance()) + + @test + def create_build_instance(self): + """Create an instance in BUILD state.""" + self.test_runner.run_create_build_instance() + + @test(depends_on=['create_build_instance']) + def delete_build_instance(self): + """Make sure the instance in BUILD state deletes.""" + self.test_runner.run_delete_build_instance() + + +@test(depends_on_groups=[groups.INST_FORCE_DELETE], + runs_after_groups=[groups.MODULE_INST_CREATE], + groups=[GROUP, groups.INST_FORCE_DELETE_WAIT]) +class InstanceForceDeleteWaitGroup(TestGroup): + """Make sure the Force Delete instance goes away.""" + + def __init__(self): + super(InstanceForceDeleteWaitGroup, self).__init__( + InstanceForceDeleteRunnerFactory.instance()) + + @test + def wait_for_force_delete(self): + """Wait for the Force Delete instance to be gone.""" + self.test_runner.run_wait_for_force_delete() diff --git a/trove/tests/scenario/groups/module_group.py b/trove/tests/scenario/groups/module_group.py index 495d12a9..1eeee68d 100644 --- a/trove/tests/scenario/groups/module_group.py +++ b/trove/tests/scenario/groups/module_group.py @@ -286,7 +286,7 @@ class ModuleCreateGroup(TestGroup): @test(depends_on_groups=[groups.INST_CREATE_WAIT, groups.MODULE_CREATE], - runs_after_groups=[groups.INST_ERROR_DELETE], + runs_after_groups=[groups.INST_ERROR_DELETE, groups.INST_FORCE_DELETE], groups=[GROUP, groups.MODULE_INST, groups.MODULE_INST_CREATE]) class ModuleInstCreateGroup(TestGroup): """Test Module Instance Create functionality.""" diff --git a/trove/tests/scenario/runners/instance_force_delete_runners.py b/trove/tests/scenario/runners/instance_force_delete_runners.py new file mode 100644 index 00000000..03045a29 --- /dev/null +++ b/trove/tests/scenario/runners/instance_force_delete_runners.py @@ -0,0 +1,54 @@ +# Copyright 2016 Tesora Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from proboscis import SkipTest + +from trove.tests.scenario.runners.test_runners import TestRunner + + +class InstanceForceDeleteRunner(TestRunner): + + def __init__(self): + super(InstanceForceDeleteRunner, self).__init__(sleep_time=1) + + self.build_inst_id = None + + def run_create_build_instance(self, expected_states=['NEW', 'BUILD'], + expected_http_code=200): + if self.is_using_existing_instance: + raise SkipTest("Using an existing instance.") + + name = self.instance_info.name + '_build' + flavor = self.get_instance_flavor() + + inst = self.auth_client.instances.create( + name, + self.get_flavor_href(flavor), + self.instance_info.volume, + nics=self.instance_info.nics, + datastore=self.instance_info.dbaas_datastore, + datastore_version=self.instance_info.dbaas_datastore_version) + self.assert_instance_action([inst.id], expected_states, + expected_http_code) + self.build_inst_id = inst.id + + def run_delete_build_instance(self, expected_http_code=202): + if self.build_inst_id: + self.auth_client.instances.force_delete(self.build_inst_id) + self.assert_client_code(expected_http_code) + + def run_wait_for_force_delete(self): + if self.build_inst_id: + self.assert_all_gone([self.build_inst_id], ['SHUTDOWN']) diff --git a/trove/tests/scenario/runners/test_runners.py b/trove/tests/scenario/runners/test_runners.py index 6816a635..1ef0f5aa 100644 --- a/trove/tests/scenario/runners/test_runners.py +++ b/trove/tests/scenario/runners/test_runners.py @@ -139,7 +139,7 @@ class RunnerFactory(object): # such as a missing override class. Anything else # shouldn't be suppressed. l_msg = ie.message.lower() - if load_type not in l_msg or ( + if (load_type and load_type not in l_msg) or ( 'no module named' not in l_msg and 'cannot be found' not in l_msg): raise -- cgit v1.2.1