summaryrefslogtreecommitdiff
path: root/trove/taskmanager/manager.py
diff options
context:
space:
mode:
authorPeter Stachowski <peter@tesora.com>2016-08-29 19:47:47 +0000
committerPeter Stachowski <peter@tesora.com>2016-09-13 21:24:05 +0000
commitbd761989eead77eead58c91ccb30fcb53d7a5c5d (patch)
tree1b78761ebd453ed693694c0440817de73b0b42fd /trove/taskmanager/manager.py
parent8ba72a5f3fab78fe337b87f84315d8aadc7f3c6f (diff)
downloadtrove-bd761989eead77eead58c91ccb30fcb53d7a5c5d.tar.gz
Allow for invalid packet sequence in keepalive
In the SQLAlchemy keep_alive class, MariaDB is failing as pymysql reports an invalid packet sequence. MariaDB seems to timeout the client in a different way than MySQL and PXC, which manifests itself as the aforementioned invalid sequence. It is now handled as a special-case exception. With this fix, the MariaDB scenario tests now pass. The scenario tests were also tweaked a bit, which aided in the testing of the fix. 'group=instance' was created, plus instance_error properly interleaved with instance_create. _has_status now calls get_instance with the admin client so that any faults are accompanied by a relevant stack trace. Cases where the result code was being checked out-of-sequence were removed, and explicit calls to check the http code for the right client were added. The replication error messages for promote and eject were enhanced as well to attempt to debug spurious failures. One of those failures was 'Replication is not on after 60 seconds.' This was fixed by setting 'MASTER_CONNECT_RETRY' in the mariadb gtid replication strategy as was done in: https://review.openstack.org/#/c/188933 Finally, backup_incremental was added to MariaDB supported groups and cleaned up elsewhere. Closes-Bug: #1621702 Change-Id: Id6bde5a34e1d79eece3084f761dcd153c38ccbad
Diffstat (limited to 'trove/taskmanager/manager.py')
-rw-r--r--trove/taskmanager/manager.py75
1 files changed, 38 insertions, 37 deletions
diff --git a/trove/taskmanager/manager.py b/trove/taskmanager/manager.py
index d375b19a..a70872b7 100644
--- a/trove/taskmanager/manager.py
+++ b/trove/taskmanager/manager.py
@@ -120,41 +120,43 @@ class Manager(periodic_task.PeriodicTasks):
# and possibly some number of "orphaned" slaves
exception_replicas = []
+ error_messages = ""
for replica in replica_models:
try:
if replica.id != master_candidate.id:
replica.detach_replica(old_master, for_failover=True)
replica.attach_replica(master_candidate)
- except exception.TroveError:
- msg = _("promote-to-replica-source: Unable to migrate "
- "replica %(slave)s from old replica source "
- "%(old_master)s to new source %(new_master)s.")
- msg_values = {
- "slave": replica.id,
- "old_master": old_master.id,
- "new_master": master_candidate.id
- }
- LOG.exception(msg % msg_values)
+ except exception.TroveError as ex:
+ msg = (_("Unable to migrate replica %(slave)s from "
+ "old replica source %(old_master)s to "
+ "new source %(new_master)s on promote.") %
+ {"slave": replica.id,
+ "old_master": old_master.id,
+ "new_master": master_candidate.id})
+ LOG.exception(msg)
exception_replicas.append(replica)
+ error_messages += "%s (%s)\n" % (msg, ex)
try:
old_master.demote_replication_master()
- except Exception:
- LOG.exception(_("Exception demoting old replica source"))
+ except Exception as ex:
+ msg = (_("Exception demoting old replica source %s.") %
+ old_master.id)
+ LOG.exception(msg)
exception_replicas.append(old_master)
+ error_messages += "%s (%s)\n" % (msg, ex)
self._set_task_status([old_master] + replica_models,
InstanceTasks.NONE)
if exception_replicas:
self._set_task_status(exception_replicas,
InstanceTasks.PROMOTION_ERROR)
- msg = _("promote-to-replica-source %(id)s: The following "
- "replicas may not have been switched: %(replicas)s")
- msg_values = {
- "id": master_candidate.id,
- "replicas": exception_replicas
- }
- raise ReplicationSlaveAttachError(msg % msg_values)
+ msg = (_("promote-to-replica-source %(id)s: The following "
+ "replicas may not have been switched: %(replicas)s") %
+ {"id": master_candidate.id,
+ "replicas": [repl.id for repl in exception_replicas]})
+ raise ReplicationSlaveAttachError("%s:\n%s" %
+ (msg, error_messages))
with EndNotification(context):
master_candidate = BuiltInstanceTasks.load(context, instance_id)
@@ -207,35 +209,34 @@ class Manager(periodic_task.PeriodicTasks):
old_master.attach_public_ips(slave_ips)
exception_replicas = []
+ error_messages = ""
for replica in replica_models:
try:
if replica.id != master_candidate.id:
replica.detach_replica(old_master, for_failover=True)
replica.attach_replica(master_candidate)
- except exception.TroveError:
- msg = _("eject-replica-source: Unable to migrate "
- "replica %(slave)s from old replica source "
- "%(old_master)s to new source %(new_master)s.")
- msg_values = {
- "slave": replica.id,
- "old_master": old_master.id,
- "new_master": master_candidate.id
- }
- LOG.exception(msg % msg_values)
- exception_replicas.append(replica.id)
+ except exception.TroveError as ex:
+ msg = (_("Unable to migrate replica %(slave)s from "
+ "old replica source %(old_master)s to "
+ "new source %(new_master)s on eject.") %
+ {"slave": replica.id,
+ "old_master": old_master.id,
+ "new_master": master_candidate.id})
+ LOG.exception(msg)
+ exception_replicas.append(replica)
+ error_messages += "%s (%s)\n" % (msg, ex)
self._set_task_status([old_master] + replica_models,
InstanceTasks.NONE)
if exception_replicas:
self._set_task_status(exception_replicas,
InstanceTasks.EJECTION_ERROR)
- msg = _("eject-replica-source %(id)s: The following "
- "replicas may not have been switched: %(replicas)s")
- msg_values = {
- "id": master_candidate.id,
- "replicas": exception_replicas
- }
- raise ReplicationSlaveAttachError(msg % msg_values)
+ msg = (_("eject-replica-source %(id)s: The following "
+ "replicas may not have been switched: %(replicas)s") %
+ {"id": master_candidate.id,
+ "replicas": [repl.id for repl in exception_replicas]})
+ raise ReplicationSlaveAttachError("%s:\n%s" %
+ (msg, error_messages))
with EndNotification(context):
master = BuiltInstanceTasks.load(context, instance_id)