diff options
author | Alan Conway <aconway@apache.org> | 2014-07-31 13:55:11 +0000 |
---|---|---|
committer | Alan Conway <aconway@apache.org> | 2014-07-31 13:55:11 +0000 |
commit | c9276b03da088b3f4d3f4b527f2e02703e2729eb (patch) | |
tree | b3f0553221917ffeb27f9562d9df7a5d9f8000d2 /qpid/tools | |
parent | 5b6f651d3f2c5b33fa510e120dc0e98f6a95409a (diff) | |
download | qpid-python-c9276b03da088b3f4d3f4b527f2e02703e2729eb.tar.gz |
QPID-5942: qpid HA cluster may end-up in joining state after HA primary is killed
There are two issues here, both related to the fact that rgmanager sees qpidd
and qpidd-primary as two separate services.
1. The service start/stop scripts can be called concurrently. This can lead to
running a qpidd process who's pid is not in the pidfile. rgmanager cannot
detect or kill this qpidd and cannot start another qpidd because of the lock
on the qpidd data directory.
2. rgmanager sees a primary failure as two failures: qpidd and qpidd-primary,
and will then try to stop and start both services. The order of these actions
is not defined and can lead to rgmanager killing a service it has just
started.
This patch makes two major changes to the init scripts:
1. Uses flock to lock the sensitive stop/start part of the scripts to ensure
they are not executed concurrently.
2. On "stop" the scripts check if a running qpidd is primary or not. "qpidd stop"
is a no-op if the running broker is primary, "qpidd-primary stop" is a no op
if it is not. This ensures that a broker will be stopped by the same stream
of service actions that started it.
Minor changes in this patch:
- better logging of broker start-up and shut-down sequence.
- qpid-ha heartbeat use half of timeout option.
- add missing timeouts in qpid-ha.
Notes:
This changes the behavior of 'clusvcadm -d <qpidd-service>' on the primary node.
Previously this would have stopped the qpidd service on that node, killed the
qpidd process and relocated the primary service. Now this will stop the qpidd
service (as far as rgmanager is concerned) but will not kill qpidd or relocate
the primary service. When the primary is relocated the qpidd service wil not be
able to re-start on that node until it is re-enabled with 'clusvcadm -e'.
git-svn-id: https://svn.apache.org/repos/asf/qpid/trunk@1614895 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'qpid/tools')
-rwxr-xr-x | qpid/tools/src/py/qpid-ha | 12 |
1 files changed, 7 insertions, 5 deletions
diff --git a/qpid/tools/src/py/qpid-ha b/qpid/tools/src/py/qpid-ha index 49f6a244c6..640463c09a 100755 --- a/qpid/tools/src/py/qpid-ha +++ b/qpid/tools/src/py/qpid-ha @@ -19,7 +19,7 @@ # under the License. # -import optparse, sys, time, os, re +import optparse, sys, time, os, re, math from qpid.messaging import Connection from qpid.messaging import Message as QpidMessage from qpid.util import URL @@ -100,7 +100,7 @@ class Command: conn_options['client_properties'] = {'qpid.ha-admin' : 1} if opts.timeout: conn_options['timeout'] = opts.timeout - conn_options['heartbeat'] = int(opts.timeout) + conn_options['heartbeat'] = int(math.ceil(opts.timeout/2)) connection = Connection.establish(opts.broker, **conn_options) qmf_broker = self.connect_agent and BrokerAgent(connection) ha_broker = self.connect_agent and qmf_broker.getHaBroker() @@ -152,7 +152,7 @@ class PromoteCmd(Command): def __init__(self): Command.__init__(self, "promote","Promote a backup broker to primary. Note this command will not detect if another broker is already primary, and creating a second primary will make the cluster inconsistent. It is up to the caller (normally the cluster resource manager) to ensure there is only one primary.") def do_execute(self, qmf_broker, ha_broker, opts, args): - qmf_broker._method("promote", {}, HA_BROKER) + qmf_broker._method("promote", {}, HA_BROKER, timeout=opts.timeout) PromoteCmd() @@ -172,13 +172,15 @@ class StatusCmd(Command): def do_execute(self, qmf_broker, ha_broker, opts, args): if opts.is_primary: if not ha_broker.status in ["active", "recovering"]: raise ExitStatus(1) + return if opts.expect: if opts.expect != ha_broker.status: raise ExitStatus(1) + return + def status(hb, b=None, ex=None): if ex: print b, ex elif b: print b, hb.status else: print hb.status - self.all_brokers(ha_broker, opts, status) StatusCmd() @@ -187,7 +189,7 @@ class ReplicateCmd(Command): def __init__(self): Command.__init__(self, "replicate", "Set up replication from <queue> on <remote-broker> to <queue> on the current broker.", ["<queue>", "<remote-broker>"]) def do_execute(self, qmf_broker, ha_broker, opts, args): - qmf_broker._method("replicate", {"broker":args[1], "queue":args[2]}, HA_BROKER) + qmf_broker._method("replicate", {"broker":args[1], "queue":args[2]}, HA_BROKER, timeout=opts.timeout) ReplicateCmd() class QueryCmd(Command): |