summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorAlan Conway <aconway@apache.org>2014-07-31 13:55:11 +0000
committerAlan Conway <aconway@apache.org>2014-07-31 13:55:11 +0000
commit22b2c33d37c8a4ccf7ebf41b34e3d236f4cc37f7 (patch)
tree96dfcdd3f4144ee476f4d3f949ea9f50c226b51c /tools
parent25df707b162d2342286285d5acd4d083967427fb (diff)
downloadqpid-python-22b2c33d37c8a4ccf7ebf41b34e3d236f4cc37f7.tar.gz
QPID-5942: qpid HA cluster may end-up in joining state after HA primary is killed
There are two issues here, both related to the fact that rgmanager sees qpidd and qpidd-primary as two separate services. 1. The service start/stop scripts can be called concurrently. This can lead to running a qpidd process who's pid is not in the pidfile. rgmanager cannot detect or kill this qpidd and cannot start another qpidd because of the lock on the qpidd data directory. 2. rgmanager sees a primary failure as two failures: qpidd and qpidd-primary, and will then try to stop and start both services. The order of these actions is not defined and can lead to rgmanager killing a service it has just started. This patch makes two major changes to the init scripts: 1. Uses flock to lock the sensitive stop/start part of the scripts to ensure they are not executed concurrently. 2. On "stop" the scripts check if a running qpidd is primary or not. "qpidd stop" is a no-op if the running broker is primary, "qpidd-primary stop" is a no op if it is not. This ensures that a broker will be stopped by the same stream of service actions that started it. Minor changes in this patch: - better logging of broker start-up and shut-down sequence. - qpid-ha heartbeat use half of timeout option. - add missing timeouts in qpid-ha. Notes: This changes the behavior of 'clusvcadm -d <qpidd-service>' on the primary node. Previously this would have stopped the qpidd service on that node, killed the qpidd process and relocated the primary service. Now this will stop the qpidd service (as far as rgmanager is concerned) but will not kill qpidd or relocate the primary service. When the primary is relocated the qpidd service wil not be able to re-start on that node until it is re-enabled with 'clusvcadm -e'. git-svn-id: https://svn.apache.org/repos/asf/qpid/trunk/qpid@1614895 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'tools')
-rwxr-xr-xtools/src/py/qpid-ha12
1 files changed, 7 insertions, 5 deletions
diff --git a/tools/src/py/qpid-ha b/tools/src/py/qpid-ha
index 49f6a244c6..640463c09a 100755
--- a/tools/src/py/qpid-ha
+++ b/tools/src/py/qpid-ha
@@ -19,7 +19,7 @@
# under the License.
#
-import optparse, sys, time, os, re
+import optparse, sys, time, os, re, math
from qpid.messaging import Connection
from qpid.messaging import Message as QpidMessage
from qpid.util import URL
@@ -100,7 +100,7 @@ class Command:
conn_options['client_properties'] = {'qpid.ha-admin' : 1}
if opts.timeout:
conn_options['timeout'] = opts.timeout
- conn_options['heartbeat'] = int(opts.timeout)
+ conn_options['heartbeat'] = int(math.ceil(opts.timeout/2))
connection = Connection.establish(opts.broker, **conn_options)
qmf_broker = self.connect_agent and BrokerAgent(connection)
ha_broker = self.connect_agent and qmf_broker.getHaBroker()
@@ -152,7 +152,7 @@ class PromoteCmd(Command):
def __init__(self):
Command.__init__(self, "promote","Promote a backup broker to primary. Note this command will not detect if another broker is already primary, and creating a second primary will make the cluster inconsistent. It is up to the caller (normally the cluster resource manager) to ensure there is only one primary.")
def do_execute(self, qmf_broker, ha_broker, opts, args):
- qmf_broker._method("promote", {}, HA_BROKER)
+ qmf_broker._method("promote", {}, HA_BROKER, timeout=opts.timeout)
PromoteCmd()
@@ -172,13 +172,15 @@ class StatusCmd(Command):
def do_execute(self, qmf_broker, ha_broker, opts, args):
if opts.is_primary:
if not ha_broker.status in ["active", "recovering"]: raise ExitStatus(1)
+ return
if opts.expect:
if opts.expect != ha_broker.status: raise ExitStatus(1)
+ return
+
def status(hb, b=None, ex=None):
if ex: print b, ex
elif b: print b, hb.status
else: print hb.status
-
self.all_brokers(ha_broker, opts, status)
StatusCmd()
@@ -187,7 +189,7 @@ class ReplicateCmd(Command):
def __init__(self):
Command.__init__(self, "replicate", "Set up replication from <queue> on <remote-broker> to <queue> on the current broker.", ["<queue>", "<remote-broker>"])
def do_execute(self, qmf_broker, ha_broker, opts, args):
- qmf_broker._method("replicate", {"broker":args[1], "queue":args[2]}, HA_BROKER)
+ qmf_broker._method("replicate", {"broker":args[1], "queue":args[2]}, HA_BROKER, timeout=opts.timeout)
ReplicateCmd()
class QueryCmd(Command):