summaryrefslogtreecommitdiff
path: root/cpp/src
diff options
context:
space:
mode:
authorAlan Conway <aconway@apache.org>2009-08-10 21:10:53 +0000
committerAlan Conway <aconway@apache.org>2009-08-10 21:10:53 +0000
commit9a0521e5562ba6bf5c3468eb171c109e166cfa5d (patch)
treea0f3e6be8d28b634a0e2ee94d5fd853dc3562ffb /cpp/src
parent5518c23f9a69f6d616abc770bda0677b2f0b51ac (diff)
downloadqpid-python-9a0521e5562ba6bf5c3468eb171c109e166cfa5d.tar.gz
Watchdog feature to remove unresponsive cluster nodes.
In some intstances (e.g. while resolving an error) it's possible for a hung process to hang the entire cluster as they wait for its response. The cluster can handle terminated processes but hung processes present a problem. If the watchdog plugin is loaded and --watchdog-interval is set then the broker forks a child process that runs a very simple watchdog program, and starts a timer in the broker process to signal the watchdog every interval/2 seconds. The watchdog kills its parent if it does not receive a signal for interval seconds. This allows a stuck broker to be removed from the cluster so other cluster members can continue. git-svn-id: https://svn.apache.org/repos/asf/qpid/trunk/qpid@802927 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'cpp/src')
-rw-r--r--cpp/src/Makefile.am1
-rw-r--r--cpp/src/cluster.mk9
-rw-r--r--cpp/src/qpid/cluster/WatchDogPlugin.cpp114
-rw-r--r--cpp/src/qpid/cluster/qpidd_watchdog.cpp60
-rw-r--r--cpp/src/tests/cluster.mk68
-rwxr-xr-xcpp/src/tests/test_watchdog16
6 files changed, 235 insertions, 33 deletions
diff --git a/cpp/src/Makefile.am b/cpp/src/Makefile.am
index 24b92c8981..feff066e2e 100644
--- a/cpp/src/Makefile.am
+++ b/cpp/src/Makefile.am
@@ -115,6 +115,7 @@ INCLUDES = -I$(top_srcdir)/include -I$(top_builddir)/include -I$(srcdir) -I=$(bu
# Destination for intalled programs and tests defined here
#
qpidexecdir = $(libexecdir)/qpid
+AM_CXXFLAGS += -DQPID_EXEC_DIR=\"$(qpidexecdir)\"
qpidexec_PROGRAMS =
qpidexec_SCRIPTS =
qpidtestdir = $(qpidexecdir)/tests
diff --git a/cpp/src/cluster.mk b/cpp/src/cluster.mk
index c4907a1b04..d90a06e1e2 100644
--- a/cpp/src/cluster.mk
+++ b/cpp/src/cluster.mk
@@ -89,4 +89,13 @@ cluster_la_LIBADD= -lcpg $(libcman) libqpidbroker.la libqpidclient.la
cluster_la_CXXFLAGS = $(AM_CXXFLAGS) -fno-strict-aliasing
cluster_la_LDFLAGS = $(PLUGINLDFLAGS)
+# The watchdog plugin and helper executable
+dmodule_LTLIBRARIES += watchdog.la
+watchdog_la_SOURCES = qpid/cluster/WatchDogPlugin.cpp
+watchdog_la_LIBADD = libqpidbroker.la
+watchdog_la_LDFLAGS = $(PLUGINLDFLAGS)
+
+qpidexec_PROGRAMS += qpidd_watchdog
+qpidd_watchdog_SOURCES = qpid/cluster/qpidd_watchdog.cpp
+
endif # HAVE_LIBCPG
diff --git a/cpp/src/qpid/cluster/WatchDogPlugin.cpp b/cpp/src/qpid/cluster/WatchDogPlugin.cpp
new file mode 100644
index 0000000000..1b813411f6
--- /dev/null
+++ b/cpp/src/qpid/cluster/WatchDogPlugin.cpp
@@ -0,0 +1,114 @@
+/*
+ *
+ * Copyright (c) 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "qpid/Plugin.h"
+#include "qpid/Options.h"
+#include "qpid/log/Statement.h"
+#include "qpid/broker/Broker.h"
+#include "qpid/sys/Timer.h"
+#include "qpid/sys/Fork.h"
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <signal.h>
+
+namespace qpid {
+namespace cluster {
+
+using broker::Broker;
+
+struct Settings {
+ Settings() : interval(0) {}
+ int interval;
+};
+
+struct WatchDogOptions : public qpid::Options {
+ Settings& settings;
+
+ WatchDogOptions(Settings& s) : settings(s) {
+ addOptions()
+ ("watchdog-interval", optValue(settings.interval, "N"),
+ "broker is automatically killed if it is hung for more than \
+ N seconds. 0 disables watchdog.");
+ }
+};
+
+struct WatchDogTask : public sys::TimerTask {
+ int pid;
+ sys::Timer& timer;
+ int interval;
+
+ WatchDogTask(int pid_, sys::Timer& t, int _interval)
+ : TimerTask(_interval*sys::TIME_SEC/2), pid(pid_), timer(t), interval(_interval) {}
+
+ void fire() {
+ timer.add (new WatchDogTask(pid, timer, interval));
+ QPID_LOG(debug, "Sending keepalive signal to watchdog");
+ ::kill(pid, SIGUSR1);
+ }
+};
+
+struct WatchDogPlugin : public qpid::Plugin, public qpid::sys::Fork {
+ Settings settings;
+ WatchDogOptions options;
+ Broker* broker;
+ int watchdogPid;
+
+ WatchDogPlugin() : options(settings), broker(0), watchdogPid(0) {}
+
+ ~WatchDogPlugin() {
+ if (watchdogPid) ::kill(watchdogPid, SIGTERM);
+ ::waitpid(watchdogPid, 0, 0);
+ }
+
+ Options* getOptions() { return &options; }
+
+ void earlyInitialize(qpid::Plugin::Target& target) {
+ broker = dynamic_cast<Broker*>(&target);
+ if (broker && settings.interval) {
+ QPID_LOG(notice, "Starting watchdog process with interval of " <<
+ settings.interval << " seconds");
+ fork();
+ }
+ }
+
+ void initialize(Target&) {}
+
+ protected:
+
+ void child() { // Child of fork
+ const char* watchdog = ::getenv("QPID_WATCHDOG_EXE"); // For use in tests
+ if (!watchdog) watchdog=QPID_EXEC_DIR "/qpidd_watchdog";
+ std::string interval = boost::lexical_cast<std::string>(settings.interval);
+ ::execl(watchdog, watchdog, interval.c_str(), NULL);
+ QPID_LOG(critical, "Failed to exec watchdog program " << watchdog );
+ ::kill(::getppid(), SIGKILL);
+ exit(1);
+ }
+
+ void parent(int pid) { // Parent of fork
+ watchdogPid = pid;
+ broker->getTimer().add(
+ new WatchDogTask(watchdogPid, broker->getTimer(), settings.interval));
+ // TODO aconway 2009-08-10: to be extra safe, we could monitor
+ // the watchdog child and re-start it if it exits.
+ }
+};
+
+static WatchDogPlugin instance; // Static initialization.
+
+}} // namespace qpid::cluster
diff --git a/cpp/src/qpid/cluster/qpidd_watchdog.cpp b/cpp/src/qpid/cluster/qpidd_watchdog.cpp
new file mode 100644
index 0000000000..0e7f4f18fd
--- /dev/null
+++ b/cpp/src/qpid/cluster/qpidd_watchdog.cpp
@@ -0,0 +1,60 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+#include <sys/types.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+
+long timeout;
+
+void killParent(int) {
+ ::kill(getppid(), SIGKILL);
+ ::fprintf(stderr, "Watchdog killed unresponsive broker, pid=%d\n", ::getppid());
+ ::exit(1);
+}
+
+void resetTimer(int) {
+ struct ::itimerval itval = { { 0, 0 }, { timeout, 0 } };
+ if (::setitimer(ITIMER_REAL, &itval, 0) !=0) {
+ ::perror("Watchdog failed to set timer");
+ killParent(0);
+ ::exit(1);
+ }
+}
+
+/** Simple watchdog program: kill parent process if timeout
+ * expires without a SIGUSR1.
+ * Will be killed with SIGHUP when parent shuts down.
+ * Args: timeout in seconds.
+ */
+int main(int argc, char** argv) {
+ if(argc != 2 || (timeout = atoi(argv[1])) == 0) {
+ ::fprintf(stderr, "Usage: %s <timeout_seconds>\n", argv[0]);
+ ::exit(1);
+ }
+ ::signal(SIGUSR1, resetTimer);
+ ::signal(SIGALRM, killParent);
+ resetTimer(0);
+ while (true) { sleep(INT_MAX); }
+}
diff --git a/cpp/src/tests/cluster.mk b/cpp/src/tests/cluster.mk
index dc592fa4d5..6fc4c64a5e 100644
--- a/cpp/src/tests/cluster.mk
+++ b/cpp/src/tests/cluster.mk
@@ -29,44 +29,46 @@ if HAVE_LIBCPG
# ais_check checks pre-requisites for cluster tests and runs them if ok.
-TESTS += \
- ais_check \
- run_cluster_tests \
- federated_cluster_test \
+TESTS += \
+ ais_check \
+ test_watchdog \
+ run_cluster_tests \
+ federated_cluster_test \
clustered_replication_test
-
-EXTRA_DIST += \
- ais_check \
- start_cluster \
- stop_cluster \
- restart_cluster \
- cluster_python_tests \
- cluster_python_tests_failing.txt \
- federated_cluster_test \
- clustered_replication_test \
- run_cluster_tests \
- run_long_cluster_tests \
- testlib.py \
- cluster_tests.py \
- long_cluster_tests.py
-
-LONG_TESTS += \
- run_long_cluster_tests \
- start_cluster \
- cluster_python_tests \
+EXTRA_DIST += \
+ ais_check \
+ start_cluster \
+ stop_cluster \
+ restart_cluster \
+ cluster_python_tests \
+ cluster_python_tests_failing.txt \
+ federated_cluster_test \
+ clustered_replication_test \
+ run_cluster_tests \
+ run_long_cluster_tests \
+ testlib.py \
+ cluster_tests.py \
+ long_cluster_tests.py
+
+LONG_TESTS += \
+ run_long_cluster_tests \
+ start_cluster \
+ cluster_python_tests \
stop_cluster
qpidtest_PROGRAMS += cluster_test
-cluster_test_SOURCES = \
- cluster_test.cpp \
- unit_test.cpp \
- ClusterFixture.cpp \
- ClusterFixture.h \
- ForkedBroker.h \
- ForkedBroker.cpp \
- PartialFailure.cpp \
- ClusterFailover.cpp
+
+cluster_test_SOURCES = \
+ cluster_test.cpp \
+ unit_test.cpp \
+ ClusterFixture.cpp \
+ ClusterFixture.h \
+ ForkedBroker.h \
+ ForkedBroker.cpp \
+ PartialFailure.cpp \
+ ClusterFailover.cpp
+
cluster_test_LDADD=$(lib_client) $(lib_broker) -lboost_unit_test_framework
qpidtest_SCRIPTS += run_cluster_tests cluster_tests.py run_long_cluster_tests long_cluster_tests.py testlib.py
diff --git a/cpp/src/tests/test_watchdog b/cpp/src/tests/test_watchdog
new file mode 100755
index 0000000000..c2f33501b8
--- /dev/null
+++ b/cpp/src/tests/test_watchdog
@@ -0,0 +1,16 @@
+#!/bin/sh
+# Tests for the watchdog plug-in
+
+# Start a broker with watchdog, freeze it with kill -STOP, verify that it is killed.
+export QPID_WATCHDOG_EXE=$PWD/../qpidd_watchdog
+PORT=`../qpidd -dp0 --no-data-dir --auth=no --no-module-dir --load-module $PWD/../.libs/watchdog.so --log-to-file=qpidd_watchdog.log --watchdog-interval 1`
+PID=`../qpidd -cp $PORT`
+kill -STOP $PID
+sleep 2
+
+if kill -0 $PID 2>/dev/null; then
+ echo "Hung process did not die."
+ kill $PID
+else
+ true
+fi