summaryrefslogtreecommitdiff
path: root/qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp')
-rw-r--r--qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp137
1 files changed, 137 insertions, 0 deletions
diff --git a/qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp b/qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp
new file mode 100644
index 0000000000..57ba5cf2fd
--- /dev/null
+++ b/qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp
@@ -0,0 +1,137 @@
+/*
+ *
+ * Copyright (c) 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+/**@file
+
+ The watchdog plug-in will kill the qpidd broker process if it
+ becomes stuck for longer than a configured interval.
+
+ If the watchdog plugin is loaded and the --watchdog-interval=N
+ option is set then the broker starts a watchdog process and signals
+ it every N/2 seconds.
+
+ The watchdog process runs a very simple program that starts a timer
+ for N seconds, and resets the timer to N seconds whenever it is
+ signalled by the broker. If the timer ever reaches 0 the watchdog
+ kills the broker process (with kill -9) and exits.
+
+ This is useful in a cluster setting because in some insttances
+ (e.g. while resolving an error) it's possible for a stuck process
+ to hang other cluster members that are waiting for it to send a
+ message. Using the watchdog, the stuck process is terminated and
+ removed fromt the cluster allowing other members to continue and
+ clients of the stuck process to fail over to other members.
+
+*/
+#include "config.h"
+#include "qpid/Plugin.h"
+#include "qpid/Options.h"
+#include "qpid/log/Statement.h"
+#include "qpid/broker/Broker.h"
+#include "qpid/sys/Timer.h"
+#include "qpid/sys/Fork.h"
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <signal.h>
+
+namespace qpid {
+namespace cluster {
+
+using broker::Broker;
+
+struct Settings {
+ Settings() : interval(0) {}
+ int interval;
+};
+
+struct WatchDogOptions : public qpid::Options {
+ Settings& settings;
+
+ WatchDogOptions(Settings& s) : settings(s) {
+ addOptions()
+ ("watchdog-interval", optValue(settings.interval, "N"),
+ "broker is automatically killed if it is hung for more than \
+ N seconds. 0 disables watchdog.");
+ }
+};
+
+struct WatchDogTask : public sys::TimerTask {
+ int pid;
+ sys::Timer& timer;
+ int interval;
+
+ WatchDogTask(int pid_, sys::Timer& t, int _interval)
+ : TimerTask(_interval*sys::TIME_SEC/2,"WatchDog"), pid(pid_), timer(t), interval(_interval) {}
+
+ void fire() {
+ timer.add (new WatchDogTask(pid, timer, interval));
+ QPID_LOG(debug, "Sending keepalive signal to watchdog");
+ ::kill(pid, SIGUSR1);
+ }
+};
+
+struct WatchDogPlugin : public qpid::Plugin, public qpid::sys::Fork {
+ Settings settings;
+ WatchDogOptions options;
+ Broker* broker;
+ int watchdogPid;
+
+ WatchDogPlugin() : options(settings), broker(0), watchdogPid(0) {}
+
+ ~WatchDogPlugin() {
+ if (watchdogPid) ::kill(watchdogPid, SIGTERM);
+ ::waitpid(watchdogPid, 0, 0);
+ }
+
+ Options* getOptions() { return &options; }
+
+ void earlyInitialize(qpid::Plugin::Target& target) {
+ broker = dynamic_cast<Broker*>(&target);
+ if (broker && settings.interval) {
+ QPID_LOG(notice, "Starting watchdog process with interval of " <<
+ settings.interval << " seconds");
+ fork();
+ }
+ }
+
+ void initialize(Target&) {}
+
+ protected:
+
+ void child() { // Child of fork
+ const char* watchdog = ::getenv("QPID_WATCHDOG_EXEC"); // For use in tests
+ if (!watchdog) watchdog=QPID_LIBEXEC_DIR "/qpidd_watchdog";
+ std::string interval = boost::lexical_cast<std::string>(settings.interval);
+ ::execl(watchdog, watchdog, interval.c_str(), NULL);
+ QPID_LOG(critical, "Failed to exec watchdog program " << watchdog );
+ ::kill(::getppid(), SIGKILL);
+ exit(1);
+ }
+
+ void parent(int pid) { // Parent of fork
+ watchdogPid = pid;
+ broker->getTimer().add(
+ new WatchDogTask(watchdogPid, broker->getTimer(), settings.interval));
+ // TODO aconway 2009-08-10: to be extra safe, we could monitor
+ // the watchdog child and re-start it if it exits.
+ }
+};
+
+static WatchDogPlugin instance; // Static initialization.
+
+}} // namespace qpid::cluster