summaryrefslogtreecommitdiff
path: root/cpp/src/qpid/cluster/WatchDogPlugin.cpp
blob: 57ba5cf2fdfaffd4a47114d990b0e995439560fd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/*
 *
 * Copyright (c) 2006 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

/**@file

   The watchdog plug-in will kill the qpidd broker process if it
   becomes stuck for longer than a configured interval.

   If the watchdog plugin is loaded and the --watchdog-interval=N
   option is set then the broker starts a watchdog process and signals
   it every N/2 seconds.

   The watchdog process runs a very simple program that starts a timer
   for N seconds, and resets the timer to N seconds whenever it is
   signalled by the broker. If the timer ever reaches 0 the watchdog
   kills the broker process (with kill -9) and exits.

   This is useful in a cluster setting because in some insttances
   (e.g. while resolving an error) it's possible for a stuck process
   to hang other cluster members that are waiting for it to send a
   message.  Using the watchdog, the stuck process is terminated and 
   removed fromt the cluster allowing other members to continue and
   clients of the stuck process to fail over to other members.

*/
#include "config.h"
#include "qpid/Plugin.h"
#include "qpid/Options.h"
#include "qpid/log/Statement.h"
#include "qpid/broker/Broker.h"
#include "qpid/sys/Timer.h"
#include "qpid/sys/Fork.h"
#include <sys/types.h>
#include <sys/wait.h>
#include <signal.h>

namespace qpid {
namespace cluster {

using broker::Broker;

struct Settings {
    Settings() : interval(0) {}
    int interval;
};

struct WatchDogOptions : public qpid::Options {
    Settings& settings;

    WatchDogOptions(Settings& s) : settings(s) {
        addOptions()
            ("watchdog-interval", optValue(settings.interval, "N"),
             "broker is automatically killed if it is hung for more than \
	      N seconds. 0 disables watchdog.");
    }
};

struct WatchDogTask : public sys::TimerTask {
    int pid;
    sys::Timer& timer;
    int interval;

    WatchDogTask(int pid_, sys::Timer& t, int _interval)
        : TimerTask(_interval*sys::TIME_SEC/2,"WatchDog"), pid(pid_), timer(t), interval(_interval) {}

    void fire() {
        timer.add (new WatchDogTask(pid, timer, interval));
        QPID_LOG(debug, "Sending keepalive signal to watchdog");
        ::kill(pid, SIGUSR1);
    }
};

struct WatchDogPlugin : public qpid::Plugin, public qpid::sys::Fork {
    Settings settings;
    WatchDogOptions options;
    Broker* broker;
    int watchdogPid;

    WatchDogPlugin() : options(settings), broker(0), watchdogPid(0) {}

    ~WatchDogPlugin() {
        if (watchdogPid) ::kill(watchdogPid, SIGTERM);
        ::waitpid(watchdogPid, 0, 0);
    }

    Options* getOptions() { return &options; }

    void earlyInitialize(qpid::Plugin::Target& target) {
        broker = dynamic_cast<Broker*>(&target);
        if (broker && settings.interval) {
            QPID_LOG(notice, "Starting watchdog process with interval of " <<
                     settings.interval << " seconds");
            fork();
        }
    }

    void initialize(Target&) {}

  protected:

    void child() {              // Child of fork
        const char* watchdog = ::getenv("QPID_WATCHDOG_EXEC"); // For use in tests
        if (!watchdog) watchdog=QPID_LIBEXEC_DIR "/qpidd_watchdog";
        std::string interval = boost::lexical_cast<std::string>(settings.interval);
        ::execl(watchdog, watchdog, interval.c_str(), NULL);
        QPID_LOG(critical, "Failed to exec watchdog program " << watchdog );
        ::kill(::getppid(), SIGKILL);
        exit(1);
    }

    void parent(int pid) {          // Parent of fork
        watchdogPid = pid;
        broker->getTimer().add(
            new WatchDogTask(watchdogPid, broker->getTimer(), settings.interval));
        // TODO aconway 2009-08-10: to be extra safe, we could monitor
        // the watchdog child and re-start it if it exits.
    }
};

static WatchDogPlugin instance; // Static initialization.

}} // namespace qpid::cluster