summaryrefslogtreecommitdiff
path: root/db/repl/rs.h
blob: 50c2c4c9b9c80e8a1c74325d8c813b1256922516 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
// /db/repl/rs.h

/**
*    Copyright (C) 2008 10gen Inc.
*
*    This program is free software: you can redistribute it and/or  modify
*    it under the terms of the GNU Affero General Public License, version 3,
*    as published by the Free Software Foundation.
*
*    This program is distributed in the hope that it will be useful,
*    but WITHOUT ANY WARRANTY; without even the implied warranty of
*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*    GNU Affero General Public License for more details.
*
*    You should have received a copy of the GNU Affero General Public License
*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#include "../../util/concurrency/list.h"
#include "../../util/concurrency/value.h"
#include "../../util/concurrency/msg.h"
#include "../../util/hostandport.h"
#include "../commands.h"
#include "rs_exception.h"
#include "rs_optime.h"
#include "rs_member.h"
#include "rs_config.h"

namespace mongo {

    struct HowToFixUp;
    struct Target;
    class DBClientConnection;
    class ReplSetImpl;
    class OplogReader;
    extern bool replSet; // true if using repl sets
    extern class ReplSet *theReplSet; // null until initialized
    extern Tee *rsLog;

    /* member of a replica set */
    class Member : public List1<Member>::Base {
    public:
        Member(HostAndPort h, unsigned ord, const ReplSetConfig::MemberCfg *c, bool self);

        string fullName() const { return h().toString(); }
        const ReplSetConfig::MemberCfg& config() const { return _config; }
        const HeartbeatInfo& hbinfo() const { return _hbinfo; }
        HeartbeatInfo& get_hbinfo() { return _hbinfo; }
        string lhb() const { return _hbinfo.lastHeartbeatMsg; }
        MemberState state() const { return _hbinfo.hbstate; }
        const HostAndPort& h() const { return _h; }
        unsigned id() const { return _hbinfo.id(); }

        bool potentiallyHot() const { return _config.potentiallyHot(); } // not arbiter, not priority 0
        void summarizeMember(stringstream& s) const;

    private:
        friend class ReplSetImpl;
        const ReplSetConfig::MemberCfg _config;
        const HostAndPort _h;
        HeartbeatInfo _hbinfo;
    };

    class Manager : public task::Server {
        ReplSetImpl *rs;
        bool busyWithElectSelf;
        int _primary;

        /** @param two - if true two primaries were seen.  this can happen transiently, in addition to our
                         polling being only occasional.  in this case null is returned, but the caller should
                         not assume primary itself in that situation.
        */
        const Member* findOtherPrimary(bool& two);

        void noteARemoteIsPrimary(const Member *);
        virtual void starting();
    public:
        Manager(ReplSetImpl *rs);
        virtual ~Manager();
        void msgReceivedNewConfig(BSONObj);
        void msgCheckNewState();
    };

    struct Target;

    class Consensus {
        ReplSetImpl &rs;
        struct LastYea {
            LastYea() : when(0), who(0xffffffff) { }
            time_t when;
            unsigned who;
        };
        Atomic<LastYea> ly;
        unsigned yea(unsigned memberId); // throws VoteException
        void electionFailed(unsigned meid);
        void _electSelf();
        bool weAreFreshest(bool& allUp, int& nTies);
        bool sleptLast; // slept last elect() pass
    public:
        Consensus(ReplSetImpl *t) : rs(*t) {
            sleptLast = false;
            steppedDown = 0;
        }

        /* if we've stepped down, this is when we are allowed to try to elect ourself again.
           todo: handle possible weirdnesses at clock skews etc.
        */
        time_t steppedDown;

        int totalVotes() const;
        bool aMajoritySeemsToBeUp() const;
        bool shouldRelinquish() const;
        void electSelf();
        void electCmdReceived(BSONObj, BSONObjBuilder*);
        void multiCommand(BSONObj cmd, list<Target>& L);
    };

    /** most operations on a ReplSet object should be done while locked. that logic implemented here. */
    class RSBase : boost::noncopyable {
    public:
        const unsigned magic;
        void assertValid() { assert( magic == 0x12345677 ); }
    private:
        mongo::mutex m;
        int _locked;
        ThreadLocalValue<bool> _lockedByMe;
    protected:
        RSBase() : magic(0x12345677), m("RSBase"), _locked(0) { }
        ~RSBase() {
            /* this can happen if we throw in the constructor; otherwise never happens.  thus we log it as it is quite unusual. */
            log() << "replSet ~RSBase called" << rsLog;
        }

        class lock {
            RSBase& rsbase;
            auto_ptr<scoped_lock> sl;
        public:
            lock(RSBase* b) : rsbase(*b) {
                if( rsbase._lockedByMe.get() )
                    return; // recursive is ok...

                sl.reset( new scoped_lock(rsbase.m) );
                DEV assert(rsbase._locked == 0);
                rsbase._locked++;
                rsbase._lockedByMe.set(true);
            }
            ~lock() {
                if( sl.get() ) {
                    assert( rsbase._lockedByMe.get() );
                    DEV assert(rsbase._locked == 1);
                    rsbase._lockedByMe.set(false);
                    rsbase._locked--;
                }
            }
        };

    public:
        /* for asserts */
        bool locked() const { return _locked != 0; }

        /* if true, is locked, and was locked by this thread. note if false, it could be in the lock or not for another
           just for asserts & such so we can make the contracts clear on who locks what when.
           we don't use these locks that frequently, so the little bit of overhead is fine.
        */
        bool lockedByMe() { return _lockedByMe.get(); }
    };

    class ReplSetHealthPollTask;

    /* safe container for our state that keeps member pointer and state variables always aligned */
    class StateBox : boost::noncopyable {
    public:
        struct SP { // SP is like pair<MemberState,const Member *> but nicer
            SP() : state(MemberState::RS_STARTUP), primary(0) { }
            MemberState state;
            const Member *primary;
        };
        const SP get() {
            scoped_lock lk(m);
            return sp;
        }
        MemberState getState() const { return sp.state; }
        const Member* getPrimary() const { return sp.primary; }
        void change(MemberState s, const Member *self) {
            scoped_lock lk(m);
            if( sp.state != s ) {
                log() << "replSet " << s.toString() << rsLog;
            }
            sp.state = s;
            if( s.primary() ) {
                sp.primary = self;
            }
            else {
                if( self == sp.primary )
                    sp.primary = 0;
            }
        }
        void set(MemberState s, const Member *p) {
            scoped_lock lk(m);
            sp.state = s; sp.primary = p;
        }
        void setSelfPrimary(const Member *self) { change(MemberState::RS_PRIMARY, self); }
        void setOtherPrimary(const Member *mem) {
            scoped_lock lk(m);
            assert( !sp.state.primary() );
            sp.primary = mem;
        }
        void noteRemoteIsPrimary(const Member *remote) {
            scoped_lock lk(m);
            if( !sp.state.secondary() && !sp.state.fatal() )
                sp.state = MemberState::RS_RECOVERING;
            sp.primary = remote;
        }
        StateBox() : m("StateBox") { }
    private:
        mongo::mutex m;
        SP sp;
    };

    void parseReplsetCmdLine(string cfgString, string& setname, vector<HostAndPort>& seeds, set<HostAndPort>& seedSet );

    /** Parameter given to the --replSet command line option (parsed).
        Syntax is "<setname>/<seedhost1>,<seedhost2>"
        where setname is a name and seedhost is "<host>[:<port>]" */
    class ReplSetCmdline {
    public:
        ReplSetCmdline(string cfgString) { parseReplsetCmdLine(cfgString, setname, seeds, seedSet); }
        string setname;
        vector<HostAndPort> seeds;
        set<HostAndPort> seedSet;
    };

    /* information about the entire repl set, such as the various servers in the set, and their state */
    /* note: We currently do not free mem when the set goes away - it is assumed the replset is a
             singleton and long lived.
    */
    class ReplSetImpl : protected RSBase {
    public:
        /** info on our state if the replset isn't yet "up".  for example, if we are pre-initiation. */
        enum StartupStatus {
            PRESTART=0, LOADINGCONFIG=1, BADCONFIG=2, EMPTYCONFIG=3,
            EMPTYUNREACHABLE=4, STARTED=5, SOON=6
        };
        static StartupStatus startupStatus;
        static DiagStr startupStatusMsg;
        static string stateAsHtml(MemberState state);

        /* todo thread */
        void msgUpdateHBInfo(HeartbeatInfo);

        StateBox box;

        OpTime lastOpTimeWritten;
        long long lastH; // hash we use to make sure we are reading the right flow of ops and aren't on an out-of-date "fork"
    private:
        set<ReplSetHealthPollTask*> healthTasks;
        void endOldHealthTasks();
        void startHealthTaskFor(Member *m);

        Consensus elect;
        void relinquish();
        void forgetPrimary();
    protected:
        bool _stepDown(int secs);
        bool _freeze(int secs);
    private:
        void assumePrimary();
        void loadLastOpTimeWritten();
        void changeState(MemberState s);
        const Member* getMemberToSyncTo();
        void _changeArbiterState();
    protected:
        // "heartbeat message"
        // sent in requestHeartbeat respond in field "hbm"
        char _hbmsg[256]; // we change this unlocked, thus not an stl::string
        time_t _hbmsgTime; // when it was logged
    public:
        void sethbmsg(string s, int logLevel = 0);
    protected:
        bool initFromConfig(ReplSetConfig& c, bool reconf=false); // true if ok; throws if config really bad; false if config doesn't include self
        void _fillIsMaster(BSONObjBuilder&);
        void _fillIsMasterHost(const Member*, vector<string>&, vector<string>&, vector<string>&);
        const ReplSetConfig& config() { return *_cfg; }
        string name() const { return _name; } /* @return replica set's logical name */
        MemberState state() const { return box.getState(); }
        void _fatal();
        void _getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const;
        void _summarizeAsHtml(stringstream&) const;
        void _summarizeStatus(BSONObjBuilder&) const; // for replSetGetStatus command

        /* throws exception if a problem initializing. */
        ReplSetImpl(ReplSetCmdline&);

        /* call afer constructing to start - returns fairly quickly after launching its threads */
        void _go();

    private:
        string _name;
        const vector<HostAndPort> *_seeds;
        ReplSetConfig *_cfg;

        /** load our configuration from admin.replset.  try seed machines too.
            @return true if ok; throws if config really bad; false if config doesn't include self
        */
        bool _loadConfigFinish(vector<ReplSetConfig>& v);
        void loadConfig();

        list<HostAndPort> memberHostnames() const;
        const ReplSetConfig::MemberCfg& myConfig() const { return _self->config(); }
        bool iAmArbiterOnly() const { return myConfig().arbiterOnly; }
        bool iAmPotentiallyHot() const { return myConfig().potentiallyHot(); }
    protected:
        Member *_self;
        bool _buildIndexes;       // = _self->config().buildIndexes
        void setSelfTo(Member *); // use this as it sets buildIndexes var
    private:
        List1<Member> _members; /* all members of the set EXCEPT self. */

    public:
        unsigned selfId() const { return _self->id(); }
        Manager *mgr;

    private:
        Member* head() const { return _members.head(); }
    public:
        const Member* findById(unsigned id) const;
    private:
        void _getTargets(list<Target>&, int &configVersion);
        void getTargets(list<Target>&, int &configVersion);
        void startThreads();
        friend class FeedbackThread;
        friend class CmdReplSetElect;
        friend class Member;
        friend class Manager;
        friend class Consensus;

    private:
        /* pulling data from primary related - see rs_sync.cpp */
        bool initialSyncOplogApplication(const Member *primary, OpTime applyGTE, OpTime minValid);
        void _syncDoInitialSync();
        void syncDoInitialSync();
        void _syncThread();
        bool tryToGoLiveAsASecondary(OpTime&); // readlocks
        void syncTail();
        bool syncApply(const BSONObj &o);
        unsigned _syncRollback(OplogReader& r);
        void syncRollback(OplogReader& r);
        void syncFixUp(HowToFixUp& h, OplogReader& r);
        bool _getOplogReader(OplogReader& r, string& hn);
        bool _isStale(OplogReader& r, const string& hn);
    public:
        void syncThread();
    };

    class ReplSet : public ReplSetImpl {
    public:
        ReplSet(ReplSetCmdline& replSetCmdline) : ReplSetImpl(replSetCmdline) {  }

        // for the replSetStepDown command
        bool stepDown(int secs) { return _stepDown(secs); }

        // for the replSetFreeze command
        bool freeze(int secs) { return _freeze(secs); }

        string selfFullName() {
            lock lk(this);
            return _self->fullName();
        }

        bool buildIndexes() const { return _buildIndexes; }

        /* call after constructing to start - returns fairly quickly after la[unching its threads */
        void go() { _go(); }

        void fatal() { _fatal(); }
        bool isPrimary() { return box.getState().primary(); }
        bool isSecondary() {  return box.getState().secondary(); }
        MemberState state() const { return ReplSetImpl::state(); }
        string name() const { return ReplSetImpl::name(); }
        const ReplSetConfig& config() { return ReplSetImpl::config(); }
        void getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const { _getOplogDiagsAsHtml(server_id,ss); }
        void summarizeAsHtml(stringstream& ss) const { _summarizeAsHtml(ss); }
        void summarizeStatus(BSONObjBuilder& b) const  { _summarizeStatus(b); }
        void fillIsMaster(BSONObjBuilder& b) { _fillIsMaster(b); }

        /* we have a new config (reconfig) - apply it.
           @param comment write a no-op comment to the oplog about it.  only makes sense if one is primary and initiating the reconf.
        */
        void haveNewConfig(ReplSetConfig& c, bool comment);

        /* if we delete old configs, this needs to assure locking. currently we don't so it is ok. */
        const ReplSetConfig& getConfig() { return config(); }

        bool lockedByMe() { return RSBase::lockedByMe(); }

        // heartbeat msg to send to others; descriptive diagnostic info
        string hbmsg() const {
            if( time(0)-_hbmsgTime > 120 ) return "";
            return _hbmsg;
        }
    };

    /** base class for repl set commands.  checks basic things such as in rs mode before the command
        does its real work
        */
    class ReplSetCommand : public Command {
    protected:
        ReplSetCommand(const char * s, bool show=false) : Command(s, show) { }
        virtual bool slaveOk() const { return true; }
        virtual bool adminOnly() const { return true; }
        virtual bool logTheOp() { return false; }
        virtual LockType locktype() const { return NONE; }
        virtual void help( stringstream &help ) const { help << "internal"; }
        bool check(string& errmsg, BSONObjBuilder& result) {
            if( !replSet ) {
                errmsg = "not running with --replSet";
                return false;
            }
            if( theReplSet == 0 ) {
                result.append("startupStatus", ReplSet::startupStatus);
                errmsg = ReplSet::startupStatusMsg.empty() ? "replset unknown error 2" : ReplSet::startupStatusMsg.get();
                if( ReplSet::startupStatus == 3 )
                    result.append("info", "run rs.initiate(...) if not yet done for the set");
                return false;
            }
            return true;
        }
    };

    /** inlines ----------------- */

    inline Member::Member(HostAndPort h, unsigned ord, const ReplSetConfig::MemberCfg *c, bool self) :
        _config(*c), _h(h), _hbinfo(ord) {
        if( self )
            _hbinfo.health = 1.0;
    }

}