summaryrefslogtreecommitdiff
path: root/jstests/replsets/catchup.js
blob: 3d661bb8e35821420f2e1a41312b39620f3eb22f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
// Test the catch-up behavior of new primaries.

load("jstests/replsets/rslib.js");

(function() {

    "use strict";
    var name = "catch_up";
    var rst = new ReplSetTest({name: name, nodes: 3, useBridge: true});

    rst.startSet();
    var conf = rst.getReplSetConfig();
    conf.settings = {
        heartbeatIntervalMillis: 500,
        electionTimeoutMillis: 10000,
        catchUpTimeoutMillis: 10000
    };
    rst.initiate(conf);
    rst.awaitSecondaryNodes();

    var primary = rst.getPrimary();
    var primaryColl = primary.getDB("test").coll;

    // Set verbosity for replication on all nodes.
    var verbosity = {
        "setParameter": 1,
        "logComponentVerbosity": {
            "replication": {"verbosity": 2},
        }
    };
    rst.nodes.forEach(function(node) {
        node.adminCommand(verbosity);
    });

    function enableFailPoint(node) {
        jsTest.log("enable failpoint " + node.host);
        // Disable syncing on both secondaries.
        assert.commandWorked(
            node.adminCommand({configureFailPoint: 'pauseRsBgSyncProducer', mode: 'alwaysOn'}),
            'Failed to configure pauseRsBgSyncProducer failpoint.');
        assert.commandWorked(
            node.adminCommand({configureFailPoint: 'stopOplogFetcher', mode: 'alwaysOn'}));
    }

    function disableFailPoint(node) {
        jsTest.log("disable failpoint " + node.host);
        assert.commandWorked(
            node.adminCommand({configureFailPoint: 'stopOplogFetcher', mode: 'off'}));
        try {
            assert.commandWorked(
                node.adminCommand({configureFailPoint: 'pauseRsBgSyncProducer', mode: 'off'}),
                'Failed to disable pauseRsBgSyncProducer failpoint.');
        } catch (ex) {
            // Enable bgsync producer may cause rollback, which will close all connections
            // including the one sending "configureFailPoint".
            print("got exception when disabling fail point 'pauseRsBgSyncProducer': " + e);
        }
    }

    function stepUp(node) {
        assert.commandWorked(node.adminCommand({replSetStepUp: 1}));
        return node;
    }

    function doWrites(node) {
        for (var i = 0; i < 3; i++) {
            assert.writeOK(node.getDB("test").foo.insert({x: i}));
        }
    }

    function checkOpInOplog(node, op, count) {
        node.getDB("admin").getMongo().setSlaveOk();
        var oplog = node.getDB("local")['oplog.rs'];
        var oplogArray = oplog.find().toArray();
        assert.eq(oplog.count(op), count, "oplog: " + tojson(oplogArray));
    }

    function isEarlierTimestamp(ts1, ts2) {
        if (ts1.getTime() == ts2.getTime()) {
            return ts1.getInc() < ts2.getInc();
        }
        return ts1.getTime() < ts2.getTime();
    }

    function countMatchedLog(node, regex) {
        var res = node.adminCommand({getLog: 'global'});
        assert.commandWorked(res);
        var count = 0;
        res.log.forEach(function(line) {
            if (regex.exec(line)) {
                count += 1;
            }
        });
        return count;
    }

    jsTest.log("Case 1: The primary is up-to-date after freshness scan.");
    // Should complete transition to primary immediately.
    rst.awaitReplication(30000, ReplSetTest.OpTimeType.LAST_DURABLE);
    var newPrimary = stepUp(rst.getSecondary());
    rst.waitForState(newPrimary, ReplSetTest.State.PRIMARY, 1000);
    // Should win an election and finish the transition very quickly.
    assert.eq(newPrimary, rst.getPrimary());

    jsTest.log("Case 2: The primary needs to catch up, succeeds in time.");
    rst.awaitReplication(30000, ReplSetTest.OpTimeType.LAST_DURABLE);
    // Write documents that cannot be replicated to secondaries in time.
    rst.getSecondaries().forEach(enableFailPoint);
    doWrites(rst.getPrimary());
    var latestOp = getLatestOp(rst.getPrimary());
    // New primary wins immediately, but needs to catch up.
    newPrimary = stepUp(rst.getSecondary());
    rst.waitForState(newPrimary, ReplSetTest.State.PRIMARY, 1000);
    // Check this node is not writable.
    assert.eq(newPrimary.getDB("test").isMaster().ismaster, false);
    // Disable fail point to allow replication.
    rst.nodes.forEach(disableFailPoint);
    // getPrimary() blocks until the primary finishes drain mode.
    assert.eq(newPrimary, rst.getPrimary());
    // Wait for all secondaries to catch up
    rst.awaitReplication();
    // Check the latest op on old primary is preserved on the new one.
    checkOpInOplog(newPrimary, latestOp, 1);

    jsTest.log("Case 3: The primary needs to catch up, fails due to timeout.");
    rst.awaitReplication(30000, ReplSetTest.OpTimeType.LAST_DURABLE);
    // Write documents that cannot be replicated to secondaries in time.
    rst.getSecondaries().forEach(enableFailPoint);
    doWrites(rst.getPrimary());
    latestOp = getLatestOp(rst.getPrimary());

    // New primary wins immediately, but needs to catch up.
    newPrimary = stepUp(rst.getSecondary());
    rst.waitForState(newPrimary, ReplSetTest.State.PRIMARY, 1000);
    var latestOpOnNewPrimary = getLatestOp(newPrimary);
    // Wait until the new primary completes the transition to primary and writes a no-op.
    assert.soon(function() {
        return countMatchedLog(newPrimary, /Cannot catch up oplog after becoming primary/) > 0;
    });
    disableFailPoint(newPrimary);
    assert.eq(newPrimary, rst.getPrimary());

    // Wait for the no-op "new primary" after winning an election, so that we know it has
    // finished transition to primary.
    assert.soon(function() {
        return isEarlierTimestamp(latestOpOnNewPrimary.ts, getLatestOp(newPrimary).ts);
    });
    // The extra oplog entries on the old primary are not replicated to the new one.
    checkOpInOplog(newPrimary, latestOp, 0);
    rst.getSecondaries().forEach(disableFailPoint);

    jsTest.log("Case 4: The primary needs to catch up, but has to change sync source to catch up.");
    rst.awaitReplication(30000, ReplSetTest.OpTimeType.LAST_DURABLE);
    // Write documents that cannot be replicated to secondaries in time.
    rst.getSecondaries().forEach(enableFailPoint);
    doWrites(rst.getPrimary());
    var oldPrimary = rst.getPrimary();
    var oldSecondaries = rst.getSecondaries();
    latestOp = getLatestOp(oldPrimary);
    newPrimary = stepUp(oldSecondaries[0]);
    rst.waitForState(newPrimary, ReplSetTest.State.PRIMARY, 1000);
    // Disable fail point on one of the other secondaries.
    // Wait until it catches up with the old primary.
    disableFailPoint(oldSecondaries[1]);
    awaitOpTime(oldSecondaries[1], latestOp.ts);
    // Disconnect the new primary and the old one.
    oldPrimary.disconnect(newPrimary);
    // Disable the failpoint, the new primary should sync from the other secondary.
    disableFailPoint(newPrimary);
    assert.eq(newPrimary, rst.getPrimary());
    checkOpInOplog(newPrimary, latestOp, 1);

})();