summaryrefslogtreecommitdiff
path: root/jstests/sharding/shard_identity_rollback.js
blob: b0a3f9b891cb4ddff1686fa3f57836ed7825de60 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/**
 * Tests that rolling back the insertion of the shardIdentity document on a shard causes the node
 * rolling it back to shut down.
 * @tags: [requires_persistence, requires_journaling]
 */

(function() {
"use strict";

load('jstests/libs/write_concern_util.js');

var st = new ShardingTest({shards: 1});

var replTest = new ReplSetTest({nodes: 3});
var nodes = replTest.startSet({shardsvr: ''});
replTest.initiate();

var priConn = replTest.getPrimary();
var secondaries = replTest.getSecondaries();
var configConnStr = st.configRS.getURL();

// Shards start in FCV 4.0 until a config server reaches out to them. This causes storage to
// shutdown with 4.0 compatible files, requiring rollback via refetch.
priConn.adminCommand({setFeatureCompatibilityVersion: "4.0"});

// Wait for the secondaries to have the latest oplog entries before stopping the fetcher to
// avoid the situation where one of the secondaries will not have an overlapping oplog with
// the other nodes once the primary is killed.
replTest.awaitSecondaryNodes();

replTest.awaitReplication();

stopServerReplication(secondaries);

jsTest.log("inserting shardIdentity document to primary that shouldn't replicate");

var shardIdentityDoc = {
    _id: 'shardIdentity',
    configsvrConnectionString: configConnStr,
    shardName: 'newShard',
    clusterId: ObjectId()
};

assert.writeOK(priConn.getDB('admin').system.version.update(
    {_id: 'shardIdentity'}, shardIdentityDoc, {upsert: true}));

// Ensure sharding state on the primary was initialized
var res = priConn.getDB('admin').runCommand({shardingState: 1});
assert(res.enabled, tojson(res));
assert.eq(shardIdentityDoc.configsvrConnectionString, res.configServer);
assert.eq(shardIdentityDoc.shardName, res.shardName);
assert.eq(shardIdentityDoc.clusterId, res.clusterId);

// Ensure sharding state on the secondaries was *not* initialized
secondaries.forEach(function(secondary) {
    secondary.setSlaveOk(true);
    res = secondary.getDB('admin').runCommand({shardingState: 1});
    assert(!res.enabled, tojson(res));
});

// Ensure manually deleting the shardIdentity document is not allowed.
assert.writeErrorWithCode(priConn.getDB('admin').system.version.remove({_id: 'shardIdentity'}),
                          40070);

jsTest.log("shutting down primary");
// Shut down the primary so a secondary gets elected that definitely won't have replicated the
// shardIdentity insert, which should trigger a rollback on the original primary when it comes
// back online.
replTest.stop(priConn);

// Disable the fail point so that the elected node can exit drain mode and finish becoming
// primary.
restartServerReplication(secondaries);

// Wait for a new healthy primary
var newPriConn = replTest.getPrimary();
assert.neq(priConn, newPriConn);
assert.writeOK(newPriConn.getDB('test').foo.insert({a: 1}, {writeConcern: {w: 'majority'}}));

// Restart the original primary so it triggers a rollback of the shardIdentity insert.
jsTest.log("Restarting original primary");
priConn = replTest.restart(priConn);

// Wait until we cannot create a connection to the former primary, which indicates that it must
// have shut itself down during the rollback.
jsTest.log("Waiting for original primary to rollback and shut down");
assert.soon(
    function() {
        try {
            var newConn = new Mongo(priConn.host);
            return false;
        } catch (x) {
            return true;
        }
    },
    function() {
        var oldPriOplog = priConn.getDB('local').oplog.rs.find().sort({$natural: -1}).toArray();
        var newPriOplog = newPriConn.getDB('local').oplog.rs.find().sort({$natural: -1}).toArray();
        return "timed out waiting for original primary to shut down after rollback. " +
            "Old primary oplog: " + tojson(oldPriOplog) +
            "; new primary oplog: " + tojson(newPriOplog);
    },
    90000);

// Restart the original primary again.  This time, the shardIdentity document should already be
// rolled back, so there shouldn't be any rollback and the node should stay online.
jsTest.log("Restarting original primary a second time and waiting for it to successfully become " +
           "secondary");
try {
    // Join() with the crashed mongod and ignore its bad exit status.
    MongoRunner.stopMongod(priConn);
} catch (e) {
    // expected
}
priConn = replTest.restart(priConn, {shardsvr: ''});
priConn.setSlaveOk();

// Wait for the old primary to replicate the document that was written to the new primary while
// it was shut down.
assert.soonNoExcept(function() {
    return priConn.getDB('test').foo.findOne();
});

// Ensure that there's no sharding state on the restarted original primary, since the
// shardIdentity doc should have been rolled back.
res = priConn.getDB('admin').runCommand({shardingState: 1});
assert(!res.enabled, tojson(res));
assert.eq(null, priConn.getDB('admin').system.version.findOne({_id: 'shardIdentity'}));

replTest.stopSet();

st.stop();
})();