jstests/replsets/clean_shutdown_oplog_state.js


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105

// SERVER-25071 We now require secondaries to finish clean shutdown with a completely clean state.
// WARNING: this test does not always fail deterministically. It is possible for a bug to be
// present without this test failing. In particular if the rst.stop(1) doesn't execute mid-batch,
// it isn't fully exercising the code. However, if the test fails there is definitely a bug.
//
// @tags: [requires_persistence, requires_majority_read_concern]
(function() {
    "use strict";

    // Skip db hash check because secondary restarted as standalone.
    TestData.skipCheckDBHashes = true;

    var rst = new ReplSetTest({
        name: "name",
        nodes: 2,
        oplogSize: 500,
    });

    rst.startSet();
    var conf = rst.getReplSetConfig();
    conf.members[1].votes = 0;
    conf.members[1].priority = 0;
    printjson(conf);
    rst.initiate(conf);

    var primary = rst.getPrimary();  // Waits for PRIMARY state.
    var slave = rst.nodes[1];

    // Stop replication on the secondary.
    assert.commandWorked(
        slave.adminCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'alwaysOn'}));

    // Prime the main collection.
    primary.getCollection("test.coll").insert({_id: -1});

    // Start a w:2 write that will block until replication is resumed.
    var waitForReplStart = startParallelShell(function() {
        printjson(assert.writeOK(
            db.getCollection('side').insert({}, {writeConcern: {w: 2, wtimeout: 30 * 60 * 1000}})));
    }, primary.host.split(':')[1]);

    // Insert a lot of data in increasing order to test.coll.
    var op = primary.getCollection("test.coll").initializeUnorderedBulkOp();
    for (var i = 0; i < 1000 * 1000; i++) {
        op.insert({_id: i});
    }
    assert.writeOK(op.execute());

    // Resume replication and wait for ops to start replicating, then do a clean shutdown on the
    // secondary.
    assert.commandWorked(slave.adminCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'}));
    waitForReplStart();
    sleep(100);  // wait a bit to increase the chances of killing mid-batch.
    rst.stop(1);

    // Restart the secondary as a standalone node.
    var options = slave.savedOptions;
    options.noCleanData = true;
    delete options.replSet;

    var storageEngine = jsTest.options().storageEngine || "wiredTiger";
    if (storageEngine === "wiredTiger") {
        options.setParameter = options.setParameter || {};
        options.setParameter.recoverFromOplogAsStandalone = true;
    }

    var conn = MongoRunner.runMongod(options);
    assert.neq(null, conn, "secondary failed to start");

    // Following clean shutdown of a node, the oplog must exactly match the applied operations.
    // Additionally, the begin field must not be in the minValid document, the ts must match the
    // top of the oplog (SERVER-25353), and the oplogTruncateAfterPoint must be null (SERVER-7200
    // and SERVER-25071).
    var oplogDoc = conn.getCollection('local.oplog.rs')
                       .find({ns: 'test.coll'})
                       .sort({$natural: -1})
                       .limit(1)[0];
    var collDoc = conn.getCollection('test.coll').find().sort({_id: -1}).limit(1)[0];
    var minValidDoc =
        conn.getCollection('local.replset.minvalid').find().sort({$natural: -1}).limit(1)[0];
    var oplogTruncateAfterPointDoc =
        conn.getCollection('local.replset.oplogTruncateAfterPoint').find().limit(1)[0];
    printjson({
        oplogDoc: oplogDoc,
        collDoc: collDoc,
        minValidDoc: minValidDoc,
        oplogTruncateAfterPointDoc: oplogTruncateAfterPointDoc
    });
    try {
        assert.eq(collDoc._id, oplogDoc.o._id);
        assert(!('begin' in minValidDoc), 'begin in minValidDoc');
        if (storageEngine !== "wiredTiger") {
            assert.eq(minValidDoc.ts, oplogDoc.ts);
        }
        assert.eq(oplogTruncateAfterPointDoc.oplogTruncateAfterPoint, Timestamp());
    } catch (e) {
        // TODO remove once SERVER-25777 is resolved.
        jsTest.log(
            "Look above and make sure clean shutdown finished without resorting to SIGKILL." +
            "\nUnfortunately that currently doesn't fail the test.");
        throw e;
    }

    rst.stopSet();
})();