jstests/replsets/startup_recovery_for_restore.js


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199

/*
 * Tests that we can recover from a node with a lagged stable timestamp using the special
 * "for restore" mode, but not read from older points-in-time on the recovered node.
 *
 * This test only makes sense for storage engines that support recover to stable timestamp.
 * @tags: [requires_wiredtiger, requires_persistence, requires_journaling, requires_replication,
 * requires_majority_read_concern, uses_transactions, uses_prepare_transaction,
 * # We don't expect to do this while upgrading.
 * multiversion_incompatible]
 */

(function() {
"use strict";
load("jstests/libs/fail_point_util.js");

const dbName = TestData.testName;

const logLevel = tojson({storage: {recovery: 2}});

const rst = new ReplSetTest({
    nodes: [{}, {}, {rsConfig: {priority: 0}}, {rsConfig: {priority: 0}}],
    settings: {chainingAllowed: false}
});

const startParams = {
    logComponentVerbosity: logLevel,
    replBatchLimitOperations: 100
};
const nodes = rst.startSet({setParameter: startParams});
let restoreNode = nodes[1];
rst.initiateWithHighElectionTimeout();
const primary = rst.getPrimary();
const db = primary.getDB(dbName);
const collName = "testcoll";
const sentinelCollName = "sentinelcoll";
const coll = db[collName];
const paddingStr = "XXXXXXXXX";

// The default WC is majority and stopServerReplication will prevent satisfying any majority writes.
assert.commandWorked(primary.adminCommand(
    {setDefaultRWConcern: 1, defaultWriteConcern: {w: 1}, writeConcern: {w: "majority"}}));

// Pre-load some documents.
const nPreDocs = 2;
coll.insert([{_id: "pre1"}, {_id: "pre2"}]);
rst.awaitReplication();

const holdOpTime = assert.commandWorked(db.runCommand({find: collName, limit: 1})).operationTime;

// Keep the stable timestamp from moving on the node we're going to restart in restore mode.
assert.commandWorked(restoreNode.adminCommand({
    configureFailPoint: 'holdStableTimestampAtSpecificTimestamp',
    mode: 'alwaysOn',
    data: {"timestamp": holdOpTime}
}));

// Insert a bunch of documents.
let bulk = coll.initializeUnorderedBulkOp();
const nDocs = 1000;
jsTestLog("Inserting " + nDocs + " documents with snapshotting disabled on one node.");
for (let id = 1; id <= nDocs; id++) {
    bulk.insert({_id: id, paddingStr: paddingStr});
}
bulk.execute();
rst.awaitReplication();

jsTestLog("Stopping replication on secondaries to hold back majority commit point.");
let stopReplProducer2 = configureFailPoint(nodes[2], 'stopReplProducer');
let stopReplProducer3 = configureFailPoint(nodes[3], 'stopReplProducer');

jsTestLog("Writing first sentinel document.");
const sentinel1Timestamp =
    assert.commandWorked(db.runCommand({insert: sentinelCollName, documents: [{_id: "s1"}]}))
        .operationTime;

const nExtraDocs = 50;
jsTestLog("Inserting " + nExtraDocs + " documents with majority point held back.");
bulk = coll.initializeUnorderedBulkOp();
for (let id = 1; id <= nExtraDocs; id++) {
    bulk.insert({_id: (id + nDocs), paddingStr: paddingStr});
}
bulk.execute();
const lastId = nDocs + nExtraDocs;

const penultimateOpTime =
    assert.commandWorked(db.runCommand({find: collName, limit: 1})).operationTime;

const sentinel2Timestamp =
    assert.commandWorked(db.runCommand({insert: sentinelCollName, documents: [{_id: "s2"}]}))
        .operationTime;

rst.awaitReplication(undefined, undefined, [restoreNode]);

jsTestLog("Restarting restore node with the --startupRecoveryForRestore flag");
restoreNode = rst.restart(restoreNode, {
    noReplSet: true,
    setParameter: Object.merge(startParams, {
        startupRecoveryForRestore: true,
        recoverFromOplogAsStandalone: true,
        takeUnstableCheckpointOnShutdown: true
    })
});
// Make sure we can read something after standalone recovery.
assert.eq(2, restoreNode.getDB(dbName)[sentinelCollName].find({}).itcount());

jsTestLog("Restarting restore node again, in repl set mode with stable checkpointing disabled");
restoreNode = rst.restart(restoreNode, {
    noReplSet: false,
    setParameter:
        Object.merge(startParams, {"failpoint.disableSnapshotting": "{'mode':'alwaysOn'}"})
});

rst.awaitSecondaryNodes(undefined, [restoreNode]);
jsTestLog("Finished restarting restore node with stable checkpointing disabled");

// Test that no appliedThrough in the minValid document after recovery.
const minValid = restoreNode.getCollection('local.replset.minvalid').findOne();
assert(!minValid.hasOwnProperty('begin'), tojson(minValid));

// Test that we cannot take another checkpoint via fsync before we have an up-to-date stable
// timestamp because otherwise we would end up mistakenly taking an unstable timestamp with a null
// appliedThrough.
jsTestLog("Running fsync on restore node before it takes the first stable checkpoint should fail");
assert.commandFailed(restoreNode.adminCommand({fsync: 1}));

jsTestLog("Doing more write on the primary");
assert.commandWorked(db.runCommand({insert: sentinelCollName, documents: [{_id: "s3"}]}));

// Make sure we can read the new write on the restore node.  Must be durable because we're about
// to crash this node with no checkpoints.
rst.awaitReplication(undefined, ReplSetTest.OpTimeType.LAST_DURABLE, [restoreNode]);
assert.eq(3, restoreNode.getDB(dbName)[sentinelCollName].find({}).itcount());

jsTestLog("Crashing restore node before it takes the first stable checkpoint");
rst.stop(restoreNode, 9, {allowedExitCode: MongoRunner.EXIT_SIGKILL}, {forRestart: true});

jsTestLog("Restarting restore node again, in repl set mode");
restoreNode =
    rst.start(restoreNode, {noReplSet: false, setParameter: startParams}, /* restart */ true);

rst.awaitSecondaryNodes(undefined, [restoreNode]);
jsTestLog("Finished restarting restore node");

// Make sure we can still read the latest write on the restore node.
assert.eq(3, restoreNode.getDB(dbName)[sentinelCollName].find({}).itcount());

const restoreDb = restoreNode.getDB(dbName);

jsTestLog("Checking restore node untimestamped read.");
// Basic test: should see all docs with untimestamped read.
assert.eq(nPreDocs + nDocs + nExtraDocs, coll.find().itcount());
assert.eq(nPreDocs + nDocs + nExtraDocs, restoreDb[collName].find().itcount());

// For the remaining checks we step up the restored node so we can do atClusterTime reads on it.
// They are necessarily speculative because we are preventing majority optimes from advancing.

jsTestLog("Stepping up restore node");
rst.stepUp(restoreNode, {awaitReplicationBeforeStepUp: false});

// Should also be able to read at the final sentinel optime on restore node.
const restoreNodeSession = restoreNode.startSession({causalConsistency: false});
restoreNodeSession.startTransaction(
    {readConcern: {level: "snapshot", atClusterTime: sentinel2Timestamp}});
const restoreNodeSessionDb = restoreNodeSession.getDatabase(dbName);
jsTestLog("Checking top-of-oplog read works on restored node.");

let res = assert.commandWorked(
    restoreNodeSessionDb.runCommand({find: collName, filter: {"_id": lastId}}));
assert.eq(1, res.cursor.firstBatch.length);
assert.docEq({_id: lastId, paddingStr: paddingStr}, res.cursor.firstBatch[0]);

// Must abort because majority is not advancing.
restoreNodeSession.abortTransaction();

// Should NOT able to read at the first sentinel optime on the restore node.
restoreNodeSession.startTransaction(
    {readConcern: {level: "snapshot", atClusterTime: sentinel1Timestamp}});
jsTestLog(
    "Checking restore node majority optime read, which should fail, because the restore node does not have that history.");
res = assert.commandFailedWithCode(
    restoreNodeSessionDb.runCommand({find: collName, filter: {"_id": {"$gte": nDocs}}}),
    ErrorCodes.SnapshotTooOld);
restoreNodeSession.abortTransaction();

// Should NOT able to read at the penultimate optime on the restore node either.
jsTestLog(
    "Checking restore node top-of-oplog minus 1 read, which should fail, because the restore node does not have that history.");
restoreNodeSession.startTransaction(
    {readConcern: {level: "snapshot", atClusterTime: penultimateOpTime}});
res = assert.commandFailedWithCode(
    restoreNodeSessionDb.runCommand({find: collName, filter: {"_id": lastId}}),
    ErrorCodes.SnapshotTooOld);
restoreNodeSession.abortTransaction();

// Allow set to become current and shut down with ordinary dbHash verification.
stopReplProducer2.off();
stopReplProducer3.off();
rst.stopSet();
})();