/* * Tests that we can recover from a node with a lagged stable timestamp using the special * "for restore" mode, but not read from older points-in-time on the recovered node, and that * we can do so even after we crash in the middle of an attempt to restore. * * This test only makes sense for storage engines that support recover to stable timestamp. * @tags: [requires_wiredtiger, requires_persistence, requires_replication, * requires_majority_read_concern, uses_transactions, uses_prepare_transaction, * # We don't expect to do this while upgrading. * multiversion_incompatible] */ (function() { "use strict"; load("jstests/libs/fail_point_util.js"); const SIGKILL = 9; const dbName = TestData.testName; const logLevel = tojson({storage: {recovery: 2, wt: {wtCheckpoint: 1}}}); const rst = new ReplSetTest({ nodes: [{}, {}, {rsConfig: {priority: 0}}, {rsConfig: {priority: 0}}], settings: {chainingAllowed: false} }); const startParams = { logComponentVerbosity: logLevel, replBatchLimitOperations: 100 }; const nodes = rst.startSet({setParameter: startParams}); let restoreNode = nodes[1]; rst.initiateWithHighElectionTimeout(); const primary = rst.getPrimary(); const db = primary.getDB(dbName); const collName = "testcoll"; const sentinelCollName = "sentinelcoll"; const coll = db[collName]; const sentinelColl = db[sentinelCollName]; const paddingStr = "XXXXXXXXX"; // The default WC is majority and this test can't satisfy majority writes. assert.commandWorked(primary.adminCommand( {setDefaultRWConcern: 1, defaultWriteConcern: {w: 1}, writeConcern: {w: "majority"}})); // Pre-load some documents. const nDocs = 100; let bulk = coll.initializeUnorderedBulkOp(); for (let id = 1; id <= nDocs; id++) { bulk.insert({_id: id, paddingStr: paddingStr}); } bulk.execute(); rst.awaitReplication(); const holdOpTime = assert.commandWorked(db.runCommand({find: collName, limit: 1})).operationTime; // Keep the stable timestamp from moving on the node we're going to restart in restore mode. assert.commandWorked(restoreNode.adminCommand({ configureFailPoint: 'holdStableTimestampAtSpecificTimestamp', mode: 'alwaysOn', data: {"timestamp": holdOpTime} })); // Do a bunch of updates, which are chosen so if one is missed, we'll know. bulk = coll.initializeUnorderedBulkOp(); const nUpdates = 1000; const writeSentinelsAfter = 650; // This should be set to get us to the middle of a batch. jsTestLog("Making " + nUpdates + " updates with snapshotting disabled on one node."); for (let updateNo = 1; updateNo <= nUpdates; updateNo++) { let id = (updateNo - 1) % nDocs + 1; let updateField = "u" + updateNo; let setDoc = {}; setDoc[updateField] = updateNo; bulk.find({_id: id}).updateOne({"$set": setDoc}); if (updateNo == writeSentinelsAfter) { // Write a bunch of inserts to the sentinel collection, which will be used to hang // oplog application mid-batch. bulk.execute(); bulk = sentinelColl.initializeUnorderedBulkOp(); for (let j = 1; j <= 100; j++) { bulk.insert({_id: j}); } bulk.execute(); bulk = coll.initializeUnorderedBulkOp(); } } bulk.execute(); rst.awaitReplication(); jsTestLog("Stopping replication on secondaries to hold back majority commit point."); let stopReplProducer2 = configureFailPoint(nodes[2], 'stopReplProducer'); let stopReplProducer3 = configureFailPoint(nodes[3], 'stopReplProducer'); const nExtraDocs = 50; jsTestLog("Inserting " + nExtraDocs + " documents with majority point held back."); bulk = coll.initializeUnorderedBulkOp(); const lastId = nDocs + nExtraDocs; for (let id = 1; id <= nExtraDocs; id++) { bulk.insert({_id: (id + nDocs), paddingStr: paddingStr}); } bulk.execute(); const penultimateOpTime = assert.commandWorked(db.runCommand({find: collName, limit: 1})).operationTime; const sentinel2Timestamp = assert.commandWorked(db.runCommand({insert: sentinelCollName, documents: [{_id: "s2"}]})) .operationTime; rst.awaitReplication(undefined, undefined, [restoreNode]); jsTestLog("Restarting restore node with the --startupRecoveryForRestore flag"); // Must use stop/start with waitForConnect: false. See SERVER-56446 rst.stop(restoreNode, undefined, undefined, {forRestart: true}); clearRawMongoProgramOutput(); rst.start(restoreNode, { noReplSet: true, waitForConnect: false, syncdelay: 1, // Take a lot of unstable checkpoints. setParameter: Object.merge(startParams, { startupRecoveryForRestore: true, recoverFromOplogAsStandalone: true, takeUnstableCheckpointOnShutdown: true, 'failpoint.hangAfterCollectionInserts': tojson({mode: 'alwaysOn', data: {collectionNS: sentinelColl.getFullName()}}), }) }, true /* restart */); assert.soon(() => { // Can't use checklog because we can't connect to the mongo in startup mode. return rawMongoProgramOutput().search("hangAfterCollectionInserts fail point enabled") !== -1; }); // We need to make sure we get a checkpoint after the failpoint is hit, so we clear the output after // hitting it. Occasionally we'll miss a checkpoint as a result of clearing the output, but we'll // get another one a second later. clearRawMongoProgramOutput(); // Ensure the checkpoint starts after the insert. assert.soon(() => { return rawMongoProgramOutput().search("WT_VERB_CHECKPOINT.*saving checkpoint snapshot min") !== -1; }); // Ensure that we wait for a checkpoint completed log message that comes strictly after the above // checkpoint started message. clearRawMongoProgramOutput(); assert.soon(() => { return rawMongoProgramOutput().search("Completed unstable checkpoint.") !== -1; }); jsTestLog("Restarting restore node uncleanly"); rst.stop(restoreNode, SIGKILL, {allowedExitCode: MongoRunner.EXIT_SIGKILL}, {forRestart: true}); restoreNode = rst.start(restoreNode, { noReplSet: true, setParameter: Object.merge(startParams, { startupRecoveryForRestore: true, recoverFromOplogAsStandalone: true, takeUnstableCheckpointOnShutdown: true }) }, true /* restart */); // Make sure we can read something after standalone recovery. assert.eq(1, restoreNode.getDB(dbName)[sentinelCollName].find({}).limit(1).itcount()); jsTestLog("Restarting restore node again, in repl set mode"); restoreNode = rst.restart(restoreNode, {noReplSet: false, setParameter: startParams}); rst.awaitSecondaryNodes(undefined, [restoreNode]); jsTestLog("Finished restarting restore node"); // For the timestamp check we step up the restored node so we can do atClusterTime reads on it. // They are necessarily speculative because we are preventing majority optimes from advancing. jsTestLog("Stepping up restore node"); rst.stepUp(restoreNode, {awaitReplicationBeforeStepUp: false}); // Should NOT able to read at the penultimate optime on the restore node. const restoreNodeSession = restoreNode.startSession({causalConsistency: false}); const restoreNodeSessionDb = restoreNodeSession.getDatabase(dbName); jsTestLog( "Checking restore node top-of-oplog minus 1 read, which should fail, because the restore node does not have that history."); restoreNodeSession.startTransaction( {readConcern: {level: "snapshot", atClusterTime: penultimateOpTime}}); assert.commandFailedWithCode( restoreNodeSessionDb.runCommand({find: collName, filter: {"_id": lastId}}), ErrorCodes.SnapshotTooOld); restoreNodeSession.abortTransaction(); // Should see that very last document. assert.docEq({_id: "s2"}, restoreNode.getDB(dbName)[sentinelCollName].findOne({_id: "s2"})); // Allow set to become current and shut down with ordinary dbHash verification. stopReplProducer2.off(); stopReplProducer3.off(); rst.stopSet(); })();